SECRET: multinormal.h Source File

00001 /*
00002 
00003 Copyright (c) 2003, Cornell University
00004 All rights reserved.
00005 
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008 
00009    - Redistributions of source code must retain the above copyright notice,
00010        this list of conditions and the following disclaimer.
00011    - Redistributions in binary form must reproduce the above copyright
00012        notice, this list of conditions and the following disclaimer in the
00013        documentation and/or other materials provided with the distribution.
00014    - Neither the name of Cornell University nor the names of its
00015        contributors may be used to endorse or promote products derived from
00016        this software without specific prior written permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029 
00030 */
00031 
00032 // -*- C++ -*-
00033 
00034 #if !defined _CLUS_MULTINORMAL_H_
00035 #define _CLUS_MULTINORMAL_H_
00036 
00037 #include "general.h"
00038 #include "distribution.h"
00039 #include "vec.h"
00040 #include "cholesky.h"
00041 #include <math.h>
00042 #include "linearregressor.h"
00043 #include "misc.h"
00044 #include "trislv.h"
00045 
00046 using namespace TNT;
00047 
00048 namespace CLUS
00049 {
00050 
00051 /** Implements a multidimentional normal distribution. */
00052 class MultiDimNormal : public Distribution
00053 {
00054 protected:
00055     /// the expected value
00056     Vector<double> mu;
00057     
00058     /// the equation of the hyperplane
00059     Vector<double> C;
00060     
00061     /// the covariance matrix's cholesky, upper diagonal
00062     /// lower diagonal
00063     Fortran_Matrix<double> sigmaChol;
00064 
00065     /// the determinant of the cholesky factor of Sigma
00066     double alpha_over_detChol_inDimp1;    
00067     double alpha_over_detChol_inDim;
00068 
00069     /// number of datapoints
00070     int N;
00071 
00072     /// sum probabilities
00073     double S;
00074 
00075     /// sum probab*x
00076     Vector<double> S_x;
00077 
00078     /// sum prob*xx^T, upper symm diag
00079     Fortran_Matrix<double> S_xxT;
00080 
00081     /// the radius of the distribution. Used to generate distributions close by
00082     double radius; 
00083 
00084     /** Computes alpha_over_detChol_inDimp1 and alpha_over_detChol_inDim given sigmaChol */
00085     void ComputeAlphas(void)
00086     {
00087         double detChol=1.0;
00088         for (int i=1; i<=inDim; i++)
00089             detChol*=sigmaChol(i,i);
00090         alpha_over_detChol_inDim=weight*pow(2*M_PI,-inDim/2.0)/detChol;
00091         detChol*=sigmaChol(inDim+1,inDim+1);
00092         alpha_over_detChol_inDimp1=weight*pow(2*M_PI,-(inDim+1)/2.0)/detChol;
00093         // print debug info
00094         if (alpha_over_detChol_inDimp1<0.0)
00095         {
00096             cerr << "Problem with alphas" << endl;
00097             cerr << "mu: " << mu << " sigmaChol: " << sigmaChol << " ao=" << alpha_over_detChol_inDim;
00098             cerr << " aop1=" << alpha_over_detChol_inDimp1 << endl;
00099             cerr << "S_xxT=" << S_xxT;
00100         }
00101     }
00102 
00103     /** Computes the equation of the hyperplane using Sigma=S_xxT.
00104     Must be called when S_xxT contains Sigma not the partial sums only.
00105     
00106     @return true if anything is fine, false if something is wrong and the computation is bogus.*/
00107     bool ComputeCCoef(Fortran_Matrix<double>& Sigma, bool isCholesky)
00108     {
00109         if (weight==0.0)
00110             return false;
00111 
00112 #if 0
00113 
00114         Fortran_Matrix<double> S(inDim+1,inDim+1);
00115         if (isCholesky)
00116         {
00117             for (int i=1; i<=inDim+1; i++)
00118             {
00119                 for (int j=i; j<=inDim+1; j++)
00120                 {
00121                     // compute G(i,:)*G'(j,:)
00122                     double tmp=0.0;
00123                     for (int k=1; k<=i; k++)
00124                         tmp+=Sigma(i,k)*Sigma(j,k);
00125                     S(i,j)=tmp;
00126                 }
00127             }
00128         }
00129         else
00130         {
00131             S=Sigma;
00132         }
00133 
00134         Vector<double> u(inDim+1);
00135 
00136         // compute u
00137         double lambda,lambda2;
00138         double slimness;
00139         if (MinEigenvalueUpperSymmetric(S,lambda,lambda2,slimness))
00140         {
00141             if (!EigenVector(S,lambda,u,1))
00142             {
00143                 cerr << "OOPS could not find the eigenvector for " << lambda << endl;
00144                 return false;
00145             }
00146         }
00147         else
00148         {
00149             cerr << "OOPS-eigenvalue problem not solved" << endl;
00150             return false;
00151         }
00152         if (lambda < -SMALL_POZ_VALUE)
00153         {
00154             cerr << "Matrix not positive definite since the smallest eigv=" << lambda << endl;
00155             return false;
00156         }
00157 
00158         // slimness is the average eigvalue nonminimal. The square of this is the size of the cluster
00159         radius = sqrt(slimness);
00160 
00161         // compute C from u
00162         double uy=u[inDim];
00163 
00164         if ( fabs(uy) < 1.0/(inDim*1000) )
00165         {
00166             cerr << "Smallest eigenvector oriented badly. Something is wrong, killing distrib" << endl;
00167             return false;
00168         }
00169 
00170         C[0]=mu[inDim];
00171         for(int j=0;j<inDim;j++)
00172         {
00173             C[j+1]=-u[j]/uy;
00174             C[0]+=mu[j]*u[j]/uy;
00175         }
00176         return true;
00177 
00178 #else
00179 
00180         ExitIf(!isCholesky,"Not implemented");
00181 
00182         // use least square
00183         // form b
00184         Vector<double> b(inDim+1);
00185         // compute in radius the trace of original S
00186         radius=0.0;
00187         for (int i=1; i<=inDim; i++)
00188         {
00189             b(i)=Sigma(inDim+1,i);
00190             radius+=pow2(Sigma(i,i));
00191         }
00192         b(inDim+1)=0.0;
00193         radius+=pow2(Sigma(inDim+1,inDim+1));
00194         radius=sqrt(radius/inDim); // suppose one of the eigenvalues is small.
00195 
00196         // solve for c=G\b
00197         Vector<double> c=Transposed_Upper_triangular_solve< Fortran_Matrix<double>,Vector<double> >(Sigma,b);
00198 
00199         // form C
00200         C[0]=mu[inDim];
00201         for(int i=1; i<=inDim; i++)
00202         {
00203             C[0]-=mu[i-1]*c[i-1];
00204             C[i]=c[i-1];
00205         }
00206         return true;
00207 
00208 #endif
00209 
00210     }
00211 
00212 public:
00213     MultiDimNormal(int InDim=0):Distribution(InDim),mu(InDim+1),C(InDim+1),
00214             sigmaChol(InDim+1,InDim+1),S_x(InDim+1),S_xxT(InDim+1,InDim+1)
00215     {
00216         alpha_over_detChol_inDimp1=1.0;
00217         alpha_over_detChol_inDim=1.0;
00218         N=0;
00219         S=0;
00220         S_x=0;
00221         S_xxT=0;
00222         weight=1.0;
00223     }
00224 
00225     MultiDimNormal(int InDim,int):Distribution(InDim),mu(InDim+1),C(InDim+1),
00226             sigmaChol(InDim+1,InDim+1),S_x(0),S_xxT(0,0)
00227     {
00228         alpha_over_detChol_inDimp1=1.0;
00229         alpha_over_detChol_inDim=1.0;
00230         N=0;
00231         S=0;
00232         S_x=0;
00233         S_xxT=0;
00234         weight=1.0;
00235     }
00236 
00237     MultiDimNormal(int InDim, double Alpha, Vector<double>& Mu, Fortran_Matrix<double>& SigmaChol):
00238             Distribution(InDim), mu(Mu), C(0), sigmaChol(SigmaChol), S_x(0),S_xxT(0,0)
00239     {
00240         N=0;
00241         S=0;
00242         S_x=0;
00243         S_xxT=0;
00244         weight=Alpha;
00245         ComputeAlphas();
00246     }
00247 
00248     Regressor* CreateRegressor(void)
00249     {
00250         if ( ComputeCCoef(sigmaChol,true) )
00251             return new LinearRegressor(C);
00252         else
00253             return new LinearRegressor();
00254     }
00255 
00256     /** Reverses the normalization transformation in the original space
00257     mu=pD.sigmaChol*mu+pD.mu;
00258     sigmaChol=pD.sigmaChol*sigmaChol;
00259     */
00260     void DenormalizeParameters(MultiDimNormal* pD)
00261     {
00262         // compute mu=pD.sigmaChol*mu+pD.mu
00263         // do it in reverse order to avoid keeping a copy of mu
00264         for(int i=inDim+1; i>=1; i--)
00265         {
00266             double tmp=0.0;
00267             for (int j=1; j<=i; j++)
00268                 tmp+=pD->sigmaChol(i,j)*mu[j-1];
00269             mu[i-1]=tmp+pD->mu[i-1];
00270         }
00271 
00272         // compute sigmaChol=pD.sigmaChol*sigmaChol
00273         // do it in reverse order on i to avoid keeping a copy of sigmaChol
00274         for(int i=inDim+1; i>=1; i--)
00275             for (int j=1; j<=i; j++)
00276             {
00277                 double tmp=0.0;
00278                 for (int k=j; k<=i; k++)
00279                     tmp+=pD->sigmaChol(i,k)*sigmaChol(k,j);
00280                 sigmaChol(i,j)=tmp;
00281             }
00282     }
00283 
00284     /** Transform data such that it looks like a 0 centered sphere
00285     of radius 1
00286     */
00287     void NormalizeData(const double* DataCache, double* X)
00288     {
00289         // X=(DataCache-mu)*sigmaChol^-1
00290         for (int i=0; i<=inDim; i++)
00291             X[i]=DataCache[i]-mu[i];
00292 
00293         // solve X=GY for Y into X
00294         for(int i=1; i<=inDim+1; i++)
00295         {
00296             double tmp=0.0;
00297             for (int j=1; j<i; j++)
00298                 tmp+=sigmaChol(i,j)*X[j-1];
00299 
00300             X[i-1]=(X[i-1]-tmp)/sigmaChol(i,i);
00301         }
00302     }
00303 
00304     double LearnProbability(const double* DataCache)
00305     {
00306         dataCache=DataCache;
00307         double X[MAX_IN_DIM];
00308         // double* Xm1=X-1; // pointer to the element before the start of X such that Xm1[i]=X[i+1]
00309 
00310         if (weight==0.0)
00311         {
00312             probabilityLearn=0.0;
00313             return 0.0;
00314         }
00315 
00316         for (int i=0; i<=inDim; i++)
00317             X[i]=dataCache[i]-mu[i];
00318         double norm2=0.0;
00319         // solve X=GY for Y into X and compute X^T*X
00320         for(int i=1; i<=inDim+1; i++)
00321         {
00322             double tmp=0.0;
00323             for (int j=1; j<i; j++)
00324                 tmp+=sigmaChol(i,j)*X[j-1];
00325 
00326             double sCii=sigmaChol(i,i);
00327             if (sCii == 0.0)
00328                 if (X[i-1]!=tmp)
00329                     return (probabilityLearn=0.0); // since norm2=inf => e^{-norm2/2)=0.0
00330                 else // 0.0/0.0 situation, point right on the hyperplane, probability=HUGE
00331                     return (probabilityLearn=MAXREAL);
00332 
00333             X[i-1]=(X[i-1]-tmp)/sCii;
00334             norm2+=pow2(X[i-1]);
00335         }
00336 
00337         probabilityLearn=alpha_over_detChol_inDimp1*exp(-norm2/2.0);
00338 
00339         // assert(probabilityLearn>=0.0 && finite(probabilityLearn) );
00340 
00341         return probabilityLearn;
00342     }
00343 
00344     double InferProbability(const double* DataCache)
00345     {
00346         dataCache=DataCache;
00347         double X[MAX_IN_DIM];
00348         // double* Xm1=X-1;
00349 
00350         if (weight==0.0)
00351             return 0.0;
00352 
00353         for (int i=0; i<inDim; i++)
00354             X[i]=dataCache[i]-mu[i];
00355         double norm2=0.0;
00356         // solve X=GY for Y into X and compute X^T*X, ignore last line in G
00357         for(int i=1; i<=inDim; i++)
00358         {
00359             double tmp=0.0;
00360             for (int j=1; j<i; j++)
00361                 tmp+=sigmaChol(i,j)*X[j-1];
00362             X[i-1]=(X[i-1]-tmp)/sigmaChol(i,i);
00363             norm2+=pow2(X[i-1]);
00364         }
00365 
00366         probabilityInfer=alpha_over_detChol_inDim*exp(-norm2/2.0);
00367         return probabilityInfer;
00368     }
00369 
00370     void UpdateStatistics(const double* DataCache, double prob)
00371     {
00372         N++;
00373         S+=prob;
00374         for (int i=0; i<=inDim; i++)
00375         {
00376             double aux=prob*DataCache[i];
00377             S_x[i]+=aux;
00378             // fill just the upperpart of S_xxT
00379             for (int j=i; j<=inDim; j++)
00380                 S_xxT(i+1,j+1)+=aux*DataCache[j];
00381         }
00382     }
00383 
00384     double NormalizeLearnProbability(double Coef, int nrClus=1)
00385     {
00386         double pLearn=Distribution::NormalizeLearnProbability(Coef,nrClus);
00387 
00388         assert(pLearn>=0.0 && pLearn <=1.0 && finite(pLearn) );
00389 
00390         UpdateStatistics(dataCache, pLearn);
00391         return 0.0;
00392     }
00393 
00394     double UpdateParameters(void)
00395     {
00396         double oldMu, distP=0.0;
00397 
00398         // if the cluster is dead do nothing
00399         if (weight==0.0)
00400             return 0.0;
00401 
00402         if (S<=(inDim+1))
00403         {
00404             cerr << "Cluster got too small. We have to throw it out since is not anymore reliable." << endl;
00405             weight=0.0;
00406             distP=1.0;
00407             goto cleanup;
00408         }
00409 
00410         // don't change the weight to have clusters of about the same size
00411         // weight=S/N; // S is the number of points on which this clusterdepends
00412 
00413 
00414         for (int i=1; i<=inDim+1; i++)
00415         {
00416             oldMu=mu(i);
00417             mu(i)=S_x(i)/S;
00418             for (int j=i; j <=inDim+1; j++)
00419             {
00420                 S_xxT(i,j)=(S_xxT(i,j)-mu(i)*S_x(j))/S;
00421 
00422                 // fix in S_xxT the almost 0.0 entries. This helpes a lot
00423                 if (fabs(S_xxT(i,j))<SMALL_POZ_VALUE)
00424                 {
00425                     S_xxT(i,j)=0.0;
00426                 }
00427             }
00428             distP+=pow2(oldMu-mu(i));
00429         }
00430 
00431 
00432         // compute Cholesky factorization of S_xxT into sigmaChol
00433         int cholRet;
00434         if ( (cholRet=Cholesky_upper_factorization(S_xxT,sigmaChol)) )
00435         {
00436             // If we got in here we have trouble
00437             // If the cluster does not depend on at least 2*(inDim+1) points we just throw it out
00438             if (S<2*(inDim+1))
00439             {
00440                 cerr << "Troublesome cluster too small to bother. We throw it out" << endl;
00441                 weight=0.0;
00442                 distP=1.0;
00443                 goto cleanup;
00444             }
00445             else
00446             {
00447                 cerr << "Matrix S_xxT is not positive definite: " << S_xxT ;
00448                 cerr << "sigmaChol: " << sigmaChol;
00449                 cerr << "weight=" << weight << endl;
00450                 // fix the last value of sigmaChol
00451                 if (cholRet==2)
00452                 {
00453                     cout << "Fixing sigmaChol" << endl;
00454                     sigmaChol(inDim+1,inDim+1)=0.0; // only the last value is bad, we can fix it
00455                     distP=-1.0; // this will make the returned value nan and terminate anything
00456                 }
00457                 else
00458                 {
00459                     // probably the cluster is a "squeezy"(has one less dimention in the input space).
00460                     cerr << "The cluster is a squeezy, setting weight to 0." << endl;
00461                     weight=0.0;
00462                     distP=1.0;
00463                     goto cleanup;
00464                 }
00465             }
00466         }
00467 
00468         ComputeAlphas();
00469 
00470         /* // need this to have all the time a regressor
00471         if (!ComputeCCoef(S_xxT))
00472         weight=0.0;
00473 
00474         */
00475 cleanup:
00476         // erase N, S, S_x, S_xxT
00477         N=0;
00478         S=0.0;
00479         S_x=0; // the operator= takes care of this
00480         S_xxT=0;
00481 
00482         return sqrt(distP)/(inDim+1); //how much the center has moved
00483     }
00484 
00485     void AdjustYwithYoverD(double& y)
00486     {
00487         double dy=C[0];
00488         for(int j=0; j<inDim; j++)
00489             dy+=C[j+1]*dataCache[j];
00490         dy*=probabilityInfer;
00491         y+=dy;
00492     }
00493 
00494     static string TypeName(void)
00495     {
00496         return string("MultiDimNormal");
00497     }
00498 
00499     void RandomDistribution(int NrClusters)
00500     {
00501         const double smallness=0.1;
00502         const double sizeDist=1/pow(NrClusters,1.0/(inDim+1));
00503         int d=inDim+1;
00504 
00505         // reset weithts to 1.
00506         weight = 1.0/NrClusters;
00507         // choose mu random
00508         for(int i=0; i<d ;i++)
00509         {
00510             mu[i]=RANDOM01FLOAT*2-1.0;
00511         }
00512 
00513         do
00514         {
00515             // choose n randomly and build V, set of vectors orthogonal on n
00516             Vector<double> n(d);
00517             for (int i=0; i<d; i++)
00518                 n[i]=RANDOM01FLOAT*2-1.0;
00519             // normalize n
00520             double norm_n=dot_prod(n,n);
00521             for (int i=0; i<d; i++)
00522                 n[i]/=norm_n;
00523 
00524             // set Sigma as I-0.1n*n'
00525             // in this way Sigma has n as the eigenvector coresp to eightvalue 0.1
00526             // and d-1 other eigenvectors orth. on n with eigvals 1.0
00527             for (int i=1; i<=d; i++)
00528             {
00529                 sigmaChol(i,i)=sizeDist*(1-smallness*pow2(n(i)));
00530                 for (int j=i+1; j<=d; j++)
00531                     sigmaChol(i,j)=-sizeDist*(1-smallness)*n(i)*n(j);
00532             }
00533 
00534         }
00535         while (Cholesky_upper_factorization(sigmaChol,sigmaChol)!=0);
00536 
00537         ComputeAlphas();
00538     }
00539 
00540     /** For hierarchies or trees we need to produce offsprings close to the parent*/
00541     void RandomDistribution(int NrClusters, MultiDimNormal& parent)
00542     {
00543         // reset weitht to 1.0. It might have been 0.
00544         weight = 1.0;
00545 
00546         // Create a distribution 1/NrClusters as small as the parent with a center
00547         // around the center of the parent
00548         radius=parent.radius/pow(NrClusters,1.0/inDim);
00549         mu[inDim]=parent.C[0];
00550         for(int i=0;i<inDim;i++)
00551         {
00552             mu[i]=parent.mu[i]+(2*parent.radius-radius)*RANDOM01FLOAT;
00553             mu[inDim]+=mu[i]*parent.C[i+1];
00554         }
00555 
00556         // comment this line if the parent.C is used
00557         // mu[inDim]=parent.mu[inDim];
00558 
00559         // put sigma=I => sigmaChol=I*radius
00560         sigmaChol=0.0;
00561         for( int i=1; i<=inDim+1; i++)
00562             sigmaChol(i,i)=radius;
00563 
00564         ComputeAlphas();
00565     }
00566 
00567     void SaveToStream(ostream& out)
00568     {
00569         out << " [ ";
00570         // write mu
00571         for (int i=0; i<inDim+1; i++)
00572             out << mu[i] << " ";
00573         out << " ] ( ";
00574         // write sigmaChol
00575         for (int i=1; i<=inDim+1; i++)
00576             for (int j=1; j<=i; j++)
00577                 out << sigmaChol(i,j) << " ";
00578         out << ") { " << weight << " } " << endl;
00579         /* for (int i=0; i<inDim+1; i++)
00580         out << C[i] << " ";
00581         out << endl;*/
00582     }
00583 
00584     void LoadFromStream(istream& in)
00585     {
00586         string s;
00587 
00588         in >> s;
00589         if (s!="[")
00590             throw ErrMsg("[ missing in MultiDimNormal");
00591         for (int i=0; i<inDim+1; i++)
00592             in >> mu[i];
00593         in >> s;
00594         if (s!="]")
00595             throw ErrMsg("] missing in MultiDimNormal");
00596         in >> s;
00597         if (s!="(")
00598             throw ErrMsg("( missing in MultiDimNormal");
00599         for (int i=1; i<=inDim+1; i++)
00600             for (int j=1; j<=i; j++)
00601                 in >>  sigmaChol(i,j);
00602         in >> s;
00603         if (s!=")")
00604             throw ErrMsg(") missing MultiDimNormal");
00605         in >> s;
00606         if (s!="{")
00607             throw ErrMsg("{ missing in MultiDimNormal");
00608         in >> weight;
00609         in >> s;
00610         if (s!="}")
00611             throw ErrMsg("} missing in MultiDimNormal");
00612 
00613         ComputeCCoef(sigmaChol,true);
00614         ComputeAlphas();
00615 
00616     }
00617 };
00618 
00619 }
00620 #endif // _CLUS_MULTINORMAL_H_