SECRET: splitpointcomputation.h Source File

00001 /*
00002 
00003 Copyright (c) 2003, Cornell University
00004 All rights reserved.
00005 
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008 
00009    - Redistributions of source code must retain the above copyright notice,
00010        this list of conditions and the following disclaimer.
00011    - Redistributions in binary form must reproduce the above copyright
00012        notice, this list of conditions and the following disclaimer in the
00013        documentation and/or other materials provided with the distribution.
00014    - Neither the name of Cornell University nor the names of its
00015        contributors may be used to endorse or promote products derived from
00016        this software without specific prior written permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029 
00030 */
00031 
00032 // -*- C++ -*-
00033 
00034 /* Implement functions for split point computation together with
00035    necessary auxiliary functions
00036 */
00037 
00038 #if !defined _CLUS_SPLITPOINTCOMPUTATION_H_
00039 #define _CLUS_SPLITPOINTCOMPUTATION_H_
00040 
00041 #include "general.h"
00042 #include <math.h>
00043 #include <iostream>
00044 #include "trislv.h"
00045 #include "fmat.h"
00046 #include "cholesky.h"
00047 
00048 #include <gsl/gsl_sf_gamma.h> // for incomplete regularized beta function
00049 #include <gsl/gsl_errno.h>
00050 
00051 #include <stdlib.h>
00052 
00053 using namespace TNT;
00054 using namespace std;
00055 
00056 namespace CLUS
00057 {
00058 
00059 /// Determines in log time if a point is in a set.
00060 /// The set is a vector of sorted values.
00061 template< class T>
00062 bool IsPointInSet(T value, Vector<T> set)
00063 {
00064     int l=0, r=set.dim()-1;
00065 
00066     assert(r>=0);
00067 
00068     while ( l<=r )
00069     {
00070         int m=(l+r)/2;
00071         T vm=set
00072              [m];
00073 
00074         if ( vm ==value )
00075             return true;
00076 
00077         if ( vm < value )
00078             l=m+1;
00079         else
00080             r=m-1;
00081     }
00082     return false;
00083 }
00084 
00085 
00086 /** Compute P[X>=val] for X~Binomial(N,p).
00087     This is exactly the normalized incomplete beta function (see Eric's encyclopedia) .
00088  
00089     We use gsl to get this function. The result is IB_p(val,N-val-1.0)
00090 */
00091 double PValueBinomialDistribution(double N, double p, double val)
00092 {
00093 
00094     double mean=N*p;
00095     double std=sqrt(N*p*(1-p)); // standard deviantion
00096 
00097     // cout << "mean=" << mean << " std=" << std << endl;
00098 
00099     // if val is not withing 10 standard deviations do not bother with the approximation
00100     if (val<mean-10*std)
00101         return 1.0;
00102 
00103     if (val>mean+10*std)
00104         return 0.0;
00105 
00106     gsl_sf_result result;
00107 
00108     int status=gsl_sf_beta_inc_e(val+1,N-val,p,&result);
00109 
00110     switch(status)
00111     {
00112     case 0:
00113         // everything is fine. Do nothing
00114         break;
00115 
00116     default:
00117         // deal with the error
00118         // use the normal approximation to the binomial
00119         {
00120             cout << "gsl_sf_beta_inc_e failed for N=" << N << " val=" << val << " p=" << p << " .Approximating with normal" << endl;
00121             return .5*(1.0+erf( (mean-val)/(std*sqrt(2.0)) ));
00122         }
00123         break;
00124     }
00125 
00126     return result.val;
00127 
00128 }
00129 
00130 /** Computes int_{x>=eta} N(mu,var) dx */
00131 double PValueNormalDistribution(double mu, double sigma, double eta)
00132 {
00133     if (sigma==0.0)
00134         if (mu>eta)
00135             return 1.0;
00136         else
00137             if (mu==eta)
00138                 return .5;
00139             else
00140                 return 0.0;
00141     return .5*(1.0+erf( (mu-eta)/sqrt(2*pow2(sigma)) ));
00142 }
00143 
00144 /** Computes int_{n'*(x-xc)>=0} N(mu,Sigma) dx */
00145 double PValueNormalDistribution(const Vector<double> mu,
00146                                 const Fortran_Matrix<double> cholSigma,
00147                                 const Vector<double> n,
00148                                 const Vector<double> xc)
00149 {
00150     /* Crafted after pvalue.m and thing.m (Matlab prototyping)
00151        The final formula is: 
00152                                1                   /       / sigma v \ \
00153            P=------------------------------------- | 1+ Erf| ------- | |
00154       2 sigma det(cholSigma) Sqrt[ det(S) ] \       \ Sqrt[2] / /
00155       
00156        where:
00157            v=n'*(mu-xc)/norm(n),
00158            sigma=Sqrt[ s-w'S^-1w ],
00159                                           / s w'\           
00160     (M'*cholSigma*cholSigma'*M)^-1=|     |
00161                                           \ w S /
00162        adn M is picked such that  M'n=e1
00163 
00164     */
00165 
00166     int d=n.dim(); // dimentionality of the space
00167 
00168     // compute v
00169     double v=dot_prod(n,mu-xc)/sqrt(dot_prod(n,n));
00170 
00171     // build initial M
00172     Fortran_Matrix<double> M(d,d); // already initialized to 0
00173     /* pick robustly the other d-1 vectors, eliminate ei, i=max(abs(n))
00174        since is the most "paralel" with n */
00175     // find i first
00176     int i=0;
00177     double max=fabs(n[i]);
00178     for(int j=1; j<d; j++)
00179     {
00180         double aux=fabs(n[j]);
00181         if (aux>max)
00182         {
00183             max=aux;
00184             i=j;
00185         }
00186     }
00187     i++; // adsust for the Fortran indexing
00188     // put n in the first line and 1 on the diagonal
00189     for (int j=1; j<=d; j++)
00190     {
00191         M(j,j)=1.0;
00192         M(j,1)=n[j-1];
00193     }
00194     // put e1 in row i
00195     if (i!=1)
00196     {
00197         M(i,i)=0.0;
00198         M(1,i)=1.0;
00199     }
00200 
00201     // Gram-Schmidt ortogonalize M
00202     for (int j=1; j<=d; j++)
00203     {
00204         for (int k=1; k<j; k++)
00205         {
00206             // mult=M(:,j)*M(:,k)'
00207             double mult=0.0;
00208             for (int l=1; l<=d; l++)
00209                 mult+=M(l,j)*M(l,k);
00210             // M(:,j)=M(:,j)-mult*M(:,k);
00211             for (int l=1; l<=d; l++)
00212                 M(l,j)-=mult*M(l,k);
00213         }
00214         // compute norm(M(j,:))
00215         double norm=0.0;
00216         for (int k=1; k<=d; k++)
00217             norm+=pow2(M(k,j));
00218         norm=sqrt(norm);
00219         // normalizde M(j,:)
00220         for (int k=1; k<=d; k++)
00221             M(k,j)/=norm;
00222     }
00223 
00224     // compute cholSigma^-1 M into M
00225     for (int i=1; i<=d; i++)
00226     {
00227         // for each column of M solve cholSigma^-1 z in z
00228         for (int j=1; j<=d; j++)
00229         {
00230             double tmp=0.0;
00231             for (int k=1; k<j; k++)
00232                 tmp+=cholSigma(j,k)*M(k,i);
00233             M(j,i)=(M(j,i)-tmp)/cholSigma(j,j);
00234         }
00235     }
00236 
00237     // compute s
00238     double s=0.0;
00239     for (int i=1; i<=d; i++)
00240         s+=pow2(M(i,1));
00241 
00242     // compute w
00243     Vector<double> w(d-1);
00244     for (int i=0; i<d-1; i++)
00245     {
00246         // w[i]=<z1,zi>
00247         double tmp=0.0;
00248         for (int j=1; j<=d; j++)
00249             tmp+=M(j,1)*M(j,i+2);
00250         w[i]=tmp;
00251     }
00252 
00253     // compute S
00254     Fortran_Matrix<double> S(d-1,d-1);
00255     for (int i=1; i<=d-1; i++)
00256         for (int j=i; j<=d-1; j++)
00257         {
00258             // multiply z(i+1) with z(j+1)
00259             double tmp=0.0;
00260             for (int k=1; k<=d; k++)
00261                 tmp+=M(k,i+1)*M(k,j+1);
00262             S(i,j)=tmp;
00263         }
00264 
00265     Cholesky_upper_factorization(S,S);
00266     // compute cholS^-1 * w
00267     Vector< double > aux = Lower_triangular_solve(S,w);
00268 
00269     double sigma=s-dot_prod(aux,aux);
00270 
00271     sigma=sqrt(sigma);
00272     double det_cholS=1.0;
00273     for (int i=1; i<=d-1; i++)
00274         det_cholS*=S(i,i);
00275     double det_cholSigma=1.0;
00276     for (int i=1; i<=d; i++)
00277         det_cholSigma*=cholSigma(i,i);
00278 
00279     // we can compute now the PValue
00280     return 1/( 2*sigma*det_cholSigma*det_cholS )*( 1+erf(sigma*v/sqrt(2.0)) );
00281 }
00282 
00283 /** auxiliary type */
00284 struct T_array
00285 {
00286     int index;
00287     double crit;
00288 };
00289 
00290 /** auxiliary function to sort elements */
00291 int compare_array_elements(const void* x, const void* y)
00292 {
00293     const T_array ax = *( (T_array*)x );
00294     const T_array ay = *( (T_array*)y );
00295     if (ax.crit<ay.crit)
00296         return -1;
00297     else
00298         if (ax.crit == ay.crit)
00299             return 0;
00300         else
00301             return 1;
00302 }
00303 
00304 /** Computes /Delta g(T).
00305     C1 is cluster 1; S1 is split 1
00306     
00307     @param p11     P[x /in C1 ^ x /in S1]
00308     @param p_1     P[x /in C1]
00309     @param p1_     P[x /in S1]
00310 */
00311 double BinaryGiniGain(double p11, double p_1, double p1_)
00312 {
00313     return p1_>0.0 ? 2*pow2(p11-p_1*p1_)/( p1_*(1-p1_) ) : 0.0;
00314 }
00315 
00316 /** Computes the maximum gain in gini by splitting on a discrete variable and the actual split
00317     Split: return the best split here
00318     Return: the new gini
00319     
00320     If d_s_p1[i]=d_N[i]=0 we don't know anything about value i. To avoid biases we distribute
00321     this values among the two splits.
00322  
00323     Theorem 9.4 from Breiman et al. justifies the linear algorithm.
00324  */
00325 double DiscreteGiniGain(Vector<double>& d_s_p1, Vector<double>& d_N, double N, double alpha_1,
00326                         Vector<int>& Split)
00327 {
00328 
00329     int n=d_N.dim();
00330 
00331 #if 0
00332 
00333     Vector<int> B_order(n);
00334     for(int i=0; i<n; i++)
00335         B_order[i]=i;
00336     // order values in vector B_order. We use bubble sort since is the simplest one to implement and acceptable
00337     bool flag;
00338     do
00339     {
00340         flag=false;
00341         for(int i=0; i<n-1; i++)
00342         {
00343             int i1=B_order[i];
00344             double v1=d_N[i1]>0 ? d_s_p1[i1]/d_N[i1] : -1.0;
00345             int i2=B_order[i+1];
00346             double v2=d_N[i2]>0 ? d_s_p1[i2]/d_N[i2] : -1.0;
00347             if (v1>v2)
00348             {
00349                 B_order[i]=i2;
00350                 B_order[i+1]=i1;
00351                 flag=true;
00352             }
00353         }
00354     }
00355     while (flag);
00356 #endif
00357 
00358     T_array* B_order=new T_array[n];
00359 
00360     for (int i=0; i<n; i++)
00361     {
00362         B_order[i].index=i;
00363         B_order[i].crit=d_N[i]>0 ? d_s_p1[i]/d_N[i] : -1.0;
00364         ;
00365     }
00366 
00367     qsort((void*)B_order, n, sizeof(T_array), compare_array_elements );
00368 
00369     //    cout << "B_order ";
00370     //     for (int i=0; i<n; i++)
00371     //       cout << " [" << B_order[i].index << "," << B_order[i].crit << "] ";
00372     //     cout << endl;
00373 
00374 
00375     // find the splitting point
00376     int pos=-1;  // the splitting point is between pos and pos+1
00377     double p11=0.0;
00378     double p1_=0.0;
00379     double maxgini=0.0;
00380     bool partonefirst=true; // if true partition one is at the left
00381     for (int i=0; i<n-1; i++)
00382     {
00383         p11+=d_s_p1[B_order[i].index]/N;
00384         p1_ +=1.0*d_N[B_order[i].index]/N;
00385         double gini=BinaryGiniGain(p11,alpha_1,p1_);
00386 
00387         //cout << "XXXXXX " << p11 << " " << p1_ << " " << gini << endl;
00388 
00389         if (p1_>=0.0 && p1_<=1.0 && gini>=maxgini)
00390         {
00391             maxgini=gini;
00392             pos=i;
00393             if ( p11<(p1_-p11) )
00394                 partonefirst=false;
00395             //cout << "maxgini=" << maxgini << " i=" << i << endl;
00396 
00397         }
00398     }
00399 
00400     double boundary;
00401     if (pos==-1 || B_order[pos].crit==-1.0)
00402         return 0.0;
00403     else
00404         boundary=d_s_p1[B_order[pos].index]/d_N[B_order[pos].index];
00405 
00406     // cout << "boundary=" << boundary << " pos=" << pos << endl;
00407 
00408     int nr=0;
00409     int buff[MAX_CATEGORIES];
00410     for (int i=0; i<n; i++)
00411         if ( (d_N[i]==0.0 && rand()*1.0/RAND_MAX > .5) ||
00412                 ((!partonefirst) && d_s_p1[i]/d_N[i]>boundary)
00413                 && NonEqual(d_s_p1[i]/d_N[i],boundary) ||
00414                 (partonefirst &&
00415                  (d_s_p1[i]/d_N[i]<boundary ||
00416                   !NonEqual(d_s_p1[i]/d_N[i],boundary) )) )
00417         {
00418             // cout << "Adding " << i << " " << nr << endl;
00419             buff[nr]=i;
00420             nr++;
00421         }
00422 
00423     //cout << "BUFFER ";
00424     //for (int i=0; i<nr; i++)
00425     //cout << buff[i] << " ";
00426     //cout << endl;
00427 
00428     // put the result in Split
00429     Split.newsize(nr);
00430     for (int i=0; i<nr; i++)
00431         Split[i]=buff[i];
00432 
00433     //cout << "Split ";
00434     // for (int i=0; i<Split.size(); i++)
00435     //cout << Split[i] << " ";
00436     //cout << " ----- " << nr << endl;
00437 
00438     return (nr>0 && nr<=n-1) ? maxgini : 0.0; // to avoid pseudo nonzero ginis
00439 }
00440 
00441 /** Sister function of DiscreteGiniGain. Instead of finding a split set it finds
00442     probabilities that a point belongs to the left set 
00443 */
00444 double ProbabilisticDiscreteGiniGain(const Vector<double>& d_s_p1, const Vector<double>& d_N,
00445                                      double N, double alpha_1,
00446                                      Vector<double>& probSet)
00447 {
00448 
00449     int n=d_N.dim();
00450 
00451     T_array* B_order=new T_array[n];
00452 
00453     for (int i=0; i<n; i++)
00454     {
00455         B_order[i].index=i;
00456         B_order[i].crit=d_N[i]>0.0 ? d_s_p1[i]/d_N[i] : -1.0;
00457         ;
00458     }
00459 
00460     qsort((void*)B_order, n, sizeof(T_array), compare_array_elements );
00461 
00462     // find the splitting point
00463     int pos=-1;  // the splitting point is between pos and pos+1
00464     double p11=0.0;
00465     double p1_=0.0;
00466     double maxgini=0.0;
00467     bool partonefirst=true; // if true partition one is at the left
00468     for (int i=0; i<n-1; i++)
00469     {
00470         p11+=d_s_p1[B_order[i].index]/N;
00471         p1_ +=1.0*d_N[B_order[i].index]/N;
00472         double gini=BinaryGiniGain(p11,alpha_1,p1_);
00473 
00474         if (p1_>=0.0 && p1_<=1.0 && gini>=maxgini)
00475         {
00476             maxgini=gini;
00477             pos=i;
00478             if ( p11<(p1_-p11) )
00479                 partonefirst=false;
00480         }
00481     }
00482 
00483     // take the boundary to be the average of the ratios where maximum occurs
00484     double boundary;
00485     probSet.newsize(n);
00486     if (pos==-1 || B_order[pos].crit==-1.0)
00487     {
00488         for (int i=0; i<n; i++)
00489             probSet[i]=0.5;
00490         return 0.0;
00491     }
00492     else
00493         boundary=(d_s_p1[B_order[pos].index]/d_N[B_order[pos].index]+
00494                   d_s_p1[B_order[pos+1].index]/d_N[B_order[pos+1].index])/2.0;
00495 
00496     delete [] B_order;
00497 
00498     // for every element in the domain compute the probability that the
00499     // observed ratio is at the left of the boundary point assuming Binomial distribution
00500     for (int i=0; i<n; i++)
00501         if (d_N[i]==0.0)
00502             probSet[i]=.5;
00503         else
00504         {
00505             double p=d_s_p1[i]/d_N[i];
00506             probSet[i]=1.0-PValueBinomialDistribution(d_N[i], p, boundary*d_N[i]);
00507             //cout << "i:" << i << " s_p1=" << d_s_p1[i] << " d_N=" << d_N[i]
00508             //     << " probSet[]=" << probSet[i] << endl;
00509 
00510         }
00511 
00512     return maxgini;
00513 }
00514 
00515 
00516 
00517 /**
00518    Form equation 
00519    eta^2(1/var1 - 1/var2)-2eta(eta1/var1-eta2/var2)+eta1^2/var1-eta2^2/var2
00520    = 2ln(alpha1/alpha2)-ln(var1/var2)
00521    and solve it.
00522 
00523    @param alpha_1
00524    @param eta1
00525    @param var1
00526    @param alpha_2
00527    @param eta2
00528    @param var2
00529    @param whichSol           set to 0 if first order equation solved, 1 if first sol of second 
00530                              order equation and 2 for second solution. Is set to 3 if the default
00531    @return                   weighted mean of averages
00532 */
00533 double UnidimensionalQDA(double alpha_1, double eta1, double var1,
00534                          double alpha_2, double eta2, double var2, int& whichSol)
00535 {
00536     whichSol=4; // special solution: eta1 or eta2
00537 
00538     if (isnan(alpha_1+eta1+var1))
00539     {
00540         assert( !isnan(alpha_2+eta2+var2) );
00541         return eta2;
00542     }
00543 
00544     if (isnan(alpha_2+eta2+var2))
00545     {
00546         assert( !isnan(alpha_1+eta1+var1) );
00547         return eta1;
00548     }
00549 
00550     if (!NonEqual(eta1,eta2))
00551         return eta1;
00552 
00553     double a=1.0/var1-1.0/var2;
00554     double b=-2.0*(eta1/var1-eta2/var2);
00555     double c=pow2(eta1)/var1-pow2(eta2)/var2-2.0*log(alpha_1/alpha_2)/log(exp(1.0))+
00556              log(var1/var2)/log(exp(1.0));
00557 
00558 
00559     if (var1<=SMALL_POZ_VALUE || var2<=SMALL_POZ_VALUE)
00560     {
00561         whichSol=5;
00562         return alpha_1*eta1+alpha_2*eta2;
00563     }
00564 
00565     // Alin:fixthis
00566     goto default_solution;
00567 
00568     if ( fabs(a) < 1e-8*(1.0/var1+1.0/var2) )
00569     {
00570         // first order equation
00571 
00572         if (NonZero(b))
00573         {
00574             double eta_l=-c/b;
00575             if ( (eta1-eta_l)*(eta2-eta_l)<0.0 )
00576             {
00577                 whichSol=0;
00578                 return eta_l;
00579             }
00580         } // otherwise return default solution; fallover
00581     }
00582     else
00583     {
00584         // second order equation
00585         double disc=pow2(b)-4.0*a*c;
00586         if (disc<0.0) // no solution, return default
00587             goto default_solution;
00588         else
00589         {
00590             // the two solutions
00591             double eta_1=(-b+sqrt(disc))/(2.0*a);
00592             double eta_2=(-b-sqrt(disc))/(2.0*a);
00593             // find the one in between
00594             if ( (eta1-eta_1)*(eta2-eta_1) < 0.0)
00595             {
00596                 whichSol=1;
00597                 return eta_1;
00598             }
00599             else
00600                 if ( (eta1-eta_2)*(eta2-eta_2) < 0.0)
00601                 {
00602                     whichSol=2;
00603                     return eta_2;
00604                 } // otherwise return default solution; fallover
00605         }
00606     }
00607 
00608 default_solution:
00609     whichSol=3; // default solution
00610     return (alpha_1*sqrt(var1)*eta1+alpha_2*eta2*sqrt(var2))/
00611            (alpha_1*sqrt(var1)+alpha_2*sqrt(var2));
00612     //return alpha_1*eta1+alpha_2*eta2;
00613 }
00614 
00615 /** Computes the variance of the split point.
00616    The prototype is in the Mathematica file StatisticsSplitPoint2.nb
00617    which has the computations of the variance using the delta method.
00618 */
00619 double UnidimensionalQDAVariance(double n1, double m1, double v1,
00620                                  double n2, double m2, double v2, int whichSol)
00621 {
00622 
00623     whichSol=6;
00624 
00625 #ifdef DEBUG_PRINT
00626 
00627     cout << "UnidimentionalQDAVariance " << n1 << " " << n2 << " " << v1 << " " << v2;
00628     cout << " " << m1 << " " << m2 << "\t";
00629 #endif
00630 
00631     double variance;
00632 
00633     if (v1<SMALL_POZ_VALUE  || v2<SMALL_POZ_VALUE )
00634     {
00635         if (n1<3 || n2<3)
00636             variance=pow2(m1-m2);
00637         else
00638             variance=SMALL_POZ_VALUE;
00639         return variance;
00640     }
00641 
00642     if (isnan(m1) || isnan(m2) || isnan(v1) || isnan(v2))
00643     {
00644         variance=LARGE_POZ_VALUE;
00645         return variance;
00646     }
00647 
00648 
00649     if ( IsEqual(v1,v2) && IsEqual(m1,m2) )
00650         variance = SMALL_POZ_VALUE;
00651     else
00652     {
00653         double CLog=log(n1/n2*sqrt(v2/v1));
00654         double SR=sqrt(v1*v2*( pow2(m1-m2)+2*(-v1+v2)*CLog ));
00655         double Q1=v1*(pow2(m1-m2)-v1+v2-2*(v1-2*v2)*CLog);
00656         double Q2=v2*(pow2(m1-m2)+v1-v2-2*(2*v1-v2)*CLog);
00657         double Q3=2*(pow(m2,4.0)-2*pow2(m2)*n2*v2+n2*pow2(v2));
00658         double Q4=2*(pow(m1,4.0)-2*pow2(m1)*n1*v1+n1*pow2(v1));
00659         double Q5=-m2*v1+m1*v2;
00660 
00661         // we compute all solutions even if not needed. any decent
00662         // compiler should get rid of unnecessary code
00663 
00664         double Sol1=
00665             ( v1*pow2(v2) ) / ( n1*pow2(v1-v2) ) * pow2(-1+(m1-m2)*v1/SR)+
00666             ( v2*pow2(v1) ) / ( n2*pow2(v1-v2) ) * pow2( 1-(m1-m2)*v2/SR)-
00667             Q4/( n1*(n1-1)*pow(v1-v2,4.0) ) * pow2(  Q5-SR+(v1-v2)*( m2+Q2/SR) )-
00668             Q3/( n2*(n2-1)*pow(v1-v2,4.0) ) * pow2( -Q5+SR+(v1-v2)*(-m1+Q1/SR) );
00669 
00670         double Sol2=
00671             ( v1*pow2(v2) ) / ( n1*pow2(v1-v2) ) * pow2( 1+(m1-m2)*v1/SR)+
00672             ( v2*pow2(v1) ) / ( n2*pow2(v1-v2) ) * pow2(-1-(m1-m2)*v2/SR)-
00673             Q4/( n1*(n1-1)*pow(v1-v2,4.0) ) * pow2( Q5+SR-(v1-v2)*(-m2+Q2/SR) )-
00674             Q3/( n2*(n2-1)*pow(v1-v2,4.0) ) * pow2( Q5+SR+(v1-v2)*( m1+Q1/SR) );
00675 
00676         // solution for the first order equation. Here v1=v2
00677         double QL=v1*log(n1/n2)/pow2(m1-m2);
00678         double Sol0=2*pow2(m1-m2)*pow2(QL)*(1.0/n1+1.0/n2)+v1/(4*n1)*pow2(1-2*QL)+
00679                     +v1/(4*n2)*pow2(1+2*QL);
00680 
00681         double SolDefault=(n1*pow2(v1)+n2*pow2(v2))/pow2(n1*sqrt(v1)+n2*sqrt(v2));
00682         //double SolDefault=(n1*v1+n2*v2)/pow2(n1+n2);
00683 
00684         switch(whichSol)
00685         {
00686         case 0:
00687             variance=Sol0;
00688             break;
00689 
00690         case 1:
00691             variance=Sol1;
00692             break;
00693 
00694         case 2:
00695             variance=Sol2;
00696             break;
00697 
00698         case 3:
00699             variance=SolDefault;
00700             break;
00701 
00702         case 5:
00703             variance=(n1*v1+n2*v2)/pow2(n1+n2);
00704             break;
00705 
00706         case 6: // paper version
00707             {
00708                 double s1=sqrt(v1);
00709                 double s2=sqrt(v2);
00710                 variance=s1*s2/(s1+s2)*(s1/n1+s2/n2);
00711             }
00712             break;
00713 
00714         default: // very degenerate solution
00715             variance=LARGE_POZ_VALUE;
00716         }
00717 #ifdef DEBUG_PRINT
00718         cout << "Sol0=" << Sol0 << "\tSol1=" << Sol1 << "\tSol2=" << Sol2;
00719         cout << "\tSolDefault=" << SolDefault << "\twhichSol=" << whichSol;
00720 #endif
00721 
00722         // assert (isfinite(variance)); //NaN?
00723     }
00724 
00725 #ifdef DEBUG_PRINT
00726     cout << "\tvariance=" << variance << endl;
00727 #endif
00728 
00729     return (finite(variance)==true) ? variance : LARGE_POZ_VALUE;
00730 }
00731 
00732 
00733 /**
00734    The hyperplane is orthogonal on one of the axis not oblique.
00735    the best axis to split on is determined by comparing the value of 
00736    the t-test for the directions, i.e.
00737    
00738        (eta1-eta2)^2
00739    t=-----------------
00740      sigma1^2+sigma2^2
00741  
00742      the split point is determined by solvind QDA on the unidimentional 
00743      space like for the general case
00744 */
00745 double ComputeSeparatingHyperplane_Anova(double mass /* the total mass of the two distributions*/,
00746         double alpha_1, Vector<double>& mu1, Fortran_Matrix<double>& S1,
00747         double alpha_2, Vector<double>& mu2, Fortran_Matrix<double>& S2,
00748         Vector<double>& SeparatingHyperplane)
00749 {
00750     double maxt=0.0;
00751     int maxi=-1;
00752     int N=mu1.dim();
00753 
00754     for (int i=0; i<N; i++)
00755     {
00756         double t=pow2(mu1[i]-mu2[i])/(S1(i+1,i+1)+S2(i+1,i+1));
00757 
00758         //cout << "i=" << i << "\tt=" << t << "\tmaxt=" << maxt << endl;
00759         //cout << "mu1=" << mu1[i] << " mu2=" << mu2[i]
00760         // << " s1=" << S1(i+1,i+1) << " s2=" << S2(i+1,i+1) << endl;
00761 
00762         if (t>maxt)
00763         {
00764             maxi=i;
00765             maxt=t;
00766         }
00767     }
00768 
00769     if (maxi==-1)
00770         return 0.0; // none of the directions is any good
00771 
00772     // solve QDA on the projection maxi
00773     int whichSol; //
00774     double eta=UnidimensionalQDA(alpha_1, mu1[maxi], S1(maxi+1, maxi+1),
00775                                  alpha_1, mu2[maxi], S2(maxi+1, maxi+1), whichSol);
00776     double variance=UnidimensionalQDAVariance(mass*alpha_1, mu1[maxi], S1(maxi+1, maxi+1),
00777                     mass*alpha_1, mu2[maxi], S2(maxi+1, maxi+1), whichSol);
00778 
00779     assert(variance>0.0);
00780 
00781     // form the separating hyperplane
00782     // by divideing all components by sqrt(variance) we get the
00783     // quantities computed with the Separating Hyperplane to be
00784     // centered on 0 and have the standard distribution of the
00785     // split point 1.0
00786     SeparatingHyperplane.newsize(N+1);
00787     SeparatingHyperplane=0.0;
00788     double sgn= (mu1[maxi]>eta) ? 1.0 : -1.0;
00789     SeparatingHyperplane[0]=-sgn*eta/sqrt(variance);
00790     SeparatingHyperplane[maxi+1]=sgn/sqrt(variance);
00791 
00792     double p11=alpha_1*PValueNormalDistribution(mu1[maxi], sqrt(S1(maxi+1, maxi+1)), eta);
00793     double p1_=p11+alpha_2*PValueNormalDistribution(mu2[maxi], sqrt(S2(maxi+1, maxi+1)), eta);
00794     double gini=BinaryGiniGain(p11,alpha_1,p1_);
00795     return gini;
00796 }
00797 
00798 /**
00799     Compute the quadratic that separates two distributions
00800     and take the separating hyperplane to be the tangent to it
00801     in the intersection point with the line between the centers.
00802 
00803     @param mass              the total mass of the two distributions
00804     @param alpha_1            alpha, mu and S (sigma) describe a multinormal distribution
00805     @param mu1
00806     @param S1
00807     @param alpha_2            alpha, mu and S (sigma) describe a multinormal distribution
00808     @param mu2
00809     @param S2
00810     @param SeparatingHyperplane
00811     
00812     @return                  SeparatingHiperplane and gini of the split
00813 */
00814 double ComputeSeparatingHyperplane_QDA(double mass,
00815                                        double alpha_1, Vector<double>& mu1, Fortran_Matrix<double>& S1,
00816                                        double alpha_2, Vector<double>& mu2, Fortran_Matrix<double>& S2,
00817                                        Vector<double>& SeparatingHyperplane)
00818 {
00819 
00820     // Sice we got here we clearly have 2 clusters. So we should better produce a split at least
00821     // into the regressor space.
00822 
00823     int N=mu1.dim();
00824 
00825     /* Check if one of the matrices S1 or S2 has a 0 on the diagonal. If this is the case
00826        then the variable corresponding to that position is either extremelly predictive (one
00827        value of the variable determines exactly one of the clusters) or the variable is useless
00828        (all the values are identical). In the first case we split on it in the seccond we
00829        substibute the 0 with same small value in both matrices. */
00830 
00831     for (int i=1; i<=N; i++)
00832     {
00833         if (S1(i,i)==0.0 && S2(i,i)==0.0)
00834         {
00835             S1(i,i)=S2(i,i)=SMALL_POZ_VALUE;
00836             continue;
00837         }
00838         if (S1(i,i)==0.0 || S2(i,i)==0.0)
00839         {
00840             /* split on variable i exclusivelly. Do the split between the projections of the
00841                centers of the clusters on this direction proportional with the sizes of the clusters.
00842             */
00843 
00844             SeparatingHyperplane.newsize(N+1);
00845             SeparatingHyperplane[0]=-(alpha_1*mu1[i-1]+alpha_2*mu2[i-1]);
00846             for (int j=1; j<=N; j++)
00847                 SeparatingHyperplane[j]= (i==j) ? 1.0 : 0.0;
00848 
00849             return 1-pow2(alpha_1)-pow2(alpha_2);
00850         }
00851     }
00852 
00853     // compute the Cholesky of S1 in lower part of S1
00854     if ( Cholesky_upper_factorization(S1,S1) || Cholesky_upper_factorization(S2,S2) )
00855     {
00856         // one of the matrices S1 and S2 is badly conditioned
00857         cerr << "Badly conditioned matrices in ComputeSeparatingHyperplane" << endl;
00858 
00859     }
00860 
00861     // compute the square root of determinant of S1 and S2
00862     double detS1=1.0;
00863     double detS2=1.0;
00864     for (int i=1; i<=N; i++)
00865     {
00866         detS1*=S1(i,i);
00867         detS2*=S2(i,i);
00868     }
00869 
00870     /* compute the quadratic that separates the distributions
00871     and take the separating hyperplane to be the tangent to it
00872     in the intersection point with the line between the centers */
00873 
00874     // Compute C
00875     double C=2.0*log(detS2/detS1)+log(alpha_1/alpha_2);
00876 
00877     Vector<double> dif=mu2-mu1;
00878     Vector<double> G1inv_dif=
00879         Lower_triangular_solve< Fortran_Matrix<double>,Vector<double> > (S1,dif);
00880     double I1=dot_prod(G1inv_dif, G1inv_dif);
00881 
00882     Vector<double> G2inv_dif=
00883         Lower_triangular_solve< Fortran_Matrix<double>,Vector<double> > (S2,dif);
00884     double I2=dot_prod(G2inv_dif, G2inv_dif);
00885 
00886     // compute t as solution of (1-t)^2 I1-t^2 I2=C
00887     double t, delta;
00888     if (fabs(I1-I2)<SMALL_POZ_VALUE)
00889     {
00890         // Liner equation
00891         t=(I1-C)/(2*I1);
00892     }
00893     else
00894     {
00895         // quadratic equation
00896         delta=C*(I1-I2)+I1*I2;
00897         if (delta<0.0)
00898         {
00899             t=-1.0;
00900         }
00901         else
00902         {
00903             t=(I1-sqrt(delta))/(I1-I2);
00904             if (t<0.0 || t>1.0)
00905                 t=(I1+sqrt(delta))/(I1-I2);
00906         }
00907     }
00908 
00909     Vector<double> n; // the normal to the Separating Hyperplane
00910 
00911     /* check if the solution makes any sense. If not take a point proportional
00912        with the sizes (alpha) on the line that goes through the two centers
00913     */
00914     if (t<0.0 || t>1.0)
00915     {
00916         // fix t
00917         t=alpha_1/(alpha_1+alpha_2); // in case alphas are not normalized
00918         n=dif;
00919     }
00920     else
00921     {
00922         // compute n as the tangent to the the quadratic in point mu
00923         n=
00924             Transposed_Upper_triangular_solve(S1,G1inv_dif)*(-1.0+t)-
00925             Transposed_Upper_triangular_solve(S2,G2inv_dif)*t;
00926     }
00927     n=n*(1/sqrt(dot_prod(n,n)));  //normalize n
00928 
00929     // compute the point which the  Separating Hyperplane crosses
00930     Vector<double> mu=t*mu1+(1-t)*mu2;
00931 
00932     // compute etai, vari
00933     double eta1=dot_prod(n,mu1);
00934     double eta2=dot_prod(n,mu2);
00935 
00936     // compute vari=n^T*S1*n directly into vari
00937     double var1=0.0, var2=0.0;
00938     for (int j=1; j<=N; j++)
00939     {
00940         double aux1=0.0, aux2=0.0;
00941         // compute n^T*Si(:,j) into auxi
00942         for (int i=1; i<=j; i++)
00943         {
00944             aux1+=n(i)*S1(i,j);
00945             aux2+=n(i)*S2(i,j);
00946         }
00947         // use only upper part of S1 and S2
00948         for (int i=j+1; i<=N; i++)
00949         {
00950             aux1+=n(i)*S1(j,i);
00951             aux2+=n(i)*S2(j,2);
00952         }
00953         var1+=aux1*n(j);
00954         var2+=aux2*n(j);
00955     }
00956 
00957     double variance=UnidimensionalQDAVariance(mass*alpha_1, eta1, var1,
00958                     mass*alpha_2, eta2, var2, 5);
00959 
00960     assert(variance>0.0);
00961 
00962     // compute the equation of the SeparatingHyperplane
00963     double eta=dot_prod(n,mu);
00964     double sgn= (eta1>eta) ? 1.0 : -1.0;
00965     SeparatingHyperplane.newsize(N+1);
00966     SeparatingHyperplane[0]=-sgn*eta/sqrt(variance);
00967     for(int i=1; i<=N; i++)
00968         SeparatingHyperplane[i]=sgn*n[i-1]/sqrt(variance);
00969 
00970     double p11=alpha_1*PValueNormalDistribution(mu1, S1, n, mu);
00971     double p1_=p11+alpha_2*PValueNormalDistribution(mu2, S2, n, mu);
00972     double gini=BinaryGiniGain(p11,alpha_1,p1_);
00973     return gini;
00974 }
00975 
00976 
00977 /**
00978     The normal of the hyperplane is the best separating direction, i.e.
00979     the vector on which the projection of the two gaussians is as separated 
00980     as possible as measured by Fisher's discriminant.
00981  
00982     The equations are:
00983         - normal: n=(alpha_1*S1+alpha_2*S2)^{-1}(mu1-mu2)
00984         - separating equations: n^T*x-eta=0
00985         - eta is solution of the equation
00986 
00987     eta^2(1/var1 - 1/var2)-2eta(eta1/var1-eta2/var2)+eta1^2/var1-eta2^2/var2
00988            = 2ln(alpha1/alpha2)-ln(var1/var2)
00989  
00990     with etai=n^T*mu1 and vari=n^T*Si*n
00991  
00992     Parameters: alpha,mu and S (sigma) describe a multinormal distribution.
00993     There are 2 of them.
00994     Output: SeparatingHiperplane and gini of the split
00995  
00996     S1 and S2 are modified and have their respective Cholesky factorization in them
00997     
00998 */
00999 double ComputeSeparatingHyperplane_LDA(double mass /* the total mass of the two distributions*/,
01000                                        double alpha_1, Vector<double>& mu1, Fortran_Matrix<double>& S1,
01001                                        double alpha_2, Vector<double>& mu2, Fortran_Matrix<double>& S2,
01002                                        Vector<double>& SeparatingHyperplane)
01003 {
01004 
01005     if (alpha_1<TNNearlyZero || alpha_2<TNNearlyZero)
01006         return 0.0; // there's nothing we can do. One of the clusters is nonexistent
01007 
01008     int N = S1.num_rows();
01009     Vector<double> n(N); // normal to the hyperplane
01010     double eta1, eta2;
01011     double var1, var2;
01012 
01013     Fortran_Matrix<double> Sw(N,N);
01014 
01015     // Compute upper part of Sw
01016     for (int j=1; j<=N; j++)
01017         for (int i=1; i<=j; i++)
01018             Sw(i,j)=alpha_1*S1(i,j)+alpha_2*S2(i,j);
01019 
01020     // Put mu1-mu2 in n
01021     for (int i=1; i<=N; i++)
01022         n(i)=mu1(i)-mu2(i);
01023 
01024     // Solve Sw^{-1}*n into n
01025     // First compute Cholesky of Sw into lower Sw
01026     if ( Cholesky_upper_factorization(Sw,Sw) )
01027     {
01028         cerr << "Badly conditioned matrices in ComputeSeparatingHyperplane" << endl;
01029         cerr << "We use normal anova analysis to find the Separating Hyperplane" << endl;
01030         return ComputeSeparatingHyperplane_Anova(mass,alpha_1, mu1, S1, alpha_2, mu2, S2, SeparatingHyperplane);
01031     }
01032 
01033     Lower_triangular_solve(Sw,n);
01034     Transposed_Upper_triangular_solve(Sw,n);
01035     // normalize n
01036     if (IsZero(dot_prod(n,n)))
01037         return 0.0; // nothing we can do. Clusters are identical
01038 
01039     n=n*(1.0/sqrt(dot_prod(n,n)));
01040 
01041     // compute etai, vari
01042     eta1=dot_prod(n,mu1);
01043     eta2=dot_prod(n,mu2);
01044 
01045     // compute vari=n^T*S1*n directly into vari
01046     var1=0.0;
01047     var2=0.0;
01048     for (int j=1; j<=N; j++)
01049     {
01050         double aux1=0.0, aux2=0.0;
01051         // compute n^T*Si(:,j) into auxi
01052         for (int i=1; i<=j; i++)
01053         {
01054             aux1+=n(i)*S1(i,j);
01055             aux2+=n(i)*S2(i,j);
01056         }
01057         // use only upper part of S1 and S2
01058         for (int i=j+1; i<=N; i++)
01059         {
01060             aux1+=n(i)*S1(j,i);
01061             aux2+=n(i)*S2(j,2);
01062         }
01063         var1+=aux1*n(j);
01064         var2+=aux2*n(j);
01065     }
01066 
01067 
01068     int whichSol;
01069     double eta=UnidimensionalQDA(alpha_1, eta1, var1, alpha_2, eta2, var2, whichSol);
01070     double variance=UnidimensionalQDAVariance(mass*alpha_1, eta1, var1,
01071                     mass*alpha_2, eta2, var2, whichSol);
01072 
01073     assert(variance>0.0);
01074 
01075     // form the separating hyperplane
01076     // by divideing all components by sqrt(variance) we get the
01077     // quantities computed with the Separating Hyperplane to be
01078     // centered on 0 and have the standard distribution of the
01079     // split point 1.0
01080     SeparatingHyperplane.newsize(N+1);
01081     double sgn= (eta1>eta) ? 1.0 : -1.0;
01082     SeparatingHyperplane[0]=-sgn*eta/sqrt(variance);
01083     for (int i=1; i<=N; i++)
01084         SeparatingHyperplane[i]=sgn*n[i-1]/sqrt(variance);
01085 
01086     Vector<double> mu(N);
01087     mu=eta*n;
01088 
01089 
01090     // compute the Cholesky of S1 in lower part of S1
01091     if ( Cholesky_upper_factorization(S1,S1) || Cholesky_upper_factorization(S2,S2) )
01092     {
01093         // one of the matrices S1 and S2 is badly conditioned
01094         cerr << "Badly conditioned matrices in ComputeSeparatingHyperplane" << endl;
01095         return 0.0; // worst gini
01096     }
01097 
01098     double p11=alpha_1*PValueNormalDistribution(mu1, S1, n, mu);
01099     double p1_=p11+alpha_2*PValueNormalDistribution(mu2, S2, n, mu);
01100     double gini=BinaryGiniGain(p11,alpha_1,p1_);
01101     return gini;
01102 }
01103 
01104 
01105 }
01106 
01107 #endif //_CLUS_SPLITPOINTCOMPUTATION_H_