Main Page | Namespace List | Class Hierarchy | Data Structures | File List | Namespace Members | Data Fields | Globals | Related Pages

binaryobliquesplitter.h

Go to the documentation of this file.
00001 /*
00002 
00003 Copyright (c) 2003, Cornell University
00004 All rights reserved.
00005 
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008 
00009    - Redistributions of source code must retain the above copyright notice,
00010        this list of conditions and the following disclaimer.
00011    - Redistributions in binary form must reproduce the above copyright
00012        notice, this list of conditions and the following disclaimer in the
00013        documentation and/or other materials provided with the distribution.
00014    - Neither the name of Cornell University nor the names of its
00015        contributors may be used to endorse or promote products derived from
00016        this software without specific prior written permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029 
00030 */
00031 
00032 // -*- C++ -*-
00033 
00034 #if !defined _CLUS_BINARYOBLIQUESPLITTER_H
00035 #define _CLUS_BINARYOBLIQUESPLITTER_H
00036 
00037 #include "general.h"
00038 #include "binarysplitter.h"
00039 #include "splitpointcomputation.h"
00040 #include "statisticsgatherers.h"
00041 #include <math.h>
00042 
00043 #include <stdlib.h>
00044 #include <iostream>
00045 
00046 using namespace TNT;
00047 
00048 namespace CLUS
00049 {
00050 
00051 class BinaryObliqueSplitter: public BinarySplitter
00052 {
00053 protected:
00054     int N;
00055     Vector<ProbabilisticBinomialStatistics>  discreteStatistics;
00056     MultidimNormalStatistics continuousStatistics;
00057 
00058     bool purePart;
00059 
00060     double ProbabilityLeftPrivate(const int* Dvars, const double* Cvars)
00061     {
00062         /* use the fact that SeparatingHyperplane is normalized */
00063         if (SplitVariable==-1)
00064         {
00065             // split on a hyperplane
00066             double crit=SeparatingHyperplane[0];
00067             for (int i=0; i<csplitDim+regDim; i++)
00068                 crit+=Cvars[i]*SeparatingHyperplane[i+1];
00069 
00070             return 1.0-PValueNormalDistribution(0.0,1.0,crit);
00071         }
00072         else
00073             if (SplitVariable <=-2)
00074             {
00075                 // split on a continuous variable
00076                 double crit=SeparatingHyperplane[0]+
00077                             Cvars[-SplitVariable-2]*SeparatingHyperplane[-SplitVariable-1];
00078 
00079                 return  1.0-PValueNormalDistribution(0.0,1.0,crit);
00080             }
00081             else
00082             {
00083                 // split on a discrete variable
00084                 int value=Dvars[SplitVariable];
00085                 return splitSetProbability[value];
00086             }
00087     }
00088 
00089 public:
00090     BinaryObliqueSplitter():BinarySplitter(), N(0), purePart(false)
00091     {}
00092     BinaryObliqueSplitter(const Vector<int>& DDomainSize,int CsplitDim, int RegDim):
00093             BinarySplitter(DDomainSize,CsplitDim,RegDim), N(0),purePart(false)
00094     {}
00095     BinaryObliqueSplitter(BinarySplitter& aux):
00096             BinarySplitter(aux)
00097     { }
00098 
00099 
00100     /// which of the three methods is used for computation of separating hyperplane
00101     enum MultidimNormalStatistics::SeparationType CSepHypType; 
00102 
00103     /// Should be called after ComputeSplitVariable()
00104     ///
00105     /// @return true if the children given by branch will be splitted in the future
00106     bool MoreSplits(int branch, int Min_no_datapoints)
00107     {
00108         if (purePart)
00109             return false;
00110 
00111         // cout << "s_p1=" << continuousStatistics.GetS_P1() << "\ts_p2=" << continuousStatistics.GetS_P2() << "\tmin_no_datapoints=" << Min_no_datapoints << endl;
00112 
00113         if (branch==0)
00114             return continuousStatistics.GetS_P1()>=Min_no_datapoints;
00115         else
00116             return continuousStatistics.GetS_P2()>=Min_no_datapoints;
00117     }
00118 
00119     /// Compute the probability to take the left branch
00120     double ProbabilityLeft(const int* Dvars, const double* Cvars)
00121     {
00122         if (ProbabilityLeftPrivate(Dvars,Cvars)>.5)
00123             return 1.0;
00124         else
00125             return 0.0;
00126     }
00127 
00128     int ChooseBranch( const int* Dvars, const double* Cvars)
00129     {
00130         if (ProbabilityLeftPrivate(Dvars,Cvars)>.5)
00131             return 0;
00132         else
00133             return 1;
00134     }
00135 
00136     void InitializeSplitStatistics(void)
00137     {
00138         discreteStatistics.newsize(dsplitDim);
00139         for (int i=0; i<dsplitDim; i++)
00140         {
00141             discreteStatistics[i].ResetDomainSize(dDomainSize[i]);
00142         }
00143 
00144         continuousStatistics.Resize(csplitDim+regDim);
00145     }
00146 
00147     void UpdateSplitStatistics( const int* Dvars, const double* Cvars,
00148                                 double p1I, double p2I, double probability)
00149     {
00150         N++;
00151 
00152         double p1=p1I*probability;
00153         double p2=p2I*probability;
00154 
00155         // update the discrete var statistics
00156         for(int i=0; i<dsplitDim; i++)
00157         {
00158             discreteStatistics[i].UpdateStatisticsP(Dvars[i],p1,p2);
00159         }
00160 
00161         continuousStatistics.UpdateStatistics(Cvars,p1,p2);
00162     }
00163 
00164     void DeleteTemporaryStatistics(void)
00165     {
00166         // free the space ocupied by the suficient statistics
00167         discreteStatistics.newsize(0);
00168         continuousStatistics.Resize(0);
00169     }
00170 
00171     int ComputeSplitVariable(int type = MultidimNormalStatistics::LDA)
00172     {
00173         double maxgini;
00174 
00175         if (N==0)
00176             goto error;
00177 
00178         // compute the best oblique split
00179         maxgini=continuousStatistics.ComputeGiniGain(type);
00180         SplitVariable=continuousStatistics.GetSplitVariable();
00181 
00182         // look for a cathegorical split that is better
00183 
00184         for (int i=0; i<dsplitDim; i++)
00185         {
00186             double gini=discreteStatistics[i].ComputeGiniGain();
00187 
00188             //cout << "Var: " << i << " gini=" << gini << endl;
00189 
00190             if (gini>maxgini)
00191             {
00192                 maxgini=gini;
00193                 SplitVariable=i;
00194             }
00195         }
00196 
00197         if (maxgini==0.0)
00198             goto error; // nobody makes any improvement
00199 
00200         if (SplitVariable>=0)
00201         {
00202             splitSetProbability=discreteStatistics[SplitVariable].GetProbabilitySet();
00203         }
00204         else
00205         {
00206             SeparatingHyperplane=continuousStatistics.GetSeparatingHyperplane();
00207         }
00208 
00209         // anything is hopefully donne at this point
00210 
00211         // if we are very close to maximum achievable gini we do no more splits in future
00212         if (fabs(maxgini-continuousStatistics.MaxGini()) < TNNearlyZero )
00213             purePart=true;
00214 
00215 #if 0
00216         //cout << "maxgini=" << maxgini << " maxpossible=" << 2*alpha_1*alpha_2 << endl;
00217         cout << "SplitVariable=" << SplitVariable << endl;
00218         if (SplitVariable>=0)
00219         {
00220             cout << " [[ ";
00221             for (int i=0; i<splitSetProbability.size(); i++)
00222                 cout << splitSetProbability[i] << " ";
00223             cout << " ]]" << endl;
00224             ;
00225         }
00226 #endif
00227 
00228         return 0;
00229 
00230 error:
00231         cout << "Error encountered, killing node" << endl;
00232 
00233         return -1;
00234     }
00235 };
00236 
00237 }
00238 
00239 #endif // _CLUS_BINARYOBLIQUESPLITTER_H

Generated on Mon Jul 21 16:57:23 2003 for SECRET by doxygen 1.3.2