00001 /* 00002 00003 Copyright (c) 2003, Cornell University 00004 All rights reserved. 00005 00006 Redistribution and use in source and binary forms, with or without 00007 modification, are permitted provided that the following conditions are met: 00008 00009 - Redistributions of source code must retain the above copyright notice, 00010 this list of conditions and the following disclaimer. 00011 - Redistributions in binary form must reproduce the above copyright 00012 notice, this list of conditions and the following disclaimer in the 00013 documentation and/or other materials provided with the distribution. 00014 - Neither the name of Cornell University nor the names of its 00015 contributors may be used to endorse or promote products derived from 00016 this software without specific prior written permission. 00017 00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF 00028 THE POSSIBILITY OF SUCH DAMAGE. 00029 00030 */ 00031 00032 // -*- C++ -*- 00033 00034 #if !defined _CLUS_MULTICLASSDISTRIBUTION_H 00035 #define _CLUS_MULTICLASSDISTRIBUTION_H 00036 00037 #ifdef CLUS_USE_XML 00038 #include "xml.h" 00039 #endif 00040 00041 namespace CLUS 00042 { 00043 00044 /** Base class for all distributions that can predict a discrete variable. 00045 The methods are virtual since not only one subclass is used in any given 00046 tree and is easier to let the virtual function mechanism kick in. 00047 */ 00048 class MulticlassDistribution 00049 { 00050 protected: 00051 /// the number of classes the predicted variable has 00052 int noClasses; 00053 00054 /// the value of the test that is usually computed during StopLearning 00055 double statisticalTest; 00056 public: 00057 MulticlassDistribution(int NoClasses): noClasses(NoClasses) 00058 { } 00059 00060 virtual ~MulticlassDistribution(void) 00061 { } 00062 00063 /** Infer will use data to produce noClasses normalized probabilities 00064 into result. 00065 */ 00066 virtual void Infer(const double* cdata, const int* ddata, double* result) 00067 { 00068 for (int i=0; i<noClasses; i++) 00069 result[i]=1.0/noClasses; 00070 } 00071 00072 /** MultiplicativeInfer uses data to produce probabilities and 00073 multiplies these probabilities with the ones in result. 00074 */ 00075 virtual void MultiplicativeInfer(const double* cdata, const int* ddata, double* result) 00076 { 00077 // Implement just a uniform distribution regardless of the inputs 00078 for (int i=0; i<noClasses; i++) 00079 result[i]/=noClasses; 00080 } 00081 00082 /** Initialize the sufficient statistics that are maintained. 00083 */ 00084 virtual void StartLearning(void) 00085 { } 00086 00087 /** Update the sufficient statistics according to the current input. 00088 Should be used if the class label is known for sure. 00089 00090 @param cdata contains values for the continuous variables 00091 @param ddata for the discrete ones 00092 @param classLabel known classification label 00093 @param weightSample used to give different importance 00094 to the samples (magnifying glass effect). 00095 */ 00096 virtual void LearnSample(const double* cdata, const int* ddata, int classLabel, double weightSample=1.0) 00097 { } 00098 00099 /** Update the sufficient statistics according to the current input. 00100 Should be used if the class label cannot be determined with 00101 certainty. 00102 00103 @param cdata contains values for the continuous variables 00104 @param ddata for the discrete ones 00105 @param classProbabilities classification probabilities 00106 @param weightSample used to give different importance 00107 to the samples (magnifying glass effect). 00108 */ 00109 virtual void LearnSample(const double* cdata, const int* ddata, double classProbabilities, double weightSample=1.0) 00110 { } 00111 00112 00113 /** Uses the sufficient statistics to compute estimates of the parameters of the 00114 distribution. 00115 */ 00116 virtual void StopLearning(void) 00117 { } 00118 00119 /** Returns the log of p-value=1-cdf of the apropriate statistical test. 00120 In other words it returns the probability that randomly (no correlations between 00121 input and classlabel) we do as well. The smaller the p-value the more predictive 00122 the distribution. The criterion depends on the distribution. 00123 This function should be called only after StopLearning 00124 */ 00125 virtual double PValueStatisticalTest(void) 00126 { 00127 return log(1.0/noClasses); 00128 } // p-value for uniform distribution } 00129 00130 void SaveToStream(ostream& output) 00131 {} 00132 00133 #ifdef CLUS_USE_XML 00134 /** Prints the distribution in a stream in XML */ 00135 virtual void PrintToXmlStream(ostream& output) 00136 { 00137 output << "<MulticlassDistribution"; 00138 PrintAttribute(output, "noClasses", noClasses); 00139 output << "/>" << endl; 00140 } 00141 #endif 00142 00143 /** @return true if the classLabel index has no significant apearance */ 00144 virtual bool IsClassLabelAbsent(int index) 00145 { 00146 return false; 00147 } 00148 00149 }; 00150 00151 } 00152 00153 #endif // _CLUS_MULTICLASSDISTRIBUTION_H