CLUS Namespace Reference

Data Structures

class BasicBinomialStatistics

class BinaryDecisionTree

Implements the binary decision tree. More...

class BinaryDecisionTreeNode

Implements a node of the binary decision tree. More...

class BinaryMultiClassificationSplitter

class BinaryObliqueProbabilisticSplitter

The class is completely redesigned as of May 27/2003 to incorporate fluctuations in splits not to use normal distributions to determine the probability functions. More...

class BinaryObliqueSplitter

class BinaryProbabilisticDecisionTree

class BinaryProbabilisticDecisionTreeNode

class BinaryProbabilisticRegressionTree

class BinaryProbabilisticRegressionTreeNode

Class used in building regression trees. More...

class BinaryProbabilisticSplitter

class BinaryRegressionTree

class BinaryRegressionTreeNode

Class used in building regression trees. More...

class BinarySplitter

Base class for all the splitters. More...

class BinomialStatistics

class Cluster

Cluster is the abstract base class for cluster hierarchy. More...

class ContinuousLinearTransformation

Applies linear shifts on continuous data. More...

class DataConsumer

class DataProducer

class DCTrainingData

Ancestor of all Training Data generators that can manipulate both discrete and continuous entries. More...

class DiscretePermutationTransformation

class Distribution

Base class for all the continuous distributions that have sufficient statistics. More...

class DynamicBuffer

Class to keep data temporarily that can grow automatically, only doubles can be stored inside. More...

class EMHiperPlan

class ErrMsg

class FileDataConsumer

class FileDataProducer

class Filter

class GridInputProducer

class HiperPlanCluster

This class implements hiperclusters with only one possible output. More...

class IndexedValue

class LinearRegressor

class Machine

Every machine has an input vector, an output one and a real output one should provide a constructor from file. More...

class MulticlassContinuousDistribution

The class is a repository of continuous sistributions that each predict one of the class labels of a discrete variable. More...

class MulticlassDistribution

Base class for all distributions that can predict a discrete variable. More...

class MultiDecisionTree

class MultiDecisionTreeNode

class MultiDimNormal

Implements a multidimentional normal distribution. More...

class MultidimNormalStatistics

Class implements a multidimentional normal distribution. More...

class NormalStatistics

class Permutation

Permutation[i] is the permuted value of i. More...

class ProbabilisticBinomialStatistics

class Regressor

class RPMSConsumer

class Scale

The following structure is used for scaling the inputs and the outputs newVal=adit+mult*oldVal. More...

class SimpleBinarySplitter

Splitter for decision trees. More...

class SimpleNormalDistribution

Implements a unidimensional normal distribution but the "active" dimension can be specified. More...

class SkinyMultiDimNormal

For now make EMHiperPlanCluster look like a Distribution. More...

class SphericCluster

Class that describes Spheric Clusters. More...

class StreamDataConsumer

class StreamDataProducer

class StreamDCTrainingData

class SyncObj

class SyncObjList

struct SyncObjList::listel

struct T_array

auxiliary type More...

class TrainingData

Enumerations

enum ShiftType { labeled, unlabeled }

Functions

template<class T> Vector< T > & GenerateRandomVector2 (Vector< T > &vec)

template<class T> void Min (Vector< T > &vec, Subscript N, T *data)

template<class T> void Max (Vector< T > &vec, Subscript N, T data[])

template<class T> void Sum (Vector< T > &vec, Subscript N, T data[])

template<class T> void Dif (Vector< T > &vec, Subscript N, Vector< T > &data)

template<class T> void SumPow2 (Vector< T > &vec, Subscript N, T data[])

template<class T> Vector< T > & operator *= (Vector< T > &vec, double mult)

template<class T> Vector< T > & SetNormTo (Vector< T > &vec, double norm)

Machine * MachineFactory (int nrpar,...)

this function constructs a machine when given a list of its parameters

Machine * MachineFactory (char *filename)

this function constructs a machine when given it's definition file

template<class T> bool IsPointInSet (T value, Vector< T > set)

Determines in log time if a point is in a set.

double PValueBinomialDistribution (double N, double p, double val)

Compute P[X>=val] for X~Binomial(N,p).

double PValueNormalDistribution (double mu, double sigma, double eta)

Computes int_{x>=eta} N(mu,var) dx.

double PValueNormalDistribution (const Vector< double > mu, const Fortran_Matrix< double > cholSigma, const Vector< double > n, const Vector< double > xc)

Computes int_{n'*(x-xc)>=0} N(mu,Sigma) dx.

int compare_array_elements (const void *x, const void *y)

auxiliary function to sort elements

double BinaryGiniGain (double p11, double p_1, double p1_)

Computes /Delta g(T).

double DiscreteGiniGain (Vector< double > &d_s_p1, Vector< double > &d_N, double N, double alpha_1, Vector< int > &Split)

Computes the maximum gain in gini by splitting on a discrete variable and the actual split Split: return the best split here Return: the new gini.

double ProbabilisticDiscreteGiniGain (const Vector< double > &d_s_p1, const Vector< double > &d_N, double N, double alpha_1, Vector< double > &probSet)

Sister function of DiscreteGiniGain.

double UnidimensionalQDA (double alpha_1, double eta1, double var1, double alpha_2, double eta2, double var2, int &whichSol)

Form equation eta^2(1/var1 - 1/var2)-2eta(eta1/var1-eta2/var2)+eta1^2/var1-eta2^2/var2 = 2ln(alpha1/alpha2)-ln(var1/var2) and solve it.

double UnidimensionalQDAVariance (double n1, double m1, double v1, double n2, double m2, double v2, int whichSol)

Computes the variance of the split point.

double ComputeSeparatingHyperplane_Anova (double mass, double alpha_1, Vector< double > &mu1, Fortran_Matrix< double > &S1, double alpha_2, Vector< double > &mu2, Fortran_Matrix< double > &S2, Vector< double > &SeparatingHyperplane)

The hyperplane is orthogonal on one of the axis not oblique.

double ComputeSeparatingHyperplane_QDA (double mass, double alpha_1, Vector< double > &mu1, Fortran_Matrix< double > &S1, double alpha_2, Vector< double > &mu2, Fortran_Matrix< double > &S2, Vector< double > &SeparatingHyperplane)

Compute the quadratic that separates two distributions and take the separating hyperplane to be the tangent to it in the intersection point with the line between the centers.

double ComputeSeparatingHyperplane_LDA (double mass, double alpha_1, Vector< double > &mu1, Fortran_Matrix< double > &S1, double alpha_2, Vector< double > &mu2, Fortran_Matrix< double > &S2, Vector< double > &SeparatingHyperplane)

The normal of the hyperplane is the best separating direction, i.e.

StreamDCTrainingData * CreateStreamDCTrainingDataFromFile (char *filename)

Detailed Description

Todo:: This solution is very ugly. Use virtual functions to get the same result

Enumeration Type Documentation

enum CLUS::ShiftType

Enumeration values:

labeled

unlabeled

Definition at line 55 of file statisticsgatherers.h.

Function Documentation

double BinaryGiniGain ( double p11,

double p_1,

double p1_

)

Computes /Delta g(T).
C1 is cluster 1; S1 is split 1

Parameters:

p11 P[x /in C1 ^ x /in S1]

p_1 P[x /in C1]

p1_ P[x /in S1]

Definition at line 311 of file splitpointcomputation.h.
Referenced by CLUS::NormalStatistics::ComputeGiniGain(), ComputeSeparatingHyperplane_Anova(), ComputeSeparatingHyperplane_LDA(), ComputeSeparatingHyperplane_QDA(), DiscreteGiniGain(), and ProbabilisticDiscreteGiniGain().

int compare_array_elements ( const void * x,

const void * y

)

auxiliary function to sort elements

Definition at line 291 of file splitpointcomputation.h.
Referenced by DiscreteGiniGain(), and ProbabilisticDiscreteGiniGain().

double ComputeSeparatingHyperplane_Anova ( double mass,

double alpha_1,

Vector< double > & mu1,

Fortran_Matrix< double > & S1,

double alpha_2,

Vector< double > & mu2,

Fortran_Matrix< double > & S2,

Vector< double > & SeparatingHyperplane

)

The hyperplane is orthogonal on one of the axis not oblique.
the best axis to split on is determined by comparing the value of the t-test for the directions, i.e.
(eta1-eta2)^2 t=----------------- sigma1^2+sigma2^2
the split point is determined by solvind QDA on the unidimentional space like for the general case
Definition at line 745 of file splitpointcomputation.h.
Referenced by CLUS::MultidimNormalStatistics::ComputeGiniGain(), and ComputeSeparatingHyperplane_LDA().

double ComputeSeparatingHyperplane_LDA ( double mass,

double alpha_1,

Vector< double > & mu1,

Fortran_Matrix< double > & S1,

double alpha_2,

Vector< double > & mu2,

Fortran_Matrix< double > & S2,

Vector< double > & SeparatingHyperplane

)

The normal of the hyperplane is the best separating direction, i.e.
the vector on which the projection of the two gaussians is as separated as possible as measured by Fisher's discriminant.
The equations are:

normal: n=(alpha_1*S1+alpha_2*S2)^{-1}(mu1-mu2)
separating equations: n^T*x-eta=0
eta is solution of the equation

eta^2(1/var1 - 1/var2)-2eta(eta1/var1-eta2/var2)+eta1^2/var1-eta2^2/var2 = 2ln(alpha1/alpha2)-ln(var1/var2)
with etai=n^T*mu1 and vari=n^T*Si*n
Parameters: alpha,mu and S (sigma) describe a multinormal distribution. There are 2 of them. Output: SeparatingHiperplane and gini of the split
S1 and S2 are modified and have their respective Cholesky factorization in them
Definition at line 999 of file splitpointcomputation.h.
Referenced by CLUS::MultidimNormalStatistics::ComputeGiniGain().

double ComputeSeparatingHyperplane_QDA ( double mass,

double alpha_1,

Vector< double > & mu1,

Fortran_Matrix< double > & S1,

double alpha_2,

Vector< double > & mu2,

Fortran_Matrix< double > & S2,

Vector< double > & SeparatingHyperplane

)

Compute the quadratic that separates two distributions and take the separating hyperplane to be the tangent to it in the intersection point with the line between the centers.

Parameters:

mass the total mass of the two distributions

alpha_1 alpha, mu and S (sigma) describe a multinormal distribution

mu1

S1

alpha_2 alpha, mu and S (sigma) describe a multinormal distribution

mu2

S2

SeparatingHyperplane

Returns:
SeparatingHiperplane and gini of the split

Definition at line 814 of file splitpointcomputation.h.
Referenced by CLUS::MultidimNormalStatistics::ComputeGiniGain().

StreamDCTrainingData* CreateStreamDCTrainingDataFromFile ( char * filename )

Definition at line 81 of file streamdctrain.h.
Referenced by main().

template<class T>

void Dif ( Vector< T > & vec,

Subscript N,

Vector< T > & data

) [inline]

Definition at line 75 of file extravec.h.
Referenced by CLUS::SphericCluster::Split(), and CLUS::HiperPlanCluster::Split().

double DiscreteGiniGain ( Vector< double > & d_s_p1,

Vector< double > & d_N,

double N,

double alpha_1,

Vector< int > & Split

)

Computes the maximum gain in gini by splitting on a discrete variable and the actual split Split: return the best split here Return: the new gini.
If d_s_p1[i]=d_N[i]=0 we don't know anything about value i. To avoid biases we distribute this values among the two splits.
Theorem 9.4 from Breiman et al. justifies the linear algorithm.
Definition at line 325 of file splitpointcomputation.h.
Referenced by CLUS::BinomialStatistics::ComputeGiniGain(), and CLUS::BinomialStatistics::ComputeSplitPoint().

template<class T>

Vector<T>& GenerateRandomVector2 ( Vector< T > & vec ) [inline]

Definition at line 44 of file extravec.h.
Referenced by CLUS::SphericCluster::Split(), and CLUS::HiperPlanCluster::Split().

template<class T>

bool IsPointInSet ( T value,

Vector< T > set

)

Determines in log time if a point is in a set.
The set is a vector of sorted values.
Definition at line 62 of file splitpointcomputation.h.

Machine* MachineFactory ( char * filename )

this function constructs a machine when given it's definition file

Machine* MachineFactory ( int nrpar,

...

)

this function constructs a machine when given a list of its parameters

template<class T>

void Max ( Vector< T > & vec,

Subscript N,

T data[]

) [inline]

Definition at line 60 of file extravec.h.
Referenced by CLUS::TrainingData::Normalize().

template<class T>

void Min ( Vector< T > & vec,

Subscript N,

T * data

) [inline]

Definition at line 52 of file extravec.h.
Referenced by CLUS::TrainingData::Normalize().

template<class T>

Vector<T>& operator *= ( Vector< T > & vec,

double mult

) [inline]

Definition at line 90 of file extravec.h.

double ProbabilisticDiscreteGiniGain ( const Vector< double > & d_s_p1,

const Vector< double > & d_N,

double N,

double alpha_1,

Vector< double > & probSet

)

Sister function of DiscreteGiniGain.
Instead of finding a split set it finds probabilities that a point belongs to the left set
Definition at line 444 of file splitpointcomputation.h.
Referenced by CLUS::ProbabilisticBinomialStatistics::ComputeGiniGain().

double PValueBinomialDistribution ( double N,

double p,

double val

)

Compute P[X>=val] for X~Binomial(N,p).
This is exactly the normalized incomplete beta function (see Eric's encyclopedia) .
We use gsl to get this function. The result is IB_p(val,N-val-1.0)
Definition at line 91 of file splitpointcomputation.h.
Referenced by ProbabilisticDiscreteGiniGain().

double PValueNormalDistribution ( const Vector< double > mu,

const Fortran_Matrix< double > cholSigma,

const Vector< double > n,

const Vector< double > xc

)

Computes int_{n'*(x-xc)>=0} N(mu,Sigma) dx.

Definition at line 145 of file splitpointcomputation.h.

double PValueNormalDistribution ( double mu,

double sigma,

double eta

)

Computes int_{x>=eta} N(mu,var) dx.

Definition at line 131 of file splitpointcomputation.h.
Referenced by CLUS::NormalStatistics::ComputeGiniGain(), ComputeSeparatingHyperplane_Anova(), ComputeSeparatingHyperplane_LDA(), ComputeSeparatingHyperplane_QDA(), CLUS::BinaryProbabilisticSplitter::ProbabilityLeft(), and CLUS::BinaryObliqueSplitter::ProbabilityLeftPrivate().

template<class T>

Vector<T>& SetNormTo ( Vector< T > & vec,

double norm

) [inline]

Definition at line 98 of file extravec.h.
Referenced by CLUS::SphericCluster::Split(), and CLUS::HiperPlanCluster::Split().

template<class T>

void Sum ( Vector< T > & vec,

Subscript N,

T data[]

) [inline]

Definition at line 68 of file extravec.h.
Referenced by CLUS::TrainingData::Normalize().

template<class T>

void SumPow2 ( Vector< T > & vec,

Subscript N,

T data[]

) [inline]

Definition at line 83 of file extravec.h.
Referenced by CLUS::TrainingData::Normalize().

double UnidimensionalQDA ( double alpha_1,

double eta1,

double var1,

double alpha_2,

double eta2,

double var2,

int & whichSol

)

Form equation eta^2(1/var1 - 1/var2)-2eta(eta1/var1-eta2/var2)+eta1^2/var1-eta2^2/var2 = 2ln(alpha1/alpha2)-ln(var1/var2) and solve it.

Parameters:

alpha_1

eta1

var1

alpha_2

eta2

var2

whichSol set to 0 if first order equation solved, 1 if first sol of second order equation and 2 for second solution. Is set to 3 if the default

Returns:
weighted mean of averages

Definition at line 533 of file splitpointcomputation.h.
Referenced by CLUS::NormalStatistics::ComputeGiniGain(), ComputeSeparatingHyperplane_Anova(), and ComputeSeparatingHyperplane_LDA().

double UnidimensionalQDAVariance ( double n1,

double m1,

double v1,

double n2,

double m2,

double v2,

int whichSol

)

Computes the variance of the split point.
The prototype is in the Mathematica file StatisticsSplitPoint2.nb which has the computations of the variance using the delta method.
Definition at line 619 of file splitpointcomputation.h.
Referenced by CLUS::NormalStatistics::ComputeGiniGain(), ComputeSeparatingHyperplane_Anova(), ComputeSeparatingHyperplane_LDA(), and ComputeSeparatingHyperplane_QDA().

Generated on Mon Jul 21 16:57:43 2003 for SECRET by

1.3.2


Data Structures
class	BasicBinomialStatistics
class	BinaryDecisionTree
	Implements the binary decision tree. More...
class	BinaryDecisionTreeNode
	Implements a node of the binary decision tree. More...
class	BinaryMultiClassificationSplitter
class	BinaryObliqueProbabilisticSplitter
	The class is completely redesigned as of May 27/2003 to incorporate fluctuations in splits not to use normal distributions to determine the probability functions. More...
class	BinaryObliqueSplitter
class	BinaryProbabilisticDecisionTree
class	BinaryProbabilisticDecisionTreeNode
class	BinaryProbabilisticRegressionTree
class	BinaryProbabilisticRegressionTreeNode
	Class used in building regression trees. More...
class	BinaryProbabilisticSplitter
class	BinaryRegressionTree
class	BinaryRegressionTreeNode
	Class used in building regression trees. More...
class	BinarySplitter
	Base class for all the splitters. More...
class	BinomialStatistics
class	Cluster
	Cluster is the abstract base class for cluster hierarchy. More...
class	ContinuousLinearTransformation
	Applies linear shifts on continuous data. More...
class	DataConsumer
class	DataProducer
class	DCTrainingData
	Ancestor of all Training Data generators that can manipulate both discrete and continuous entries. More...
class	DiscretePermutationTransformation
class	Distribution
	Base class for all the continuous distributions that have sufficient statistics. More...
class	DynamicBuffer
	Class to keep data temporarily that can grow automatically, only doubles can be stored inside. More...
class	EMHiperPlan
class	ErrMsg
class	FileDataConsumer
class	FileDataProducer
class	Filter
class	GridInputProducer
class	HiperPlanCluster
	This class implements hiperclusters with only one possible output. More...
class	IndexedValue
class	LinearRegressor
class	Machine
	Every machine has an input vector, an output one and a real output one should provide a constructor from file. More...
class	MulticlassContinuousDistribution
	The class is a repository of continuous sistributions that each predict one of the class labels of a discrete variable. More...
class	MulticlassDistribution
	Base class for all distributions that can predict a discrete variable. More...
class	MultiDecisionTree
class	MultiDecisionTreeNode
class	MultiDimNormal
	Implements a multidimentional normal distribution. More...
class	MultidimNormalStatistics
	Class implements a multidimentional normal distribution. More...
class	NormalStatistics
class	Permutation
	Permutation[i] is the permuted value of i. More...
class	ProbabilisticBinomialStatistics
class	Regressor
class	RPMSConsumer
class	Scale
	The following structure is used for scaling the inputs and the outputs newVal=adit+mult*oldVal. More...
class	SimpleBinarySplitter
	Splitter for decision trees. More...
class	SimpleNormalDistribution
	Implements a unidimensional normal distribution but the "active" dimension can be specified. More...
class	SkinyMultiDimNormal
	For now make EMHiperPlanCluster look like a Distribution. More...
class	SphericCluster
	Class that describes Spheric Clusters. More...
class	StreamDataConsumer
class	StreamDataProducer
class	StreamDCTrainingData
class	SyncObj
class	SyncObjList
struct	SyncObjList::listel
struct	T_array
	auxiliary type More...
class	TrainingData
Enumerations
enum	ShiftType { labeled, unlabeled }
Functions
template<class T> Vector< T > &	GenerateRandomVector2 (Vector< T > &vec)
template<class T> void	Min (Vector< T > &vec, Subscript N, T *data)
template<class T> void	Max (Vector< T > &vec, Subscript N, T data[])
template<class T> void	Sum (Vector< T > &vec, Subscript N, T data[])
template<class T> void	Dif (Vector< T > &vec, Subscript N, Vector< T > &data)
template<class T> void	SumPow2 (Vector< T > &vec, Subscript N, T data[])
template<class T> Vector< T > &	operator *= (Vector< T > &vec, double mult)
template<class T> Vector< T > &	SetNormTo (Vector< T > &vec, double norm)
Machine *	MachineFactory (int nrpar,...)
	this function constructs a machine when given a list of its parameters
Machine *	MachineFactory (char *filename)
	this function constructs a machine when given it's definition file
template<class T> bool	IsPointInSet (T value, Vector< T > set)
	Determines in log time if a point is in a set.
double	PValueBinomialDistribution (double N, double p, double val)
	Compute P[X>=val] for X~Binomial(N,p).
double	PValueNormalDistribution (double mu, double sigma, double eta)
	Computes int_{x>=eta} N(mu,var) dx.
double	PValueNormalDistribution (const Vector< double > mu, const Fortran_Matrix< double > cholSigma, const Vector< double > n, const Vector< double > xc)
	Computes int_{n'*(x-xc)>=0} N(mu,Sigma) dx.
int	compare_array_elements (const void x, const void y)
	auxiliary function to sort elements
double	BinaryGiniGain (double p11, double p_1, double p1_)
	Computes /Delta g(T).
double	DiscreteGiniGain (Vector< double > &d_s_p1, Vector< double > &d_N, double N, double alpha_1, Vector< int > &Split)
	Computes the maximum gain in gini by splitting on a discrete variable and the actual split Split: return the best split here Return: the new gini.
double	ProbabilisticDiscreteGiniGain (const Vector< double > &d_s_p1, const Vector< double > &d_N, double N, double alpha_1, Vector< double > &probSet)
	Sister function of DiscreteGiniGain.
double	UnidimensionalQDA (double alpha_1, double eta1, double var1, double alpha_2, double eta2, double var2, int &whichSol)
	Form equation eta^2(1/var1 - 1/var2)-2eta(eta1/var1-eta2/var2)+eta1^2/var1-eta2^2/var2 = 2ln(alpha1/alpha2)-ln(var1/var2) and solve it.
double	UnidimensionalQDAVariance (double n1, double m1, double v1, double n2, double m2, double v2, int whichSol)
	Computes the variance of the split point.
double	ComputeSeparatingHyperplane_Anova (double mass, double alpha_1, Vector< double > &mu1, Fortran_Matrix< double > &S1, double alpha_2, Vector< double > &mu2, Fortran_Matrix< double > &S2, Vector< double > &SeparatingHyperplane)
	The hyperplane is orthogonal on one of the axis not oblique.
double	ComputeSeparatingHyperplane_QDA (double mass, double alpha_1, Vector< double > &mu1, Fortran_Matrix< double > &S1, double alpha_2, Vector< double > &mu2, Fortran_Matrix< double > &S2, Vector< double > &SeparatingHyperplane)
	Compute the quadratic that separates two distributions and take the separating hyperplane to be the tangent to it in the intersection point with the line between the centers.
double	ComputeSeparatingHyperplane_LDA (double mass, double alpha_1, Vector< double > &mu1, Fortran_Matrix< double > &S1, double alpha_2, Vector< double > &mu2, Fortran_Matrix< double > &S2, Vector< double > &SeparatingHyperplane)
	The normal of the hyperplane is the best separating direction, i.e.
StreamDCTrainingData *	CreateStreamDCTrainingDataFromFile (char *filename)