Main Page | Namespace List | Class Hierarchy | Data Structures | File List | Namespace Members | Data Fields | Globals | Related Pages

UCItranslator.cc

Go to the documentation of this file.
00001 /*
00002 
00003 Copyright (c) 2003, Cornell University
00004 All rights reserved.
00005 
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008 
00009    - Redistributions of source code must retain the above copyright notice,
00010        this list of conditions and the following disclaimer.
00011    - Redistributions in binary form must reproduce the above copyright
00012        notice, this list of conditions and the following disclaimer in the
00013        documentation and/or other materials provided with the distribution.
00014    - Neither the name of Cornell University nor the names of its
00015        contributors may be used to endorse or promote products derived from
00016        this software without specific prior written permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029 
00030 */
00031 
00032 #include <string>
00033 #include <vector>
00034 #include <list>
00035 #include <iostream>
00036 #include <fstream>
00037 #include <string.h>
00038 #include <time.h>
00039 
00040 using namespace std;
00041 
00042 struct elemList
00043 {
00044     vector<int> disc;
00045     vector<double> cont;
00046 
00047     elemList(vector<int>& Disc, vector<double>& Cont):
00048             disc(Disc), cont(Cont)
00049     { }
00050 }
00051 ;
00052 
00053 
00054 int main(int argc, char** argv)
00055 {
00056     // what percent of the data to give to train, prune, and test sets.
00057     double trainFrac = .5;
00058     double pruneFrac =.25;
00059     int threshold=1; // default threshold
00060 
00061     if (argc<2 || argc>5)
00062     {
00063         cerr << "Usage: UCItranslator description(c=continouous, d=discrete) [ threshold trainFrac pruneFrac] " << endl;
00064         return 1;
00065     }
00066 
00067     srandom(time(0));
00068 
00069     char description[256];
00070     strcpy(description,argv[1]);
00071 
00072     if (argc>=3)
00073         threshold=atoi(argv[2]);
00074     if (argc>=4)
00075         trainFrac=atof(argv[3]);
00076     if (argc>=5)
00077         pruneFrac=atof(argv[4]);
00078 
00079     int noVars = strlen(description);
00080     // nuber of variables
00081     int noDiscrete;
00082     int noContinuous;
00083     int noLines=0;
00084 
00085     list<elemList*> Repository;
00086 
00087     /* If variable is discrete or classLabel build the dictionary
00088        on the fly 
00089     */
00090 
00091     noDiscrete=0;
00092     noContinuous=0;
00093     for (int i=0; i<noVars; i++)
00094     {
00095         switch (description[i])
00096         {
00097         case 'c':
00098             noContinuous++;
00099             break;
00100 
00101         case 'd':
00102         case 's':
00103         case 'n':
00104             noDiscrete++;
00105             break;
00106 
00107         default:
00108             cerr << "Wrong decription" << endl;
00109             return 1;
00110         }
00111     }
00112 
00113     // for discrete variables keep a dictionary
00114     vector< list<string> > Dictionary(noDiscrete);
00115     vector< int > sizeDiscrete(noDiscrete); // for substraction variables
00116 
00117     // data/tuple
00118     vector< int > discreteData(noDiscrete);
00119     vector< double > continuousData(noContinuous);
00120 
00121 
00122     while (!cin.eof())
00123     {
00124         noLines++;
00125 
00126         // parse the line
00127         int curr_cont=0;
00128         int curr_disc=0;
00129 
00130         char c;
00131         // skip separators
00132         while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00133         { }
00134         cin.putback(c);
00135 
00136         for (int i=0; i<noDiscrete+noContinuous; i++)
00137         {
00138             char buffer[10000];
00139             int pos=0;
00140 
00141             while ( (c=cin.get())!=' ' && c!='\t' && c!='\n' && c!=',' && !cin.eof())
00142                 buffer[pos++]=c;
00143             buffer[pos]=0;
00144             if (cin.eof())
00145                 goto EOFencountered;
00146             cin.putback(c);
00147 
00148             // skip separators
00149             while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00150             { }
00151             cin.putback(c);
00152             if (cin.eof())
00153                 goto EOFencountered;
00154 
00155 
00156             string token(buffer);
00157 
00158             if (cin.eof())
00159                 goto EOFencountered;
00160 
00161             switch (description[i])
00162             {
00163             case 'c':
00164                 {
00165                     double value=atof(token.c_str());
00166                     continuousData[curr_cont]=value;
00167 
00168                     curr_cont++;
00169                 }
00170                 break;
00171 
00172             case 'd':
00173                 {
00174                     // discrete data. Look token in Dictionary. If not there insert it
00175                     int c_pos=0;
00176                     discreteData[curr_disc]=-1;
00177                     list<string>& dict = Dictionary[curr_disc];
00178                     list<string>::iterator itrt;
00179                     for (itrt=dict.begin();
00180                             itrt!=dict.end(); itrt++)
00181                     {
00182                         string entry=*itrt;
00183                         if (entry==token)
00184                         {
00185                             discreteData[curr_disc]=c_pos;
00186                             break;
00187                         }
00188                         c_pos++;
00189                     }
00190 
00191                     if (discreteData[curr_disc]==-1)
00192                     {
00193                         dict.push_back(token);
00194                         discreteData[curr_disc]=c_pos;
00195                         sizeDiscrete[curr_disc]++;
00196                     }
00197 
00198                     curr_disc++;
00199                 }
00200                 break;
00201 
00202             case 's':
00203                 {
00204                     // discrete but already between 1 and n; just substract -1
00205                     int value=atoi(token.c_str())-1;
00206                     discreteData[curr_disc]=value;
00207 
00208                     if (sizeDiscrete[curr_disc]<value+1)
00209                         sizeDiscrete[curr_disc]=value+1;
00210 
00211                     curr_disc++;
00212                 }
00213                 break;
00214 
00215             case 'n':
00216                 {
00217                     // discrete but already between 1 and n; just substract -1
00218                     int value=atoi(token.c_str());
00219                     discreteData[curr_disc]=value;
00220 
00221                     if (sizeDiscrete[curr_disc]<value+1)
00222                         sizeDiscrete[curr_disc]=value+1;
00223 
00224                     curr_disc++;
00225                 }
00226 
00227             }
00228         }
00229 
00230         assert(curr_disc+curr_cont==noDiscrete+noContinuous);
00231 
00232         elemList* elp = new elemList(discreteData, continuousData);
00233         Repository.push_back(elp );
00234     }
00235 
00236 EOFencountered:
00237 
00238     int trainLines=0;
00239     int pruneLines=0;
00240     int testLines=0;
00241 
00242     vector<int> whichSet(noLines);
00243     // decide whether each line will be put in training, test, or pruning file.
00244     for (int i=0; i < noLines; i++)
00245     {
00246         double d =1.0*rand()/RAND_MAX;
00247         if (d<trainFrac)
00248         {
00249             whichSet[i] = 0;
00250             trainLines++;
00251         }
00252         else if (d < trainFrac+pruneFrac)
00253         {
00254             whichSet[i]=1;
00255             pruneLines++;
00256         }
00257         else
00258         {
00259             whichSet[i]=2;
00260             testLines++;
00261         }
00262     }
00263 
00264     ofstream trainFile("trainset");
00265     ofstream pruneFile("pruneset");
00266     ofstream testFile("testset");
00267 
00268     // put headers on each of the files
00269 
00270     trainFile << "# " << trainLines << " " << noDiscrete << " " << noContinuous << endl;
00271 
00272     if (noDiscrete>0)
00273     {
00274         trainFile << "# " ;
00275         for (int i=0; i<noDiscrete-1; i++)
00276             trainFile << sizeDiscrete[i] << " ";
00277         trainFile << 2 << endl;
00278     }
00279 
00280     pruneFile << "# " << pruneLines << " " << noDiscrete << " " << noContinuous << endl;
00281     if (noDiscrete>0)
00282     {
00283         pruneFile << "# " ;
00284         for (int i=0; i<noDiscrete-1; i++)
00285             pruneFile << sizeDiscrete[i] << " ";
00286         pruneFile << 2 << endl;
00287     }
00288 
00289     testFile << "# " << testLines << " " << noDiscrete << " " << noContinuous << endl;
00290     if (noDiscrete>0)
00291     {
00292         testFile << "# " ;
00293         for (int i=0; i<noDiscrete-1; i++)
00294             testFile << sizeDiscrete[i] << " ";
00295         testFile << 2 << endl;
00296     }
00297 
00298     //int n = (int)Dictionary[noDiscrete-1].size();
00299     //int threshold = rand()%( n>2 ? n-1 : 2);
00300 
00301     int cLine=0;
00302     list<elemList*>::iterator itrt;
00303     for (itrt=Repository.begin();
00304             itrt!=Repository.end(); itrt++)
00305     {
00306         elemList& eL = *(*itrt);
00307         vector<int> discreteData = eL.disc;
00308         vector<double> continuousData = eL.cont;
00309 
00310         if(Dictionary[noDiscrete-1].size() > 2)
00311         {
00312             if (discreteData[noDiscrete-1] <= threshold)
00313                 discreteData[noDiscrete-1] = 0;
00314             else
00315                 discreteData[noDiscrete-1]=1;
00316         }
00317 
00318 
00319         if (whichSet[cLine]==0)
00320         {
00321             for (int i=0; i<noDiscrete; i++)
00322                 trainFile << discreteData[i] << " ";
00323 
00324             for (int i=0; i<noContinuous; i++)
00325                 trainFile << continuousData[i] << " ";
00326 
00327             trainFile << endl;
00328         }
00329         else if (whichSet[cLine]==1)
00330         {
00331             for (int i=0; i<noDiscrete; i++)
00332                 pruneFile << discreteData[i] << " ";
00333 
00334             for (int i=0; i<noContinuous; i++)
00335                 pruneFile << continuousData[i] << " ";
00336 
00337             pruneFile << endl;
00338         }
00339         else
00340         { // testing; format last discrete variable is the class label; put it last
00341             for (int i=0; i<noDiscrete-1; i++)
00342                 testFile << discreteData[i] << " ";
00343 
00344             for (int i=0; i<noContinuous; i++)
00345                 testFile << continuousData[i] << " ";
00346 
00347             testFile << discreteData[noDiscrete-1];
00348             testFile << endl;
00349         }
00350 
00351         cLine++;
00352     }
00353 
00354     return 0;
00355 }

Generated on Mon Jul 21 16:57:25 2003 for SECRET by doxygen 1.3.2