00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00022 
00023 
00024 
00025 
00026 
00027 
00028 
00029 
00030 
00031 
00032 #include <string>
00033 #include <vector>
00034 #include <list>
00035 #include <iostream>
00036 #include <fstream>
00037 #include <string.h>
00038 #include <time.h>
00039 
00040 using namespace std;
00041 
00042 struct elemList
00043 {
00044     vector<int> disc;
00045     vector<double> cont;
00046 
00047     elemList(vector<int>& Disc, vector<double>& Cont):
00048             disc(Disc), cont(Cont)
00049     { }
00050 }
00051 ;
00052 
00053 
00054 int main(int argc, char** argv)
00055 {
00056     
00057     double trainFrac = .5;
00058     double pruneFrac =.25;
00059     int threshold=1; 
00060 
00061     if (argc<2 || argc>5)
00062     {
00063         cerr << "Usage: UCItranslator description(c=continouous, d=discrete) [ threshold trainFrac pruneFrac] " << endl;
00064         return 1;
00065     }
00066 
00067     srandom(time(0));
00068 
00069     char description[256];
00070     strcpy(description,argv[1]);
00071 
00072     if (argc>=3)
00073         threshold=atoi(argv[2]);
00074     if (argc>=4)
00075         trainFrac=atof(argv[3]);
00076     if (argc>=5)
00077         pruneFrac=atof(argv[4]);
00078 
00079     int noVars = strlen(description);
00080     
00081     int noDiscrete;
00082     int noContinuous;
00083     int noLines=0;
00084 
00085     list<elemList*> Repository;
00086 
00087     
00088 
00089 
00090 
00091     noDiscrete=0;
00092     noContinuous=0;
00093     for (int i=0; i<noVars; i++)
00094     {
00095         switch (description[i])
00096         {
00097         case 'c':
00098             noContinuous++;
00099             break;
00100 
00101         case 'd':
00102         case 's':
00103         case 'n':
00104             noDiscrete++;
00105             break;
00106 
00107         default:
00108             cerr << "Wrong decription" << endl;
00109             return 1;
00110         }
00111     }
00112 
00113     
00114     vector< list<string> > Dictionary(noDiscrete);
00115     vector< int > sizeDiscrete(noDiscrete); 
00116 
00117     
00118     vector< int > discreteData(noDiscrete);
00119     vector< double > continuousData(noContinuous);
00120 
00121 
00122     while (!cin.eof())
00123     {
00124         noLines++;
00125 
00126         
00127         int curr_cont=0;
00128         int curr_disc=0;
00129 
00130         char c;
00131         
00132         while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00133         { }
00134         cin.putback(c);
00135 
00136         for (int i=0; i<noDiscrete+noContinuous; i++)
00137         {
00138             char buffer[10000];
00139             int pos=0;
00140 
00141             while ( (c=cin.get())!=' ' && c!='\t' && c!='\n' && c!=',' && !cin.eof())
00142                 buffer[pos++]=c;
00143             buffer[pos]=0;
00144             if (cin.eof())
00145                 goto EOFencountered;
00146             cin.putback(c);
00147 
00148             
00149             while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00150             { }
00151             cin.putback(c);
00152             if (cin.eof())
00153                 goto EOFencountered;
00154 
00155 
00156             string token(buffer);
00157 
00158             if (cin.eof())
00159                 goto EOFencountered;
00160 
00161             switch (description[i])
00162             {
00163             case 'c':
00164                 {
00165                     double value=atof(token.c_str());
00166                     continuousData[curr_cont]=value;
00167 
00168                     curr_cont++;
00169                 }
00170                 break;
00171 
00172             case 'd':
00173                 {
00174                     
00175                     int c_pos=0;
00176                     discreteData[curr_disc]=-1;
00177                     list<string>& dict = Dictionary[curr_disc];
00178                     list<string>::iterator itrt;
00179                     for (itrt=dict.begin();
00180                             itrt!=dict.end(); itrt++)
00181                     {
00182                         string entry=*itrt;
00183                         if (entry==token)
00184                         {
00185                             discreteData[curr_disc]=c_pos;
00186                             break;
00187                         }
00188                         c_pos++;
00189                     }
00190 
00191                     if (discreteData[curr_disc]==-1)
00192                     {
00193                         dict.push_back(token);
00194                         discreteData[curr_disc]=c_pos;
00195                         sizeDiscrete[curr_disc]++;
00196                     }
00197 
00198                     curr_disc++;
00199                 }
00200                 break;
00201 
00202             case 's':
00203                 {
00204                     
00205                     int value=atoi(token.c_str())-1;
00206                     discreteData[curr_disc]=value;
00207 
00208                     if (sizeDiscrete[curr_disc]<value+1)
00209                         sizeDiscrete[curr_disc]=value+1;
00210 
00211                     curr_disc++;
00212                 }
00213                 break;
00214 
00215             case 'n':
00216                 {
00217                     
00218                     int value=atoi(token.c_str());
00219                     discreteData[curr_disc]=value;
00220 
00221                     if (sizeDiscrete[curr_disc]<value+1)
00222                         sizeDiscrete[curr_disc]=value+1;
00223 
00224                     curr_disc++;
00225                 }
00226 
00227             }
00228         }
00229 
00230         assert(curr_disc+curr_cont==noDiscrete+noContinuous);
00231 
00232         elemList* elp = new elemList(discreteData, continuousData);
00233         Repository.push_back(elp );
00234     }
00235 
00236 EOFencountered:
00237 
00238     int trainLines=0;
00239     int pruneLines=0;
00240     int testLines=0;
00241 
00242     vector<int> whichSet(noLines);
00243     
00244     for (int i=0; i < noLines; i++)
00245     {
00246         double d =1.0*rand()/RAND_MAX;
00247         if (d<trainFrac)
00248         {
00249             whichSet[i] = 0;
00250             trainLines++;
00251         }
00252         else if (d < trainFrac+pruneFrac)
00253         {
00254             whichSet[i]=1;
00255             pruneLines++;
00256         }
00257         else
00258         {
00259             whichSet[i]=2;
00260             testLines++;
00261         }
00262     }
00263 
00264     ofstream trainFile("trainset");
00265     ofstream pruneFile("pruneset");
00266     ofstream testFile("testset");
00267 
00268     
00269 
00270     trainFile << "# " << trainLines << " " << noDiscrete << " " << noContinuous << endl;
00271 
00272     if (noDiscrete>0)
00273     {
00274         trainFile << "# " ;
00275         for (int i=0; i<noDiscrete-1; i++)
00276             trainFile << sizeDiscrete[i] << " ";
00277         trainFile << 2 << endl;
00278     }
00279 
00280     pruneFile << "# " << pruneLines << " " << noDiscrete << " " << noContinuous << endl;
00281     if (noDiscrete>0)
00282     {
00283         pruneFile << "# " ;
00284         for (int i=0; i<noDiscrete-1; i++)
00285             pruneFile << sizeDiscrete[i] << " ";
00286         pruneFile << 2 << endl;
00287     }
00288 
00289     testFile << "# " << testLines << " " << noDiscrete << " " << noContinuous << endl;
00290     if (noDiscrete>0)
00291     {
00292         testFile << "# " ;
00293         for (int i=0; i<noDiscrete-1; i++)
00294             testFile << sizeDiscrete[i] << " ";
00295         testFile << 2 << endl;
00296     }
00297 
00298     
00299     
00300 
00301     int cLine=0;
00302     list<elemList*>::iterator itrt;
00303     for (itrt=Repository.begin();
00304             itrt!=Repository.end(); itrt++)
00305     {
00306         elemList& eL = *(*itrt);
00307         vector<int> discreteData = eL.disc;
00308         vector<double> continuousData = eL.cont;
00309 
00310         if(Dictionary[noDiscrete-1].size() > 2)
00311         {
00312             if (discreteData[noDiscrete-1] <= threshold)
00313                 discreteData[noDiscrete-1] = 0;
00314             else
00315                 discreteData[noDiscrete-1]=1;
00316         }
00317 
00318 
00319         if (whichSet[cLine]==0)
00320         {
00321             for (int i=0; i<noDiscrete; i++)
00322                 trainFile << discreteData[i] << " ";
00323 
00324             for (int i=0; i<noContinuous; i++)
00325                 trainFile << continuousData[i] << " ";
00326 
00327             trainFile << endl;
00328         }
00329         else if (whichSet[cLine]==1)
00330         {
00331             for (int i=0; i<noDiscrete; i++)
00332                 pruneFile << discreteData[i] << " ";
00333 
00334             for (int i=0; i<noContinuous; i++)
00335                 pruneFile << continuousData[i] << " ";
00336 
00337             pruneFile << endl;
00338         }
00339         else
00340         { 
00341             for (int i=0; i<noDiscrete-1; i++)
00342                 testFile << discreteData[i] << " ";
00343 
00344             for (int i=0; i<noContinuous; i++)
00345                 testFile << continuousData[i] << " ";
00346 
00347             testFile << discreteData[noDiscrete-1];
00348             testFile << endl;
00349         }
00350 
00351         cLine++;
00352     }
00353 
00354     return 0;
00355 }