Main Page | Namespace List | Class Hierarchy | Data Structures | File List | Namespace Members | Data Fields | Globals | Related Pages

UCItranslator-val.cc

Go to the documentation of this file.
00001 /*
00002 
00003 Copyright (c) 2003, Cornell University
00004 All rights reserved.
00005 
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008 
00009    - Redistributions of source code must retain the above copyright notice,
00010        this list of conditions and the following disclaimer.
00011    - Redistributions in binary form must reproduce the above copyright
00012        notice, this list of conditions and the following disclaimer in the
00013        documentation and/or other materials provided with the distribution.
00014    - Neither the name of Cornell University nor the names of its
00015        contributors may be used to endorse or promote products derived from
00016        this software without specific prior written permission.
00017 
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029 
00030 */
00031 
00032 #include <string>
00033 #include <vector>
00034 #include <list>
00035 #include <iostream>
00036 #include <fstream>
00037 #include <string.h>
00038 #include <time.h>
00039 
00040 using namespace std;
00041 
00042 struct elemList
00043 {
00044     vector<int> disc;
00045     vector<double> cont;
00046 
00047     elemList(vector<int>& Disc, vector<double>& Cont):
00048             disc(Disc), cont(Cont)
00049     { }
00050 }
00051 ;
00052 
00053 
00054 int main(int argc, char** argv)
00055 {
00056     // what percent of the data to give to train, prune, and test sets.
00057     double trainFrac = .5;
00058     double pruneFrac =.25;
00059 
00060     if (argc<3 || argc>5)
00061     {
00062         cerr << "Usage: UCItranslator description(c=continouous, d=discrete) valC0 [trainFrac pruneFrac] " << endl;
00063         return 1;
00064     }
00065 
00066     /* valC0 has to be from 0 up. When is out of range,
00067        1 is returned so that the script that is calling 
00068        UCItranslator-val does not have to know about the 
00069        maximum value admissible */
00070 
00071     srandom(time(0));
00072 
00073     char description[256];
00074     strcpy(description,argv[1]);
00075     int valC0=atoi(argv[2]);
00076 
00077     if (argc>=4)
00078         trainFrac=atof(argv[3]);
00079     if (argc>=5)
00080         pruneFrac=atof(argv[4]);
00081 
00082     int noVars = strlen(description);
00083     // nuber of variables
00084     int noDiscrete;
00085     int noContinuous;
00086     int noLines=0;
00087 
00088     list<elemList*> Repository;
00089 
00090     /* If variable is discrete or classLabel build the dictionary
00091        on the fly 
00092     */
00093 
00094     noDiscrete=0;
00095     noContinuous=0;
00096     for (int i=0; i<noVars; i++)
00097     {
00098         switch (description[i])
00099         {
00100         case 'c':
00101             noContinuous++;
00102             break;
00103 
00104         case 'd':
00105         case 's':
00106         case 'n':
00107             noDiscrete++;
00108             break;
00109 
00110         default:
00111             cerr << "Wrong decription" << endl;
00112             return 1;
00113         }
00114     }
00115 
00116     // for discrete variables keep a dictionary
00117     vector< list<string> > Dictionary(noDiscrete);
00118     vector< int > sizeDiscrete(noDiscrete); // for substraction variables
00119 
00120     // data/tuple
00121     vector< int > discreteData(noDiscrete);
00122     vector< double > continuousData(noContinuous);
00123 
00124 
00125     while (!cin.eof())
00126     {
00127         noLines++;
00128 
00129         // parse the line
00130         int curr_cont=0;
00131         int curr_disc=0;
00132 
00133         char c;
00134         // skip separators
00135         while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00136         { }
00137         cin.putback(c);
00138 
00139         for (int i=0; i<noDiscrete+noContinuous; i++)
00140         {
00141             char buffer[10000];
00142             int pos=0;
00143 
00144             while ( (c=cin.get())!=' ' && c!='\t' && c!='\n' && c!=',' && !cin.eof())
00145                 buffer[pos++]=c;
00146             buffer[pos]=0;
00147             if (cin.eof())
00148                 goto EOFencountered;
00149             cin.putback(c);
00150 
00151             // skip separators
00152             while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00153             { }
00154             cin.putback(c);
00155             if (cin.eof())
00156                 goto EOFencountered;
00157 
00158 
00159             string token(buffer);
00160 
00161             if (cin.eof())
00162                 goto EOFencountered;
00163 
00164             switch (description[i])
00165             {
00166             case 'c':
00167                 {
00168                     double value=atof(token.c_str());
00169                     continuousData[curr_cont]=value;
00170 
00171                     curr_cont++;
00172                 }
00173                 break;
00174 
00175             case 'd':
00176                 {
00177                     // discrete data. Look token in Dictionary. If not there insert it
00178                     int c_pos=0;
00179                     discreteData[curr_disc]=-1;
00180                     list<string>& dict = Dictionary[curr_disc];
00181                     list<string>::iterator itrt;
00182                     for (itrt=dict.begin();
00183                             itrt!=dict.end(); itrt++)
00184                     {
00185                         string entry=*itrt;
00186                         if (entry==token)
00187                         {
00188                             discreteData[curr_disc]=c_pos;
00189                             break;
00190                         }
00191                         c_pos++;
00192                     }
00193 
00194                     if (discreteData[curr_disc]==-1)
00195                     {
00196                         dict.push_back(token);
00197                         discreteData[curr_disc]=c_pos;
00198                         sizeDiscrete[curr_disc]++;
00199                     }
00200 
00201                     curr_disc++;
00202                 }
00203                 break;
00204 
00205             case 's':
00206                 {
00207                     // discrete but already between 1 and n; just substract -1
00208                     int value=atoi(token.c_str())-1;
00209                     discreteData[curr_disc]=value;
00210 
00211                     if (sizeDiscrete[curr_disc]<value+1)
00212                         sizeDiscrete[curr_disc]=value+1;
00213 
00214                     curr_disc++;
00215                 }
00216                 break;
00217 
00218             case 'n':
00219                 {
00220                     // discrete but already between 1 and n; just substract -1
00221                     int value=atoi(token.c_str());
00222                     discreteData[curr_disc]=value;
00223 
00224                     if (sizeDiscrete[curr_disc]<value+1)
00225                         sizeDiscrete[curr_disc]=value+1;
00226 
00227                     curr_disc++;
00228                 }
00229 
00230             }
00231         }
00232 
00233         assert(curr_disc+curr_cont==noDiscrete+noContinuous);
00234 
00235         elemList* elp = new elemList(discreteData, continuousData);
00236         Repository.push_back(elp );
00237     }
00238 
00239 EOFencountered:
00240 
00241     // Check if valC0 is within limits
00242     if (valC0<0 || valC0>=(int)Dictionary[noDiscrete-1].size())
00243         return 1;
00244 
00245     int trainLines=0;
00246     int pruneLines=0;
00247     int testLines=0;
00248 
00249     vector<int> whichSet(noLines);
00250     // decide whether each line will be put in training, test, or pruning file.
00251     for (int i=0; i < noLines; i++)
00252     {
00253         double d =1.0*rand()/RAND_MAX;
00254         if (d<trainFrac)
00255         {
00256             whichSet[i] = 0;
00257             trainLines++;
00258         }
00259         else if (d < trainFrac+pruneFrac)
00260         {
00261             whichSet[i]=1;
00262             pruneLines++;
00263         }
00264         else
00265         {
00266             whichSet[i]=2;
00267             testLines++;
00268         }
00269     }
00270 
00271     ofstream trainFile("trainset");
00272     ofstream pruneFile("pruneset");
00273     ofstream testFile("testset");
00274     ofstream trainQuest("trainset-quest");
00275     ofstream testQuest("testset-quest");
00276     ofstream pruneQuest("pruneset-quest");
00277 
00278     // put headers on each of the files
00279 
00280     trainFile << "# " << trainLines << " " << noDiscrete << " " << noContinuous << endl;
00281 
00282     if (noDiscrete>0)
00283     {
00284         trainFile << "# " ;
00285         for (int i=0; i<noDiscrete-1; i++)
00286             trainFile << sizeDiscrete[i] << " ";
00287         trainFile << 2 << endl;
00288     }
00289 
00290     pruneFile << "# " << pruneLines << " " << noDiscrete << " " << noContinuous << endl;
00291     if (noDiscrete>0)
00292     {
00293         pruneFile << "# " ;
00294         for (int i=0; i<noDiscrete-1; i++)
00295             pruneFile << sizeDiscrete[i] << " ";
00296         pruneFile << 2 << endl;
00297     }
00298 
00299     testFile << "# " << testLines << " " << noDiscrete << " " << noContinuous << endl;
00300     if (noDiscrete>0)
00301     {
00302         testFile << "# " ;
00303         for (int i=0; i<noDiscrete-1; i++)
00304             testFile << sizeDiscrete[i] << " ";
00305         testFile << 2 << endl;
00306     }
00307 
00308     //int n = (int)Dictionary[noDiscrete-1].size();
00309     //int threshold = rand()%( n>2 ? n-1 : 2);
00310 
00311     int cLine=0;
00312     list<elemList*>::iterator itrt;
00313     for (itrt=Repository.begin();
00314             itrt!=Repository.end(); itrt++)
00315     {
00316         elemList& eL = *(*itrt);
00317         vector<int> discreteData = eL.disc;
00318         vector<double> continuousData = eL.cont;
00319 
00320         if (discreteData[noDiscrete-1] == valC0)
00321             discreteData[noDiscrete-1] = 0;
00322         else
00323             discreteData[noDiscrete-1]=1;
00324 
00325         if (whichSet[cLine]==0)
00326         {
00327             for (int i=0; i<noDiscrete; i++)
00328                 trainFile << discreteData[i] << " ";
00329 
00330             for (int i=0; i<noContinuous; i++)
00331                 trainFile << continuousData[i] << " ";
00332 
00333             trainFile << endl;
00334 
00335 
00336             for (int i=0; i<noDiscrete; i++)
00337                 trainQuest << discreteData[i] << " ";
00338 
00339             for (int i=0; i<noContinuous; i++)
00340                 trainQuest << continuousData[i] << " ";
00341 
00342             trainQuest << 1 << endl;
00343 
00344         }
00345         else if (whichSet[cLine]==1)
00346         {
00347             for (int i=0; i<noDiscrete; i++)
00348                 pruneFile << discreteData[i] << " ";
00349 
00350             for (int i=0; i<noContinuous; i++)
00351                 pruneFile << continuousData[i] << " ";
00352 
00353             pruneFile << endl;
00354 
00355             for (int i=0; i<noDiscrete; i++)
00356                 pruneQuest << discreteData[i] << " ";
00357 
00358             for (int i=0; i<noContinuous; i++)
00359                 pruneQuest << continuousData[i] << " ";
00360 
00361             pruneQuest << 1 << endl;
00362 
00363         }
00364         else
00365         { // testing; format last discrete variable is the class label; put it last
00366             for (int i=0; i<noDiscrete-1; i++)
00367                 testFile << discreteData[i] << " ";
00368 
00369             for (int i=0; i<noContinuous; i++)
00370                 testFile << continuousData[i] << " ";
00371 
00372             testFile << discreteData[noDiscrete-1];
00373             testFile << endl;
00374 
00375             for (int i=0; i<noDiscrete; i++)
00376                 testQuest << discreteData[i] << " ";
00377 
00378             for (int i=0; i<noContinuous; i++)
00379                 testQuest << continuousData[i] << " ";
00380 
00381             testQuest << 0 << endl;
00382 
00383         }
00384 
00385         cLine++;
00386     }
00387 
00388     return 0;
00389 }

Generated on Mon Jul 21 16:57:25 2003 for SECRET by doxygen 1.3.2