00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include <string>
00033 #include <vector>
00034 #include <list>
00035 #include <iostream>
00036 #include <fstream>
00037 #include <string.h>
00038 #include <time.h>
00039
00040 using namespace std;
00041
00042 struct elemList
00043 {
00044 vector<int> disc;
00045 vector<double> cont;
00046
00047 elemList(vector<int>& Disc, vector<double>& Cont):
00048 disc(Disc), cont(Cont)
00049 { }
00050 }
00051 ;
00052
00053
00054 int main(int argc, char** argv)
00055 {
00056
00057 double trainFrac = .5;
00058 double pruneFrac =.25;
00059
00060 if (argc<3 || argc>5)
00061 {
00062 cerr << "Usage: UCItranslator description(c=continouous, d=discrete) valC0 [trainFrac pruneFrac] " << endl;
00063 return 1;
00064 }
00065
00066
00067
00068
00069
00070
00071 srandom(time(0));
00072
00073 char description[256];
00074 strcpy(description,argv[1]);
00075 int valC0=atoi(argv[2]);
00076
00077 if (argc>=4)
00078 trainFrac=atof(argv[3]);
00079 if (argc>=5)
00080 pruneFrac=atof(argv[4]);
00081
00082 int noVars = strlen(description);
00083
00084 int noDiscrete;
00085 int noContinuous;
00086 int noLines=0;
00087
00088 list<elemList*> Repository;
00089
00090
00091
00092
00093
00094 noDiscrete=0;
00095 noContinuous=0;
00096 for (int i=0; i<noVars; i++)
00097 {
00098 switch (description[i])
00099 {
00100 case 'c':
00101 noContinuous++;
00102 break;
00103
00104 case 'd':
00105 case 's':
00106 case 'n':
00107 noDiscrete++;
00108 break;
00109
00110 default:
00111 cerr << "Wrong decription" << endl;
00112 return 1;
00113 }
00114 }
00115
00116
00117 vector< list<string> > Dictionary(noDiscrete);
00118 vector< int > sizeDiscrete(noDiscrete);
00119
00120
00121 vector< int > discreteData(noDiscrete);
00122 vector< double > continuousData(noContinuous);
00123
00124
00125 while (!cin.eof())
00126 {
00127 noLines++;
00128
00129
00130 int curr_cont=0;
00131 int curr_disc=0;
00132
00133 char c;
00134
00135 while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00136 { }
00137 cin.putback(c);
00138
00139 for (int i=0; i<noDiscrete+noContinuous; i++)
00140 {
00141 char buffer[10000];
00142 int pos=0;
00143
00144 while ( (c=cin.get())!=' ' && c!='\t' && c!='\n' && c!=',' && !cin.eof())
00145 buffer[pos++]=c;
00146 buffer[pos]=0;
00147 if (cin.eof())
00148 goto EOFencountered;
00149 cin.putback(c);
00150
00151
00152 while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00153 { }
00154 cin.putback(c);
00155 if (cin.eof())
00156 goto EOFencountered;
00157
00158
00159 string token(buffer);
00160
00161 if (cin.eof())
00162 goto EOFencountered;
00163
00164 switch (description[i])
00165 {
00166 case 'c':
00167 {
00168 double value=atof(token.c_str());
00169 continuousData[curr_cont]=value;
00170
00171 curr_cont++;
00172 }
00173 break;
00174
00175 case 'd':
00176 {
00177
00178 int c_pos=0;
00179 discreteData[curr_disc]=-1;
00180 list<string>& dict = Dictionary[curr_disc];
00181 list<string>::iterator itrt;
00182 for (itrt=dict.begin();
00183 itrt!=dict.end(); itrt++)
00184 {
00185 string entry=*itrt;
00186 if (entry==token)
00187 {
00188 discreteData[curr_disc]=c_pos;
00189 break;
00190 }
00191 c_pos++;
00192 }
00193
00194 if (discreteData[curr_disc]==-1)
00195 {
00196 dict.push_back(token);
00197 discreteData[curr_disc]=c_pos;
00198 sizeDiscrete[curr_disc]++;
00199 }
00200
00201 curr_disc++;
00202 }
00203 break;
00204
00205 case 's':
00206 {
00207
00208 int value=atoi(token.c_str())-1;
00209 discreteData[curr_disc]=value;
00210
00211 if (sizeDiscrete[curr_disc]<value+1)
00212 sizeDiscrete[curr_disc]=value+1;
00213
00214 curr_disc++;
00215 }
00216 break;
00217
00218 case 'n':
00219 {
00220
00221 int value=atoi(token.c_str());
00222 discreteData[curr_disc]=value;
00223
00224 if (sizeDiscrete[curr_disc]<value+1)
00225 sizeDiscrete[curr_disc]=value+1;
00226
00227 curr_disc++;
00228 }
00229
00230 }
00231 }
00232
00233 assert(curr_disc+curr_cont==noDiscrete+noContinuous);
00234
00235 elemList* elp = new elemList(discreteData, continuousData);
00236 Repository.push_back(elp );
00237 }
00238
00239 EOFencountered:
00240
00241
00242 if (valC0<0 || valC0>=(int)Dictionary[noDiscrete-1].size())
00243 return 1;
00244
00245 int trainLines=0;
00246 int pruneLines=0;
00247 int testLines=0;
00248
00249 vector<int> whichSet(noLines);
00250
00251 for (int i=0; i < noLines; i++)
00252 {
00253 double d =1.0*rand()/RAND_MAX;
00254 if (d<trainFrac)
00255 {
00256 whichSet[i] = 0;
00257 trainLines++;
00258 }
00259 else if (d < trainFrac+pruneFrac)
00260 {
00261 whichSet[i]=1;
00262 pruneLines++;
00263 }
00264 else
00265 {
00266 whichSet[i]=2;
00267 testLines++;
00268 }
00269 }
00270
00271 ofstream trainFile("trainset");
00272 ofstream pruneFile("pruneset");
00273 ofstream testFile("testset");
00274 ofstream trainQuest("trainset-quest");
00275 ofstream testQuest("testset-quest");
00276 ofstream pruneQuest("pruneset-quest");
00277
00278
00279
00280 trainFile << "# " << trainLines << " " << noDiscrete << " " << noContinuous << endl;
00281
00282 if (noDiscrete>0)
00283 {
00284 trainFile << "# " ;
00285 for (int i=0; i<noDiscrete-1; i++)
00286 trainFile << sizeDiscrete[i] << " ";
00287 trainFile << 2 << endl;
00288 }
00289
00290 pruneFile << "# " << pruneLines << " " << noDiscrete << " " << noContinuous << endl;
00291 if (noDiscrete>0)
00292 {
00293 pruneFile << "# " ;
00294 for (int i=0; i<noDiscrete-1; i++)
00295 pruneFile << sizeDiscrete[i] << " ";
00296 pruneFile << 2 << endl;
00297 }
00298
00299 testFile << "# " << testLines << " " << noDiscrete << " " << noContinuous << endl;
00300 if (noDiscrete>0)
00301 {
00302 testFile << "# " ;
00303 for (int i=0; i<noDiscrete-1; i++)
00304 testFile << sizeDiscrete[i] << " ";
00305 testFile << 2 << endl;
00306 }
00307
00308
00309
00310
00311 int cLine=0;
00312 list<elemList*>::iterator itrt;
00313 for (itrt=Repository.begin();
00314 itrt!=Repository.end(); itrt++)
00315 {
00316 elemList& eL = *(*itrt);
00317 vector<int> discreteData = eL.disc;
00318 vector<double> continuousData = eL.cont;
00319
00320 if (discreteData[noDiscrete-1] == valC0)
00321 discreteData[noDiscrete-1] = 0;
00322 else
00323 discreteData[noDiscrete-1]=1;
00324
00325 if (whichSet[cLine]==0)
00326 {
00327 for (int i=0; i<noDiscrete; i++)
00328 trainFile << discreteData[i] << " ";
00329
00330 for (int i=0; i<noContinuous; i++)
00331 trainFile << continuousData[i] << " ";
00332
00333 trainFile << endl;
00334
00335
00336 for (int i=0; i<noDiscrete; i++)
00337 trainQuest << discreteData[i] << " ";
00338
00339 for (int i=0; i<noContinuous; i++)
00340 trainQuest << continuousData[i] << " ";
00341
00342 trainQuest << 1 << endl;
00343
00344 }
00345 else if (whichSet[cLine]==1)
00346 {
00347 for (int i=0; i<noDiscrete; i++)
00348 pruneFile << discreteData[i] << " ";
00349
00350 for (int i=0; i<noContinuous; i++)
00351 pruneFile << continuousData[i] << " ";
00352
00353 pruneFile << endl;
00354
00355 for (int i=0; i<noDiscrete; i++)
00356 pruneQuest << discreteData[i] << " ";
00357
00358 for (int i=0; i<noContinuous; i++)
00359 pruneQuest << continuousData[i] << " ";
00360
00361 pruneQuest << 1 << endl;
00362
00363 }
00364 else
00365 {
00366 for (int i=0; i<noDiscrete-1; i++)
00367 testFile << discreteData[i] << " ";
00368
00369 for (int i=0; i<noContinuous; i++)
00370 testFile << continuousData[i] << " ";
00371
00372 testFile << discreteData[noDiscrete-1];
00373 testFile << endl;
00374
00375 for (int i=0; i<noDiscrete; i++)
00376 testQuest << discreteData[i] << " ";
00377
00378 for (int i=0; i<noContinuous; i++)
00379 testQuest << continuousData[i] << " ";
00380
00381 testQuest << 0 << endl;
00382
00383 }
00384
00385 cLine++;
00386 }
00387
00388 return 0;
00389 }