00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032 #include <string>
00033 #include <vector>
00034 #include <list>
00035 #include <iostream>
00036 #include <fstream>
00037 #include <string.h>
00038 #include <time.h>
00039
00040 using namespace std;
00041
00042 struct elemList
00043 {
00044 vector<int> disc;
00045 vector<double> cont;
00046
00047 elemList(vector<int>& Disc, vector<double>& Cont):
00048 disc(Disc), cont(Cont)
00049 { }
00050 }
00051 ;
00052
00053
00054 int main(int argc, char** argv)
00055 {
00056
00057 double trainFrac = .5;
00058 double pruneFrac =.25;
00059 int threshold=1;
00060
00061 if (argc<2 || argc>5)
00062 {
00063 cerr << "Usage: UCItranslator description(c=continouous, d=discrete) [ threshold trainFrac pruneFrac] " << endl;
00064 return 1;
00065 }
00066
00067 srandom(time(0));
00068
00069 char description[256];
00070 strcpy(description,argv[1]);
00071
00072 if (argc>=3)
00073 threshold=atoi(argv[2]);
00074 if (argc>=4)
00075 trainFrac=atof(argv[3]);
00076 if (argc>=5)
00077 pruneFrac=atof(argv[4]);
00078
00079 int noVars = strlen(description);
00080
00081 int noDiscrete;
00082 int noContinuous;
00083 int noLines=0;
00084
00085 list<elemList*> Repository;
00086
00087
00088
00089
00090
00091 noDiscrete=0;
00092 noContinuous=0;
00093 for (int i=0; i<noVars; i++)
00094 {
00095 switch (description[i])
00096 {
00097 case 'c':
00098 noContinuous++;
00099 break;
00100
00101 case 'd':
00102 case 's':
00103 case 'n':
00104 noDiscrete++;
00105 break;
00106
00107 default:
00108 cerr << "Wrong decription" << endl;
00109 return 1;
00110 }
00111 }
00112
00113
00114 vector< list<string> > Dictionary(noDiscrete);
00115 vector< int > sizeDiscrete(noDiscrete);
00116
00117
00118 vector< int > discreteData(noDiscrete);
00119 vector< double > continuousData(noContinuous);
00120
00121
00122 while (!cin.eof())
00123 {
00124 noLines++;
00125
00126
00127 int curr_cont=0;
00128 int curr_disc=0;
00129
00130 char c;
00131
00132 while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00133 { }
00134 cin.putback(c);
00135
00136 for (int i=0; i<noDiscrete+noContinuous; i++)
00137 {
00138 char buffer[10000];
00139 int pos=0;
00140
00141 while ( (c=cin.get())!=' ' && c!='\t' && c!='\n' && c!=',' && !cin.eof())
00142 buffer[pos++]=c;
00143 buffer[pos]=0;
00144 if (cin.eof())
00145 goto EOFencountered;
00146 cin.putback(c);
00147
00148
00149 while ( (c=cin.get())==' ' || c=='\t' || c=='\n' || c==',')
00150 { }
00151 cin.putback(c);
00152 if (cin.eof())
00153 goto EOFencountered;
00154
00155
00156 string token(buffer);
00157
00158 if (cin.eof())
00159 goto EOFencountered;
00160
00161 switch (description[i])
00162 {
00163 case 'c':
00164 {
00165 double value=atof(token.c_str());
00166 continuousData[curr_cont]=value;
00167
00168 curr_cont++;
00169 }
00170 break;
00171
00172 case 'd':
00173 {
00174
00175 int c_pos=0;
00176 discreteData[curr_disc]=-1;
00177 list<string>& dict = Dictionary[curr_disc];
00178 list<string>::iterator itrt;
00179 for (itrt=dict.begin();
00180 itrt!=dict.end(); itrt++)
00181 {
00182 string entry=*itrt;
00183 if (entry==token)
00184 {
00185 discreteData[curr_disc]=c_pos;
00186 break;
00187 }
00188 c_pos++;
00189 }
00190
00191 if (discreteData[curr_disc]==-1)
00192 {
00193 dict.push_back(token);
00194 discreteData[curr_disc]=c_pos;
00195 sizeDiscrete[curr_disc]++;
00196 }
00197
00198 curr_disc++;
00199 }
00200 break;
00201
00202 case 's':
00203 {
00204
00205 int value=atoi(token.c_str())-1;
00206 discreteData[curr_disc]=value;
00207
00208 if (sizeDiscrete[curr_disc]<value+1)
00209 sizeDiscrete[curr_disc]=value+1;
00210
00211 curr_disc++;
00212 }
00213 break;
00214
00215 case 'n':
00216 {
00217
00218 int value=atoi(token.c_str());
00219 discreteData[curr_disc]=value;
00220
00221 if (sizeDiscrete[curr_disc]<value+1)
00222 sizeDiscrete[curr_disc]=value+1;
00223
00224 curr_disc++;
00225 }
00226
00227 }
00228 }
00229
00230 assert(curr_disc+curr_cont==noDiscrete+noContinuous);
00231
00232 elemList* elp = new elemList(discreteData, continuousData);
00233 Repository.push_back(elp );
00234 }
00235
00236 EOFencountered:
00237
00238 int trainLines=0;
00239 int pruneLines=0;
00240 int testLines=0;
00241
00242 vector<int> whichSet(noLines);
00243
00244 for (int i=0; i < noLines; i++)
00245 {
00246 double d =1.0*rand()/RAND_MAX;
00247 if (d<trainFrac)
00248 {
00249 whichSet[i] = 0;
00250 trainLines++;
00251 }
00252 else if (d < trainFrac+pruneFrac)
00253 {
00254 whichSet[i]=1;
00255 pruneLines++;
00256 }
00257 else
00258 {
00259 whichSet[i]=2;
00260 testLines++;
00261 }
00262 }
00263
00264 ofstream trainFile("trainset");
00265 ofstream pruneFile("pruneset");
00266 ofstream testFile("testset");
00267
00268
00269
00270 trainFile << "# " << trainLines << " " << noDiscrete << " " << noContinuous << endl;
00271
00272 if (noDiscrete>0)
00273 {
00274 trainFile << "# " ;
00275 for (int i=0; i<noDiscrete-1; i++)
00276 trainFile << sizeDiscrete[i] << " ";
00277 trainFile << 2 << endl;
00278 }
00279
00280 pruneFile << "# " << pruneLines << " " << noDiscrete << " " << noContinuous << endl;
00281 if (noDiscrete>0)
00282 {
00283 pruneFile << "# " ;
00284 for (int i=0; i<noDiscrete-1; i++)
00285 pruneFile << sizeDiscrete[i] << " ";
00286 pruneFile << 2 << endl;
00287 }
00288
00289 testFile << "# " << testLines << " " << noDiscrete << " " << noContinuous << endl;
00290 if (noDiscrete>0)
00291 {
00292 testFile << "# " ;
00293 for (int i=0; i<noDiscrete-1; i++)
00294 testFile << sizeDiscrete[i] << " ";
00295 testFile << 2 << endl;
00296 }
00297
00298
00299
00300
00301 int cLine=0;
00302 list<elemList*>::iterator itrt;
00303 for (itrt=Repository.begin();
00304 itrt!=Repository.end(); itrt++)
00305 {
00306 elemList& eL = *(*itrt);
00307 vector<int> discreteData = eL.disc;
00308 vector<double> continuousData = eL.cont;
00309
00310 if(Dictionary[noDiscrete-1].size() > 2)
00311 {
00312 if (discreteData[noDiscrete-1] <= threshold)
00313 discreteData[noDiscrete-1] = 0;
00314 else
00315 discreteData[noDiscrete-1]=1;
00316 }
00317
00318
00319 if (whichSet[cLine]==0)
00320 {
00321 for (int i=0; i<noDiscrete; i++)
00322 trainFile << discreteData[i] << " ";
00323
00324 for (int i=0; i<noContinuous; i++)
00325 trainFile << continuousData[i] << " ";
00326
00327 trainFile << endl;
00328 }
00329 else if (whichSet[cLine]==1)
00330 {
00331 for (int i=0; i<noDiscrete; i++)
00332 pruneFile << discreteData[i] << " ";
00333
00334 for (int i=0; i<noContinuous; i++)
00335 pruneFile << continuousData[i] << " ";
00336
00337 pruneFile << endl;
00338 }
00339 else
00340 {
00341 for (int i=0; i<noDiscrete-1; i++)
00342 testFile << discreteData[i] << " ";
00343
00344 for (int i=0; i<noContinuous; i++)
00345 testFile << continuousData[i] << " ";
00346
00347 testFile << discreteData[noDiscrete-1];
00348 testFile << endl;
00349 }
00350
00351 cLine++;
00352 }
00353
00354 return 0;
00355 }