00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060 #include "StringMap.h"
00061 #include "SeqBitmap.h"
00062 #include "TreeNode.h"
00063 #include "DatasetInfo.h"
00064 #include "Stats.h"
00065 #include <math.h>
00066 #include <stdlib.h>
00067 #include <stdio.h>
00068 #include <string.h>
00069 #include <iostream>
00070
00071 using namespace std;
00072
00073
00074
00075
00076
00077
00078 CountInfo* TreeNode::countList = 0;
00079
00080 ofstream summaryFile;
00081
00082
00083 int minSup;
00084 TreeNode** nodeBuff;
00085 int* tempIndexList;
00086 bool* indexExists;
00087
00088
00089 ofstream outFile;
00090 int **sequentialPatterns;
00091 int *elementSize;
00092 int sequenceLength;
00093 bool outputSeq;
00094 bool stdoutSpecified;
00095
00096 StringMap *custStrMap;
00097 StringMap *transStrMap;
00098 StringMap *itemStrMap;
00099 bool isStringFile;
00100
00101
00102
00103
00104
00105
00106 int minCompSize;
00107
00108
00109 double emptySpaceRatio;
00110
00111
00112 int compLevel;
00113
00114 int totalCust;
00115 int testCount = 0;
00116 int numCompress = 0;
00117
00118
00119 int globalLevel;
00120 #ifdef __STATS__
00121 Stats *statistics;
00122 #endif
00123
00124
00125
00126
00127
00128
00129
00130
00131 DatasetInfo* ReadDataset(
00132 bool isBinaryFile,
00133 bool isStringFile,
00134 char *filename,
00135 double minSupPercent,
00136 StringMap *& custStrMap,
00137 StringMap *& transStrMap,
00138 StringMap *& itemStrMap);
00139
00140 int compare( const void *arg1, const void *arg2 )
00141 {
00142
00143 return ( ((CountInfo* ) arg1)->count - (( CountInfo* ) arg2)->count );
00144 }
00145
00146 void LogStdoutSequence(const int c);
00147 void LogFileSequence(const int c);
00148
00149 void LogSequence(const int c)
00150 {
00151 if (stdoutSpecified)
00152 LogStdoutSequence(c);
00153 else
00154 LogFileSequence(c);
00155 }
00156
00157
00158 void LogStdoutSequence(const int c)
00159 {
00160
00161
00162 #ifdef ___PREFIXSPAN___
00163
00164 char buf[256];
00165
00166
00167 for (int j = 0; j <= sequenceLength; j++)
00168 {
00169
00170 cout << "(";
00171
00172
00173
00174
00175
00176
00177
00178
00179 if (isStringFile)
00180 cout << itemStrMap->GetValue(sequentialPatterns[j][0]);
00181 else
00182 cout << sequentialPatterns[j][0];
00183
00184 for (int m = 1; m <= elementSize[j]; m++)
00185 {
00186 if (isStringFile)
00187 cout << " " << itemStrMap->GetValue(sequentialPatterns[j][m]);
00188 else
00189 cout << " " << sequentialPatterns[j][m];
00190 }
00191 cout << ") ";
00192 }
00193
00194 sprintf(buf, "%.6f", (double)c / (double)totalCust);
00195 cout << ": " << buf << endl;
00196
00197 #else
00198
00199 int m;
00200
00201
00202 int* curItemset;
00203 int temp;
00204
00205 if (elementSize[0] >= 1)
00206 {
00207
00208 curItemset = new int[elementSize[0] + 1];
00209
00210 for (m = 0; m <= elementSize[0]; m++)
00211 curItemset[m] = sequentialPatterns[0][m];
00212
00213
00214 for (int kk = 0; kk <= elementSize[0]; kk++)
00215 {
00216 for (int jj = kk + 1; jj <= elementSize[0]; jj++)
00217 {
00218 if (curItemset[kk] > curItemset[jj])
00219 {
00220 temp = curItemset[kk];
00221 curItemset[kk] = curItemset[jj];
00222 curItemset[jj] = temp;
00223 }
00224 }
00225 }
00226
00227 for (m = 0; m <= elementSize[0]; m++)
00228 {
00229 if (isStringFile)
00230 cout << itemStrMap->GetValue(curItemset[m]) << " ";
00231 else
00232 cout << curItemset[m] << " " ;
00233 }
00234
00235 delete [] curItemset;
00236
00237 }
00238 else
00239 if (isStringFile)
00240 cout << itemStrMap->GetValue(sequentialPatterns[0][0]) << " ";
00241 else
00242 cout << sequentialPatterns[0][0] << " " ;
00243
00244
00245 for (int j = 1; j <= sequenceLength; j++)
00246 {
00247 cout << "-1 ";
00248
00249
00250 if (elementSize[j] >= 1)
00251 {
00252 curItemset = new int[elementSize[j] + 1];
00253
00254 for (m = 0; m <= elementSize[j]; m++)
00255 curItemset[m] = sequentialPatterns[j][m];
00256
00257 for (int kk = 0; kk <= elementSize[j]; kk++)
00258 for (int jj = kk + 1; jj <= elementSize[j]; jj++)
00259 if (curItemset[kk] > curItemset[jj])
00260 {
00261 temp = curItemset[kk];
00262 curItemset[kk] = curItemset[jj];
00263 curItemset[jj] = temp;
00264 }
00265
00266 if (isStringFile)
00267 cout << itemStrMap->GetValue(curItemset[0]) << " ";
00268 else
00269 cout << curItemset[0] << " " ;
00270
00271 for (m = 1; m <= elementSize[j]; m++)
00272 {
00273 if (isStringFile)
00274 cout << itemStrMap->GetValue(curItemset[m]) << " ";
00275 else
00276 cout << curItemset[m] << " " ;
00277 }
00278
00279 delete [] curItemset;
00280 }
00281 else
00282 if (isStringFile)
00283 cout << itemStrMap->GetValue(sequentialPatterns[j][0]);
00284 else
00285 cout << sequentialPatterns[j][0] << " " ;
00286
00287 }
00288 cout << "- " << c << endl;
00289
00290
00291 #endif
00292
00293 }
00294
00295
00296
00297 void LogFileSequence(const int c)
00298 {
00299
00300
00301 #ifdef ___PREFIXSPAN___
00302
00303 char buf[256];
00304
00305
00306 for (int j = 0; j <= sequenceLength; j++)
00307 {
00308
00309 outFile << "(";
00310
00311
00312
00313
00314
00315
00316
00317
00318 if (isStringFile)
00319 outFile << itemStrMap->GetValue(sequentialPatterns[j][0]);
00320 else
00321 outFile << sequentialPatterns[j][0];
00322
00323 for (int m = 1; m <= elementSize[j]; m++)
00324 {
00325 if (isStringFile)
00326 outFile << " " << itemStrMap->GetValue(sequentialPatterns[j][m]);
00327 else
00328 outFile << " " << sequentialPatterns[j][m];
00329 }
00330 outFile << ") ";
00331 }
00332
00333 sprintf(buf, "%.6f", (double)c / (double)totalCust);
00334 outFile << ": " << buf << endl;
00335
00336 #else
00337
00338 int m;
00339
00340
00341 int* curItemset;
00342 int temp;
00343
00344 if (elementSize[0] >= 1)
00345 {
00346
00347 curItemset = new int[elementSize[0] + 1];
00348
00349 for (m = 0; m <= elementSize[0]; m++)
00350 curItemset[m] = sequentialPatterns[0][m];
00351
00352
00353 for (int kk = 0; kk <= elementSize[0]; kk++)
00354 {
00355 for (int jj = kk + 1; jj <= elementSize[0]; jj++)
00356 {
00357 if (curItemset[kk] > curItemset[jj])
00358 {
00359 temp = curItemset[kk];
00360 curItemset[kk] = curItemset[jj];
00361 curItemset[jj] = temp;
00362 }
00363 }
00364 }
00365
00366 for (m = 0; m <= elementSize[0]; m++)
00367 {
00368 if (isStringFile)
00369 outFile << itemStrMap->GetValue(curItemset[m]) << " ";
00370 else
00371 outFile << curItemset[m] << " " ;
00372 }
00373
00374 delete [] curItemset;
00375
00376 }
00377 else
00378 if (isStringFile)
00379 outFile << itemStrMap->GetValue(sequentialPatterns[0][0]) << " ";
00380 else
00381 outFile << sequentialPatterns[0][0] << " " ;
00382
00383
00384 for (int j = 1; j <= sequenceLength; j++)
00385 {
00386 outFile << "-1 ";
00387
00388
00389 if (elementSize[j] >= 1)
00390 {
00391 curItemset = new int[elementSize[j] + 1];
00392
00393 for (m = 0; m <= elementSize[j]; m++)
00394 curItemset[m] = sequentialPatterns[j][m];
00395
00396 for (int kk = 0; kk <= elementSize[j]; kk++)
00397 for (int jj = kk + 1; jj <= elementSize[j]; jj++)
00398 if (curItemset[kk] > curItemset[jj])
00399 {
00400 temp = curItemset[kk];
00401 curItemset[kk] = curItemset[jj];
00402 curItemset[jj] = temp;
00403 }
00404
00405 if (isStringFile)
00406 outFile << itemStrMap->GetValue(curItemset[0]) << " ";
00407 else
00408 outFile << curItemset[0] << " " ;
00409
00410 for (m = 1; m <= elementSize[j]; m++)
00411 {
00412 if (isStringFile)
00413 outFile << itemStrMap->GetValue(curItemset[m]) << " ";
00414 else
00415 outFile << curItemset[m] << " " ;
00416 }
00417
00418 delete [] curItemset;
00419 }
00420 else
00421 if (isStringFile)
00422 outFile << itemStrMap->GetValue(sequentialPatterns[j][0]);
00423 else
00424 outFile << sequentialPatterns[j][0] << " " ;
00425
00426 }
00427 outFile << "- " << c << endl;
00428
00429
00430 #endif
00431
00432 }
00433
00434
00435
00436
00437
00438
00439
00440
00441
00442
00443
00444
00445
00446
00447
00448
00449 void CreateOrBitmap(
00450 SeqBitmap** f1,
00451 int* indexList,
00452 int indexLength,
00453 SeqBitmap*& orBitmap)
00454 {
00455
00456 orBitmap = new SeqBitmap(*f1[indexList[0]]);
00457 for (int i = 1; i < indexLength; i++)
00458 orBitmap->Or(*orBitmap, *f1[indexList[i]]);
00459 }
00460
00461
00462
00463
00464
00465
00466
00467
00468
00469
00470
00471
00472
00473
00474
00475
00476
00477
00478
00479
00480
00481 void Compress(
00482 SeqBitmap* refBitmap,
00483 SeqBitmap* tempAndBitmap,
00484 SeqBitmap*& returnBitmap,
00485 SeqBitmap** f1,
00486 SeqBitmap** newF1,
00487 int* indexList,
00488 int indexLength)
00489 {
00490
00491 numCompress++;
00492
00493
00494 int size4 = 0;
00495 int size8 = 0;
00496 int size16 = 0;
00497 int size32 = 0;
00498 int size64 = 0;
00499
00500
00501 int pos4;
00502 int pos8;
00503 int pos16;
00504 int pos32;
00505 int pos64;
00506
00507 int i;
00508
00509
00510
00511
00512
00513
00514
00515
00516
00517
00518
00519
00520
00521
00522
00523
00524
00525
00526
00527
00528
00529
00530
00531
00532
00533
00534
00535 refBitmap->CountSmaller(size4, size8, size16, size32, size64, summaryFile);
00536
00537
00538 for (i = 0; i < indexLength; i++)
00539 {
00540 newF1[indexList[i]] =
00541 new SeqBitmap(size4, size8, size16, size32, size64);
00542 }
00543
00544 returnBitmap = new SeqBitmap(size4, size8, size16, size32, size64);
00545
00546
00547
00548
00549
00550
00551
00552
00553
00554
00555
00556
00557
00558
00559
00560
00561
00562
00563
00564
00565
00566
00567
00568 pos4 = 0;
00569 pos8 = 0;
00570 pos16 = 0;
00571 pos32 = 0;
00572 pos64 = 0;
00573
00574
00575
00576 if (refBitmap->_bitmap4 != 0)
00577 returnBitmap->Compress4(
00578 refBitmap, tempAndBitmap,
00579 pos4);
00580
00581
00582
00583 if (refBitmap->_bitmap8 != 0)
00584 returnBitmap->Compress8(
00585 refBitmap, tempAndBitmap,
00586 pos4, pos8);
00587
00588
00589
00590 if (refBitmap->_bitmap16 != 0)
00591 returnBitmap->Compress16(
00592 refBitmap, tempAndBitmap,
00593 pos4, pos8, pos16);
00594
00595
00596
00597 if (refBitmap->_bitmap32 != 0)
00598 returnBitmap->Compress32(
00599 refBitmap,
00600 tempAndBitmap,
00601 pos4, pos8, pos16, pos32);
00602
00603
00604
00605 if (refBitmap->_bitmap64 != 0)
00606 returnBitmap->Compress64(
00607 refBitmap, tempAndBitmap,
00608 pos4, pos8, pos16, pos32, pos64);
00609
00610
00611
00612
00613
00614
00615
00616 assert(pos64 == size64*8);
00617 assert(pos32 == size32*4);
00618 assert(pos16 == size16*2);
00619 assert(pos8 == size8);
00620 assert(pos4*2 == size4);
00621
00622
00623 for (i = 0; i < indexLength; i++)
00624 {
00625
00626 pos4 = 0;
00627 pos8 = 0;
00628 pos16 = 0;
00629 pos32 = 0;
00630 pos64 = 0;
00631
00632
00633
00634 if (refBitmap->_bitmap4 != 0)
00635 newF1[indexList[i]]->Compress4(refBitmap, f1[indexList[i]],
00636 pos4);
00637
00638
00639
00640 if (refBitmap->_bitmap8 != 0)
00641 newF1[indexList[i]]->Compress8(refBitmap, f1[indexList[i]],
00642 pos4, pos8);
00643
00644
00645
00646 if (refBitmap->_bitmap16 != 0)
00647 newF1[indexList[i]]->Compress16(refBitmap, f1[indexList[i]],
00648 pos4, pos8, pos16);
00649
00650
00651
00652 if (refBitmap->_bitmap32 != 0)
00653 newF1[indexList[i]]->Compress32(refBitmap, f1[indexList[i]],
00654 pos4, pos8, pos16, pos32);
00655
00656
00657
00658 if (refBitmap->_bitmap64 != 0)
00659 newF1[indexList[i]]->Compress64(refBitmap, f1[indexList[i]],
00660 pos4, pos8, pos16, pos32, pos64);
00661
00662
00663 assert(pos64 == size64*8);
00664 assert(pos32 == size32*4);
00665 assert(pos16 == size16*2);
00666 assert(pos8 == size8);
00667 assert(pos4*2 == size4);
00668
00669 }
00670
00671 }
00672
00673
00674
00675
00676
00677
00678
00679
00680
00681 void FindSequentialPatterns(TreeNode* curNode)
00682 {
00683
00684 testCount++;
00685
00686
00687
00688
00689 #ifdef __STATS__
00690
00691 statistics->startLevelTimer();
00692 statistics->updateNumNodes(curNode->level, 1);
00693 #endif
00694
00695
00696 int* nextLevelSList;
00697 int* nextLevelIList;
00698 int nextLevelSLength;
00699 int nextLevelILength;
00700
00701
00702 TreeNode* nextNode = nodeBuff[curNode->level + 1];
00703 SeqBitmap* tempAndBitmap = new SeqBitmap(*curNode->iBitmap);
00704 SeqBitmap* sBitmap = new SeqBitmap(*curNode->iBitmap);
00705
00706
00707
00708
00709
00710 int count = 0;
00711 int i;
00712
00713 #ifdef __COMPRESSION__
00714
00715 bool prevComp = false;
00716 int j;
00717 int memSize;
00718 #endif
00719
00720
00721
00722
00723
00724 sBitmap->CreateSBitmap(*curNode->iBitmap);
00725
00726
00727 nextLevelSList = curNode->sList + curNode->sLength;
00728 nextLevelIList = curNode->iList + curNode->iLength;
00729 nextLevelSLength = 0;
00730
00731
00732 for (i = 0; i < curNode->sLength; i++)
00733 {
00734
00735
00736 tempAndBitmap->And(*sBitmap, *curNode->f1[curNode->sList[i]]);
00737 count = tempAndBitmap->Count();
00738
00739
00740
00741 if (count >= minSup)
00742 {
00743 curNode->countList[nextLevelSLength].count = count;
00744 curNode->countList[nextLevelSLength].index = curNode->sList[i];
00745 nextLevelSLength++;
00746 }
00747
00748 }
00749
00750 #ifdef __STATS__
00751 statistics->updateSExt(curNode->level, nextLevelSLength);
00752 #endif
00753
00754
00755 #ifdef ___REORDERING___
00756
00757
00758 qsort((void *) curNode->countList, (size_t)nextLevelSLength,
00759 sizeof(int) + sizeof(int), compare );
00760 #endif
00761
00762
00763 for (i = 0;i < nextLevelSLength;i++)
00764 nextLevelSList[i] = curNode->countList[i].index;
00765
00766
00767 for (i = 0;i < nextLevelSLength - 1;i++)
00768 nextLevelIList[i] = nextLevelSList[i + 1];
00769
00770
00771 nextNode->level = curNode->level + 1;
00772 nextNode->sList = nextLevelSList;
00773 nextNode->sLength = nextLevelSLength;
00774 nextNode->f1 = curNode->f1;
00775 nextNode->f1Name = curNode->f1Name;
00776 nextNode->f1Size = curNode->f1Size;
00777 nextNode->compress = false;
00778 #ifdef __COMPRESSION__
00779
00780
00781
00782
00783 if ((nextNode->sLevel % 15) == 1)
00784 {
00785 nextNode->compress = true;
00786 }
00787 #ifdef __ALLCOMPRESS__
00788 nextNode->compress = true;
00789 #endif
00790 #else
00791
00792 nextNode->compress = false;
00793 #endif
00794
00795
00796
00797 if (outputSeq)
00798 sequenceLength++;
00799
00800 #ifdef __COMPRESSION__
00801 #ifdef __STATS__
00802
00803 statistics->startCompTimer();
00804 #endif
00805
00806 SeqBitmap *orBitmap = 0;
00807 SeqBitmap *specialBitmap = 0;
00808
00809
00810 if (curNode->compress && nextLevelSLength > 0)
00811 {
00812 CreateOrBitmap(
00813 curNode->f1,
00814 nextLevelSList,
00815 nextLevelSLength,
00816 orBitmap);
00817
00818 specialBitmap = new SeqBitmap(*tempAndBitmap);
00819 }
00820
00821 #ifdef __STATS__
00822 statistics->stopCompTimer(curNode->level);
00823 #endif
00824 #endif
00825
00826
00827 for (i = 0; i < nextLevelSLength; i++)
00828 {
00829
00830 nextNode->sLevel = curNode->sLevel + 1;
00831
00832
00833 nextNode->iList = nextLevelIList + i;
00834 nextNode->iLength = nextLevelSLength - 1 - i;
00835 tempAndBitmap->And(*sBitmap, *curNode->f1[nextLevelSList[i]]);
00836
00837 if (outputSeq)
00838 count = tempAndBitmap->Count();
00839
00840 #ifdef __COMPRESSION__
00841 #ifdef __STATS__
00842
00843 statistics->startCompTimer();
00844 #endif
00845
00846
00847
00848 memSize = tempAndBitmap->memSize();
00849
00850 if ((memSize >= minCompSize) && curNode->compress)
00851 {
00852
00853 prevComp = true;
00854
00855
00856 nextNode->f1 = curNode->f1 + curNode->f1Size;
00857
00858 specialBitmap->CreateCBitmap(*tempAndBitmap);
00859 specialBitmap->And(*specialBitmap, *orBitmap);
00860
00861
00862 globalLevel = curNode->level;
00863
00864
00865
00866 #ifdef __STATS__
00867
00868 statistics->updateNumComp(curNode->level, 1);
00869 #endif
00870
00871
00872
00873
00874
00875
00876
00877
00878
00879
00880
00881 Compress(
00882 specialBitmap,
00883 tempAndBitmap,
00884 returnBitmap,
00885 curNode->f1,
00886 nextNode->f1,
00887 nextLevelSList,
00888 nextLevelSLength);
00889
00890 nextNode->iBitmap = returnBitmap;
00891 }
00892 else
00893 {
00894
00895
00896 if (prevComp)
00897 {
00898 prevComp = false;
00899 nextNode->f1 = curNode->f1;
00900 }
00901 nextNode->iBitmap = tempAndBitmap;
00902 }
00903 #ifdef __STATS__
00904 statistics->stopCompTimer(curNode->level);
00905 #endif
00906 #else
00907
00908 nextNode->iBitmap = tempAndBitmap;
00909 #endif
00910
00911
00912 if (outputSeq)
00913 {
00914 elementSize[sequenceLength] = 0;
00915 sequentialPatterns[sequenceLength][0] =
00916 curNode->f1Name[nextLevelSList[i]];
00917 LogSequence(count);
00918 }
00919
00920 #ifdef __STATS__
00921 statistics->stopLevelTimer(curNode->level);
00922 #endif
00923
00924
00925 FindSequentialPatterns(nextNode);
00926
00927 #ifdef __STATS__
00928
00929 statistics->startLevelTimer();
00930 #endif
00931
00932 #ifdef __COMPRESSION__
00933 #ifdef __STATS__
00934
00935 statistics->startCompTimer();
00936 #endif
00937
00938 if (prevComp)
00939 {
00940
00941 returnBitmap->Deallocate();
00942 delete returnBitmap;
00943
00944
00945
00946
00947 for (j = nextNode->f1Size - 1; j >= 0; j--)
00948 {
00949 if (nextNode->f1[j] != 0)
00950 {
00951 assert(nextNode->f1[j] != (SeqBitmap *)0xfdfdfdfd);
00952 nextNode->f1[j]->Deallocate();
00953 delete nextNode->f1[j];
00954 }
00955 nextNode->f1[j] = 0;
00956 }
00957
00958 for (j = 0; j < nextNode->f1Size; j++)
00959 assert(nextNode->f1[j] == 0);
00960 }
00961 #ifdef __STATS__
00962 statistics->stopCompTimer(curNode->level);
00963 #endif
00964 #endif
00965
00966 }
00967
00968 #ifdef __COMPRESSION__
00969 #ifdef __STATS__
00970 statistics->startCompTimer();
00971 #endif
00972
00973 if (curNode->compress && nextLevelSLength != 0)
00974 {
00975
00976 specialBitmap->Deallocate();
00977 orBitmap->Deallocate();
00978 delete orBitmap;
00979 delete specialBitmap;
00980 }
00981 #ifdef __STATS__
00982 statistics->stopCompTimer(curNode->level);
00983 #endif
00984 #endif
00985
00986
00987 if (outputSeq)
00988 sequenceLength--;
00989
00990
00991
00992
00993 nextLevelIList = curNode->iList + curNode->iLength;
00994 nextLevelILength = 0;
00995
00996
00997 for (i = 0; i < curNode->iLength; i++)
00998 {
00999
01000
01001 nextNode->sLevel = curNode->sLevel;
01002
01003 tempAndBitmap->And(*curNode->iBitmap, *curNode->f1[curNode->iList[i]]);
01004 count = tempAndBitmap->Count();
01005
01006
01007
01008 if (count >= minSup)
01009 {
01010 curNode->countList[nextLevelILength].count = count;
01011 curNode->countList[nextLevelILength].index = curNode->iList[i];
01012 nextLevelILength++;
01013 }
01014 }
01015
01016 #ifdef __STATS__
01017 statistics->updateIExt(curNode->level, nextLevelILength);
01018 #endif
01019 #ifdef ___REORDERING___
01020
01021 qsort( (void *) curNode->countList, (size_t)nextLevelILength,
01022 sizeof(int) + sizeof(int), compare );
01023 #endif
01024
01025
01026 for (i = 0;i < nextLevelILength;i++)
01027 nextLevelIList[i] = curNode->countList[i].index;
01028
01029
01030
01031
01032 if (outputSeq)
01033 elementSize[sequenceLength]++;
01034
01035
01036 for (i = 0; i < nextLevelILength; i++)
01037 {
01038 nextNode->iList = nextLevelIList + i + 1;
01039 nextNode->iLength = nextLevelILength - i - 1;
01040
01041 tempAndBitmap->And(*curNode->iBitmap, *curNode->f1[nextLevelIList[i]]);
01042 if (outputSeq)
01043 count = tempAndBitmap->Count();
01044
01045
01046
01047 #ifdef __COMPRESSION__
01048
01049 memSize = tempAndBitmap->memSize();
01050
01051 if ((memSize >= minCompSize) && curNode->compress && false)
01052 {
01053 prevComp = true;
01054
01055
01056 nextNode->f1 = curNode->f1 + curNode->f1Size;
01057
01058 int j;
01059
01060 for (j = 0; j < nextNode->f1Size; j++)
01061 indexExists[j] = false;
01062
01063
01064 for (j = 0; j < nextLevelSLength; j++)
01065 {
01066 tempIndexList[j] = nextLevelSList[j];
01067 indexExists[nextLevelSList[j]] = true;
01068 }
01069 int indexLength = nextLevelSLength;
01070
01071
01072
01073 for (j = 0; j < nextNode->iLength; j++)
01074 {
01075 if (!indexExists[nextNode->iList[j]])
01076 {
01077 tempIndexList[indexLength] = nextNode->iList[j];
01078 indexLength++;
01079 }
01080 }
01081
01082
01083
01084 nextNode->iBitmap = tempAndBitmap;
01085 }
01086 else
01087 {
01088
01089 if (prevComp)
01090 {
01091 prevComp = false;
01092 nextNode->f1 = curNode->f1;
01093 }
01094 nextNode->iBitmap = tempAndBitmap;
01095
01096 }
01097 #else
01098 nextNode->iBitmap = tempAndBitmap;
01099 #endif
01100
01101
01102
01103 if (outputSeq)
01104 {
01105 sequentialPatterns[sequenceLength][elementSize[sequenceLength]] =
01106 curNode->f1Name[nextLevelIList[i]];
01107 LogSequence(count);
01108 }
01109
01110 #ifdef __STATS__
01111 statistics->stopLevelTimer(curNode->level);
01112 #endif
01113
01114
01115 FindSequentialPatterns(nextNode);
01116 #ifdef __STATS__
01117
01118 statistics->startLevelTimer();
01119 #endif
01120
01121 #ifdef __COMPRESSION__
01122
01123 if (prevComp)
01124 {
01125 delete returnBitmap;
01126 for (j = 0; j < nextNode->f1Size; j++)
01127 {
01128 if (nextNode->f1[j] != 0)
01129 delete nextNode->f1[j];
01130 nextNode->f1[j] = 0;
01131 }
01132 }
01133 #endif
01134
01135 }
01136
01137
01138 if (outputSeq)
01139 elementSize[sequenceLength]--;
01140
01141
01142
01143
01144 sBitmap->Deallocate();
01145 tempAndBitmap->Deallocate();
01146 delete tempAndBitmap;
01147 delete sBitmap;
01148
01149 #ifdef __STATS__
01150
01151 statistics->stopLevelTimer(curNode->level);
01152 #endif
01153
01154 }
01155
01156
01157
01158
01159
01160
01161
01162
01163 void StartMining(DatasetInfo* info)
01164 {
01165 #ifdef __STATS__
01166 statistics->startLevelTimer();
01167 statistics->updateNumNodes(0, 1);
01168 statistics->updateSExt(0, info->f1Size);
01169 #endif
01170
01171 int i;
01172 int count;
01173
01174 TreeNode* curNode = nodeBuff[0];
01175 curNode->f1 = info->f1Buff;
01176 curNode->f1Name = info->f1NameBuff;
01177 curNode->f1Size = info->f1Size;
01178 curNode->sList = info->sListBuff;
01179 curNode->sLength = info->f1Size;
01180 curNode->countList = info->countBuff;
01181 curNode->level = 0;
01182 curNode->sLevel = 0;
01183 curNode->compress = false;
01184
01185
01186 for (i = 0; i < info->f1Size; i++)
01187 {
01188 count = info->f1Buff[i]->Count();
01189 curNode->countList[i].count = count;
01190 curNode->countList[i].index = i;
01191
01192 }
01193
01194 #ifdef ___REORDERING___
01195
01196 qsort( (void *) curNode->countList, (size_t)info->f1Size,
01197 sizeof(int) + sizeof(int), compare );
01198 #endif
01199
01200 for (i = 0; i < info->f1Size; i++)
01201 {
01202 info->sListBuff[i] = curNode->countList[i].index;
01203 info->iListBuff[i] = curNode->countList[i].index;
01204 }
01205
01206
01207
01208 for (i = 0; i < info->f1Size; i++)
01209 {
01210
01211
01212 if (outputSeq)
01213 {
01214 count = info->f1Buff[info->sListBuff[i]]->Count();
01215 elementSize[sequenceLength] = 0;
01216 sequentialPatterns[sequenceLength][0] =
01217 info->f1NameBuff[info->sListBuff[i]];
01218 LogSequence(count);
01219 }
01220
01221 curNode->iList = info->iListBuff + i + 1;
01222 curNode->iLength = info->f1Size - i - 1;
01223 curNode->iBitmap = info->f1Buff[info->sListBuff[i]];
01224
01225 #ifdef __STATS__
01226
01227 statistics->stopLevelTimer(0);
01228 #endif
01229
01230 FindSequentialPatterns(curNode);
01231
01232 #ifdef __STATS__
01233
01234 statistics->startLevelTimer();
01235 #endif
01236
01237 }
01238
01239 #ifdef __STATS__
01240 statistics->stopLevelTimer(0);
01241 #endif
01242
01243 }
01244
01245
01246 void PrintError()
01247 {
01248 cerr << "\nUsage: spam -sup <minSup> [-fn <infile>] [-stdin] [-ascii] [-str]\n";
01249 cerr << " [-outFile <outfile>] [-stdout]\n\n";
01250
01251
01252
01253 cerr << " minSup - The minimum support (between 0.0 and 1.0)\n";
01254 cerr << " infile - The data file to read in (see below for specifications)\n";
01255 cerr << " stdin - Use this flag if the data should be read in from stdin.\n";
01256 cerr << " Must use if -fn is not specified.\n";
01257 cerr << " Overrides any file specified via -fn\n";
01258 cerr << " ascii - Use this flag if your input is ASCII text;\n";
01259 cerr << " otherwise SPAM assumes it is in a binary file format\n";
01260 cerr << " str - Use this flag if your input is a list of strings\n";
01261 cerr << " representing customers, transactions, and items\n";
01262 cerr << " (see documentation for full file-format description)\n";
01263 cerr << " outfile - The file to place the output in. If -outFile and -stdout\n";
01264 cerr << " are not present, no output will be produced\n";
01265 cerr << " stdout - Use this flag if you want the output to go to stdout.\n";
01266 cerr << " Overrides any file specified via -outFile\n\n";
01267 cerr << "See documentation for examples of how to use SPAM with various data formats.\n";
01268
01269
01270
01271
01272
01273
01274
01275
01276
01277
01278
01279
01280
01281 exit(0);
01282 }
01283
01284
01285 int main(int argc, char **argv)
01286 {
01287
01288
01289 clock_t programStart = clock();
01290
01291
01292 bool isBinaryFile;
01293 double minSupPercent = 0;
01294 char* inFilename = 0;
01295 char outFilename[50];
01296 bool fnSpecified = false;
01297 bool supSpecified = false;
01298 bool stdinSpecified = false;
01299 outputSeq = false;
01300
01301
01302 clock_t programEnd;
01303 clock_t miningStart, miningEnd;
01304 double programDuration;
01305 double miningDuration;
01306
01307 DatasetInfo* info;
01308
01309
01310
01311
01312
01313
01314 strcpy(outFilename, "output.txt");
01315 emptySpaceRatio = 0.8;
01316 minCompSize = 30;
01317 compLevel = 1;
01318 isBinaryFile = true;
01319 isStringFile = false;
01320
01321
01322 int i = 1;
01323 while (i < argc)
01324 {
01325
01326 if (strcmp(argv[i], "-fn") == 0)
01327 {
01328
01329
01330 i++;
01331 if (i == argc)
01332 PrintError();
01333 else
01334 {
01335 inFilename = argv[i];
01336 fnSpecified = true;
01337 }
01338
01339 }
01340 else if (strcmp(argv[i], "-sup") == 0 )
01341 {
01342
01343
01344 i++;
01345 if (i == argc)
01346 PrintError();
01347 else
01348 {
01349 minSupPercent = atof(argv[i]);
01350 supSpecified = true;
01351 }
01352
01353 }
01354 else if (strcmp(argv[i], "-emptRatio") == 0 )
01355 {
01356
01357
01358 i++;
01359 if (i == argc)
01360 PrintError();
01361 else
01362 emptySpaceRatio = atof(argv[i]);
01363
01364 }
01365 else if (strcmp(argv[i], "-minSize") == 0)
01366 {
01367
01368
01369 i++;
01370 if (i == argc)
01371 PrintError();
01372 else
01373 minCompSize = atoi(argv[i]);
01374
01375 }
01376 else if (strcmp(argv[i], "-compLevel") == 0)
01377 {
01378
01379
01380 i++;
01381 if (i == argc)
01382 PrintError();
01383 else
01384 compLevel = atoi(argv[i]);
01385
01386 }
01387 else if (strcmp(argv[i], "-outFile") == 0)
01388 {
01389
01390
01391 i++;
01392 if (i == argc)
01393 PrintError();
01394 else
01395 {
01396 strcpy(outFilename, argv[i]);
01397 outputSeq = true;
01398 }
01399
01400 }
01401 else if (strcmp(argv[i], "-ascii") == 0)
01402 {
01403
01404
01405 isBinaryFile = false;
01406
01407 }
01408 else if (strcmp(argv[i], "-str") == 0)
01409 {
01410 isBinaryFile = false;
01411 isStringFile = true;
01412 }
01413 else if (strcmp(argv[i], "-stdin") == 0)
01414 {
01415 stdinSpecified = true;
01416 }
01417 else if (strcmp(argv[i], "-stdout") == 0)
01418 {
01419 stdoutSpecified = true;
01420 outputSeq = true;
01421 }
01422 else
01423 {
01424 PrintError();
01425 }
01426
01427 i++;
01428 }
01429
01430 if (!supSpecified)
01431 {
01432 cout << "Must specify minimum support" << endl;
01433 PrintError();
01434 }
01435
01436
01437 if (!fnSpecified || stdinSpecified)
01438 inFilename = 0;
01439
01440 if (!fnSpecified && !stdinSpecified)
01441 {
01442
01443
01444 cout << "No input data source selected; use either -stdin or -fn" << endl;
01445 PrintError();
01446 }
01447
01448 if (stdinSpecified && isBinaryFile)
01449 {
01450 cout << "Cannot use binary files with -stdin; use -ascii or -str" << endl;
01451 PrintError();
01452 }
01453
01454 bool suppressOutput = false;
01455
01456
01457 if (stdinSpecified || stdoutSpecified)
01458 suppressOutput = true;
01459
01460 if (isBinaryFile == true)
01461 {
01462 if (!suppressOutput)
01463 {
01464 cout << "Input file will be interpreted as binary, ";
01465 cout << "if SPAM crashes use the -ascii flag\n";
01466 }
01467 }
01468 else
01469 {
01470 if (!suppressOutput)
01471 cout << "Input file will be interpreted as ASCII...\n";
01472 }
01473
01474
01475 if (!suppressOutput)
01476 cout << "Reading Data... " << endl;
01477
01478 info = ReadDataset(isBinaryFile,
01479 isStringFile,
01480 inFilename,
01481 minSupPercent,
01482 custStrMap,
01483 transStrMap,
01484 itemStrMap);
01485 if (info == 0)
01486 {
01487 cout << "Error reading dataset" << endl;
01488 exit(1);
01489 }
01490
01491
01492 if (outputSeq && !stdoutSpecified)
01493 outFile.open(outFilename);
01494 summaryFile.open("summary.txt");
01495
01496 if (info->f1Size > 0)
01497 {
01498
01499
01500 minSup = info->minSup;
01501 totalCust = info->custCount;
01502
01503
01504 SeqBitmap::Init();
01505
01506
01507
01508
01509
01510
01511
01512 int bufSize = info->f1Size * info->maxCustTrans + 1;
01513 nodeBuff = new TreeNode * [bufSize];
01514
01515
01516
01517
01518
01519 for (i = 0; i < bufSize; i++)
01520 {
01521 nodeBuff[i] = new TreeNode();
01522
01523
01524
01525
01526
01527
01528
01529 }
01530
01531
01532
01533
01534 tempIndexList = new int[info->f1Size * 2];
01535 indexExists = new bool[info->f1Size];
01536
01537
01538
01539 if (outputSeq)
01540 {
01541 sequentialPatterns = new int * [info->maxCustTrans];
01542 elementSize = new int[info->maxCustTrans];
01543 for (i = 0; i < info->maxCustTrans; i++)
01544 {
01545 sequentialPatterns[i] = new int[info->f1Size];
01546 elementSize[i] = 0;
01547 }
01548 sequenceLength = 0;
01549 }
01550
01551
01552 #ifdef __STATS__
01553 statistics = new Stats(bufSize);
01554 #endif
01555
01556
01557 if (!suppressOutput)
01558 cout << "Running the algorithm..." << endl;
01559 miningStart = clock();
01560 StartMining(info);
01561 miningEnd = clock();
01562 if (!suppressOutput)
01563 cout << "Done." << endl;
01564
01565 #ifdef __STATS__
01566
01567 statistics->printStats(summaryFile);
01568 #endif
01569
01570 #ifdef __STATS__
01571
01572 delete statistics;
01573 #endif
01574
01575
01576 for (i = 0; i < bufSize; i++)
01577 {
01578 delete nodeBuff[i];
01579
01580
01581 }
01582
01583 delete [] nodeBuff;
01584
01585
01586 delete [] tempIndexList;
01587 delete [] indexExists;
01588
01589 delete custStrMap;
01590 delete transStrMap;
01591 delete itemStrMap;
01592
01593
01594 if (outputSeq)
01595 {
01596
01597 for (i = 0; i < info->maxCustTrans; i++)
01598 delete[] sequentialPatterns[i];
01599 delete[] sequentialPatterns;
01600 delete[] elementSize;
01601 }
01602
01603 SeqBitmap::Destroy();
01604
01605 }
01606 else
01607 {
01608
01609 miningStart = 0;
01610 miningEnd = 0;
01611
01612 }
01613
01614
01615 miningDuration = (miningEnd - miningStart) / (double)CLOCKS_PER_SEC;
01616
01617 programEnd = clock();
01618 programDuration = (programEnd - programStart) / (double)CLOCKS_PER_SEC;
01619
01620 summaryFile << "Number of customer: " << info->custCount << endl;
01621 summaryFile << "Minimum support: " << minSup << " ( " << minSupPercent
01622 << " )" << endl;
01623 summaryFile << "Mining Duration:" << miningDuration << endl;
01624 summaryFile << "Program Duration: " << programDuration << endl << endl;
01625
01626 summaryFile << "Number of Compression: " << numCompress << endl;
01627 summaryFile << "Number of Ors: " << SeqBitmap::_countOr << endl;
01628 summaryFile << "Number of Ands: " << SeqBitmap::_countAnd << endl;
01629 summaryFile << "Number of Count: " << SeqBitmap::_countCount << endl;
01630 summaryFile << "Number of CountZeros: " <<
01631 SeqBitmap::_countCountZeros << endl;
01632 summaryFile << "Number of CountSmaller: " <<
01633 SeqBitmap::_countCountSmaller << endl;
01634 summaryFile << "Number of CreateSBitmap: " <<
01635 SeqBitmap::_countCreateSBitmap << endl;
01636 summaryFile << "Number of CreateCBitmaps: " <<
01637 SeqBitmap::_countCreateCBitmap << endl;
01638
01639
01640 if (outputSeq)
01641 outFile.close();
01642 summaryFile.close();
01643
01644
01645 for (i = info->f1Size - 1; i >= 0; i--)
01646 info->f1Buff[i]->Deallocate();
01647
01648 delete info;
01649
01650
01651 SeqBitmap::MemDealloc();
01652 return 0;
01653 }
01654
01655
01656
01657
01658
01659
01660
01661
01662
01663
01664
01665
01666
01667
01668
01669
01670
01671
01672
01673
01674
01675
01676
01677
01678
01679
01680
01681
01682
01683
01684
01685
01686
01687
01688
01689
01690
01691
01692
01693
01694
01695
01696
01697
01698
01699
01700
01701
01702
01703
01704
01705
01706
01707
01708
01709
01710
01711
01712
01713
01714
01715
01716
01717
01718
01719
01720
01721
01722
01723
01724
01725
01726
01727
01728
01729
01730
01731
01732
01733
01734
01735
01736
01737
01738
01739
01740
01741
01742
01743
01744
01745
01746
01747
01748
01749
01750
01751
01752
01753
01754
01755
01756
01757
01758
01759
01760
01761
01762
01763
01764
01765
01766
01767
01768
01769
01770
01771
01772
01773
01774
01775
01776
01777
01778
01779
01780
01781
01782
01783
01784
01785
01786
01787
01788
01789
01790
01791
01792
01793
01794
01795
01796
01797
01798
01799
01800
01801
01802
01803
01804
01805
01806
01807
01808
01809
01810
01811
01812
01813
01814
01815
01816
01817
01818
01819
01820
01821
01822
01823
01824
01825
01826
01827
01828
01829
01830
01831
01832
01833
01834
01835
01836
01837