Main Page | Modules | Namespace List | Data Structures | File List | Data Fields | Globals

Spam.cpp

Go to the documentation of this file.
00001 /*
00002  
00003 Copyright (c) 2004, Cornell University
00004 All rights reserved.
00005  
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008  
00009    - Redistributions of source code must retain the above copyright notice,
00010        this list of conditions and the following disclaimer.
00011    - Redistributions in binary form must reproduce the above copyright
00012        notice, this list of conditions and the following disclaimer in the
00013        documentation and/or other materials provided with the distribution.
00014    - Neither the name of Cornell University nor the names of its
00015        contributors may be used to endorse or promote products derived from
00016        this software without specific prior written permission.
00017  
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029  
00030 */
00031 
00032 
00033 /////////////////////////////////////////////////////////////////////
00034 ///
00035 /// Spam.cpp
00036 ///
00037 /////////////////////////////////////////////////////////////////////
00038 
00039 // Toggle whether the output file format should be the same as that of
00040 // PrefixSpan (for Perlscript regression test purposes)
00041 //#define ___PREFIXSPAN___
00042 
00043 // Toggle whether at each step, the frequent i-step and s-step extensions
00044 // should be traversed in order of decreasing support
00045 //#define ___REORDERING___
00046 
00047 // Toggle whether bitmap compression will be used. If not, all bitmaps
00048 // encapsulated within SeqBitmap will be Bitmap32s
00049 //#define __COMPRESSION__
00050 
00051 // Toggle whether various debugging statistics will be included with the
00052 // output file
00053 //#define __STATS__
00054 
00055 // Toggle whether or not to compress the bitmaps at every single level
00056 // (when compression is turned on)
00057 //#define __ALLCOMPRESS__
00058 
00059 
00060 #include "StringMap.h"
00061 #include "SeqBitmap.h"
00062 #include "TreeNode.h"
00063 #include "DatasetInfo.h"
00064 #include "Stats.h"
00065 #include <math.h>
00066 #include <stdlib.h>
00067 #include <stdio.h>
00068 #include <string.h>
00069 #include <iostream>
00070 
00071 using namespace std;
00072 
00073 /// @defgroup GlobalVariables Global Variables
00074 /// Global variables
00075 /** @{ */
00076 
00077 // Support count of the current node
00078 CountInfo* TreeNode::countList = 0;
00079 
00080 ofstream summaryFile;
00081 
00082 // Variables for tree nodes
00083 int minSup;             ///< min sup as a transaction count
00084 TreeNode** nodeBuff;    ///< node buffer
00085 int* tempIndexList;     ///< stored the result of combining i-list and s-list
00086 bool* indexExists;      ///< for combining i-list and s-list
00087 
00088 // Variables for printing output
00089 ofstream outFile;
00090 int **sequentialPatterns;
00091 int *elementSize;
00092 int sequenceLength;
00093 bool outputSeq;
00094 bool stdoutSpecified;
00095 
00096 StringMap *custStrMap;// StringMap objects that map the IDs into their string names
00097 StringMap *transStrMap;
00098 StringMap *itemStrMap;
00099 bool isStringFile;      // Whether or not we should print the output as strings (because the input was strings)
00100 
00101 
00102 // Misc global variables
00103 
00104 /// no compression will be done if the bitmap size (_sizeShort)
00105 /// is small than this
00106 int minCompSize;
00107 
00108 /// the ratio at which we compress the bitmaps
00109 double emptySpaceRatio;
00110 
00111 /// Used to test whether compression should be done at every level
00112 int compLevel;
00113 
00114 int totalCust;
00115 int testCount = 0;
00116 int numCompress = 0;
00117 
00118 // Jay: Temporary global for compression debugging
00119 int globalLevel;
00120 #ifdef __STATS__
00121 Stats *statistics;
00122 #endif
00123 /** @} */
00124 
00125 
00126 /// @defgroup GlobalFunctions Global Functions
00127 /// Global functions
00128 /** @{ */
00129 
00130 // function for reading input, refer to FileInput.cpp for details
00131 DatasetInfo* ReadDataset(
00132     bool isBinaryFile,
00133     bool isStringFile,
00134     char *filename,
00135     double minSupPercent,
00136     StringMap *& custStrMap,
00137     StringMap *& transStrMap,
00138     StringMap *& itemStrMap);
00139 
00140 int compare( const void *arg1, const void *arg2 )
00141 {
00142     // Compare all of both strings:
00143     return ( ((CountInfo* ) arg1)->count - (( CountInfo* ) arg2)->count );
00144 }
00145 
00146 void LogStdoutSequence(const int c);
00147 void LogFileSequence(const int c);
00148 
00149 void LogSequence(const int c)
00150 {
00151     if (stdoutSpecified)
00152         LogStdoutSequence(c);
00153     else
00154         LogFileSequence(c);
00155 }
00156 
00157 
00158 void LogStdoutSequence(const int c)
00159 {
00160     // Output the sequence to the outFile
00161 
00162 #ifdef ___PREFIXSPAN___
00163 
00164     char buf[256];
00165     // compare with prefix span, need to output the sequence in the same
00166     // way that PrefixSpan does
00167     for (int j = 0; j <= sequenceLength; j++)
00168     {
00169 
00170         cout << "(";
00171 
00172         //  assert(sequentialPatterns[j][0] < indexLength);
00173 
00174         //  cout << indexList[sequentialPatterns[j][0]];
00175         //  for (int m = 1; m <= elementSize[j]; m++) {
00176         //   assert(sequentialPatterns[j][m] < indexLength);
00177         //   cout << " " << indexList[sequentialPatterns[j][m]];
00178 
00179         if (isStringFile)
00180             cout << itemStrMap->GetValue(sequentialPatterns[j][0]);
00181         else
00182             cout << sequentialPatterns[j][0];
00183 
00184         for (int m = 1; m <= elementSize[j]; m++)
00185         {
00186             if (isStringFile)
00187                 cout << " " << itemStrMap->GetValue(sequentialPatterns[j][m]);
00188             else
00189                 cout << " " << sequentialPatterns[j][m];
00190         }
00191         cout << ") ";
00192     }
00193 
00194     sprintf(buf, "%.6f", (double)c / (double)totalCust);
00195     cout << ": " << buf << endl;
00196 
00197 #else
00198 
00199     int m;
00200     // compare with spade, output the data in the same format that Spade does
00201 
00202     int* curItemset;
00203     int temp;
00204 
00205     if (elementSize[0] >= 1)
00206     {
00207 
00208         curItemset = new int[elementSize[0] + 1];
00209 
00210         for (m = 0; m <= elementSize[0]; m++)
00211             curItemset[m] = sequentialPatterns[0][m];
00212 
00213 
00214         for (int kk = 0; kk <= elementSize[0]; kk++)
00215         {
00216             for (int jj = kk + 1; jj <= elementSize[0]; jj++)
00217             {
00218                 if (curItemset[kk] > curItemset[jj])
00219                 {
00220                     temp = curItemset[kk];
00221                     curItemset[kk] = curItemset[jj];
00222                     curItemset[jj] = temp;
00223                 }
00224             }
00225         }
00226 
00227         for (m = 0; m <= elementSize[0]; m++)
00228         {
00229             if (isStringFile)
00230                 cout << itemStrMap->GetValue(curItemset[m]) << " ";
00231             else
00232                 cout << curItemset[m] << " " ;
00233         }
00234 
00235         delete [] curItemset;
00236 
00237     }
00238     else
00239         if (isStringFile)
00240             cout << itemStrMap->GetValue(sequentialPatterns[0][0]) << " ";
00241         else
00242             cout << sequentialPatterns[0][0] << " " ;
00243 
00244 
00245     for (int j = 1; j <= sequenceLength; j++)
00246     {
00247         cout << "-1 ";
00248 
00249 
00250         if (elementSize[j] >= 1)
00251         {
00252             curItemset = new int[elementSize[j] + 1];
00253 
00254             for (m = 0; m <= elementSize[j]; m++)
00255                 curItemset[m] = sequentialPatterns[j][m];
00256 
00257             for (int kk = 0; kk <= elementSize[j]; kk++)
00258                 for (int jj = kk + 1; jj <= elementSize[j]; jj++)
00259                     if (curItemset[kk] > curItemset[jj])
00260                     {
00261                         temp = curItemset[kk];
00262                         curItemset[kk] = curItemset[jj];
00263                         curItemset[jj] = temp;
00264                     }
00265 
00266             if (isStringFile)
00267                 cout << itemStrMap->GetValue(curItemset[0]) << " ";
00268             else
00269                 cout << curItemset[0] << " " ;
00270 
00271             for (m = 1; m <= elementSize[j]; m++)
00272             {
00273                 if (isStringFile)
00274                     cout << itemStrMap->GetValue(curItemset[m]) << " ";
00275                 else
00276                     cout << curItemset[m] << " " ;
00277             }
00278 
00279             delete [] curItemset;
00280         }
00281         else
00282             if (isStringFile)
00283                 cout << itemStrMap->GetValue(sequentialPatterns[j][0]);
00284             else
00285                 cout << sequentialPatterns[j][0] << " " ;
00286 
00287     }
00288     cout << "- " << c << endl;
00289 
00290 
00291 #endif
00292 
00293 }
00294 
00295 
00296 
00297 void LogFileSequence(const int c)
00298 {
00299     // Output the sequence to the outFile
00300 
00301 #ifdef ___PREFIXSPAN___
00302 
00303     char buf[256];
00304     // compare with prefix span, need to output the sequence in the same
00305     // way that PrefixSpan does
00306     for (int j = 0; j <= sequenceLength; j++)
00307     {
00308 
00309         outFile << "(";
00310 
00311         //  assert(sequentialPatterns[j][0] < indexLength);
00312 
00313         //  outFile << indexList[sequentialPatterns[j][0]];
00314         //  for (int m = 1; m <= elementSize[j]; m++) {
00315         //   assert(sequentialPatterns[j][m] < indexLength);
00316         //   outFile << " " << indexList[sequentialPatterns[j][m]];
00317 
00318         if (isStringFile)
00319             outFile << itemStrMap->GetValue(sequentialPatterns[j][0]);
00320         else
00321             outFile << sequentialPatterns[j][0];
00322 
00323         for (int m = 1; m <= elementSize[j]; m++)
00324         {
00325             if (isStringFile)
00326                 outFile << " " << itemStrMap->GetValue(sequentialPatterns[j][m]);
00327             else
00328                 outFile << " " << sequentialPatterns[j][m];
00329         }
00330         outFile << ") ";
00331     }
00332 
00333     sprintf(buf, "%.6f", (double)c / (double)totalCust);
00334     outFile << ": " << buf << endl;
00335 
00336 #else
00337 
00338     int m;
00339     // compare with spade, output the data in the same format that Spade does
00340 
00341     int* curItemset;
00342     int temp;
00343 
00344     if (elementSize[0] >= 1)
00345     {
00346 
00347         curItemset = new int[elementSize[0] + 1];
00348 
00349         for (m = 0; m <= elementSize[0]; m++)
00350             curItemset[m] = sequentialPatterns[0][m];
00351 
00352 
00353         for (int kk = 0; kk <= elementSize[0]; kk++)
00354         {
00355             for (int jj = kk + 1; jj <= elementSize[0]; jj++)
00356             {
00357                 if (curItemset[kk] > curItemset[jj])
00358                 {
00359                     temp = curItemset[kk];
00360                     curItemset[kk] = curItemset[jj];
00361                     curItemset[jj] = temp;
00362                 }
00363             }
00364         }
00365 
00366         for (m = 0; m <= elementSize[0]; m++)
00367         {
00368             if (isStringFile)
00369                 outFile << itemStrMap->GetValue(curItemset[m]) << " ";
00370             else
00371                 outFile << curItemset[m] << " " ;
00372         }
00373 
00374         delete [] curItemset;
00375 
00376     }
00377     else
00378         if (isStringFile)
00379             outFile << itemStrMap->GetValue(sequentialPatterns[0][0]) << " ";
00380         else
00381             outFile << sequentialPatterns[0][0] << " " ;
00382 
00383 
00384     for (int j = 1; j <= sequenceLength; j++)
00385     {
00386         outFile << "-1 ";
00387 
00388 
00389         if (elementSize[j] >= 1)
00390         {
00391             curItemset = new int[elementSize[j] + 1];
00392 
00393             for (m = 0; m <= elementSize[j]; m++)
00394                 curItemset[m] = sequentialPatterns[j][m];
00395 
00396             for (int kk = 0; kk <= elementSize[j]; kk++)
00397                 for (int jj = kk + 1; jj <= elementSize[j]; jj++)
00398                     if (curItemset[kk] > curItemset[jj])
00399                     {
00400                         temp = curItemset[kk];
00401                         curItemset[kk] = curItemset[jj];
00402                         curItemset[jj] = temp;
00403                     }
00404 
00405             if (isStringFile)
00406                 outFile << itemStrMap->GetValue(curItemset[0]) << " ";
00407             else
00408                 outFile << curItemset[0] << " " ;
00409 
00410             for (m = 1; m <= elementSize[j]; m++)
00411             {
00412                 if (isStringFile)
00413                     outFile << itemStrMap->GetValue(curItemset[m]) << " ";
00414                 else
00415                     outFile << curItemset[m] << " " ;
00416             }
00417 
00418             delete [] curItemset;
00419         }
00420         else
00421             if (isStringFile)
00422                 outFile << itemStrMap->GetValue(sequentialPatterns[j][0]);
00423             else
00424                 outFile << sequentialPatterns[j][0] << " " ;
00425 
00426     }
00427     outFile << "- " << c << endl;
00428 
00429 
00430 #endif
00431 
00432 }
00433 
00434 
00435 
00436 ///////////////////////////////////////////////////////
00437 /// OR's all of the frequent-1 itemset bitmaps together
00438 /// This is used to create the refBitmap for bitmap compression
00439 ///
00440 /// @param f1                   The frequent-1 itemset bitmaps
00441 /// @param indexList            The list of item names for the 1 itemsets
00442 ///                                 that are still frequent
00443 /// @param indexLength          The length of indexList
00444 ///
00445 /// @param orBitmap             [output]The OR'd bitmap. This tells us which
00446 ///                                 bits can be used in operations further
00447 ///                                 down this branch of the sequence tree
00448 ///////////////////////////////////////////////////////
00449 void CreateOrBitmap(
00450     SeqBitmap** f1,
00451     int* indexList,
00452     int indexLength,
00453     SeqBitmap*& orBitmap)
00454 {
00455 
00456     orBitmap = new SeqBitmap(*f1[indexList[0]]);
00457     for (int i = 1; i < indexLength; i++)
00458         orBitmap->Or(*orBitmap, *f1[indexList[i]]);
00459 }
00460 
00461 ///////////////////////////////////////////////////////
00462 /// Perform compression on a sequence bitmap. Compression converts as many
00463 /// Bitmap64 bits as possible to Bitmap32 bits, Bitmap32 to Bitmap16, and so
00464 /// on, with the goal of speeding up support counting. We can only get rid
00465 /// of a given bit in a bitmap if it will never be used further on down the
00466 /// tree. The refBitmap parameter should contain a 0 for every bit that
00467 /// is never again used, and a 1 for a bit that is used again. Note that
00468 /// the frequent-1 itemset bitmaps are also compressed so that their bits
00469 /// are still aligned with the sequence bitmap after compression.
00470 ///
00471 /// @param refBitmap            Bitmap specifying which bits are compressible
00472 /// @param tempAndBitmap        The sequence bitmap to compress
00473 /// @param returnBitmap         The compressed sequence bitmap
00474 /// @param f1                   Frequent-1 itemset bitmaps prior to compression
00475 /// @param newF1                The compressed versions of the frequent-1
00476 ///                                 itemset bitmaps
00477 /// @param indexList            The list of item names for the 1 itemsets that
00478 ///                                 are still frequent
00479 /// @param indexLength          The length of indexList
00480 ///////////////////////////////////////////////////////
00481 void Compress(
00482     SeqBitmap* refBitmap,
00483     SeqBitmap* tempAndBitmap,
00484     SeqBitmap*& returnBitmap,
00485     SeqBitmap** f1,
00486     SeqBitmap** newF1,
00487     int* indexList,
00488     int indexLength)
00489 {
00490 
00491     numCompress++;
00492 
00493     // Allocation sizes
00494     int size4 = 0;
00495     int size8 = 0;
00496     int size16 = 0;
00497     int size32 = 0;
00498     int size64 = 0;
00499 
00500     // Bitmap position pointers
00501     int pos4;
00502     int pos8;
00503     int pos16;
00504     int pos32;
00505     int pos64;
00506 
00507     int i;
00508 
00509     //testFile << " Sepcial Bitmap " << endl;
00510     //specialBitmap->PrintBitmap(testFile);
00511 
00512     /*    SeqBitmap *refBitmap = new SeqBitmap(*f1[indexList[0]]);
00513      for (i=1; i < indexLength; i++)
00514       refBitmap->Or(*refBitmap, *f1[indexList[i]]);
00515 
00516     */
00517     //testFile << " Reference Bitmap " << endl;
00518     //refBitmap->PrintBitmap(testFile);
00519 
00520     /* SeqBitmap* specialBitmap = new SeqBitmap(*tempAndBitmap);
00521      specialBitmap->CreateSpecialBitmap(*tempAndBitmap);
00522      refBitmap->And(*refBitmap, *specialBitmap);
00523     */
00524 
00525     /* SeqBitmap* refBitmap = new SeqBitmap(*tempAndBitmap);
00526      refBitmap->CreateSpecialBitmap(*tempAndBitmap);
00527      refBitmap->And(*refBitmap, *orBitmap);
00528     */
00529     //testFile << " Reference Bitmap (after and)" << endl;
00530     //refBitmap->PrintBitmap(testFile);
00531 
00532 
00533 
00534     // Find the allocation sizes
00535     refBitmap->CountSmaller(size4, size8, size16, size32, size64, summaryFile);
00536 
00537     // Allocate memory for the compressed bitmaps
00538     for (i = 0; i < indexLength; i++)
00539     {
00540         newF1[indexList[i]] =
00541             new SeqBitmap(size4, size8, size16, size32, size64);
00542     }
00543 
00544     returnBitmap = new SeqBitmap(size4, size8, size16, size32, size64);
00545 
00546     // summaryFile <<
00547     // "SPAM::COMPRESS: Outputting old sizes, new sizes, and the current level"
00548     // << endl;
00549 
00550     // ofstream debugFile;
00551     // debugFile.open("testOutput.txt");
00552     // debugFile << "tempANdBitmap"<<endl;
00553     // tempAndBitmap->PrintBitmap(debugFile);
00554     // debugFile << "refBitmap"<<endl;
00555     // refBitmap->PrintBitmap(debugFile);
00556 
00557 
00558 
00559     // refBitmap->printSizes(summaryFile);
00560     // summaryFile <<
00561     // size4 << " " <<
00562     // size8 << " " <<
00563     // size16 << " " << size32 << " " << size64 << endl;
00564     // returnBitmap->printSizes(summaryFile);
00565 
00566     // summaryFile << globalLevel << endl;
00567 
00568     pos4 = 0;
00569     pos8 = 0;
00570     pos16 = 0;
00571     pos32 = 0;
00572     pos64 = 0;
00573 
00574     // Compress bitmap4
00575     // pos4 is the first position in bitmap4 free after compress4 is run
00576     if (refBitmap->_bitmap4 != 0)
00577         returnBitmap->Compress4(
00578                   refBitmap, tempAndBitmap,
00579                   pos4);
00580 
00581     // Compress bitmap8
00582     // pos8 is the first position in bitmap8 free after compress8 is run
00583     if (refBitmap->_bitmap8 != 0)
00584         returnBitmap->Compress8(
00585                   refBitmap, tempAndBitmap,
00586                   pos4, pos8);
00587 
00588     // Compress bitmap16
00589     // pos16 is the first position in bitmap16 free after compress16 is run
00590     if (refBitmap->_bitmap16 != 0)
00591         returnBitmap->Compress16(
00592                   refBitmap, tempAndBitmap,
00593                   pos4, pos8, pos16);
00594 
00595     // Compress bitmap32
00596     // pos32 is the first position in bitmap32 free after compress32 is run
00597     if (refBitmap->_bitmap32 != 0)
00598         returnBitmap->Compress32(
00599                   refBitmap,
00600                   tempAndBitmap,
00601                   pos4, pos8, pos16, pos32);
00602 
00603     // Compress bitmap64
00604     // pos64 is the first position in bitmap64 free after compress64 is run
00605     if (refBitmap->_bitmap64 != 0)
00606         returnBitmap->Compress64(
00607                   refBitmap, tempAndBitmap,
00608                   pos4, pos8, pos16, pos32, pos64);
00609 
00610 
00611     // debugFile << "compress bitmap"<<endl;
00612     // returnBitmap->PrintBitmap(debugFile);
00613 
00614 
00615 
00616     assert(pos64 == size64*8);
00617     assert(pos32 == size32*4);
00618     assert(pos16 == size16*2);
00619     assert(pos8 == size8);
00620     assert(pos4*2 == size4);
00621 
00622     // Now start the compression
00623     for (i = 0; i < indexLength; i++)
00624     {
00625 
00626         pos4 = 0;
00627         pos8 = 0;
00628         pos16 = 0;
00629         pos32 = 0;
00630         pos64 = 0;
00631 
00632         // Compress bitmap4
00633         // pos4 is the first position in bitmap4 free after compress4 is run
00634         if (refBitmap->_bitmap4 != 0)
00635             newF1[indexList[i]]->Compress4(refBitmap, f1[indexList[i]],
00636                                            pos4);
00637 
00638         // Compress bitmap8
00639         // pos8 is the first position in bitmap8 free after compress8 is run
00640         if (refBitmap->_bitmap8 != 0)
00641             newF1[indexList[i]]->Compress8(refBitmap, f1[indexList[i]],
00642                                            pos4, pos8);
00643 
00644         // Compress bitmap16
00645         // pos16 is the first position in bitmap16 free after compress16 is run
00646         if (refBitmap->_bitmap16 != 0)
00647             newF1[indexList[i]]->Compress16(refBitmap, f1[indexList[i]],
00648                                             pos4, pos8, pos16);
00649 
00650         // Compress bitmap32
00651         // pos32 is the first position in bitmap32 free after compress32 is run
00652         if (refBitmap->_bitmap32 != 0)
00653             newF1[indexList[i]]->Compress32(refBitmap, f1[indexList[i]],
00654                                             pos4, pos8, pos16, pos32);
00655 
00656         // Compress bitmap64
00657         // pos64 is the first position in bitmap64 free after compress64 is run
00658         if (refBitmap->_bitmap64 != 0)
00659             newF1[indexList[i]]->Compress64(refBitmap, f1[indexList[i]],
00660                                             pos4, pos8, pos16, pos32, pos64);
00661 
00662 
00663         assert(pos64 == size64*8);
00664         assert(pos32 == size32*4);
00665         assert(pos16 == size16*2);
00666         assert(pos8 == size8);
00667         assert(pos4*2 == size4);
00668 
00669     }
00670 
00671 }
00672 
00673 
00674 
00675 /////////////////////////////////////////////////////////////////////
00676 /// A recursive call that goes down the search lattice to find sequential
00677 ///    patterns.
00678 ///
00679 /// @param curNode           information about the current node
00680 /////////////////////////////////////////////////////////////////////
00681 void FindSequentialPatterns(TreeNode* curNode)
00682 {
00683     //cout << curNode->level << " " << testCount << endl;
00684     testCount++;
00685     //if (testCount >= 100) {
00686     // exit(0);
00687     //}
00688 
00689 #ifdef __STATS__
00690 
00691     statistics->startLevelTimer();
00692     statistics->updateNumNodes(curNode->level, 1);
00693 #endif
00694 
00695     // temp variables to store s-list/i-list of next level
00696     int* nextLevelSList;
00697     int* nextLevelIList;
00698     int nextLevelSLength;
00699     int nextLevelILength;
00700 
00701     // get node and bitmaps from buffers
00702     TreeNode* nextNode = nodeBuff[curNode->level + 1];
00703     SeqBitmap* tempAndBitmap = new SeqBitmap(*curNode->iBitmap);
00704     SeqBitmap* sBitmap = new SeqBitmap(*curNode->iBitmap);
00705     //SeqBitmap* returnBitmap = 0;
00706 
00707     // SeqBitmap* tempAndBitmap = tempBitmapBuff[curNode->level];
00708     // SeqBitmap* sBitmap = sBitmapBuff[curNode->level];
00709 
00710     int count = 0;     // count the support of a bitmap
00711     int i;       // counter
00712 
00713 #ifdef __COMPRESSION__
00714 
00715     bool prevComp = false;  // a flag to tell if we compress the previous node
00716     int j;
00717     int memSize;
00718 #endif
00719 
00720     // -- s-step
00721     // create the s-bitmap, using the post-processing step of setting the
00722     // first 1 in each bit slice to 0,
00723     // and all subsequent bits to 1
00724     sBitmap->CreateSBitmap(*curNode->iBitmap);
00725 
00726     // find s-extensions for the next node
00727     nextLevelSList = curNode->sList + curNode->sLength;
00728     nextLevelIList = curNode->iList + curNode->iLength;
00729     nextLevelSLength = 0;
00730 
00731     // for each possible s-extension from this level
00732     for (i = 0; i < curNode->sLength; i++)
00733     {
00734         // AND the post-processed s-step bitmap with the candidate
00735         // frequent-1 itemset
00736         tempAndBitmap->And(*sBitmap, *curNode->f1[curNode->sList[i]]);
00737         count = tempAndBitmap->Count();
00738 
00739         // If the AND'd result is frequent, specify in curNode->countList that
00740         // the s-extended sequence should be traversed further
00741         if (count >= minSup)
00742         {
00743             curNode->countList[nextLevelSLength].count = count;
00744             curNode->countList[nextLevelSLength].index = curNode->sList[i];
00745             nextLevelSLength++;
00746         }
00747 
00748     }
00749 
00750 #ifdef __STATS__
00751     statistics->updateSExt(curNode->level, nextLevelSLength);
00752 #endif
00753 
00754 
00755 #ifdef ___REORDERING___
00756     // Sort nextLevelSList based on curNode->countList so that extensions
00757     // with higher support are traversed first
00758     qsort((void *) curNode->countList, (size_t)nextLevelSLength,
00759           sizeof(int) + sizeof(int), compare );
00760 #endif
00761 
00762     // Set values in nextLevelSList based on the countList indices
00763     for (i = 0;i < nextLevelSLength;i++)
00764         nextLevelSList[i] = curNode->countList[i].index;
00765 
00766     // create i-extensions for the next node
00767     for (i = 0;i < nextLevelSLength - 1;i++)
00768         nextLevelIList[i] = nextLevelSList[i + 1];
00769 
00770     // initialize nextNode
00771     nextNode->level = curNode->level + 1;
00772     nextNode->sList = nextLevelSList;
00773     nextNode->sLength = nextLevelSLength;
00774     nextNode->f1 = curNode->f1;
00775     nextNode->f1Name = curNode->f1Name;
00776     nextNode->f1Size = curNode->f1Size;
00777     nextNode->compress = false;
00778 #ifdef __COMPRESSION__
00779     // Do we want to compress every 5 levels, 10 levels, or 15 levels?
00780     // Not sure of optimal times to compress.
00781     //if ((nextNode->sLevel % 5) == 1) { nextNode->compress = true; }
00782     //if ((nextNode->sLevel % 10) == 1) { nextNode->compress = true; }
00783     if ((nextNode->sLevel % 15) == 1)
00784     {
00785         nextNode->compress = true;
00786     }
00787 #ifdef __ALLCOMPRESS__
00788     nextNode->compress = true;
00789 #endif
00790 #else
00791 
00792     nextNode->compress = false;
00793 #endif
00794 
00795 
00796     // for output
00797     if (outputSeq)
00798         sequenceLength++;
00799 
00800 #ifdef __COMPRESSION__
00801 #ifdef __STATS__
00802 
00803     statistics->startCompTimer();
00804 #endif
00805 
00806     SeqBitmap *orBitmap = 0;
00807     SeqBitmap *specialBitmap = 0;
00808 
00809 
00810     if (curNode->compress && nextLevelSLength > 0)
00811     {
00812         CreateOrBitmap(
00813             curNode->f1,
00814             nextLevelSList,
00815             nextLevelSLength,
00816             orBitmap);
00817 
00818         specialBitmap = new SeqBitmap(*tempAndBitmap);
00819     }
00820 
00821 #ifdef __STATS__
00822     statistics->stopCompTimer(curNode->level);
00823 #endif
00824 #endif
00825 
00826     // perform s-step
00827     for (i = 0; i < nextLevelSLength; i++)
00828     {
00829         // Update the sLevel for the next node
00830         nextNode->sLevel = curNode->sLevel + 1;
00831 
00832         // i-list only contains extensions after the current node
00833         nextNode->iList = nextLevelIList + i;
00834         nextNode->iLength = nextLevelSLength - 1 - i;
00835         tempAndBitmap->And(*sBitmap, *curNode->f1[nextLevelSList[i]]);
00836 
00837         if (outputSeq)
00838             count = tempAndBitmap->Count();
00839 
00840 #ifdef __COMPRESSION__
00841 #ifdef __STATS__
00842 
00843         statistics->startCompTimer();
00844 #endif
00845         //int nextLevelSLengthLocal;
00846         //int * nextLevelSListLocal;
00847         //SeqBitmap **tempF1Local;
00848         memSize = tempAndBitmap->memSize();
00849         // compress the data when condition holds
00850         if ((memSize >= minCompSize) && curNode->compress)
00851         {
00852 
00853             prevComp = true;
00854 
00855             // update the current frequent-one itemset buffer
00856             nextNode->f1 = curNode->f1 + curNode->f1Size;
00857 
00858             specialBitmap->CreateCBitmap(*tempAndBitmap);
00859             specialBitmap->And(*specialBitmap, *orBitmap);
00860 
00861             // Jay: For debugging of compression
00862             globalLevel = curNode->level;
00863 
00864 
00865 
00866 #ifdef __STATS__
00867 
00868             statistics->updateNumComp(curNode->level, 1);
00869 #endif
00870             //nextLevelSLengthLocal = nextLevelSLength;
00871             //nextLevelSListLocal = new int[nextLevelSLength];
00872             //for (j = 0; j < nextLevelSLength; j++) {
00873             // nextLevelSListLocal[j] = nextLevelSList[j];
00874             //}
00875 
00876             //tempF1Local = new SeqBitmap*[nextLevelSLength];
00877             //for (j = 0; j < nextLevelSLength; j++) {
00878             // tempF1Local[j] = nextNode->f1[j];
00879             //}
00880 
00881             Compress(
00882                 specialBitmap,
00883                 tempAndBitmap,
00884                 returnBitmap,
00885                 curNode->f1,
00886                 nextNode->f1,
00887                 nextLevelSList,
00888                 nextLevelSLength);
00889 
00890             nextNode->iBitmap = returnBitmap;
00891         }
00892         else
00893         {
00894             // If compression was done for the last extension,
00895             // reset the values for nextNode
00896             if (prevComp)
00897             {
00898                 prevComp = false;
00899                 nextNode->f1 = curNode->f1;
00900             }
00901             nextNode->iBitmap = tempAndBitmap;
00902         }
00903 #ifdef __STATS__
00904         statistics->stopCompTimer(curNode->level);
00905 #endif
00906 #else
00907 
00908         nextNode->iBitmap = tempAndBitmap;
00909 #endif
00910 
00911         // output sequence
00912         if (outputSeq)
00913         {
00914             elementSize[sequenceLength] = 0;
00915             sequentialPatterns[sequenceLength][0] =
00916                 curNode->f1Name[nextLevelSList[i]];
00917             LogSequence(count);
00918         }
00919 
00920 #ifdef __STATS__
00921         statistics->stopLevelTimer(curNode->level);
00922 #endif
00923 
00924         // Recurse on the next node
00925         FindSequentialPatterns(nextNode);
00926 
00927 #ifdef __STATS__
00928 
00929         statistics->startLevelTimer();
00930 #endif
00931 
00932 #ifdef __COMPRESSION__
00933 #ifdef __STATS__
00934 
00935         statistics->startCompTimer();
00936 #endif
00937 
00938         if (prevComp)
00939         {
00940             // Deallocate/delete returnBitmap
00941             returnBitmap->Deallocate();
00942             delete returnBitmap;
00943 
00944             // Deallocate the F1s
00945             //assert(nextLevelSLengthLocal == nextLevelSLength);
00946 
00947             for (j = nextNode->f1Size - 1; j >= 0; j--)
00948             {
00949                 if (nextNode->f1[j] != 0)
00950                 {
00951                     assert(nextNode->f1[j] != (SeqBitmap *)0xfdfdfdfd);
00952                     nextNode->f1[j]->Deallocate();
00953                     delete nextNode->f1[j];
00954                 }
00955                 nextNode->f1[j] = 0;
00956             }
00957 
00958             for (j = 0; j < nextNode->f1Size; j++)
00959                 assert(nextNode->f1[j] == 0);
00960         }
00961 #ifdef __STATS__
00962         statistics->stopCompTimer(curNode->level);
00963 #endif
00964 #endif
00965 
00966     }
00967 
00968 #ifdef __COMPRESSION__
00969 #ifdef __STATS__
00970     statistics->startCompTimer();
00971 #endif
00972 
00973     if (curNode->compress && nextLevelSLength != 0)
00974     {
00975         // Deallocate specialBitmap and orBitmap in reverse order of allocation
00976         specialBitmap->Deallocate();
00977         orBitmap->Deallocate();
00978         delete orBitmap;
00979         delete specialBitmap;
00980     }
00981 #ifdef __STATS__
00982     statistics->stopCompTimer(curNode->level);
00983 #endif
00984 #endif
00985 
00986     // for output
00987     if (outputSeq)
00988         sequenceLength--;
00989 
00990 
00991     // -- i-step
00992     // find i-extensions
00993     nextLevelIList = curNode->iList + curNode->iLength;
00994     nextLevelILength = 0;
00995 
00996     // For each possible i-step
00997     for (i = 0; i < curNode->iLength; i++)
00998     {
00999         // Update the sLevel for the next node
01000         // Not increasing sLevel, just copy it over
01001         nextNode->sLevel = curNode->sLevel;
01002 
01003         tempAndBitmap->And(*curNode->iBitmap, *curNode->f1[curNode->iList[i]]);
01004         count = tempAndBitmap->Count();
01005 
01006         // If the i-step was frequent, the i-step sequence is considered
01007         // a candidate
01008         if (count >= minSup)
01009         {
01010             curNode->countList[nextLevelILength].count = count;
01011             curNode->countList[nextLevelILength].index = curNode->iList[i];
01012             nextLevelILength++;
01013         }
01014     }
01015 
01016 #ifdef __STATS__
01017     statistics->updateIExt(curNode->level, nextLevelILength);
01018 #endif
01019 #ifdef ___REORDERING___
01020     // Sort nextLevelSList based on curNode->countList
01021     qsort( (void *) curNode->countList, (size_t)nextLevelILength,
01022            sizeof(int) + sizeof(int), compare );
01023 #endif
01024 
01025     // Set values in nextLevelSList based on the reordered indices
01026     for (i = 0;i < nextLevelILength;i++)
01027         nextLevelIList[i] = curNode->countList[i].index;
01028 
01029 
01030     // i-step
01031     // for output
01032     if (outputSeq)
01033         elementSize[sequenceLength]++;
01034 
01035 
01036     for (i = 0; i < nextLevelILength; i++)
01037     {
01038         nextNode->iList = nextLevelIList + i + 1;
01039         nextNode->iLength = nextLevelILength - i - 1;
01040 
01041         tempAndBitmap->And(*curNode->iBitmap, *curNode->f1[nextLevelIList[i]]);
01042         if (outputSeq)
01043             count = tempAndBitmap->Count();
01044 
01045 
01046         // not doing compression for i-step
01047 #ifdef __COMPRESSION__
01048 
01049         memSize = tempAndBitmap->memSize();
01050 
01051         if ((memSize >= minCompSize) && curNode->compress && false)
01052         {
01053             prevComp = true;
01054 
01055             // update the current frequent-one itemset buffer
01056             nextNode->f1 = curNode->f1 + curNode->f1Size;
01057 
01058             int j;
01059             // find the indices of f1's which have to be compressed
01060             for (j = 0; j < nextNode->f1Size; j++)
01061                 indexExists[j] = false;
01062 
01063             // copy indicies from s-list
01064             for (j = 0; j < nextLevelSLength; j++)
01065             {
01066                 tempIndexList[j] = nextLevelSList[j];
01067                 indexExists[nextLevelSList[j]] = true;
01068             }
01069             int indexLength = nextLevelSLength;
01070 
01071             // copy indices from i-list but making sure that it doesn't already
01072             // exist in the s-list
01073             for (j = 0; j < nextNode->iLength; j++)
01074             {
01075                 if (!indexExists[nextNode->iList[j]])
01076                 {
01077                     tempIndexList[indexLength] = nextNode->iList[j];
01078                     indexLength++;
01079                 }
01080             }
01081 
01082             // Compress(tempAndBitmap, returnBitmap, curNode->f1, nextNode->f1,
01083             // tempIndexList, indexLength);
01084             nextNode->iBitmap = tempAndBitmap;
01085         }
01086         else
01087         {
01088 
01089             if (prevComp)
01090             {
01091                 prevComp = false;
01092                 nextNode->f1 = curNode->f1;
01093             }
01094             nextNode->iBitmap = tempAndBitmap;
01095 
01096         }
01097 #else
01098         nextNode->iBitmap = tempAndBitmap;
01099 #endif
01100 
01101 
01102         // output
01103         if (outputSeq)
01104         {
01105             sequentialPatterns[sequenceLength][elementSize[sequenceLength]] =
01106                 curNode->f1Name[nextLevelIList[i]];
01107             LogSequence(count);
01108         }
01109 
01110 #ifdef __STATS__
01111         statistics->stopLevelTimer(curNode->level);
01112 #endif
01113 
01114         // Recurse on the next node
01115         FindSequentialPatterns(nextNode);
01116 #ifdef __STATS__
01117 
01118         statistics->startLevelTimer();
01119 #endif
01120 
01121 #ifdef __COMPRESSION__
01122 
01123         if (prevComp)
01124         {
01125             delete returnBitmap;
01126             for (j = 0; j < nextNode->f1Size; j++)
01127             {
01128                 if (nextNode->f1[j] != 0)
01129                     delete nextNode->f1[j];
01130                 nextNode->f1[j] = 0;
01131             }
01132         }
01133 #endif
01134 
01135     }
01136 
01137     // for output
01138     if (outputSeq)
01139         elementSize[sequenceLength]--;
01140 
01141 
01142     // Deallocate the SeqBitmaps used in this node in reverse order of
01143     // allocation
01144     sBitmap->Deallocate();
01145     tempAndBitmap->Deallocate();
01146     delete tempAndBitmap;
01147     delete sBitmap;
01148 
01149 #ifdef __STATS__
01150 
01151     statistics->stopLevelTimer(curNode->level);
01152 #endif
01153 
01154 }
01155 
01156 
01157 /////////////////////////////////////////////////////////////////////
01158 /// Start the mining algorithm by generating the initial TreeNode to
01159 /// start recursing from.
01160 ///
01161 /// @param info              information about the data set
01162 /////////////////////////////////////////////////////////////////////
01163 void StartMining(DatasetInfo* info)
01164 {
01165 #ifdef __STATS__
01166     statistics->startLevelTimer();
01167     statistics->updateNumNodes(0, 1);
01168     statistics->updateSExt(0, info->f1Size);
01169 #endif
01170 
01171     int i;
01172     int count;
01173 
01174     TreeNode* curNode = nodeBuff[0];
01175     curNode->f1 = info->f1Buff;
01176     curNode->f1Name = info->f1NameBuff;
01177     curNode->f1Size = info->f1Size;
01178     curNode->sList = info->sListBuff;
01179     curNode->sLength = info->f1Size;
01180     curNode->countList = info->countBuff;
01181     curNode->level = 0;
01182     curNode->sLevel = 0;
01183     curNode->compress = false;
01184 
01185 
01186     for (i = 0; i < info->f1Size; i++)
01187     {
01188         count = info->f1Buff[i]->Count();
01189         curNode->countList[i].count = count;
01190         curNode->countList[i].index = i;
01191 
01192     }
01193 
01194 #ifdef ___REORDERING___
01195     // Sort SList based on curNode->countList
01196     qsort( (void *) curNode->countList, (size_t)info->f1Size,
01197            sizeof(int) + sizeof(int), compare );
01198 #endif
01199 
01200     for (i = 0; i < info->f1Size; i++)
01201     {
01202         info->sListBuff[i] = curNode->countList[i].index;
01203         info->iListBuff[i] = curNode->countList[i].index;
01204     }
01205 
01206 
01207     // call FindSequentialPatterns
01208     for (i = 0; i < info->f1Size; i++)
01209     {
01210 
01211         // output sequence
01212         if (outputSeq)
01213         {
01214             count = info->f1Buff[info->sListBuff[i]]->Count();
01215             elementSize[sequenceLength] = 0;
01216             sequentialPatterns[sequenceLength][0] =
01217                 info->f1NameBuff[info->sListBuff[i]];
01218             LogSequence(count);
01219         }
01220 
01221         curNode->iList = info->iListBuff + i + 1;
01222         curNode->iLength = info->f1Size - i - 1;
01223         curNode->iBitmap = info->f1Buff[info->sListBuff[i]];
01224 
01225 #ifdef __STATS__
01226 
01227         statistics->stopLevelTimer(0);
01228 #endif
01229 
01230         FindSequentialPatterns(curNode);
01231 
01232 #ifdef __STATS__
01233 
01234         statistics->startLevelTimer();
01235 #endif
01236 
01237     }
01238 
01239 #ifdef __STATS__
01240     statistics->stopLevelTimer(0);
01241 #endif
01242 
01243 }
01244 
01245 
01246 void PrintError()
01247 {
01248     cerr << "\nUsage: spam -sup <minSup> [-fn <infile>] [-stdin] [-ascii] [-str]\n";
01249     cerr << "             [-outFile <outfile>] [-stdout]\n\n";
01250 
01251     // (See below comment re emptRatio, compLevel, and minSize)
01252     // cerr << "             [-emptRatio e] [-compLevel c] [-minSize m]\n\n";
01253     cerr << "    minSup    - The minimum support (between 0.0 and 1.0)\n";
01254     cerr << "    infile    - The data file to read in (see below for specifications)\n";
01255     cerr << "    stdin     - Use this flag if the data should be read in from stdin.\n";
01256     cerr << "                Must use if -fn is not specified.\n";
01257     cerr << "                Overrides any file specified via -fn\n";
01258     cerr << "    ascii     - Use this flag if your input is ASCII text;\n";
01259     cerr << "                otherwise SPAM assumes it is in a binary file format\n";
01260     cerr << "    str       - Use this flag if your input is a list of strings\n";
01261     cerr << "                representing customers, transactions, and items\n";
01262     cerr << "                (see documentation for full file-format description)\n";
01263     cerr << "    outfile   - The file to place the output in. If -outFile and -stdout\n";
01264     cerr << "                are not present, no output will be produced\n";
01265     cerr << "    stdout    - Use this flag if you want the output to go to stdout.\n";
01266     cerr << "                Overrides any file specified via -outFile\n\n";
01267     cerr << "See documentation for examples of how to use SPAM with various data formats.\n";
01268 
01269     // emptRatio, minSize, and compLevel are not needed to just run the
01270     // algorithm; for simplicity's sake they are not documented in the
01271     // usage instructions
01272     /*
01273     cerr << "    emptRatio - The percentage (0.0 to 1.0) of empty spaces";
01274     cerr << " a bitmap \n";
01275     cerr << "                  must have to warrant compression\n";
01276     cerr << "    minSize   - The size (in # of shorts) a bitmap must be";
01277     cerr << " reduced to in \n";
01278     cerr << "                  order to stop compression\n";
01279     cerr << "    compLevel - do compression every c levels \n";
01280     */
01281     exit(0);
01282 }
01283 
01284 
01285 int main(int argc, char **argv)
01286 {
01287 
01288     // start timing
01289     clock_t programStart = clock();
01290 
01291     // -- command prompt variables
01292     bool isBinaryFile;    // whether the input file is a binary file
01293     double minSupPercent = 0;  // user-defined min sup as percentage
01294     char* inFilename = 0;
01295     char outFilename[50];
01296     bool fnSpecified = false;
01297     bool supSpecified = false;
01298     bool stdinSpecified = false;
01299     outputSeq = false;
01300 
01301     // -- variables for timing
01302     clock_t programEnd;
01303     clock_t miningStart, miningEnd;
01304     double programDuration;
01305     double miningDuration;
01306 
01307     DatasetInfo* info;
01308 
01309     // -- summaryFile
01310     //ofstream summaryFile;
01311 
01312     // --- Parse the command line
01313     // default value
01314     strcpy(outFilename, "output.txt");
01315     emptySpaceRatio = 0.8;
01316     minCompSize = 30;
01317     compLevel = 1;
01318     isBinaryFile = true;
01319     isStringFile = false;
01320 
01321     // parse input
01322     int i = 1;
01323     while (i < argc)
01324     {
01325 
01326         if (strcmp(argv[i], "-fn") == 0)
01327         {
01328 
01329             // input filename
01330             i++;
01331             if (i == argc)
01332                 PrintError();
01333             else
01334             {
01335                 inFilename = argv[i];
01336                 fnSpecified = true;
01337             }
01338 
01339         }
01340         else if (strcmp(argv[i], "-sup") == 0 )
01341         {
01342 
01343             // support percentage
01344             i++;
01345             if (i == argc)
01346                 PrintError();
01347             else
01348             {
01349                 minSupPercent = atof(argv[i]);
01350                 supSpecified = true;
01351             }
01352 
01353         }
01354         else if (strcmp(argv[i], "-emptRatio") == 0 )
01355         {
01356 
01357             // empty space ratio
01358             i++;
01359             if (i == argc)
01360                 PrintError();
01361             else
01362                 emptySpaceRatio = atof(argv[i]);
01363 
01364         }
01365         else if (strcmp(argv[i], "-minSize") == 0)
01366         {
01367 
01368             // minimum compression size
01369             i++;
01370             if (i == argc)
01371                 PrintError();
01372             else
01373                 minCompSize = atoi(argv[i]);
01374 
01375         }
01376         else if (strcmp(argv[i], "-compLevel") == 0)
01377         {
01378 
01379             // comp level
01380             i++;
01381             if (i == argc)
01382                 PrintError();
01383             else
01384                 compLevel = atoi(argv[i]);
01385 
01386         }
01387         else if (strcmp(argv[i], "-outFile") == 0)
01388         {
01389 
01390             // output file name
01391             i++;
01392             if (i == argc)
01393                 PrintError();
01394             else
01395             {
01396                 strcpy(outFilename, argv[i]);
01397                 outputSeq = true;
01398             }
01399 
01400         }
01401         else if (strcmp(argv[i], "-ascii") == 0)
01402         {
01403 
01404             // whether the input file is ascii
01405             isBinaryFile = false;
01406 
01407         }
01408         else if (strcmp(argv[i], "-str") == 0)
01409         {
01410             isBinaryFile = false;
01411             isStringFile = true;
01412         }
01413         else if (strcmp(argv[i], "-stdin") == 0)
01414         {
01415             stdinSpecified = true;
01416         }
01417         else if (strcmp(argv[i], "-stdout") == 0)
01418         {
01419             stdoutSpecified = true;
01420             outputSeq = true;
01421         }
01422         else
01423         {
01424             PrintError();
01425         }
01426 
01427         i++;
01428     }
01429 
01430     if (!supSpecified)
01431     {
01432         cout << "Must specify minimum support" << endl;
01433         PrintError();
01434     }
01435     // If the filename was not specified or stdin was specified,
01436     // attempt to read in from stdin
01437     if (!fnSpecified || stdinSpecified)
01438         inFilename = 0;
01439 
01440     if (!fnSpecified && !stdinSpecified)
01441     {
01442         // The user didn't select an input file
01443         // Let them know that we are reading from stdin by default
01444         cout << "No input data source selected; use either -stdin or -fn" << endl;
01445         PrintError();
01446     }
01447 
01448     if (stdinSpecified && isBinaryFile)
01449     {
01450         cout << "Cannot use binary files with -stdin; use -ascii or -str" << endl;
01451         PrintError();
01452     }
01453 
01454     bool suppressOutput = false;
01455     // If either reading from stdin or writing to stdout, we
01456     // want to suppress all extraneous output to the screen
01457     if (stdinSpecified || stdoutSpecified)
01458         suppressOutput = true;
01459 
01460     if (isBinaryFile == true)
01461     {
01462         if (!suppressOutput)
01463         {
01464             cout << "Input file will be interpreted as binary, ";
01465             cout << "if SPAM crashes use the -ascii flag\n";
01466         }
01467     }
01468     else
01469     {
01470         if (!suppressOutput)
01471             cout << "Input file will be interpreted as ASCII...\n";
01472     }
01473 
01474     // --- Read in data
01475     if (!suppressOutput)
01476         cout << "Reading Data... " << endl;
01477 
01478     info = ReadDataset(isBinaryFile,
01479                        isStringFile,
01480                        inFilename,
01481                        minSupPercent,
01482                        custStrMap,
01483                        transStrMap,
01484                        itemStrMap);
01485     if (info == 0)
01486     {
01487         cout << "Error reading dataset" << endl;
01488         exit(1);
01489     }
01490 
01491     // --- Open output file
01492     if (outputSeq && !stdoutSpecified)
01493         outFile.open(outFilename);
01494     summaryFile.open("summary.txt");
01495 
01496     if (info->f1Size > 0)
01497     {
01498 
01499         // --- Preparing for mining sequential patterns
01500         minSup = info->minSup;
01501         totalCust = info->custCount;
01502 
01503         // build sbitmap process table
01504         SeqBitmap::Init();
01505 
01506 
01507         // initialize buffers
01508 
01509         // max depth of the tree is the maximum number of transaction
01510         // a customer (s-tree) has plus
01511         // the size of f1 (i-tree)
01512         int bufSize = info->f1Size * info->maxCustTrans + 1;
01513         nodeBuff = new TreeNode * [bufSize];
01514         //tempBitmapBuff = new SeqBitmap*[bufSize];
01515         //sBitmapBuff = new SeqBitmap*[bufSize];
01516 
01517 
01518 
01519         for (i = 0; i < bufSize; i++)
01520         {
01521             nodeBuff[i] = new TreeNode();
01522             //tempBitmapBuff[i] =
01523             // new SeqBitmap(info->bitmapSizes[0], info->bitmapSizes[1],
01524             // info->bitmapSizes[2], info->bitmapSizes[3],
01525             // info->bitmapSizes[4]);
01526             //sBitmapBuff[i] = new SeqBitmap(info->bitmapSizes[0],
01527             // info->bitmapSizes[1], info->bitmapSizes[2],
01528             // info->bitmapSizes[3], info->bitmapSizes[4]);
01529         }
01530 
01531         // In doing compression, we have to know what f1 to compress, and
01532         // hence we have to combine the i-list and s-list. These two buffers
01533         // are for that purpose.
01534         tempIndexList = new int[info->f1Size * 2];
01535         indexExists = new bool[info->f1Size];
01536 
01537         // If output sequence,
01538         // initialize array for printing sequential patterns
01539         if (outputSeq)
01540         {
01541             sequentialPatterns = new int * [info->maxCustTrans];
01542             elementSize = new int[info->maxCustTrans];
01543             for (i = 0; i < info->maxCustTrans; i++)
01544             {
01545                 sequentialPatterns[i] = new int[info->f1Size];
01546                 elementSize[i] = 0;
01547             }
01548             sequenceLength = 0;
01549         }
01550 
01551 
01552 #ifdef __STATS__
01553         statistics = new Stats(bufSize);
01554 #endif
01555 
01556         // -- Mine sequential patterns
01557         if (!suppressOutput)
01558             cout << "Running the algorithm..." << endl;
01559         miningStart = clock();
01560         StartMining(info);
01561         miningEnd = clock();
01562         if (!suppressOutput)
01563             cout << "Done." << endl;
01564 
01565 #ifdef __STATS__
01566 
01567         statistics->printStats(summaryFile);
01568 #endif
01569 
01570 #ifdef __STATS__
01571 
01572         delete statistics;
01573 #endif
01574 
01575         // deallocate memory
01576         for (i = 0; i < bufSize; i++)
01577         {
01578             delete nodeBuff[i];
01579             //delete tempBitmapBuff[i];
01580             //delete sBitmapBuff[i];
01581         }
01582 
01583         delete [] nodeBuff;
01584         //delete [] tempBitmapBuff;
01585         //delete [] sBitmapBuff;
01586         delete [] tempIndexList;
01587         delete [] indexExists;
01588 
01589         delete custStrMap;
01590         delete transStrMap;
01591         delete itemStrMap;
01592 
01593 
01594         if (outputSeq)
01595         {
01596 
01597             for (i = 0; i < info->maxCustTrans; i++)
01598                 delete[] sequentialPatterns[i];
01599             delete[] sequentialPatterns;
01600             delete[] elementSize;
01601         }
01602 
01603         SeqBitmap::Destroy();
01604 
01605     }
01606     else
01607     {
01608 
01609         miningStart = 0;
01610         miningEnd = 0;
01611 
01612     }
01613 
01614     // -- time calculation
01615     miningDuration = (miningEnd - miningStart) / (double)CLOCKS_PER_SEC;
01616 
01617     programEnd = clock();
01618     programDuration = (programEnd - programStart) / (double)CLOCKS_PER_SEC;
01619 
01620     summaryFile << "Number of customer: " << info->custCount << endl;
01621     summaryFile << "Minimum support: " << minSup << " ( " << minSupPercent
01622     << " )" << endl;
01623     summaryFile << "Mining Duration:" << miningDuration << endl;
01624     summaryFile << "Program Duration: " << programDuration << endl << endl;
01625 
01626     summaryFile << "Number of Compression: " << numCompress << endl;
01627     summaryFile << "Number of Ors: " << SeqBitmap::_countOr << endl;
01628     summaryFile << "Number of Ands: " << SeqBitmap::_countAnd << endl;
01629     summaryFile << "Number of Count: " << SeqBitmap::_countCount << endl;
01630     summaryFile << "Number of CountZeros: " <<
01631     SeqBitmap::_countCountZeros << endl;
01632     summaryFile << "Number of CountSmaller: " <<
01633     SeqBitmap::_countCountSmaller << endl;
01634     summaryFile << "Number of CreateSBitmap: " <<
01635     SeqBitmap::_countCreateSBitmap << endl;
01636     summaryFile << "Number of CreateCBitmaps: " <<
01637     SeqBitmap::_countCreateCBitmap << endl;
01638 
01639     // deallocate memory
01640     if (outputSeq)
01641         outFile.close();
01642     summaryFile.close();
01643 
01644     // Deallocate initial f1 buff in reverse order of allocation
01645     for (i = info->f1Size - 1; i >= 0; i--)
01646         info->f1Buff[i]->Deallocate();
01647 
01648     delete info;
01649 
01650     // Do final deallocation of SeqBitmap memory
01651     SeqBitmap::MemDealloc();
01652     return 0;
01653 }
01654 
01655 /** @} */
01656 
01657 
01658 /////////////////////////////////////////////////////////////////////
01659 /// @mainpage SPAM Code Documentation
01660 /// \image html SpamLogo.jpg
01661 ///
01662 /// \section ContactSection Contact
01663 /// - Jay Ayres (kja9@cornell.edu)
01664 /// - Manuel Calimlim (calimlim@cs.cornell.edu)
01665 /// - Johannes Gehrke (johannes@cs.cornell.edu)
01666 ///
01667 /// \section DownloadSection Download
01668 /// - Main webpage at
01669 ///   http://himalaya-tools.sourceforge.net/Spam
01670 /// - Older webpage at
01671 ///   http://www.cs.cornell.edu/database/himalaya/spam/spam.htm
01672 ///
01673 /// \section LinuxCompilationSection Linux Compilation
01674 /// -# ./configure
01675 /// -# make
01676 ///     - executable is created as 'src/spam'
01677 /// -# make install
01678 ///     - will copy the executable to /usr/local/bin
01679 ///     - make sure you have the right permission to install there
01680 ///
01681 /// \section WindowsCompilationSection Windows Compilation
01682 /// -# Open MAFIA.dsw in Visual Studio
01683 /// -# Build the program
01684 ///
01685 /// \section DirectoryStructureSection Directory Structure
01686 /// <table border=1 cellpadding=5 cellspacing=0>
01687 /// <tr>
01688 /// <td>admin/</td>
01689 /// <td>Contains config files for compiling.  Should
01690 ///     not be altered.</td>
01691 /// </tr>
01692 /// <tr>
01693 /// <td>datasets/</td>
01694 /// <td>Contains several datasets for testing</td>
01695 /// </tr>
01696 /// <tr>
01697 /// <td>src/</td>
01698 /// <td>Contains all of source code for SPAM</td>
01699 /// </tr>
01700 /// <tr>
01701 /// <td>src/Bitmap4.h
01702 /// <br>src/Bitmap4.cpp
01703 /// <br>src/Bitmap8.h
01704 /// <br>src/Bitmap8.cpp
01705 /// <br>src/Bitmap16.h
01706 /// <br>src/Bitmap16.cpp
01707 /// <br>src/Bitmap32.h
01708 /// <br>src/Bitmap32.cpp
01709 /// <br>src/Bitmap64.h
01710 /// <br>src/Bitmap64.cpp
01711 /// </td>
01712 /// <td>
01713 /// Vertical representations of the data. They handle both uncompressed
01714 ///  and compressed data.
01715 ///  <p>
01716 /// Each bitmap can represent customers with up to x transactions since it
01717 ///  allocates exactly x bits per customer
01718 /// </td>
01719 /// </tr>
01720 /// <tr>
01721 /// <td>src/DatasetInfo.h</td>
01722 /// <td>A class for representing the info gathered from the dataset</td>
01723 /// </tr>
01724 /// <tr>
01725 /// <td>src/FileInput.cpp</td>
01726 /// <td>The functions necessary for reading in datasets.</td>
01727 /// </tr>
01728 /// <tr>
01729 /// <td>src/ResizableArray.h</td>
01730 /// <td>A class containing a resizable array data structure</td>
01731 /// </tr>
01732 /// <tr>
01733 /// <td>src/SeqBitmap.h
01734 /// <br>src/SeqBitmap.cpp</td>
01735 /// <td>A representation of a sequence (or an item)</td>
01736 /// </tr>
01737 /// <tr>
01738 /// <td>src/Spam.cpp</td>
01739 /// <td>Main file with most of the algorithmic code</td>
01740 /// </tr>
01741 /// <tr>
01742 /// <td>src/Stats.h
01743 /// <br>src/Stats.cpp</td>
01744 /// <td>Collects statistics about SPAM as it is run</td>
01745 /// </tr>
01746 /// <tr>
01747 /// <td>src/StringMap.h</td>
01748 /// <td>A class containing a data structure that allows for two way mapping between ints and strings</td>
01749 /// </tr>
01750 /// <tr>
01751 /// <td>src/Tables.h</td>
01752 /// <td>For gathering the lookup tables in one place</td>
01753 /// </tr>
01754 /// <tr>
01755 /// <td>src/TreeNode.h</td>
01756 /// <td>Class for representing nodes in the search tree</td>
01757 /// </tr>
01758 /// <tr>
01759 /// <td>INSTALL</td>
01760 /// <td>Generic installation instructions</td>
01761 /// </tr>
01762 /// <tr>
01763 /// <td>spam.{kdevprj,kdevses}</td>
01764 /// <td>KDevelop project files for Linux</td>
01765 /// </tr>
01766 /// <tr>
01767 /// <td>Spam.{sln,vcproj}</td>
01768 /// <td>Visual Studio .NET project files for Windows</td>
01769 /// </tr>
01770 /// <tr>
01771 /// <td>README</td>
01772 /// <td>Pointer to this page</td>
01773 /// </tr>
01774 /// <tr>
01775 /// <td>test</td>
01776 /// <td>Contains perlscripts and executables to test SPAM</td>
01777 /// </tr>
01778 /// </table>
01779 ///
01780 /// \section UsageSection Program Usage
01781 /// <pre>
01782 /// Usage: spam -sup &lt;minSup&gt; [-fn &lt;infile&gt;] [-stdin] [-ascii] [-str]
01783 ///             [-outFile &lt;outfile&gt;] [-stdout]
01784 ///
01785 ///    minSup    - The minimum support (between 0.0 and 1.0)
01786 ///    infile    - The data file to read in (see below for specifications)
01787 ///    stdin     - Use this flag if the data should be read in from stdin.
01788 ///                Must use if -fn is not specified.
01789 ///                Overrides any file specified via -fn
01790 ///    ascii     - Use this flag if your input is ASCII text;
01791 ///                otherwise SPAM assumes it is in a binary file format
01792 ///    str       - Use this flag if your input is a list of strings
01793 ///                representing customers, transactions, and items
01794 ///                (see documentation for full file-format description)
01795 ///    outfile   - The file to place the output in. If -outFile and -stdout
01796 ///                are not present, no output will be produced
01797 ///    stdout    - Use this flag if you want the output to go to stdout.
01798 ///                Overrides any file specified via -outFile
01799 ///
01800 /// There are three input data formats:
01801 /// 1. ASCII numbers (use -ascii)
01802 ///    This data format is ASCII text with each line containing the customer ID,
01803 ///    the transaction ID, and the item ID separated by spaces. The data must be
01804 ///    sorted in ascending order first by cust ID, then by trans ID, then by item ID.
01805 ///    Note that SPAM has the limitation that transactions can contain no more than
01806 ///    64 items.
01807 ///
01808 /// 2. ASCII strings (use -str)
01809 ///    This data format is also ASCII text, but each customer, transaction, and item
01810 ///    is an actual string instead of a number. Since the strings may have spaces,
01811 ///    the customer, transaction, and item must be separated by newline characters
01812 ///    instead of by spaces as for format #1. This option should not be used with
01813 ///    extremely large files because the input and output is slow compared to #1.
01814 ///
01815 /// 3. Binary file (don't use -ascii or -str)
01816 ///    This data format is present to support AssocGen-generated data files. See
01817 ///    the perlscripts in the test directory for information on how to generate
01818 ///    binary files.
01819 ///
01820 ///
01821 /// New in release 1.3: -stdin and -stdout:
01822 /// Now you can have files come in through standard input and output go through
01823 /// standard output. Note that SPAM does not support having binary files come
01824 /// in through stdin.
01825 /// </pre>
01826 ///
01827 /// \section TestingSection Program Testing
01828 /// All datasets used by SPAM were generated by the IBM AssocGen synthetic
01829 /// data generator. Several sample datasets are included in the datasets
01830 /// directory, and the AssocGen executable can be used along with the
01831 /// perlscripts in the test directory to generate custom datasets with
01832 /// varying parameters. Please view the perlscripts for instructions on
01833 /// how to use them.
01834 ///
01835 /////////////////////////////////////////////////////////////////////
01836 
01837 

Generated on Thu Mar 11 12:01:52 2004 for SPAM by doxygen 1.3.4