Main Page   Modules   Namespace List   Class Hierarchy   Data Structures   File List   Data Fields   Globals  

Mafia.cpp

Go to the documentation of this file.
00001 /*
00002  
00003 Copyright (c) 2003, Cornell University
00004 All rights reserved.
00005  
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008  
00009   - Redistributions of source code must retain the above copyright notice,
00010       this list of conditions and the following disclaimer.
00011   - Redistributions in binary form must reproduce the above copyright
00012       notice, this list of conditions and the following disclaimer in the
00013       documentation and/or other materials provided with the distribution.
00014   - Neither the name of Cornell University nor the names of its
00015       contributors may be used to endorse or promote products derived from
00016       this software without specific prior written permission.
00017  
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029  
00030 */
00031 
00032 /////////////////////////////////////////////////////////////////////
00033 //
00034 // Mafia.cpp
00035 //
00036 /////////////////////////////////////////////////////////////////////
00037 
00038 //#define DEBUG
00039 
00040 #include <iostream>
00041 #include <fstream>
00042 #include <time.h>
00043 #include <stdio.h>
00044 #include <list>
00045 #include <vector>
00046 #include <algorithm>
00047 #include <string>
00048 #include <assert.h>
00049 #include <math.h>
00050 #include <map>
00051 #include "Bitmap.h"
00052 #include "TreeNode.h"
00053 #include "Transaction.h"
00054 #include "ItemsetOutput.h"
00055 
00056 using namespace std;
00057 
00058 /// @defgroup GlobalVariables Global Variables
00059 /// Global variables
00060 /** @{ */
00061 typedef vector<TreeNode *> NodeList;
00062 typedef vector<TreeNode *> BranchList;
00063 typedef vector<Bitmap *> BitmapList;
00064 typedef vector<BaseBitmap *> BaseBitmapList;
00065 typedef vector<int> ItemSet;
00066 typedef map<long, ItemSet *> HashTable;
00067 
00068 /// Simple class for storing subtree size estimates
00069 class SubtreeEstimate {
00070 public:
00071     int Count;              ///< Count of actual subtrees counted
00072     int Sum;                ///< Sum of subtree sizes
00073     SubtreeEstimate () {
00074         Count = 0;
00075         Sum = 0;
00076     }
00077 };
00078 
00079 /// Simple class for storing tail elements of each node of the tree
00080 class TailElement {
00081 public:
00082     int Count;              ///< Support of the 1-extension
00083     int Item;               ///< Item-id for this1 -extension
00084 
00085     TailElement () {
00086         Count = 0;
00087         Item = 0;
00088     }
00089 
00090     bool operator < (const TailElement& rhs) const {
00091         return this->Count < rhs.Count;
00092     };
00093 };
00094 
00095 /// @defgroup CommandLineParameters Command Line Parameters/Variables
00096 /// Commmand line parameters from user or inferred
00097 /** @{ */
00098 string method;               ///< either -mfi or -fci or -fi
00099 char* outFilename;           ///< filename for output
00100 ItemsetOutput *outFile;      ///< file for ouput
00101 bool outputMFI = false;      ///< true if MFI should be saved to a file
00102 bool MethodIsFI = false;     ///< true if the method is -fi
00103 bool MethodIsFCI = false;    ///< true if the method is -fci
00104 int ItemCount;               ///< # of items in the file
00105 int TransCount;              ///< # of transactions in the file
00106 double MSF;                  ///< user-defined min sup as percentage
00107 int MS;                      ///< min sup as a transaction count
00108 int VerScale = 1;            ///< Scaling factor for transactions
00109 int HorScale = 1;            ///< Scaling factor for items
00110 bool GoFHUT = true;          ///< FHUT flag
00111 bool HUTMFI = true;          ///< HUTMFI flag
00112 bool PEPrune = true;         ///< PEPrune flag -- parent equivalent pruning
00113 bool Reorder = true;         ///< Reorder flag
00114 /** @} */
00115 
00116 
00117 /// @defgroup CounterVariables Counter Variables
00118 /// Counter variables for information gathering
00119 /** @{ */
00120 int CountFHUT = 0;           ///< # of times FHUT was successful
00121 int CountNodes = 0;          ///< # of frequent nodes in the tree
00122 int CountCounts = 0;         ///< # of Counts or all nodes in the tree
00123 int CountAnds = 0;           ///< # of ANDs of normal bitmaps
00124 int CountSmallAnds = 0;      ///< # of compressed bitmap ANDs
00125 int CountPEPrunes = 0;         ///< # of PEPruning
00126 int CountCheckPosition = 0;  ///< # of CheckPosition calls
00127 int CountHUTMFI = 0;         ///< # of HUTMFI attempts
00128 int CountHUTMFISuccess = 0;  ///< # of HUTMFI successes
00129 int CountRebuilds;           ///< # of Rebuilds
00130 /** @} */
00131 
00132 
00133 /// @defgroup ProgramVariables Program Parameters/Variables
00134 /// Useful program parameters/counters
00135 /** @{ */
00136 int maxtail = 0;
00137 int MFISize = 0;             ///< MFI size before pruning
00138 int MFIDepth = 0;            ///< The aggregated depth of the all MFI elements
00139 int F1size = 0;              ///< # of frequent 1-itemsets after merging repeats
00140 int FullF1size = 0;          ///< # of frequent 1-itemsets
00141 int k = 50;                  ///< # of items checked for a MFI lookup
00142 int MAX_compID = 1;          ///< max compression ID
00143 int projectDepth = -1;       ///< depth of the bitmap you're projecting from
00144 int EstimateSize;            ///< size of subtree estimation buffer
00145 int EstimateDiv = 5;         ///< bucket size by frequent tail length
00146 int maxItemsetSize = 0;      ///< max size of a frequent itemset
00147 /** @} */
00148 
00149 
00150 /// @defgroup DataVariables Data variables
00151 /// Complex data structure variables
00152 /** @{ */
00153 NodeList F1;                 ///< List of frequent 1-itemsets
00154 BitmapList TransBuffy;       ///< Buffer of transaction bitmaps
00155 BaseBitmapList NameBuffy;    ///< Buffer of name bitmaps
00156 NodeList NodeBuffy;          ///< Buffer of tree nodess
00157 TreeNode *Root;              ///< The root (the nullset)
00158 TailElement *gTail;          ///< global tail pointer
00159 TailElement *TailBuffy;      ///< Common Buffer for tail elements
00160 Bitmap *NullTrans;           ///< A transaction bitmap filled with ones
00161 int *ItemMap;                ///< For renaming items after sorting by support
00162 int *ReverseItemMap;         ///< For remembering the renaming...
00163 BaseBitmapList MFI;          ///< List of Maximally Frequent Itemsets
00164 HashTable HT;                ///< Hash table of transaction supports
00165 vector <int> SupportCountList;       ///< List that stores support count
00166 BaseBitmap* TempName;        ///< Temporary buffer for one name bitmap
00167 SubtreeEstimate* EstimateBuffy;      ///< Buffer of subtree estimates
00168 int *MFIBySizes;             ///< Buffer for counting MFI by itemset size
00169 int *ItemsetBuffy;           ///< Buffer for writing itemsets to file output
00170 /** @} */
00171 
00172 
00173 /// @defgroup TimingVariables Timing Variables
00174 /// Variables for timing (and instrumenting the code)
00175 /** @{ */
00176 time_t total_start, total_finish;
00177 double total_time;
00178 time_t read_start, read_finish;
00179 double read_time;
00180 time_t algorithm_start, algorithm_finish;
00181 double algorithm_time;
00182 time_t print_start, print_finish;
00183 double print_time;
00184 /** @} */
00185 /** @} */
00186 
00187 /*********************************************************************
00188  Reading the data in and building the item bitmaps
00189 *********************************************************************/
00190 /// @addtogroup InputOutput Input/Output Functions
00191 /// Reading the data in and building the item bitmaps.
00192 /// Outputting the frequent itemsets.
00193 /** @{ */
00194 
00195 /////////////////////////////////////////////////////////////////////
00196 /// Insert pointer to node in order of increasing support
00197 ///
00198 /// @param newNode           item node to add to F1
00199 /////////////////////////////////////////////////////////////////////
00200 void AddToF1(TreeNode *newNode) {
00201     if (F1.empty())
00202         F1.push_back(newNode);
00203     else {
00204         // use insertion sort for increasing support ordering
00205         NodeList::iterator noli = F1.begin();
00206         while (noli != F1.end()) {
00207             if ((*noli)->Trans->_count >= newNode->Trans->_count) {
00208                 F1.insert(noli, newNode);
00209                 return ;
00210             }
00211             noli++;
00212         }
00213 
00214         // Add to end of F1 list
00215         F1.push_back(newNode);
00216     }
00217 }
00218 
00219 /////////////////////////////////////////////////////////////////////
00220 /// Create bitmaps filled randomly with probability p
00221 ///
00222 /// @param P - probability of a bit p being set
00223 /////////////////////////////////////////////////////////////////////
00224 void F1UsingProb(double P) {
00225     int blah = 0;
00226     //cout << "Creating bitmap with prob p=" << P << endl;
00227 
00228     // Create frequent 1-itemsets with probability p
00229     for (int i = 0; i < ItemCount / HorScale; i++) {
00230         // Create random transaction bitmap
00231         Bitmap *trans = new Bitmap(TransCount);
00232         trans->FillRand(P);
00233 
00234         // Add frequent items to list
00235         if (trans->Count(blah) >= MS) {
00236             TreeNode *node = new TreeNode(NULL, trans, 1, -1, i, -1, 0);
00237             for (int r = 0; r < HorScale; r++)
00238                 AddToF1(node);
00239         } else
00240             delete trans;
00241     }
00242 }
00243 
00244 /////////////////////////////////////////////////////////////////////
00245 /// Read transaction data from file and build item bitmaps.
00246 ///     If the data is in ascii form:
00247 ///     [itemid_1] [itemid_2] ... [itemid_n]
00248 ///     If the data is in binary form:
00249 ///     [custid] [transid] [number of items] [itemid_1] [itemid_2] ... [itemid_n]
00250 ///
00251 /// @param filename          name of file to be read in
00252 /// @param isAsciiFile       true for ASCII input, false for binary
00253 /////////////////////////////////////////////////////////////////////
00254 void F1FromFile(char *filename, bool isAsciiFile) {
00255     TransCount = 0;
00256     int MAXitemID = -1;
00257     int *Counters = new int[MAX_NUM_ITEMS];        //to count the support of items
00258     int *F1IndexList = new int[MAX_NUM_ITEMS];     // the id's of F1 items
00259     int *InvF1IndexList = new int[MAX_NUM_ITEMS];
00260     int itemIndex = 0;
00261 
00262     //Initialize counters
00263     for (int ct = 0; ct < MAX_NUM_ITEMS; ++ct) {
00264         Counters[ct] = 0;
00265         F1IndexList[ct] = -1;
00266         InvF1IndexList[ct] = -1;
00267     }
00268 
00269     time(&read_start);
00270     BitmapList Trans;
00271 
00272     int *itemlist = new int[MAX_NUM_ITEMS];
00273     InputData *inData = new InputData(filename, itemlist, isAsciiFile);
00274     if (!inData->isOpen()) {
00275         cerr << "Input file not found!" << endl;
00276         exit(1);
00277     }
00278 
00279     Transaction *newTransaction = inData->getNextTransaction();
00280     while (newTransaction != NULL) {
00281         for (itemIndex = 0; itemIndex < newTransaction->length; itemIndex++) {
00282             // ensure that there are not too many items
00283             if (newTransaction->itemlist[itemIndex] >= MAX_NUM_ITEMS) {
00284                 cerr << "Read item_id=" << newTransaction->itemlist[itemIndex]
00285                 << " which is more than the max item_id allowed (" << MAX_NUM_ITEMS << ")";
00286                 exit(1);
00287             }
00288 
00289             if (newTransaction->itemlist[itemIndex] > MAXitemID)
00290                 MAXitemID = newTransaction->itemlist[itemIndex];
00291 
00292             Counters[newTransaction->itemlist[itemIndex]]++;
00293         }
00294 
00295         TransCount++;
00296         delete newTransaction;
00297         newTransaction = inData->getNextTransaction();
00298     }
00299 
00300     delete newTransaction;
00301     delete inData;
00302 
00303     MS = (int)ceil(MSF * (double)TransCount);
00304     //MSF = MS/(double)TransCount;
00305 
00306     ItemCount = MAXitemID + 1;
00307 
00308     int F1items = 0;
00309     // build the normal bitmaps -- Preallocated memory for the bitmaps
00310     for (itemIndex = 0; itemIndex <= MAXitemID; itemIndex++) {
00311         if (Counters[itemIndex] >= MS) {
00312             F1IndexList[F1items++] = itemIndex;
00313             InvF1IndexList[itemIndex] = F1items - 1;
00314             Bitmap *trans = new Bitmap(TransCount);
00315             trans->_count = Counters[itemIndex];
00316             Trans.push_back(trans);
00317         }
00318     }
00319 
00320     int transIndex = 0;
00321     InputData *inData2 = new InputData(filename, itemlist, isAsciiFile);
00322     newTransaction = inData2->getNextTransaction();
00323     while (newTransaction != NULL) {
00324         for (int itemIndex = 0; itemIndex < newTransaction->length; itemIndex++) {
00325             if (InvF1IndexList[newTransaction->itemlist[itemIndex]] != -1) {
00326                 Trans[InvF1IndexList[newTransaction->itemlist[itemIndex]]]->FillEmptyPosition(transIndex);
00327             }
00328         }
00329 
00330         transIndex++;
00331         delete newTransaction;
00332         newTransaction = inData2->getNextTransaction();
00333     }
00334 
00335     delete newTransaction;
00336     delete inData2;
00337     delete [] itemlist;
00338 
00339     time(&read_finish);
00340     read_time = difftime(read_finish, read_start);
00341     printf("Reading input time:    %.2f seconds.\n", read_time);
00342 
00343     // Create F1
00344     BitmapList::iterator bli = Trans.begin();
00345     itemIndex = 0;
00346 
00347     while (bli != Trans.end()) {
00348         TreeNode *node = new TreeNode(
00349                              NULL,
00350                              (*bli),
00351                              1,
00352                              -1,
00353                              F1IndexList[itemIndex],
00354                              -1,
00355                              0);
00356         AddToF1(node);
00357         bli++;
00358         itemIndex++;
00359     }
00360 
00361     delete [] Counters;
00362     delete [] F1IndexList;
00363     delete [] InvF1IndexList;
00364 }
00365 
00366 /////////////////////////////////////////////////////////////////////
00367 /// To print the MFI data out to an ASCII file
00368 ///     with each entry having the format:
00369 ///     [list of items in MFI entry...] [(support)]
00370 /////////////////////////////////////////////////////////////////////
00371 void PrintMFI() {
00372     // open output file
00373     outFile = new ItemsetOutput(outFilename);
00374     if (!outFile->isOpen()) {
00375         cerr << "Output file not open!" << endl;
00376         exit(1);
00377     }
00378 
00379     if (FullF1size != 0) {
00380         // translate bitmap to list of INTs
00381         int* ITEMS = new int[FullF1size];
00382         for (int i = 0; i < MFISize; i++) {
00383             int j = 0;
00384             for (int cc = 0; cc < FullF1size; cc++) {
00385                 if (MFI[i]->CheckPosition(cc, CountCheckPosition) > 0) {
00386                     ITEMS[j] = ItemMap[cc];
00387                     j++;
00388                 }
00389             }
00390 
00391             outFile->printSet(MFI[i]->_count, ITEMS, SupportCountList[i]);
00392         }
00393         delete [] ITEMS;
00394     } else {
00395         outFile->printSet(0, NULL, TransCount);
00396     }
00397 
00398     delete outFile;
00399 }
00400 
00401 /** @} */
00402 
00403 
00404 /*********************************************************************
00405  Algorithmic components
00406 *********************************************************************/
00407 /// @defgroup AlgorithmicComponents Algorithmic Component Functions
00408 /// Algorithm components (HUTMFI, PEP, etc.)
00409 /** @{ */
00410 
00411 /////////////////////////////////////////////////////////////////////
00412 /// Check for an existing superset of name in the MFI
00413 ///
00414 /// @param location          The node we're at.  Use this to examine only
00415 ///                          the relevant portion of the MFI
00416 /// @return True - if superset found.
00417 ///         False - if no superset.
00418 /////////////////////////////////////////////////////////////////////
00419 bool LMFISuperSet(TreeNode* location) {
00420     return (location->rEnd > location->rBegin);
00421 }
00422 
00423 /////////////////////////////////////////////////////////////////////
00424 /// Output itemset (don't need to save bitmaps for FI)
00425 ///
00426 /// @param C                 the current node
00427 /////////////////////////////////////////////////////////////////////
00428 void AddToFI(TreeNode *C) {
00429     int itemsetIndex = 0;
00430     for (int cc = 0; cc < FullF1size; cc++) {
00431         if (C->Name->CheckPosition(cc, CountCheckPosition) > 0) {
00432             ItemsetBuffy[itemsetIndex] = ItemMap[cc];
00433             itemsetIndex++;
00434         }
00435     }
00436 
00437     MFIBySizes[C->Name->_count]++;
00438     if (C->Name->_count > maxItemsetSize)
00439         maxItemsetSize = C->Name->_count;
00440 
00441     if (outputMFI)
00442         outFile->printSet(C->Name->_count, ItemsetBuffy, C->Trans->_count);
00443 
00444     // Update stat variables
00445     MFIDepth += C->Name->_count;
00446     MFISize++;
00447 }
00448 
00449 /////////////////////////////////////////////////////////////////////
00450 /// Add this node's name bitmap to the MFI list
00451 ///
00452 /// @param C                 the current node
00453 /////////////////////////////////////////////////////////////////////
00454 void AddToMFI(TreeNode *C) {
00455     // copy name bitmap
00456     BaseBitmap *name = new BaseBitmap(*C->Name);
00457 
00458     // add to MFI
00459     MFI.push_back(name);
00460 
00461     // update the end of the relevant
00462     C->rEnd++;
00463 
00464     // Update stat variables
00465     MFIDepth += C->Name->_count;
00466     MFISize++;
00467 
00468     SupportCountList.push_back(C->Trans->_count);
00469     MFIBySizes[name->_count]++;
00470     if (name->_count > maxItemsetSize)
00471         maxItemsetSize = name->_count;
00472 }
00473 
00474 /////////////////////////////////////////////////////////////////////
00475 /// Add this node's name bitmap to the FCI list
00476 ///
00477 /// @param C                 the current node
00478 /////////////////////////////////////////////////////////////////////
00479 void AddToFCI(TreeNode *C) {
00480     // Search HT
00481     HashTable::iterator h = HT.find(C->Trans->_count);
00482 
00483     // If the support of node C is NOT in HashSup
00484     if (h == HT.end()) {
00485         // Add a new list to the HashSup
00486         ItemSet* newList = new ItemSet();
00487         newList->reserve(500);
00488         newList->push_back(MFI.size());
00489         HT.insert(HashTable::value_type(C->Trans->_count, newList));
00490 
00491         // Else add pointer to last item in iName to HT entry
00492     } else {
00493         for (ItemSet::reverse_iterator goli = (*h).second->rbegin();
00494                 goli != (*h).second->rend();
00495                 goli++)
00496             if (MFI[*goli]->Superset(C->Name))
00497                 return ;
00498 
00499         // search the table
00500         (*h).second->push_back(MFI.size());
00501     }
00502 
00503     // copy name bitmap
00504     BaseBitmap *name = new BaseBitmap(*C->Name);
00505 
00506     // add to MFI
00507     MFI.push_back(name);
00508 
00509     // Update stat variables
00510     MFIDepth += C->Name->_count;
00511     MFISize++;
00512 
00513     SupportCountList.push_back(C->Trans->_count);
00514     MFIBySizes[name->_count]++;
00515     if (name->_count > maxItemsetSize)
00516         maxItemsetSize = name->_count;
00517 }
00518 
00519 int SortLMFI(int rBegin, int rEnd, int sortBy) {
00520     int left = rBegin;
00521     int right = rEnd - 1;
00522     while (left <= right) {
00523         while (left < rEnd && !MFI[left]->CheckPosition(sortBy, CountCheckPosition))
00524             left++;
00525         while (right >= rBegin && MFI[right]->CheckPosition(sortBy, CountCheckPosition))
00526             right--;
00527         if (left < right) {
00528             // we are now at a point where MFI[left] is relevant
00529             // and MFI[right] is not since left < right, we swap the two
00530             BaseBitmap* tempBitmap = MFI[left];
00531             MFI[left] = MFI[right];
00532             MFI[right] = tempBitmap;
00533             tempBitmap = NULL;
00534             left++;
00535             right--;
00536         }
00537     }
00538 
00539     // the first relevant one for the next node is left
00540     return left;
00541 }
00542 
00543 /////////////////////////////////////////////////////////////////////
00544 /// Determine whether a HUTMFI is true.
00545 ///     - if HUT is in MFI, then HUT is frequent
00546 ///     and the subtree rooted at this node can be pruned
00547 ///
00548 /// @return True if HUT is in the MFI
00549 /////////////////////////////////////////////////////////////////////
00550 bool CheckHUTMFI(TreeNode *C, int iTAIL) {
00551 
00552     // for each element i in the tail form {head} U {i} and check for a
00553     //     superset in the MFI
00554     int rBegin = C->rBegin;
00555     int rEnd = C->rEnd;
00556     for (; iTAIL < C->tEnd; iTAIL++) {
00557         rBegin = SortLMFI(rBegin, rEnd, F1[gTail[iTAIL].Item]->Prefix);
00558 
00559         if (rEnd <= rBegin)
00560             return false;
00561     }
00562 
00563     return true;
00564 }
00565 
00566 /////////////////////////////////////////////////////////////////////
00567 /// Dynamically reorder the elements in the tail by increasing support
00568 ///    - Expand all children and sort by increasing support
00569 ///    - Remove infrequent children
00570 ///
00571 /// @param C                 current node
00572 /// @param iTAIL             index into tail of current node
00573 /// @param useComp           whether compressed bitmaps should be used
00574 /// @param NoChild           whether C has any frequent children
00575 /// @param AllFreq           whether all children are frequent
00576 /////////////////////////////////////////////////////////////////////
00577 void ReorderTail(
00578     TreeNode *C,
00579     int &iTAIL,
00580     bool useComp,
00581     bool &NoChild,
00582     bool &AllFreq) {
00583 
00584     int tailIndex = 0;
00585     int lol = 0;
00586     // for each tail element
00587     for (lol = iTAIL; lol < C->tEnd; lol++) {
00588         Bitmap *trans = TransBuffy[C->Depth];
00589         int theCount = 0;
00590 
00591         // Compress the bitmaps
00592         if (useComp && (F1[gTail[lol].Item]->compID != C->compID)) {
00593             F1[gTail[lol].Item]->Trans->BuildRelComp(
00594                 *TransBuffy[projectDepth]);
00595             F1[gTail[lol].Item]->compID = C->compID;
00596             CountRebuilds++;
00597         }
00598 
00599         // Use the compressed bitmaps
00600         if (useComp) {
00601             // AND the compressed bitmaps and count the result
00602             trans->AndCompOnly(
00603                 *C->Trans,
00604                 *F1[gTail[lol].Item]->Trans,
00605                 CountSmallAnds);
00606             theCount = trans->SmallCount(CountCounts);
00607 
00608             // use the full bitmaps
00609         } else {
00610             // AND & count the bitmaps
00611             trans->AndOnly(*C->Trans, *F1[gTail[lol].Item]->Trans, CountAnds);
00612             theCount = trans->Count(CountCounts);
00613         }
00614 
00615         // If the results is frequent
00616         if (theCount >= MS) {
00617             // if PEP pruning holds
00618             if (PEPrune && (trans->_count == C->Trans->_count)) {
00619                 // Move tail element from tail to head
00620                 C->Name->Or(*C->Name, *F1[gTail[lol].Item]->Name);
00621                 CountPEPrunes++;
00622 
00623                 // add tail element to reordered tail
00624             } else {
00625                 NoChild = false;
00626 
00627                 // create new tail element
00628                 TailBuffy[tailIndex].Count = theCount;
00629                 TailBuffy[tailIndex].Item = gTail[lol].Item;
00630                 tailIndex++;
00631             }
00632         } else
00633             AllFreq = false;
00634     }
00635 
00636     sort(TailBuffy, TailBuffy + tailIndex);
00637 
00638     // Set the begin and end values of the new tail
00639     iTAIL = C->tEnd;
00640     C->tEnd = iTAIL + tailIndex;
00641     C->tBegin = iTAIL;
00642     int r = 0;
00643 
00644     // Copy new tail into next slots
00645     for (lol = iTAIL; lol < C->tEnd; lol++) {
00646         gTail[lol] = TailBuffy[r];
00647         r++;
00648     }
00649 }
00650 
00651 /////////////////////////////////////////////////////////////////////
00652 /// Simply copy over the tail without expanding any of the children
00653 ///    for pure DFS (no expansion of all children)
00654 ///
00655 /// @param C                 current node
00656 /// @param iTAIL             index into tail of current node
00657 /////////////////////////////////////////////////////////////////////
00658 void NoorderTail(
00659     TreeNode *C,
00660     int &iTAIL) {
00661 
00662     // set begin and end tail pointers
00663     iTAIL = C->tEnd;
00664     C->tEnd = C->tEnd - C->tBegin + C->tEnd;
00665     C->tBegin = iTAIL;
00666     int r = 0;
00667 
00668     // copy over old tail to new tail
00669     for (int lol = iTAIL; lol < C->tEnd; lol++) {
00670         gTail[lol] = gTail[lol - C->tEnd + C->tBegin];
00671         r++;
00672     }
00673 }
00674 
00675 
00676 /////////////////////////////////////////////////////////////////////
00677 /// The main MAFIA algorithm function
00678 ///
00679 /// @param C                 the current node
00680 /// @param HUT               whether this is a HUT check (left most branch)
00681 /// @param FHUT              [output] whether the HUT is frequent
00682 /// @param useComp           if compression has been switched on
00683 /////////////////////////////////////////////////////////////////////
00684 void MAFIA(TreeNode *C, bool HUT, bool &FHUT, bool useComp) {
00685     CountNodes++;
00686     int iTAIL = C->tBegin;   // index into the tail
00687     bool NoChild = true;     // whether the node has any frequent children
00688     bool AllFreq = true;     // whether all the children are frequent
00689     FHUT = false;            // whether this is a FHUT
00690     int beforeCountNodes = CountNodes;
00691     int frequentTailSize = C->tEnd - iTAIL;
00692 
00693     if (iTAIL < C->tEnd) {
00694 
00695         if (C != Root) {
00696             if (Reorder) {
00697                 ReorderTail(C, iTAIL, useComp, NoChild, AllFreq);
00698             } else {
00699                 NoorderTail(C, iTAIL);
00700             }
00701         }
00702 
00703         frequentTailSize = C->tEnd - iTAIL;
00704 
00705         int estimateTail = (int)(frequentTailSize / (double)EstimateDiv);
00706         if (estimateTail > 0 && C->Trans->_count != TransCount) {
00707             double estimateSubTree = EstimateBuffy[estimateTail].Sum / (double)EstimateBuffy[estimateTail].Count;
00708             double support = C->Trans->_count / (double) TransCount;
00709             double factor = 11.597 - 29.914 * (support - .52392) * (support - .52392);
00710             double cost = abs(factor * frequentTailSize / (1 - 1.2 * support));
00711             //double cost = 5 * frequentTailSize / (1 - support);
00712 
00713             // check if relative comp should be performed
00714             if ((!useComp) && (estimateSubTree > cost)) {
00715 
00716                 // build the relative bitmap for source [node bitmap] (all 1's)
00717                 C->Trans->BuildSource();
00718 
00719                 // remember the depth of the FULL bitmap your projecting from
00720                 projectDepth = C->Depth - 1;
00721 
00722                 // increment the ID
00723                 C->compID = MAX_compID;
00724                 MAX_compID++;
00725                 useComp = true;
00726             }
00727         }
00728 
00729         // Candidate generation - extend the Head with the tail elements
00730         // We start from the end of the tail and move backwards
00731         // Therefore the tail is iterated through in increasing support,
00732         // but is stored in decreasing support.
00733         while (iTAIL < C->tEnd) {
00734             // form a one-extension
00735             Bitmap *trans = TransBuffy[C->Depth];
00736             BaseBitmap *name = NameBuffy[C->Depth];
00737             TreeNode *newNode = NodeBuffy[C->Depth];
00738 
00739             // create name for the new node
00740             name->Or(*C->Name, *F1[gTail[iTAIL].Item]->Name);
00741 
00742             // compress the bitmaps if warranted
00743             if (useComp && (F1[gTail[iTAIL].Item]->compID != C->compID)) {
00744                 // build the relative for this node
00745                 F1[gTail[iTAIL].Item]->Trans->BuildRelComp(
00746                     *TransBuffy[projectDepth]);
00747                 F1[gTail[iTAIL].Item]->compID = C->compID;
00748                 CountRebuilds++;
00749             }
00750 
00751             int theCount = 0;
00752 
00753             // use the compressed bitmaps for ANDing and counting
00754             if (useComp) {
00755                 // AND and count small bitmaps
00756                 trans->AndCompOnly(
00757                     *C->Trans,
00758                     *F1[gTail[iTAIL].Item]->Trans,
00759                     CountSmallAnds);
00760 
00761                 if (Reorder)
00762                     trans->_count = gTail[iTAIL].Count;
00763                 else
00764                     theCount = trans->SmallCount(CountCounts);
00765             } else {
00766                 // AND and count the full bitmaps
00767                 trans->AndOnly(
00768                     *C->Trans,
00769                     *F1[gTail[iTAIL].Item]->Trans,
00770                     CountAnds);
00771 
00772                 if (Reorder)
00773                     trans->_count = gTail[iTAIL].Count;
00774                 else
00775                     theCount = trans->Count(CountCounts);
00776             }
00777             if (!Reorder && PEPrune && (theCount == C->Trans->_count)) {
00778                 CountPEPrunes++;
00779                 C->Name->Or(*C->Name, *F1[gTail[iTAIL].Item]->Name);
00780                 iTAIL++;
00781                 continue;
00782             }
00783 
00784             // Determine whether this candidate will be a HUT
00785             // Conceptually the leftmost branch of the tree is a HUT check
00786             if ((iTAIL != C->tBegin) && (C != Root))
00787                 HUT = 0;
00788             else
00789                 HUT = 1;
00790 
00791             if (!AllFreq)
00792                 HUT = 0;
00793 
00794             if (Reorder || (theCount >= MS)) {
00795                 // form the 1-extension node
00796                 newNode->setTreeNode(name,
00797                                      trans,
00798                                      C->Depth + 1,
00799                                      C->compID,
00800                                      F1[gTail[iTAIL].Item]->Prefix,
00801                                      iTAIL + 1,
00802                                      C->tEnd);
00803 
00804                 // setup the LMFI for the next level; it contains all
00805                 // itemsets in LMFI for this level that also include the
00806                 // one we're extending the node with.  We do sort of a
00807                 // quicksort thing to move the relevant itemsets for the
00808                 // next node to the end of the portion of the MFI relevant
00809                 // to this node
00810 
00811                 newNode->rEnd = C->rEnd;
00812                 newNode->rBegin = SortLMFI(C->rBegin, C->rEnd, newNode->Prefix);
00813 
00814                 // Check for HUT in MFI for remaining tail
00815                 if (HUTMFI && newNode->tBegin != newNode->tEnd && !HUT) {
00816                     CountHUTMFI++;
00817 
00818                     if (CheckHUTMFI(newNode, newNode->tBegin)) {
00819                         // stop generation of extensions
00820                         CountHUTMFISuccess++;
00821                         AllFreq = false;
00822                         break;
00823                     }
00824                 }
00825 
00826                 NoChild = false;
00827 
00828                 // recurse down the tree
00829                 MAFIA(newNode, HUT, FHUT, useComp);
00830 
00831                 // Add those discovered from lower levels to the current LMFI
00832                 // LMFI_l = LMFI_l \union LMFI_{l+1}
00833                 // all we need to do is to update the end pointer
00834                 C->rEnd = newNode->rEnd;
00835             } else
00836                 AllFreq = false;
00837 
00838             // if this was a successful HUT check
00839             if (FHUT) {
00840                 // keep going up the tree
00841                 if (HUT) {
00842                     return ;
00843 
00844                     // reached start of HUT, so stop generation of subtree
00845                     // rooted at this node
00846                 } else {
00847                     FHUT = false;
00848                     break;
00849                 }
00850             }
00851 
00852             // Move on the next tail element
00853             iTAIL++;
00854         }
00855     }
00856 
00857     // if this is a FHUT
00858     if (GoFHUT && HUT && AllFreq) {
00859         FHUT = true;
00860         CountFHUT++;
00861     }
00862 
00863     // if this node is childless and not in MFI
00864     if (MethodIsFI)
00865         AddToFI(C);
00866     else if (MethodIsFCI && C != Root)
00867         AddToFCI(C);
00868     else if (NoChild && !LMFISuperSet(C)) {
00869         AddToMFI(C);
00870     }
00871 
00872     int subtreeSize = CountNodes - beforeCountNodes + 1;
00873     int estimateTail = (int)(frequentTailSize / (double)EstimateDiv);
00874     if (estimateTail > 0 && C->Trans->_count != TransCount) {
00875         EstimateBuffy[estimateTail].Count++;
00876         EstimateBuffy[estimateTail].Sum += subtreeSize;
00877     }
00878 }
00879 
00880 
00881 /////////////////////////////////////////////////////////////////////
00882 /// Merge repeated itemsets into one combined itemset
00883 ///    - e.g. if (transaction set of item 4) AND
00884 ///    (transaction set item 5) = (transaction set item 5),
00885 ///    then item 5 is a duplicate of item 4
00886 ///    due to increasing support
00887 /////////////////////////////////////////////////////////////////////
00888 void MergeRepeatedItemsets() {
00889     NodeList::iterator bali = F1.begin();
00890     Bitmap out(*(*bali)->Trans);
00891     int blah = 0;
00892 
00893     // for each frequent 1-itemset
00894     while (bali != F1.end()) {
00895         out.FillOnes();
00896         NodeList::iterator noli = bali;
00897         noli++;
00898 
00899         // search for a copy of the itemset's transaction set
00900         while (noli != F1.end()) {
00901             // stop when count is no longer the same
00902             if ((*bali)->Trans->_count != (*noli)->Trans->_count)
00903                 break;
00904             else {
00905                 // AND itemsets with the same count
00906                 out.AndOnly(*(*bali)->Trans, *(*noli)->Trans, CountAnds);
00907                 out.Count(blah);
00908 
00909                 // check for a duplicate
00910                 if (out._count == (*noli)->Trans->_count) {
00911                     (*bali)->Name->Or(*(*bali)->Name, *(*noli)->Name);
00912                     F1.erase(noli);
00913                 } else
00914                     noli++;
00915             }
00916         }
00917         bali++;
00918     }
00919 }
00920 /** @} */
00921 
00922 
00923 /*********************************************************************
00924  MAIN
00925 *********************************************************************/
00926 /// @defgroup MainFunction Main Function
00927 /// Main function for running the program
00928 /** @{ */
00929 
00930 /////////////////////////////////////////////////////////////////////
00931 // main function for the program
00932 /////////////////////////////////////////////////////////////////////
00933 int main(int argc, char **argv) {
00934     // Check parameters
00935 
00936     if (argc < 5) {
00937         cerr << "Usage: " << argv[0] << " [-mfi/-fci/-fi] [min sup (percent)] " << endl;
00938         cerr << "\t[-ascii/-binary] [input filename] " << endl;
00939         cerr << "\t[output filename (optional)]" << endl;
00940         cerr << "Ex: " << argv[0] << " -mfi .5 -ascii connect4.ascii mfi.txt" << endl;
00941         cerr << "Ex: " << argv[0] << " -mfi .3 -binary chess.binary" << endl;
00942         exit(0);
00943     }
00944 
00945     // time hook
00946     time(&total_start);
00947 
00948     // get the algorithm type
00949     method = argv[1];
00950 
00951     // Minimum support as a fraction
00952     MSF = atof(argv[2]);
00953     
00954     if (strcmp(argv[3], "-ascii") == 0)
00955         F1FromFile(argv[4], true);
00956     else if (strcmp(argv[3], "-binary") == 0)
00957         F1FromFile(argv[4], false);
00958     else {
00959         cerr << "File format must be -ascii or -binary" << endl;
00960         exit(1);
00961     }
00962 
00963     if (argc == 6) {
00964         outputMFI = true;
00965         outFilename = argv[5];
00966     }    
00967 
00968     // Create a null node to begin the DFS tree
00969     NullTrans = new Bitmap(TransCount);
00970     NullTrans->FillOnes();
00971     NullTrans->_count = TransCount;
00972     ItemSet NullList;
00973 
00974     MFIBySizes = new int[MAX_ITEMSET_SIZE];
00975     for (int h = 0; h < MAX_ITEMSET_SIZE; h++) {
00976         MFIBySizes[h] = 0;
00977     }
00978 
00979     // Set size of F1
00980     FullF1size = F1size = F1.size();
00981 
00982     // if F1 is not empty
00983     if (FullF1size != 0) {
00984 
00985         BaseBitmap *NullName = new BaseBitmap(FullF1size);
00986         MFI.reserve(100000);
00987 
00988         int p = 0;
00989         ItemMap = new int[FullF1size];
00990         ItemsetBuffy = new int[FullF1size];
00991 
00992         // Rename items in F1
00993         for (NodeList::iterator nli = F1.begin(); nli != F1.end(); nli++) {
00994             // store old itemid
00995             ItemMap[p] = (*nli)->Prefix;
00996 
00997             // assign new itemid
00998             (*nli)->Prefix = p;
00999 
01000             // assign name bitmaps
01001             (*nli)->Name = new BaseBitmap(FullF1size);
01002             (*nli)->Name->FillEmptyPosition(p);
01003             (*nli)->Name->_count = 1;
01004             p++;
01005         }
01006 
01007         // handle repeated itemsets
01008 
01009         MergeRepeatedItemsets();
01010         F1size = F1.size();
01011 
01012         // Create global tail
01013         maxtail = F1size * (F1size + 1) / 2;
01014         gTail = new TailElement[maxtail];
01015 
01016         // Create buffer for sorting
01017         TailBuffy = new TailElement[F1size];
01018 
01019         // Create buffer for estimating size of each subtree
01020         EstimateSize = (int)ceil(F1size / (double)EstimateDiv);
01021         EstimateBuffy = new SubtreeEstimate[EstimateSize];
01022         for (int estimateIndex = 0; estimateIndex < EstimateSize; estimateIndex++) {
01023             EstimateBuffy[estimateIndex].Count = 1;
01024             EstimateBuffy[estimateIndex].Sum = estimateIndex * EstimateDiv * estimateIndex * EstimateDiv / 2;
01025         }
01026 
01027         // Initialize global tail
01028         int uu;
01029         for (uu = 0; uu < maxtail; uu++) {
01030             gTail[uu].Item = -1;
01031             gTail[uu].Count = 0;
01032         }
01033 
01034         // Fill global tail
01035         for (uu = 0; uu < F1size; uu++) {
01036             gTail[uu].Item = uu;
01037             gTail[uu].Count = F1[uu]->Trans->_count;
01038 
01039             // assign tail index
01040             F1[uu]->tBegin = uu + 1;
01041             if (uu == F1size - 1)
01042                 F1[uu]->tBegin = -1;
01043 
01044             TempName = new BaseBitmap(FullF1size);
01045 
01046             // add a buffer element for each item in F1
01047             BaseBitmap *name = new BaseBitmap(FullF1size);
01048             NameBuffy.push_back(name);
01049             Bitmap *buff = new Bitmap(TransCount);
01050             TransBuffy.push_back(buff);
01051             TreeNode *newNode = new TreeNode();
01052             NodeBuffy.push_back(newNode);
01053         }
01054 
01055         srand(666);
01056         bool FHUT;
01057 
01058         // start algorithm timer
01059         clock_t start, finish;
01060         double duration = -1;
01061         start = clock();
01062 
01063         time(&algorithm_start);
01064 
01065         // create root node and its associated tail
01066         Root = new TreeNode(NullName, NullTrans, 0, 0, -1, 0, F1size);
01067 
01068         //Nothing is in MFI, so nothing is relevant
01069         Root->rBegin = 0;
01070         Root->rEnd = 0;
01071 
01072         // run the appropriate algorithm
01073         if (method.compare("-fci") == 0) {
01074             //cout << "running closure (FCI) algorithm..." << endl;
01075 
01076             GoFHUT = false;      // FHUT flag
01077             HUTMFI = false;      // HUTMFI flag
01078             PEPrune = true;      // PEPrune flag
01079             Reorder = true;      // Reorder flag
01080             MethodIsFCI = true;
01081 
01082             MAFIA(Root, false, FHUT, false);
01083         } else if (method.compare("-mfi") == 0) {
01084             //cout << "running MFI algorithm..." << endl;
01085             GoFHUT = true;      // FHUT flag
01086             HUTMFI = true;      // HUTMFI flag
01087             PEPrune = true;     // PEPrune flag
01088             Reorder = true;     // Reorder flag
01089 
01090             MAFIA(Root, true, FHUT, false);
01091         } else if (method.compare("-fi") == 0) {
01092             if (outputMFI) {
01093                 // open output file
01094                 outFile = new ItemsetOutput(outFilename);
01095                 if (!outFile->isOpen()) {
01096                     cerr << "Output file not open!" << endl;
01097                     exit(1);
01098                 }
01099             }
01100 
01101             //cout << "running FI algorithm..." << endl;
01102             GoFHUT = false;      // FHUT flag
01103             HUTMFI = false;      // HUTMFI flag
01104             PEPrune = false;     // PEPrune flag
01105             Reorder = false;     // Reorder flag
01106             MethodIsFI = true;
01107 
01108             MAFIA(Root, false, FHUT, false);
01109 
01110             if (outputMFI) {
01111                 delete outFile;
01112             }
01113         } else {
01114             cerr << "Invalid algorithm option!" << endl;
01115             exit(0);
01116         }
01117 
01118         finish = clock();
01119         duration = (finish - start) / (double)CLOCKS_PER_SEC;
01120 
01121         //printf( "Algorithm CPU time: %.3f seconds.\n", duration );
01122 
01123         time(&algorithm_finish);
01124         algorithm_time = difftime(algorithm_finish, algorithm_start);
01125         printf("Algorithm time:        %.2f seconds.\n", algorithm_time);
01126     } else {
01127         MFIBySizes[0]++;
01128     }    
01129 
01130     /*
01131     // Print out MFI length distribution
01132     for (int sizeIndex = 0; sizeIndex <= maxItemsetSize; sizeIndex++) {
01133         cout << sizeIndex << " " << MFIBySizes[sizeIndex] << endl;
01134     }
01135     */
01136 
01137     if (outputMFI && !MethodIsFI) {
01138         // PrintMFI to a file
01139         //cout << "Printing out the mfi..." << endl;
01140         time(&print_start);
01141 
01142         PrintMFI();
01143 
01144         time(&print_finish);
01145         print_time = difftime(print_finish, print_start);
01146         printf("Printing output time:  %.2f seconds.\n", print_time);
01147     }
01148 
01149     time(&total_finish);
01150     total_time = difftime(total_finish, total_start);
01151     printf("Total time:            %.2f seconds.\n\n", total_time);
01152 
01153     // output stat data
01154     cout << "MinSup:      " << MS << endl;
01155     cout << "TransCount:  " << TransCount << endl;
01156     cout << "F1size:      " << FullF1size << endl;
01157     cout << "MFISize:     " << MFI.size() << endl;
01158     cout << "MFIAvg:      " << MFIDepth / (double) MFISize << endl << endl;
01159 
01160     #ifdef DEBUG
01161 
01162     cout << "CountNodes: " << CountNodes << endl;
01163     cout << "CountAnds: " << CountAnds << endl;
01164     cout << "CountSmallAnds: " << CountSmallAnds << endl;
01165     cout << "CountRebuilds: " << CountRebuilds << endl;
01166     cout << "CountCounts: " << CountCounts << endl << endl;
01167     cout << "CountFHUT:   " << CountFHUT << endl;
01168     cout << "CountHUTMFI: " << CountHUTMFI << endl;
01169     cout << "CountHUTMFISuccess: " << CountHUTMFISuccess << endl;
01170     cout << "CountCheckPosition:   " << CountCheckPosition << endl;
01171     cout << "CountPEPrunes:        " << CountPEPrunes << endl << endl;
01172 
01173     #endif
01174 
01175     return 0;
01176 }
01177 /** @} */
01178 
01179 
01180 /////////////////////////////////////////////////////////////////////
01181 /// @mainpage MAFIA Code Documentation
01182 /// \image html MafiaLogo.jpg
01183 ///
01184 /// \section ContactSection Contact
01185 /// - Manuel Calimlim (calimlim@cs.cornell.edu)
01186 /// - Johannes Gehrke (johannes@cs.cornell.edu)
01187 ///
01188 /// \section DownloadSection Download
01189 /// - Main MAFIA webpage
01190 ///   http://himalaya-tools.sourceforge.net/Mafia
01191 ///
01192 /// \section LinuxCompilationSection Linux Compilation
01193 /// -# ./configure
01194 /// -# make
01195 ///     - executable is created as 'src/mafia'
01196 ///
01197 /// \section WindowsCompilationSection Windows Compilation
01198 /// -# Use cygwin and follow the Linux instructions
01199 /// or
01200 /// -# Use a Windows Compiler or IDE (such as Visual Studio)
01201 /// to compile the source code.
01202 ///
01203 /// \section DirectoryStructureSection Directory Structure
01204 /// <table border=1 cellspacing=5 cellpadding=5>
01205 /// <tr>
01206 /// <td>admin/</td>
01207 /// <td>Contains config files for compiling.  Should
01208 ///     not be altered.</td>
01209 /// </tr>
01210 /// <tr>
01211 /// <td>src/</td>
01212 /// <td>Contains all of source code for MAFIA</td>
01213 /// </tr>
01214 /// <tr>
01215 /// <td>src/Transaction.h
01216 /// <br>src/Transaction.cpp</td>
01217 /// <td>Class for reading transactions from ASCII datasets</td>
01218 /// </tr>
01219 /// <tr>
01220 /// <td>src/ItemsetOutput.h
01221 /// <br>src/ItemsetOutput.cpp</td>
01222 /// <td>Class for writing itemsets to file output</td>
01223 /// </tr>
01224 /// <tr>
01225 /// <td>src/BaseBitmap.h
01226 /// <br>src/BaseBitmap.cpp</td>
01227 /// <td>Simple bitmap class for name bitmaps</td>
01228 /// </tr>
01229 /// <tr>
01230 /// <td>src/Bitmap.h
01231 /// <br>src/Bitmap.cpp</td>
01232 /// <td>Main bitmap class for transaction bitmaps</td>
01233 /// </tr>
01234 /// <tr>
01235 /// <td>src/Mafia.cpp</td>
01236 /// <td>Main class file with most of the MAFIA code</td>
01237 /// </tr>
01238 /// <tr>
01239 /// <td>src/Tables.h</td>
01240 /// <td>Stores precomputed lookup tables (not included in
01241 ///    documentation due to very large tables)</td>
01242 /// </tr>
01243 /// <tr>
01244 /// <td>src/TreeNode.h</td>
01245 /// <td>Class for representing nodes in the search tree</td>
01246 /// </tr>
01247 /// <tr>
01248 /// <td>INSTALL</td>
01249 /// <td>Generic installation instructions for Linux/Unix</td>
01250 /// </tr>
01251 /// <tr>
01252 /// <td>mafia.{kdevprj,kdevses}</td>
01253 /// <td>KDevelop project files for Linux</td>
01254 /// </tr>
01255 /// <tr>
01256 /// <td>README</td>
01257 /// <td>Pointer this page</td>
01258 /// </tr>
01259 /// </table>
01260 ///
01261 /// \section UsageSection Program Usage
01262 /// <pre>
01263 /// Usage: mafia [-mfi/-fci/-fi] [min sup (percent)]
01264 ///         [-ascii/-binary] [input filename]
01265 ///         [output filename (optional)]
01266 /// Ex: mafia -mfi .5 -ascii connect4.ascii mfi.txt
01267 /// Ex: mafia -mfi .3 -binary chess.binary
01268 /// </pre>
01269 ///
01270 /// \section InputSection File Input
01271 /// Datasets can be in ASCII or binary format.
01272 /// For ASCII files, the file format must be:
01273 /// <pre>
01274 /// [item_id_1] [item_id_2] ... [item_id_n]
01275 /// </pre>
01276 ///
01277 /// Items do not have to be sorted within each transaction.
01278 /// Items are separated by spaces and each transaction
01279 /// should end with a newline, e.g.
01280 /// <pre>
01281 /// 1 4 2
01282 /// 2 8 9 4
01283 /// 2 5
01284 /// </pre>
01285 ///
01286 /// For binary files, the file format must be:
01287 /// <pre>
01288 /// [custid][transid][number of items][itemid_1][itemid_2] ... [itemid_n]
01289 /// </pre>
01290 ///
01291 /// The custid and transid numbers are ignored at this time.
01292 /// Since the file is in binary format, all numbers are read
01293 /// as integers.
01294 ///
01295 /// \section DatasetsSection Datasets
01296 /// Download <a href="http://sourceforge.net/project/showfiles.php?group_id=72667">Datasets-ascii.tar.gz</a>
01297 /// or <a href="http://sourceforge.net/project/showfiles.php?group_id=72667">Datasets-binary.tar.gz</a>
01298 /// for the full set of datasets used for testing:
01299 /// - chess.{ascii,binary}
01300 /// - connect4.{ascii,binary}
01301 /// - mushroom.{ascii,binary}
01302 /// - pumsb.{ascii,binary}
01303 /// - pumsb_star.{ascii,binary}
01304 ///
01305 /// \section OutputSection Program Output
01306 /// The frequent itemsets outputted by the program are
01307 /// in ASCII form with the following format:
01308 /// <pre>
01309 /// [item_id_1] [item_id_2] ... [item_id_n] [(support)]
01310 /// </pre>
01311 /// Ex:
01312 /// <pre>
01313 /// 28 64 42 60 40 29 52 58 (966)
01314 /// 46 64 42 3 25 9 5 48 66 56 34 62 7 36 60 40 29 52 58 (962)
01315 /// 39 36 40 29 52 58 (960)
01316 /// </pre>
01317 /////////////////////////////////////////////////////////////////////
01318 

Generated on Thu Dec 4 15:22:06 2003 for MAFIA by doxygen1.2.18