Main Page | Modules | Namespace List | Data Structures | File List | Data Fields | Globals

FileInput.cpp

Go to the documentation of this file.
00001 /*
00002  
00003 Copyright (c) 2004, Cornell University
00004 All rights reserved.
00005  
00006 Redistribution and use in source and binary forms, with or without
00007 modification, are permitted provided that the following conditions are met:
00008  
00009    - Redistributions of source code must retain the above copyright notice,
00010        this list of conditions and the following disclaimer.
00011    - Redistributions in binary form must reproduce the above copyright
00012        notice, this list of conditions and the following disclaimer in the
00013        documentation and/or other materials provided with the distribution.
00014    - Neither the name of Cornell University nor the names of its
00015        contributors may be used to endorse or promote products derived from
00016        this software without specific prior written permission.
00017  
00018 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00019 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00020 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00021 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00022 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00023 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00024 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00025 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00026 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00027 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
00028 THE POSSIBILITY OF SUCH DAMAGE.
00029  
00030 */
00031 
00032 
00033 /////////////////////////////////////////////////////////////////////
00034 /// FileInput.cpp
00035 /////////////////////////////////////////////////////////////////////
00036 //#define ___PREFIXSPAN___
00037 
00038 #include <fstream>
00039 #include <math.h>
00040 #include <stdio.h>
00041 #include "SeqBitmap.h"
00042 #include "Bitmap4.h"
00043 #include "Bitmap8.h"
00044 #include "Bitmap16.h"
00045 #include "Bitmap32.h"
00046 #include "Bitmap64.h"
00047 #include "DatasetInfo.h"
00048 #include "ResizableArray.h"
00049 #include "StringMap.h"
00050 #include <iostream>
00051 
00052 #define FILEERR_64TRANSACTIONS 1
00053 #define FILEERR_NOTFOUNDBINARY 2
00054 #define FILEERR_NOTFOUNDASCII 3
00055 
00056 /////////////////////////////////////////////////////////////////////
00057 /// @addtogroup FileInputGroup File Input Functions
00058 /// It contains all the functions necessary for reading in datasets.
00059 /** @{ */
00060 
00061 /// number of different bitmaps in SeqBitmap
00062 const int NUM_BITMAP = 5;
00063 
00064 /// the size of each customer data in each bitmap
00065 const int BITMAP_LENGTH[5] =
00066     {
00067         4, 8, 16, 32, 64
00068     };
00069 
00070 /// tempAndBitmap, specialBitmap, returnBitmap, SBitmap
00071 const int NUM_BITMAPS_USED = 4;
00072 
00073 /// Maximum length of strings used to represent
00074 /// customers, transactions, and items when -str is used
00075 const int MAX_STRING_SIZE = 256;
00076 
00077 /////////////////////////////////////////////////////////////////////
00078 /// It prints an error related to the data file input
00079 ///
00080 /// @param errorType         the type of file read error encountered
00081 ///
00082 /////////////////////////////////////////////////////////////////////
00083 void PrintFileReadError(int errorType)
00084 {
00085     cerr << "\nInput file error:\n\n";
00086 
00087     switch (errorType)
00088     {
00089     case FILEERR_64TRANSACTIONS:
00090         cerr << "A customer has more than 64 transactions.\n";
00091         break;
00092     case FILEERR_NOTFOUNDBINARY:
00093         cerr << "The input file either does not exist, or is not\n";
00094         cerr << "a valid binary input file (Perhaps try running\n";
00095         cerr << "SPAM with the -ascii flag to see if it is an\n";
00096         cerr << "ASCII input file)\n";
00097         break;
00098     case FILEERR_NOTFOUNDASCII:
00099         cerr << "The input file either does not exist, or is not\n";
00100         cerr << "a valid ascii input file. If the input file was\n";
00101         cerr << "automatically generated using a program like AssocGen,\n";
00102         cerr << "perhaps try running SPAM without the -ascii flag to\n";
00103         cerr << "see if it is a binary input file.\n";
00104         break;
00105     default:
00106         cerr << "Unknown file read error.\n";
00107     }
00108 
00109     cerr << "\nNotes about the input file:\n";
00110     cerr << "The input file should be an ASCII text file containing\n";
00111     cerr << "three integers, separated by spaces, on each line:\n";
00112     cerr << "<Customer ID> <Transaction ID> <Item ID>\n";
00113     cerr << "Customer IDs and Item IDs should be assigned relative to\n";
00114     cerr << "the overall transactional database. Transaction IDs should\n";
00115     cerr << "be assigned relative to the customer they belong to. Each\n";
00116     cerr << "customer can have no more than 64 transactions. Make sure to\n";
00117     cerr << "use the -ascii flag, since the input is ASCII text.\n";
00118 
00119     exit(0);
00120 }
00121 
00122 /////////////////////////////////////////////////////////////////////
00123 /// It resizes the array to have an increased size.
00124 ///
00125 /// @param array             pointer to the array
00126 /// @param oldSize           the size of the array
00127 /// @param newSize           the desired size of the array
00128 /////////////////////////////////////////////////////////////////////
00129 void IncArraySize(int*& array, int oldSize, int newSize)
00130 {
00131     int i;
00132 
00133     // create a new array and copy data to the new one
00134     int *newArray = new int[newSize];
00135     for (i = 0;i < oldSize;i++)
00136         newArray[i] = array[i];
00137     for (i = oldSize;i < newSize;i++)
00138         newArray[i] = 0;
00139 
00140     // deallocate the old array and redirect the pointer to the new one
00141     delete [] array;
00142     array = newArray;
00143 }
00144 
00145 /////////////////////////////////////////////////////////////////////
00146 /// It collects information about a binary data file. It finds the number of
00147 /// customers, the number of items, the number of transactions in the dataset.
00148 /// It also finds the number of transactions each customer has and the number
00149 /// of customers having a particular item in their transactions.
00150 ///
00151 /// Note:
00152 ///  - Memory should not be allocated for custTransCount and itemCustCount
00153 ///    before calling this function.
00154 ///  - Memory have to be deallocated for custTransCount and itemCustCount by
00155 ///    the caller.
00156 ///  - File format: [custID] [transID] [number of item] [itemID1, ...]
00157 ///    [custID] ...
00158 ///  - Assume that transactions of a customer appears together in the file.
00159 ///    It assures that the following case will not happen:
00160 ///   [custID-1]... [custID-1]... [custID-2]... [custID-1]...
00161 ///
00162 /// @param filename          the filename of the data file
00163 /// @param custCount         [output] the number of customers
00164 /// @param itemCount         [output] the number of items
00165 /// @param transCount        [output] the number of transactions
00166 /// @param custTransCount    [output] number of transactions each customer has
00167 /// @param itemCustCount     [output] number of customers having each item
00168 ///                                   in their transactions
00169 /// @param cids [output] customer ids exactly as they appear in the file
00170 /// @param tids   [output] transaction ids exactly as they appear in the file
00171 /// @param iids   [output] item ids exactly as they appear in the file
00172 /// @param transLens [output] transaction lengths exactly as they appear in the file
00173 /// @param overallCount   [output] length of the cids, tids, and iids arrays
00174 /// @param transLensLength [output] length of transLens array
00175 /// @return true - if the reading is successful.
00176 ///         false - if there is an error in the reading process.
00177 /////////////////////////////////////////////////////////////////////
00178 bool CollectBinaryInfo(
00179     char* filename,
00180     int& custCount,
00181     int& itemCount,
00182     int& transCount,
00183     int*& custTransCount,
00184     int*& itemCustCount,
00185     int*& cids,
00186     int*& tids,
00187     int*& iids,
00188     int*& transLens,
00189     int& overallCount,
00190     int& transLensLength)
00191 {
00192 
00193     // --- Variables
00194     // initialize local variables
00195     ResizableArray * cidArr = new ResizableArray(64);
00196     ResizableArray * tidArr = new ResizableArray(64);
00197     ResizableArray * iidArr = new ResizableArray(64);
00198     ResizableArray * transLensArr = new ResizableArray(64);
00199     int custID;                   // current customer ID
00200     int transID;                  // current transaction ID
00201     int numItem;                  // number of items
00202     int *itemlist;                // list of items in the current transaction
00203     ifstream inFile;              // file handler
00204     int custTransSize = 400;      // size of the custTransCount array
00205     int itemCustSize = 400;       // size of the itemCustCount array
00206     bool useStdin;                // Whether or not the input is coming from stdin
00207 
00208     // --- Read File
00209     // open the binary file
00210     if (filename == 0)
00211         useStdin = true;
00212     else
00213         useStdin = false;
00214 
00215     if (!useStdin)
00216     {
00217         inFile.open(filename, ios::binary);
00218         if (!inFile.is_open())
00219         {
00220             PrintFileReadError(FILEERR_NOTFOUNDBINARY);
00221             return false;
00222         }
00223     }
00224 
00225     // initialize output variables
00226     // XXX- do we want these initially set to -1?
00227     custCount = -1;               // # of customers in the dataset (largest ID)
00228     itemCount = -1;               // # of items in the dataset (largest ID)
00229     transCount = 0;               // number of transaction
00230     custTransCount = new int[custTransSize];
00231     itemCustCount = new int[itemCustSize];
00232 
00233     for (int cti = 0; cti < custTransSize; cti++)
00234         custTransCount[cti] = 0;
00235 
00236     for (int ici = 0; ici < itemCustSize; ici++)
00237         itemCustCount[ici] = 0;
00238 
00239     // this array stores the ID of the previous customer we have scanned and
00240     // has a certain item in his/her transactions.
00241     int *itemPrevCustID = new int[itemCustSize];
00242     for (int ipi = 0; ipi < itemCustSize; ipi++)
00243         itemPrevCustID[ipi] = -1;
00244 
00245     // read data
00246     while ((!useStdin && !inFile.eof()) || (useStdin && !cin.eof()))
00247     {
00248 
00249         if (useStdin)
00250         {
00251             // infomation about a transaction
00252             cin.read((char *)&custID, sizeof(int));
00253             cin.read((char *)&transID, sizeof(int));
00254             cin.read((char *)&numItem, sizeof(int));
00255 
00256             itemlist = new int[numItem];
00257 
00258             // read in the items of the transaction
00259             cin.read((char *)itemlist, numItem * sizeof(int));
00260         }
00261         else
00262         {
00263             // infomation about a transaction
00264             inFile.read((char *)&custID, sizeof(int));
00265             inFile.read((char *)&transID, sizeof(int));
00266             inFile.read((char *)&numItem, sizeof(int));
00267 
00268             itemlist = new int[numItem];
00269 
00270             // read in the items of the transaction
00271             inFile.read((char *)itemlist, numItem * sizeof(int));
00272         }
00273 
00274         // just in case we reach the end of the file
00275         if ((!useStdin && inFile.eof()) || (useStdin && cin.eof()))
00276         {
00277             delete [] itemlist;
00278             break;
00279         }
00280 
00281         transLensArr->Add(numItem);
00282         for (int i = 0; i < numItem; i++)
00283         {
00284             // Copy the line of data into our resizable arrays
00285             cidArr->Add(custID);
00286             tidArr->Add(transID);
00287             iidArr->Add(itemlist[i]);
00288         }
00289 
00290 
00291         // -- update the statistcs about customers
00292         if (custID >= custCount)
00293         {
00294             custCount = custID + 1;
00295 
00296             // if custTransCount is not big enough, reallocate memory
00297             if (custCount > custTransSize)
00298             {
00299                 int newSize = (custCount > 2 * custTransSize) ?
00300                               custCount : 2 * custTransSize;
00301                 IncArraySize(custTransCount, custTransSize, newSize);
00302                 custTransSize = newSize;
00303             }
00304         }
00305         custTransCount[custID]++;
00306         transCount++;
00307 
00308 
00309         // -- update the statistics about items
00310         for (int ici = 0; ici < numItem; ici++)
00311         {
00312             if (itemlist[ici] >= itemCount)
00313                 itemCount = itemlist[ici] + 1;
00314         }
00315 
00316         // make sure itemCustCount and prevCust are large enough
00317         if (itemCount >= itemCustSize)
00318         {
00319             int newSize = (itemCount > 2 * itemCustSize) ?
00320                           itemCount : 2 * itemCustSize;
00321             IncArraySize(itemCustCount, itemCustSize, newSize);
00322             IncArraySize(itemPrevCustID, itemCustSize, newSize);
00323             itemCustSize = newSize;
00324         }
00325 
00326         for (int itemIndex = 0; itemIndex < numItem; itemIndex++)
00327         {
00328             // update itemCustCount only if the item is from a diff customer
00329             if (itemPrevCustID[itemlist[itemIndex]] != custID)
00330             {
00331                 itemCustCount[itemlist[itemIndex]]++;
00332                 itemPrevCustID[itemlist[itemIndex]] = custID;
00333             }
00334         }
00335 
00336         delete [] itemlist;
00337     }
00338 
00339     delete [] itemPrevCustID;
00340     if (!useStdin)
00341         inFile.close();
00342 
00343     // Copy the resizable array contents to the arrays containing
00344     // the in-memory cid/tid/iid lists
00345     cidArr->ToArray(cids, overallCount);
00346     tidArr->ToArray(tids, overallCount);
00347     iidArr->ToArray(iids, overallCount);
00348     transLensArr->ToArray(transLens, transLensLength);
00349     delete cidArr;
00350     delete tidArr;
00351     delete iidArr;
00352     delete transLensArr;
00353 
00354     return true;
00355 }
00356 
00357 /////////////////////////////////////////////////////////////////////
00358 /// It collects information about an ASCII data file. It finds the number of
00359 /// customers, the number of items, the number of line in the file. It
00360 /// also finds the number of transactions each customer has and the number of
00361 /// customers having a particular item in their transactions.
00362 ///
00363 /// Note:
00364 ///  - Memory should not be allocated for custTransCount and itemCustCount
00365 ///    before calling this function.
00366 ///  - Memory have to be deallocated for custTransCount and itemCustCount by
00367 ///    the caller.
00368 ///  - File format: [custID] [transID] [itemID]
00369 ///        [custID] [transID] [itemID] ...
00370 ///  - Assume that transactions of a customer appears together in the file.
00371 ///    It assures that the following case will not happen:
00372 ///    [custID-1]... [custID-1]... [custID-2]... [custID-1]...
00373 ///  - Assume that items of a transactions appears together in the file.
00374 ///    It assures that the following case will not happen:
00375 ///   ... [transID-1] ...
00376 ///   ... [transID-2] ...
00377 ///   ... [transID-1] ...
00378 ///
00379 /// @param filename          the filename of the data file
00380 /// @param isStringFile      whether the input file contains integers or the string names
00381 /// @param custStrMap        [output] Maps cust IDs to strings (only used when isStringFile == true)
00382 /// @param transStrMap       [output] Maps trans IDs to strings (only used when isStringFile == true)
00383 /// @param itemStrMap        [output] Maps item IDs to strings (only used when isStringFile == true)
00384 /// @param custCount         [output] the number of customers
00385 /// @param itemCount         [output] the number of items
00386 /// @param lineCount         [output] the number of line in the file
00387 /// @param custTransCount    [output] number of transactions each customer has
00388 /// @param itemCustCount     [output] number of customers having each item in
00389 ///                                   their transactions
00390 /// @param cids [output] customer ids exactly as they appear in the file
00391 /// @param tids   [output] transaction ids exactly as they appear in the file
00392 /// @param iids   [output] item ids exactly as they appear in the file
00393 /// @param overallCount   [output] length of the cids, tids, and iids arrays
00394 ///
00395 /// @return true - if the reading is successful.
00396 ///         false - if there is an error in the reading process
00397 /////////////////////////////////////////////////////////////////////
00398 bool CollectASCIIInfo(
00399     char* filename,
00400     bool isStringFile,
00401     StringMap*& custStrMap,
00402     StringMap*& transStrMap,
00403     StringMap*& itemStrMap,
00404     int& custCount,
00405     int& itemCount,
00406     int& lineCount,
00407     int*& custTransCount,
00408     int*& itemCustCount,
00409     int*& cids,
00410     int*& tids,
00411     int*& iids,
00412     int& overallCount)
00413 {
00414 
00415     // --- Variables
00416     // initialize local variables
00417     ResizableArray * cidArr = new ResizableArray(64);
00418     ResizableArray * tidArr = new ResizableArray(64);
00419     ResizableArray * iidArr = new ResizableArray(64);
00420     int custID;                   // current customer ID
00421     int transID;                  // current transaction ID
00422     int itemID;                   // current item ID
00423     int prevTransID = -1;         // previous transaction ID
00424     ifstream inFile;              // file handler
00425     int custTransSize = 400;      // size of the custTransCount array
00426     int itemCustSize = 400;       // size of the itemCustCount array
00427     int i;                        // counter
00428     bool useStdin;                // whether or not the input will come from stdin
00429     int custStrMapID = 1;
00430     int transStrMapID = 1;
00431     int itemStrMapID = 1;
00432 
00433     // If we are mapping strings to IDs, initialize the StringMaps
00434     if (isStringFile)
00435     {
00436         custStrMap = new StringMap();
00437         transStrMap = new StringMap();
00438         itemStrMap = new StringMap();
00439     }
00440 
00441     // --- Read File
00442     // open the ASCII file; if filename is null then use stdin
00443     if (filename == 0)
00444         useStdin = true;
00445     else
00446         useStdin = false;
00447 
00448     if (!useStdin)
00449     {
00450         inFile.open(filename);
00451         if (!inFile.is_open())
00452         {
00453             PrintFileReadError(FILEERR_NOTFOUNDASCII);
00454             return false;
00455         }
00456     }
00457 
00458     // initialize output variables
00459     custCount = -1;               // # of customers in the dataset (largest ID)
00460     itemCount = -1;               // # of items in the dataset (largest ID)
00461     lineCount = 0;                // number of transaction
00462     custTransCount = new int[custTransSize];
00463     itemCustCount = new int[itemCustSize];
00464     for (i = 0; i < custTransSize; i++)
00465         custTransCount[i] = 0;
00466     for (i = 0; i < itemCustSize; i++)
00467         itemCustCount[i] = 0;
00468 
00469 
00470     // this array stores the ID of the previous customer we have scanned and
00471     // has a certain item in his/her transactions.
00472     int *itemPrevCustID = new int[itemCustSize];
00473     for (i = 0; i < itemCustSize; i++)
00474         itemPrevCustID[i] = -1;
00475 
00476     // read data
00477     while ((!useStdin && !inFile.eof()) || (useStdin && !cin.eof()))
00478     {
00479         // read in the transaction
00480         if (isStringFile)
00481         {
00482             // Read in the 3 strings corresponding to customer name,
00483             // transaction name (will probably be an int but we will map it to a string anyway)
00484             // and item name.
00485             char *custStr = new char[MAX_STRING_SIZE];
00486             char *transStr = new char[MAX_STRING_SIZE];
00487             char *itemStr = new char[MAX_STRING_SIZE];
00488             if (useStdin)
00489             {
00490                 cin.getline(custStr, MAX_STRING_SIZE);
00491                 cin.getline(transStr, MAX_STRING_SIZE);
00492                 cin.getline(itemStr, MAX_STRING_SIZE);
00493             }
00494             else
00495             {
00496                 inFile.getline(custStr, MAX_STRING_SIZE);
00497                 inFile.getline(transStr, MAX_STRING_SIZE);
00498                 inFile.getline(itemStr, MAX_STRING_SIZE);
00499             }
00500 
00501             // If each string was found in our map, reuse its ID
00502             // Otherwise assign it a new ID
00503             const int * custKeyID = custStrMap->GetKey(custStr);
00504             const int * transKeyID = transStrMap->GetKey(transStr);
00505             const int * itemKeyID = itemStrMap->GetKey(itemStr);
00506             if (custKeyID != 0)
00507                 custID = *custKeyID;
00508             else
00509             {
00510                 custID = custStrMapID;
00511                 custStrMap->Add(custID, custStr);
00512                 custStrMapID++;
00513             }
00514             if (transKeyID != 0)
00515                 transID = *transKeyID;
00516             else
00517             {
00518                 transID = transStrMapID;
00519                 transStrMap->Add(transID, transStr);
00520                 transStrMapID++;
00521             }
00522             if (itemKeyID != 0)
00523                 itemID = *itemKeyID;
00524             else
00525             {
00526                 itemID = itemStrMapID;
00527                 itemStrMap->Add(itemID, itemStr);
00528                 itemStrMapID++;
00529             }
00530         }
00531         else
00532         {
00533             if (useStdin)
00534             {
00535                 cin >> custID;
00536                 cin >> transID;
00537                 cin >> itemID;
00538             }
00539             else
00540             {
00541                 inFile >> custID;
00542                 inFile >> transID;
00543                 inFile >> itemID;
00544             }
00545         }
00546 
00547         // Copy the line of data into our resizable arrays
00548         cidArr->Add(custID);
00549         tidArr->Add(transID);
00550         iidArr->Add(itemID);
00551 
00552         // -- update the statistcs about customers
00553         if (custID >= custCount)
00554         {
00555             custCount = custID + 1;
00556 
00557             // make sure custTransCount is big enough
00558             if (custCount > custTransSize)
00559             {
00560                 int newSize = (custCount > 2 * custTransSize) ?
00561                               custCount : 2 * custTransSize;
00562                 IncArraySize(custTransCount, custTransSize, newSize);
00563                 custTransSize = newSize;
00564             }
00565         }
00566 
00567         // increment custTransCount only if it's a different transaction
00568         if (prevTransID != transID)
00569         {
00570             custTransCount[custID]++;
00571             prevTransID = transID;
00572         }
00573         lineCount++;
00574 
00575         // -- update the statistics about items
00576         if (itemID >= itemCount)
00577         {
00578             itemCount = itemID + 1;
00579 
00580             // make sure itemCustCount is large enough
00581             if (itemCount >= itemCustSize)
00582             {
00583                 int newSize = (itemCount > 2 * itemCustSize) ?
00584                               itemCount : 2 * itemCustSize;
00585                 IncArraySize(itemCustCount, itemCustSize, newSize);
00586                 IncArraySize(itemPrevCustID, itemCustSize, newSize);
00587                 itemCustSize = newSize;
00588             }
00589         }
00590 
00591         // update itemCustCount only if the item is from a different customer
00592         if (itemPrevCustID[itemID] != custID)
00593         {
00594             itemCustCount[itemID]++;
00595             itemPrevCustID[itemID] = custID;
00596         }
00597     }
00598 
00599     delete [] itemPrevCustID;
00600     if (!useStdin)
00601         inFile.close();
00602 
00603     // Copy the resizable array contents to the arrays containing
00604     // the in-memory cid/tid/iid lists
00605     cidArr->ToArray(cids, overallCount);
00606     tidArr->ToArray(tids, overallCount);
00607     iidArr->ToArray(iids, overallCount);
00608     delete cidArr;
00609     delete tidArr;
00610     delete iidArr;
00611 
00612     return true;
00613 }
00614 
00615 /////////////////////////////////////////////////////////////////////
00616 /// It reads stores the binary data file.
00617 ///
00618 /// Note:
00619 ///  - File format: [custID] [transID] [number of item] [itemID1, ...]
00620 ///    [custID] ...
00621 ///  - Assume that transactions of a customer appears together in the file.
00622 ///    It assures that the following case will not happen:
00623 ///    [custID-1]... [custID-1]... [custID-2]... [custID-1]...
00624 ///
00625 /// @param filename          the filename of the data file
00626 /// @param cids              the list of customer IDs as it appears in the file
00627 /// @param tids              the list of transaction IDs
00628 /// @param iids              the list of item IDs
00629 /// @param numEntries        the length of the arrays cids, tids, and iids
00630 /// @param transLens         the list of transaction lengths
00631 /// @param transLensLength   the length of the list transLens
00632 /// @param custBitmapMap     map from custID to bitmapID
00633 /// @param custMap           the mapping of custID from external naming
00634 ///                              to internal naming
00635 /// @param itemMap           the mapping of itemID from external naming
00636 ///                              to internal naming
00637 /// @param f1Buff            [output] buffer for frequent-one item sets
00638 ///
00639 /// @return true - if the reading is successful.
00640 ///         false - if there is an error in the reading process
00641 /////////////////////////////////////////////////////////////////////
00642 bool ReadBinary(
00643     char* filename,
00644     int* cids,
00645     int* tids,
00646     int* iids,
00647     int numEntries,
00648     int* transLens,
00649     int transLensLength,
00650     int* custBitmapMap,
00651     int** custMap,
00652     int* itemMap,
00653     SeqBitmap** f1Buff)
00654 {
00655 
00656     // --- Variables
00657     // initialize local variables
00658     int custID;              // current customer ID
00659     int transID;             // current transaction ID
00660     int numItem;             // number of items
00661     int *itemlist;           // list of items in the current transaction
00662     int prevCustID = -1;
00663     int bitmapID = 0;
00664     int index = 0;
00665     ifstream inFile;         // file handler
00666 
00667     // --- Read File
00668     // Always do a second scan for binary files
00669     bool secondScan = false;
00670     int lenIndex = 0;
00671     int scanIndex = 0;
00672 
00673     if (secondScan)
00674     {
00675         if (filename == 0)
00676         {
00677             cout << "Error: cannot read input a second time when -stdin is on";
00678             exit(-1);
00679         }
00680 
00681         // open the binary file
00682         inFile.open(filename, ios::binary);
00683         if (!inFile.is_open())
00684         {
00685             return false;
00686         }
00687     }
00688 
00689     while (     (secondScan && !inFile.eof())
00690                 || (!secondScan && scanIndex < numEntries) )
00691     {
00692         if (secondScan)
00693         {
00694             inFile.read((char *)&custID, sizeof(int));
00695             inFile.read((char *)&transID, sizeof(int));
00696             inFile.read((char *)&numItem, sizeof(int));
00697             itemlist = new int[numItem];
00698 
00699             // read in the items of the transaction
00700             inFile.read((char *)itemlist, numItem * sizeof(int));
00701 
00702             // just in case
00703             if (inFile.eof())
00704                 break;
00705         }
00706         else
00707         {
00708             numItem = transLens[lenIndex];
00709             itemlist = new int[numItem];
00710             custID = cids[scanIndex];
00711             transID = tids[scanIndex];
00712             for (int i = 0; i < numItem; i++)
00713                 itemlist[i] = iids[scanIndex + i];
00714             scanIndex+=numItem;
00715             lenIndex++;
00716         }
00717 
00718         if (custID != prevCustID)
00719         {
00720             prevCustID = custID;
00721             bitmapID = custBitmapMap[custID];
00722             index = custMap[bitmapID][custID] * BITMAP_LENGTH[bitmapID];
00723         }
00724 
00725         // Fill in transaction bit for appropriate bitmaps
00726         for (int j = 0; j < numItem; j++)
00727         {
00728             if (itemMap[itemlist[j]] >= 0)
00729                 f1Buff[itemMap[itemlist[j]]]->
00730                 FillEmptyPosition(bitmapID, index);
00731         }
00732 
00733 
00734         index++;
00735         delete [] itemlist;
00736     }
00737 
00738     if (secondScan)
00739         inFile.close();
00740     return true;
00741 }
00742 
00743 /////////////////////////////////////////////////////////////////////
00744 /// It reads stores the ASCII data file.
00745 ///
00746 /// Note:
00747 ///  - File format: [custID] [transID] [itemID]
00748 ///        [custID] [transID] [itemID]
00749 ///        ...
00750 ///  - Assume that transactions of a customer appears together in the file.
00751 ///    It assures that the following case will not happen:
00752 ///   [custID-1]... [custID-1]... [custID-2]... [custID-1]...
00753 ///  - Assume that items of a transactions appears together in the file.
00754 ///    It assures that the following case will not happen:
00755 ///   ... [transID-1] ...
00756 ///   ... [transID-2] ...
00757 ///   ... [transID-1] ...
00758 ///
00759 /// @param filename          the filename of the data file
00760 /// @param cids              the list of customer IDs as it appears in the file
00761 /// @param tids              the list of transaction IDs
00762 /// @param iids              the list of item IDs
00763 /// @param numEntries        the length of the arrays cids, tids, and iids
00764 /// @param custBitmapMap     map from custID to bitmapID
00765 /// @param custMap           the mapping of custID from external naming
00766 ///                              to internal naming
00767 /// @param itemMap           the mapping of itemID from external naming
00768 ///                              to internal naming
00769 /// @param f1Buff            [output] buffer for frequent-one item sets
00770 ///
00771 /// @return true - if the reading is successful.
00772 ///         false - if there is an error in the reading process
00773 /////////////////////////////////////////////////////////////////////
00774 bool ReadASCII(
00775     char* filename,
00776     int* cids,
00777     int* tids,
00778     int* iids,
00779     int numEntries,
00780     int* custBitmapMap,
00781     int** custMap,
00782     int* itemMap,
00783     SeqBitmap** f1Buff)
00784 {
00785 
00786     // --- Variables
00787     // initialize local variables
00788     int custID;              // current customer ID
00789     int transID;             // current transaction ID
00790     int itemID;              // current item ID
00791     int prevTransID = -1;    // previous transaction ID
00792     int prevCustID = -1;     // previous customer ID
00793     int bitmapID = 0;        // bitmap used to store current customer data
00794     int index = 0;
00795     ifstream inFile;         // file handler
00796 
00797     // --- Read File
00798     bool secondScan = false;
00799     int scanIndex = 0;
00800 
00801     if (secondScan)
00802     {
00803         if (filename == 0)
00804         {
00805             cout << "Error: cannot read input a second time when -stdin is on";
00806             exit(-1);
00807         }
00808         // open the ASCII file
00809         inFile.open(filename);
00810         if (!inFile.is_open())
00811         {
00812             return false;
00813         }
00814     }
00815 
00816     // read and store data
00817     while (   (secondScan && !inFile.eof())
00818               || (!secondScan && scanIndex < numEntries))
00819     {
00820 
00821         if (secondScan)
00822         {
00823             // read in the transaction
00824             inFile >> custID;
00825 
00826             if (inFile.eof())
00827                 break;
00828 
00829             inFile >> transID;
00830             inFile >> itemID;
00831         }
00832         else
00833         {
00834             custID = cids[scanIndex];
00835             transID = tids[scanIndex];
00836             itemID = iids[scanIndex];
00837             scanIndex++;
00838         }
00839 
00840         if (custID != prevCustID)
00841         {
00842             prevCustID = custID;
00843             bitmapID = custBitmapMap[custID];
00844             index = custMap[bitmapID][custID] * BITMAP_LENGTH[bitmapID] - 1;
00845         }
00846 
00847         if (prevTransID != transID)
00848         {
00849             index++;
00850             prevTransID = transID;
00851         }
00852 
00853         // Fill in transaction bit for appropriate bitmaps
00854         if (itemMap[itemID] >= 0)
00855             f1Buff[itemMap[itemID]]->FillEmptyPosition(bitmapID, index);
00856     }
00857 
00858     if (secondScan)
00859         inFile.close();
00860 
00861     return true;
00862 }
00863 
00864 /////////////////////////////////////////////////////////////////////
00865 /// It reads the input file and finds the frequent-1 itemsets.
00866 ///
00867 /// @param isBinaryFile      whether the input file is a binary data file
00868 /// @param isStringFile      whether the input file contains integers or the string names
00869 /// @param filename          the filename of the data file
00870 /// @param minSupPercent     the minimum support percentage
00871 /// @param custStrMap        [output] Maps cust IDs to strings (only used when isStringFile == true)
00872 /// @param transStrMap       [output] Maps trans IDs to strings (only used when isStringFile == true)
00873 /// @param itemStrMap        [output] Maps item IDs to strings (only used when isStringFile == true)
00874 ///
00875 /// @return DatasetInfo - the information gathered from the dataset
00876 /////////////////////////////////////////////////////////////////////
00877 DatasetInfo* ReadDataset(
00878     bool isBinaryFile,
00879     bool isStringFile,
00880     char *filename,
00881     double minSupPercent,
00882     StringMap *&custStrMap,
00883     StringMap *&transStrMap,
00884     StringMap *&itemStrMap)
00885 {
00886 
00887     DatasetInfo* info = new DatasetInfo(); // dataset info
00888 
00889     // for the first pass of the dataset
00890     int tempCustCount = -1;  // # of customers in the dataset (largest ID)
00891     int itemCount = -1;      // # of items in the dataset (largest ID)
00892     int transCount = 0;      // number of transaction
00893     int *custTransCount;     // number of transactions each customer has
00894     int *itemCustCount;      // number of customers having that item in
00895     //     their transactions
00896 
00897     // for post-processing of the info of the dataset
00898     int *bitmapSizes;        // number of customer in each bitmap
00899     //     (4, 8, 16, 32, and 32+)
00900     int *custBitmapMap;      // map customers to bitmapIDs
00901     int **custMap;           // map bitmapID and custID to index
00902 
00903     int noTransCount = 0;    // the number of customers with no transaction
00904     int *itemMap;            // items renaming information
00905 
00906     int numCompression;      // maximum number of compression that we will do
00907 
00908     bool result = false;     // true if successfully read in data
00909     int i, j;                // temp variables
00910 
00911     // ----------------------- First scan of the data ----------------------//
00912     // This is the first scan of the database. In this scan, we gather
00913     // information about the dataset, for example, the number of items, the
00914     // number of customers, and the number of transaction for each customers.
00915     // ---------------------------------------------------------------------//
00916 
00917     // Copy the file's contents into memory so we can avoid a second file scan
00918     int *cids;
00919     int *tids;
00920     int *iids;
00921     int numEntries;
00922     int *transLens;
00923     int transLensLength;
00924 
00925     if (isBinaryFile)
00926     {
00927         result = CollectBinaryInfo(
00928                      filename,
00929                      tempCustCount,
00930                      itemCount,
00931                      transCount,
00932                      custTransCount,
00933                      itemCustCount,
00934                      cids,
00935                      tids,
00936                      iids,
00937                      transLens,
00938                      numEntries,
00939                      transLensLength);
00940     }
00941     else
00942     {
00943         result = CollectASCIIInfo(
00944                      filename,
00945                      isStringFile,
00946                      custStrMap,
00947                      transStrMap,
00948                      itemStrMap,
00949                      tempCustCount,
00950                      itemCount,
00951                      transCount,
00952                      custTransCount,
00953                      itemCustCount,
00954                      cids,
00955                      tids,
00956                      iids,
00957                      numEntries);
00958     }
00959     if (!result)
00960     {
00961         delete info;
00962         return 0;
00963     }
00964 
00965     // ----------------------- First scan of the data ----------------------//
00966     // We process the data so that we can effectively store the data.
00967     // ----------------------- First scan of the data ----------------------//
00968 
00969     // *-- customer info --*
00970     // This section:
00971     // - finds out the max number of transactions and number of customers
00972     //   who have transactions in the dataset.
00973     // - creates a mapping which maps ids of customers who have transations
00974     //   in the dataset to bitmapIDs. Currently, we use five bitmap sizes:
00975     //   4, 8, 16, 32, and 64.
00976     // - finds the size of each bitmaps
00977     // - given bitmapIDs of customers, creates a mapping which maps ids of
00978     //   customers and bitmapIDs to a set of integers, which is the index of
00979     //   the customer in each bitmap.
00980 
00981 
00982     // initialize variables
00983     info->maxCustTrans = 0;
00984     noTransCount = 0;
00985     custBitmapMap = new int[tempCustCount]; // custID --> bitmapID
00986     bitmapSizes = new int[NUM_BITMAP];      // bitmap size
00987     custMap = new int * [NUM_BITMAP];       // bitmapID, custID -->
00988     //     index in bitmapID
00989     for (i = 0; i < NUM_BITMAP; i++)
00990     {
00991         custMap[i] = new int[tempCustCount];
00992         bitmapSizes[i] = 0;
00993     }
00994 
00995     // create the maps
00996     for (i = 0; i < tempCustCount; i++)
00997     {
00998         if (custTransCount[i] > BITMAP_LENGTH[NUM_BITMAP - 1])
00999         {
01000             // Spam does not support customers that have more than 64
01001             // transactions; give the user an error message.
01002             PrintFileReadError(FILEERR_64TRANSACTIONS);
01003         }
01004 
01005         if (custTransCount[i] <= 0)
01006         {
01007             // this customer doesn't have transactions, we won't consider it
01008             custBitmapMap[i] = -1;
01009             noTransCount++;
01010         }
01011         else
01012         {
01013             // find out which bitmap this customer fit in (find the bitmapID)
01014             for (j = 0; j < NUM_BITMAP; j++)
01015                 if (custTransCount[i] <= BITMAP_LENGTH[j])
01016                 {
01017                     custBitmapMap[i] = j;
01018                     break;
01019                 }
01020         }
01021 
01022         // create the mapping from bitmapID & custID to index in the bitmap
01023         // with bitmapID
01024         for (j = 0; j < NUM_BITMAP; j++)
01025             if (custBitmapMap[i] == j)
01026             {
01027                 custMap[j][i] = bitmapSizes[j];
01028                 bitmapSizes[j]++;
01029             }
01030             else
01031                 custMap[j][i] = -1;
01032 
01033         // find the max number of transactions
01034         if (custTransCount[i] > info->maxCustTrans && custTransCount[i] <= 64)
01035             info->maxCustTrans = custTransCount[i];
01036     }
01037 
01038     // actual number of customers
01039     info->custCount = tempCustCount - noTransCount;
01040     info->bitmapSizes = new int[NUM_BITMAP];
01041     for (i = 0; i < NUM_BITMAP; i++)
01042         info->bitmapSizes[i] = bitmapSizes[i];
01043 
01044     // *-- minimum support --*
01045     // user-specified minimum support; if the minSup is not greater than 0,
01046     // we assume it to be 1
01047 
01048 #ifdef ___PREFIXSPAN___
01049 
01050     info->minSup = (int) ceil(minSupPercent * info->custCount);
01051 #else
01052 
01053     info->minSup = (int) floor(minSupPercent * info->custCount + 0.5);
01054 #endif
01055 
01056     info->minSup = (info->minSup == 0) ? 1 : info->minSup;
01057 
01058     // *-- Frequent 1 itemsets --*
01059     // This section:
01060     // - finds the frequent 1 itemset
01061     // - creates a mapping which maps item ids of the frequent 1 itemset
01062     //   to a set of consecutive integers
01063 
01064     info->f1Size = 0;
01065     itemMap = new int[itemCount];
01066 
01067     for (i = 0; i < itemCount; i++)
01068     {
01069         if (itemCustCount[i] >= info->minSup )
01070         {
01071             itemMap[i] = info->f1Size;
01072             info->f1Size++;
01073         }
01074         else
01075         {
01076             itemMap[i] = -1;
01077         }
01078     }
01079 
01080     // return if there is no frequent 1 itemset
01081     if (info->f1Size == 0)
01082         return info;
01083 
01084     numCompression = info->maxCustTrans * info->f1Size;
01085 
01086     // Allocate the SeqBitmap buffer
01087     // -- estimate the max size of the bitmap data buffers
01088     int size64 = bitmapSizes[4];
01089     int size32 = bitmapSizes[3] + size64;
01090     int size16 = bitmapSizes[2] + size32;
01091     int size8 = bitmapSizes[1] + size16;
01092     int size4 = bitmapSizes[0] + size8;
01093 
01094     size4 = Bitmap4::CalcSize(size4);
01095     size8 = Bitmap8::CalcSize(size8);
01096     size16 = Bitmap16::CalcSize(size16);
01097     size32 = Bitmap32::CalcSize(size32);
01098     size64 = Bitmap64::CalcSize(size64);
01099 
01100 
01101     // Size of bitmap * number of levels * number of bitmaps per level
01102     SeqBitmap::MemAlloc(size4 * (info->maxCustTrans * info->f1Size) *
01103                         (NUM_BITMAPS_USED + info->f1BufSize),
01104                         size8 * (info->maxCustTrans * info->f1Size) *
01105                         (NUM_BITMAPS_USED + info->f1BufSize),
01106                         size16 * (info->maxCustTrans * info->f1Size) *
01107                         (NUM_BITMAPS_USED + info->f1BufSize),
01108                         size32 * (info->maxCustTrans * info->f1Size) *
01109                         (NUM_BITMAPS_USED + info->f1BufSize),
01110                         size64 * (info->maxCustTrans * info->f1Size) *
01111                         (NUM_BITMAPS_USED + info->f1BufSize));
01112 
01113 
01114     // -- estimate the max size of the f1 buffers
01115     // size of f1 buffer depends on num of compression that we allow
01116     // for f1Buff and f1NameBuff
01117     info->f1BufSize = info->f1Size * (numCompression + 1) + 10;
01118 
01119     info->f1Buff = new SeqBitmap * [info->f1BufSize];
01120 
01121     for (i = 0; i < info->f1Size; i++)
01122         info->f1Buff[i] = new SeqBitmap(
01123                               bitmapSizes[0],
01124                               bitmapSizes[1],
01125                               bitmapSizes[2],
01126                               bitmapSizes[3],
01127                               bitmapSizes[4]);
01128 
01129     //cout << bitmapSizes[0] << " " << bitmapSizes[1] << " "
01130     // << bitmapSizes[2] << " " << bitmapSizes[3] << " "
01131     // << bitmapSizes[4] << endl;
01132     for (; i < info->f1BufSize; i++)
01133         info->f1Buff[i] = 0;
01134 
01135     info->f1NameBuff = new int[info->f1BufSize];
01136     for (i = 0; i < itemCount; i++)
01137         if (itemMap[i] >= 0)
01138             info->f1NameBuff[itemMap[i]] = i;
01139 
01140     // -- estimate the max size of slist buffers
01141     // Size of sListBuff depends on the max number of transactions
01142     // a customer has. In the worst case, we will only go down "s-tree"
01143     // for info->maxCustTrans number of times, and we only change s-list
01144     // when we go down "s-tree".
01145     info->sListBuff = new int[(info->maxCustTrans * info->f1Size)
01146                               * info->f1Size + 1];
01147 
01148     // -- estimate the max size of ilist buffers
01149     // Size of i-list buff also depends on the max number of transactions
01150     // a customer has. It aslo depends on how many times it goes down the
01151     // "i-tree". In the worst case we will go down the whole "i-tree" for
01152     // each transaction in the sequence.
01153     info->iListBuff = new int[(info->maxCustTrans * info->f1Size )
01154                               * info->f1Size + 1];
01155 
01156     for (i = 0; i < info->f1Size; i++)
01157         info->sListBuff[i] = i;
01158 
01159     // -- estimate the max size of count buffer
01160     info->countBuff = new CountInfo[info->f1Size];
01161 
01162 
01163 
01164 
01165     // ---------------------- Second scan of the data ----------------------//
01166     // This is the second scan of the database. In this scan, we actually
01167     // retrieve the information of each transaction. We will also partition
01168     // the data by customers.
01169     // ---------------------------------------------------------------------//
01170 
01171     if (isBinaryFile)
01172         result = ReadBinary(
01173                      filename,
01174                      cids,
01175                      tids,
01176                      iids,
01177                      numEntries,
01178                      transLens,
01179                      transLensLength,
01180                      custBitmapMap,
01181                      custMap,
01182                      itemMap,
01183                      info->f1Buff);
01184     else
01185         result = ReadASCII(
01186                      filename,
01187                      cids,
01188                      tids,
01189                      iids,
01190                      numEntries,
01191                      custBitmapMap,
01192                      custMap,
01193                      itemMap,
01194                      info->f1Buff);
01195 
01196     if (!result)
01197     {
01198         delete info;
01199         return 0;
01200     }
01201 
01202     // deallocate memory storing file to avoid second scan
01203     delete [] cids;
01204     delete [] tids;
01205     delete [] iids;
01206 
01207     // deallocate memory for counting customer index
01208     delete [] custTransCount;
01209     delete [] itemCustCount;
01210     delete [] custBitmapMap;
01211     for (i = 0; i < NUM_BITMAP; i++)
01212         delete [] custMap[i];
01213     delete [] custMap;
01214     delete [] bitmapSizes;
01215     delete [] itemMap;
01216 
01217     return info;
01218 }
01219 
01220 /** @} */

Generated on Thu Mar 11 12:01:51 2004 for SPAM by doxygen 1.3.4