Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/itemsets.c
3206 views
/*!1* \file2* \brief Frequent/Closed itemset discovery routines3*4* This file contains the code for finding frequent/closed itemests. These routines5* are implemented using a call-back mechanism to deal with the discovered itemsets.6*7* \date 6/13/20088* \author George Karypis9* \version\verbatim $Id: itemsets.c 11075 2011-11-11 22:31:52Z karypis $ \endverbatim10*/1112#include <GKlib.h>1314/*-------------------------------------------------------------*/15/*! Data structures for use within this module */16/*-------------------------------------------------------------*/17typedef struct {18int minfreq; /* the minimum frequency of a pattern */19int maxfreq; /* the maximum frequency of a pattern */20int minlen; /* the minimum length of the requested pattern */21int maxlen; /* the maximum length of the requested pattern */22int tnitems; /* the initial range of the item space */2324/* the call-back function */25void (*callback)(void *stateptr, int nitems, int *itemids, int ntrans, int *transids);26void *stateptr; /* the user-supplied pointer to pass to the callback */2728/* workspace variables */29int *rmarker;30gk_ikv_t *cand;31} isparams_t;323334/*-------------------------------------------------------------*/35/*! Prototypes for this module */36/*-------------------------------------------------------------*/37void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat,38int preflen, int *prefix);39gk_csr_t *itemsets_project_matrix(isparams_t *param, gk_csr_t *mat, int cid);40414243/*************************************************************************/44/*! The entry point of the frequent itemset discovery code */45/*************************************************************************/46void gk_find_frequent_itemsets(int ntrans, ssize_t *tranptr, int *tranind,47int minfreq, int maxfreq, int minlen, int maxlen,48void (*process_itemset)(void *stateptr, int nitems, int *itemids,49int ntrans, int *transids),50void *stateptr)51{52ssize_t i;53gk_csr_t *mat, *pmat;54isparams_t params;55int *pattern;5657/* Create the matrix */58mat = gk_csr_Create();59mat->nrows = ntrans;60mat->ncols = tranind[gk_iargmax(tranptr[ntrans], tranind)]+1;61mat->rowptr = gk_zcopy(ntrans+1, tranptr, gk_zmalloc(ntrans+1, "gk_find_frequent_itemsets: mat.rowptr"));62mat->rowind = gk_icopy(tranptr[ntrans], tranind, gk_imalloc(tranptr[ntrans], "gk_find_frequent_itemsets: mat.rowind"));63mat->colids = gk_iincset(mat->ncols, 0, gk_imalloc(mat->ncols, "gk_find_frequent_itemsets: mat.colids"));6465/* Setup the parameters */66params.minfreq = minfreq;67params.maxfreq = (maxfreq == -1 ? mat->nrows : maxfreq);68params.minlen = minlen;69params.maxlen = (maxlen == -1 ? mat->ncols : maxlen);70params.tnitems = mat->ncols;71params.callback = process_itemset;72params.stateptr = stateptr;73params.rmarker = gk_ismalloc(mat->nrows, 0, "gk_find_frequent_itemsets: rmarker");74params.cand = gk_ikvmalloc(mat->ncols, "gk_find_frequent_itemsets: cand");7576/* Perform the initial projection */77gk_csr_CreateIndex(mat, GK_CSR_COL);78pmat = itemsets_project_matrix(¶ms, mat, -1);79gk_csr_Free(&mat);8081pattern = gk_imalloc(pmat->ncols, "gk_find_frequent_itemsets: pattern");82itemsets_find_frequent_itemsets(¶ms, pmat, 0, pattern);8384gk_csr_Free(&pmat);85gk_free((void **)&pattern, ¶ms.rmarker, ¶ms.cand, LTERM);8687}88899091/*************************************************************************/92/*! The recursive routine for DFS-based frequent pattern discovery */93/*************************************************************************/94void itemsets_find_frequent_itemsets(isparams_t *params, gk_csr_t *mat,95int preflen, int *prefix)96{97ssize_t i;98gk_csr_t *cmat;99100/* Project each frequent column */101for (i=0; i<mat->ncols; i++) {102prefix[preflen] = mat->colids[i];103104if (preflen+1 >= params->minlen)105(*params->callback)(params->stateptr, preflen+1, prefix,106mat->colptr[i+1]-mat->colptr[i], mat->colind+mat->colptr[i]);107108if (preflen+1 < params->maxlen) {109cmat = itemsets_project_matrix(params, mat, i);110itemsets_find_frequent_itemsets(params, cmat, preflen+1, prefix);111gk_csr_Free(&cmat);112}113}114115}116117118/******************************************************************************/119/*! This function projects a matrix w.r.t. to a particular column.120It performs the following steps:121- Determines the length of each column that is remaining122- Sorts the columns in increasing length123- Creates a column-based version of the matrix with the proper124column ordering and renamed rowids.125*/126/*******************************************************************************/127gk_csr_t *itemsets_project_matrix(isparams_t *params, gk_csr_t *mat, int cid)128{129ssize_t i, j, k, ii, pnnz;130int nrows, ncols, pnrows, pncols;131ssize_t *colptr, *pcolptr;132int *colind, *colids, *pcolind, *pcolids, *rmarker;133gk_csr_t *pmat;134gk_ikv_t *cand;135136nrows = mat->nrows;137ncols = mat->ncols;138colptr = mat->colptr;139colind = mat->colind;140colids = mat->colids;141142rmarker = params->rmarker;143cand = params->cand;144145146/* Allocate space for the projected matrix based on what you know thus far */147pmat = gk_csr_Create();148pmat->nrows = pnrows = (cid == -1 ? nrows : colptr[cid+1]-colptr[cid]);149150151/* Mark the rows that will be kept and determine the prowids */152if (cid == -1) { /* Initial projection */153gk_iset(nrows, 1, rmarker);154}155else { /* The other projections */156for (i=colptr[cid]; i<colptr[cid+1]; i++)157rmarker[colind[i]] = 1;158}159160161/* Determine the length of each column that will be left in the projected matrix */162for (pncols=0, pnnz=0, i=cid+1; i<ncols; i++) {163for (k=0, j=colptr[i]; j<colptr[i+1]; j++) {164k += rmarker[colind[j]];165}166if (k >= params->minfreq && k <= params->maxfreq) {167cand[pncols].val = i;168cand[pncols++].key = k;169pnnz += k;170}171}172173/* Sort the columns in increasing order */174gk_ikvsorti(pncols, cand);175176177/* Allocate space for the remaining fields of the projected matrix */178pmat->ncols = pncols;179pmat->colids = pcolids = gk_imalloc(pncols, "itemsets_project_matrix: pcolids");180pmat->colptr = pcolptr = gk_zmalloc(pncols+1, "itemsets_project_matrix: pcolptr");181pmat->colind = pcolind = gk_imalloc(pnnz, "itemsets_project_matrix: pcolind");182183184/* Populate the projected matrix */185pcolptr[0] = 0;186for (pnnz=0, ii=0; ii<pncols; ii++) {187i = cand[ii].val;188for (j=colptr[i]; j<colptr[i+1]; j++) {189if (rmarker[colind[j]])190pcolind[pnnz++] = colind[j];191}192193pcolids[ii] = colids[i];194pcolptr[ii+1] = pnnz;195}196197198/* Reset the rmarker array */199if (cid == -1) { /* Initial projection */200gk_iset(nrows, 0, rmarker);201}202else { /* The other projections */203for (i=colptr[cid]; i<colptr[cid+1]; i++)204rmarker[colind[i]] = 0;205}206207208return pmat;209}210211212