Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/csr.c
3206 views
/*!1* \file2*3* \brief Various routines with dealing with CSR matrices4*5* \author George Karypis6* \version\verbatim $Id: csr.c 13437 2013-01-11 21:54:10Z karypis $ \endverbatim7*/89#include <GKlib.h>1011#define OMPMINOPS 500001213/*************************************************************************/14/*! Allocate memory for a CSR matrix and initializes it15\returns the allocated matrix. The various fields are set to NULL.16*/17/**************************************************************************/18gk_csr_t *gk_csr_Create()19{20gk_csr_t *mat;2122mat = (gk_csr_t *)gk_malloc(sizeof(gk_csr_t), "gk_csr_Create: mat");2324gk_csr_Init(mat);2526return mat;27}282930/*************************************************************************/31/*! Initializes the matrix32\param mat is the matrix to be initialized.33*/34/*************************************************************************/35void gk_csr_Init(gk_csr_t *mat)36{37memset(mat, 0, sizeof(gk_csr_t));38mat->nrows = mat->ncols = -1;39}404142/*************************************************************************/43/*! Frees all the memory allocated for matrix.44\param mat is the matrix to be freed.45*/46/*************************************************************************/47void gk_csr_Free(gk_csr_t **mat)48{49if (*mat == NULL)50return;51gk_csr_FreeContents(*mat);52gk_free((void **)mat, LTERM);53}545556/*************************************************************************/57/*! Frees only the memory allocated for the matrix's different fields and58sets them to NULL.59\param mat is the matrix whose contents will be freed.60*/61/*************************************************************************/62void gk_csr_FreeContents(gk_csr_t *mat)63{64gk_free((void *)&mat->rowptr, &mat->rowind, &mat->rowval, &mat->rowids,65&mat->colptr, &mat->colind, &mat->colval, &mat->colids,66&mat->rnorms, &mat->cnorms, &mat->rsums, &mat->csums,67&mat->rsizes, &mat->csizes, &mat->rvols, &mat->cvols,68&mat->rwgts, &mat->cwgts,69LTERM);70}717273/*************************************************************************/74/*! Returns a copy of a matrix.75\param mat is the matrix to be duplicated.76\returns the newly created copy of the matrix.77*/78/**************************************************************************/79gk_csr_t *gk_csr_Dup(gk_csr_t *mat)80{81gk_csr_t *nmat;8283nmat = gk_csr_Create();8485nmat->nrows = mat->nrows;86nmat->ncols = mat->ncols;8788/* copy the row structure */89if (mat->rowptr)90nmat->rowptr = gk_zcopy(mat->nrows+1, mat->rowptr,91gk_zmalloc(mat->nrows+1, "gk_csr_Dup: rowptr"));92if (mat->rowids)93nmat->rowids = gk_icopy(mat->nrows, mat->rowids,94gk_imalloc(mat->nrows, "gk_csr_Dup: rowids"));95if (mat->rnorms)96nmat->rnorms = gk_fcopy(mat->nrows, mat->rnorms,97gk_fmalloc(mat->nrows, "gk_csr_Dup: rnorms"));98if (mat->rowind)99nmat->rowind = gk_icopy(mat->rowptr[mat->nrows], mat->rowind,100gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowind"));101if (mat->rowval)102nmat->rowval = gk_fcopy(mat->rowptr[mat->nrows], mat->rowval,103gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowval"));104105/* copy the col structure */106if (mat->colptr)107nmat->colptr = gk_zcopy(mat->ncols+1, mat->colptr,108gk_zmalloc(mat->ncols+1, "gk_csr_Dup: colptr"));109if (mat->colids)110nmat->colids = gk_icopy(mat->ncols, mat->colids,111gk_imalloc(mat->ncols, "gk_csr_Dup: colids"));112if (mat->cnorms)113nmat->cnorms = gk_fcopy(mat->ncols, mat->cnorms,114gk_fmalloc(mat->ncols, "gk_csr_Dup: cnorms"));115if (mat->colind)116nmat->colind = gk_icopy(mat->colptr[mat->ncols], mat->colind,117gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colind"));118if (mat->colval)119nmat->colval = gk_fcopy(mat->colptr[mat->ncols], mat->colval,120gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colval"));121122return nmat;123}124125126/*************************************************************************/127/*! Returns a submatrix containint a set of consecutive rows.128\param mat is the original matrix.129\param rstart is the starting row.130\param nrows is the number of rows from rstart to extract.131\returns the row structure of the newly created submatrix.132*/133/**************************************************************************/134gk_csr_t *gk_csr_ExtractSubmatrix(gk_csr_t *mat, int rstart, int nrows)135{136ssize_t i;137gk_csr_t *nmat;138139if (rstart+nrows > mat->nrows)140return NULL;141142nmat = gk_csr_Create();143144nmat->nrows = nrows;145nmat->ncols = mat->ncols;146147/* copy the row structure */148if (mat->rowptr)149nmat->rowptr = gk_zcopy(nrows+1, mat->rowptr+rstart,150gk_zmalloc(nrows+1, "gk_csr_ExtractSubmatrix: rowptr"));151for (i=nrows; i>=0; i--)152nmat->rowptr[i] -= nmat->rowptr[0];153ASSERT(nmat->rowptr[0] == 0);154155if (mat->rowids)156nmat->rowids = gk_icopy(nrows, mat->rowids+rstart,157gk_imalloc(nrows, "gk_csr_ExtractSubmatrix: rowids"));158if (mat->rnorms)159nmat->rnorms = gk_fcopy(nrows, mat->rnorms+rstart,160gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rnorms"));161162if (mat->rsums)163nmat->rsums = gk_fcopy(nrows, mat->rsums+rstart,164gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rsums"));165166ASSERT(nmat->rowptr[nrows] == mat->rowptr[rstart+nrows]-mat->rowptr[rstart]);167if (mat->rowind)168nmat->rowind = gk_icopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],169mat->rowind+mat->rowptr[rstart],170gk_imalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],171"gk_csr_ExtractSubmatrix: rowind"));172if (mat->rowval)173nmat->rowval = gk_fcopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],174mat->rowval+mat->rowptr[rstart],175gk_fmalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],176"gk_csr_ExtractSubmatrix: rowval"));177178return nmat;179}180181182/*************************************************************************/183/*! Returns a submatrix containing a certain set of rows.184\param mat is the original matrix.185\param nrows is the number of rows to extract.186\param rind is the set of row numbers to extract.187\returns the row structure of the newly created submatrix.188*/189/**************************************************************************/190gk_csr_t *gk_csr_ExtractRows(gk_csr_t *mat, int nrows, int *rind)191{192ssize_t i, ii, j, nnz;193gk_csr_t *nmat;194195nmat = gk_csr_Create();196197nmat->nrows = nrows;198nmat->ncols = mat->ncols;199200for (nnz=0, i=0; i<nrows; i++)201nnz += mat->rowptr[rind[i]+1]-mat->rowptr[rind[i]];202203nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");204nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");205nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");206207nmat->rowptr[0] = 0;208for (nnz=0, j=0, ii=0; ii<nrows; ii++) {209i = rind[ii];210gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);211gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);212nnz += mat->rowptr[i+1]-mat->rowptr[i];213nmat->rowptr[++j] = nnz;214}215ASSERT(j == nmat->nrows);216217return nmat;218}219220221/*************************************************************************/222/*! Returns a submatrix corresponding to a specified partitioning of rows.223\param mat is the original matrix.224\param part is the partitioning vector of the rows.225\param pid is the partition ID that will be extracted.226\returns the row structure of the newly created submatrix.227*/228/**************************************************************************/229gk_csr_t *gk_csr_ExtractPartition(gk_csr_t *mat, int *part, int pid)230{231ssize_t i, j, nnz;232gk_csr_t *nmat;233234nmat = gk_csr_Create();235236nmat->nrows = 0;237nmat->ncols = mat->ncols;238239for (nnz=0, i=0; i<mat->nrows; i++) {240if (part[i] == pid) {241nmat->nrows++;242nnz += mat->rowptr[i+1]-mat->rowptr[i];243}244}245246nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");247nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");248nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");249250nmat->rowptr[0] = 0;251for (nnz=0, j=0, i=0; i<mat->nrows; i++) {252if (part[i] == pid) {253gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);254gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);255nnz += mat->rowptr[i+1]-mat->rowptr[i];256nmat->rowptr[++j] = nnz;257}258}259ASSERT(j == nmat->nrows);260261return nmat;262}263264265/*************************************************************************/266/*! Splits the matrix into multiple sub-matrices based on the provided267color array.268\param mat is the original matrix.269\param color is an array of size equal to the number of non-zeros270in the matrix (row-wise structure). The matrix is split into271as many parts as the number of colors. For meaningfull results,272the colors should be numbered consecutively starting from 0.273\returns an array of matrices for each supplied color number.274*/275/**************************************************************************/276gk_csr_t **gk_csr_Split(gk_csr_t *mat, int *color)277{278ssize_t i, j;279int nrows, ncolors;280ssize_t *rowptr;281int *rowind;282float *rowval;283gk_csr_t **smats;284285nrows = mat->nrows;286rowptr = mat->rowptr;287rowind = mat->rowind;288rowval = mat->rowval;289290ncolors = gk_imax(rowptr[nrows], color)+1;291292smats = (gk_csr_t **)gk_malloc(sizeof(gk_csr_t *)*ncolors, "gk_csr_Split: smats");293for (i=0; i<ncolors; i++) {294smats[i] = gk_csr_Create();295smats[i]->nrows = mat->nrows;296smats[i]->ncols = mat->ncols;297smats[i]->rowptr = gk_zsmalloc(nrows+1, 0, "gk_csr_Split: smats[i]->rowptr");298}299300for (i=0; i<nrows; i++) {301for (j=rowptr[i]; j<rowptr[i+1]; j++)302smats[color[j]]->rowptr[i]++;303}304for (i=0; i<ncolors; i++)305MAKECSR(j, nrows, smats[i]->rowptr);306307for (i=0; i<ncolors; i++) {308smats[i]->rowind = gk_imalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowind");309smats[i]->rowval = gk_fmalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowval");310}311312for (i=0; i<nrows; i++) {313for (j=rowptr[i]; j<rowptr[i+1]; j++) {314smats[color[j]]->rowind[smats[color[j]]->rowptr[i]] = rowind[j];315smats[color[j]]->rowval[smats[color[j]]->rowptr[i]] = rowval[j];316smats[color[j]]->rowptr[i]++;317}318}319320for (i=0; i<ncolors; i++)321SHIFTCSR(j, nrows, smats[i]->rowptr);322323return smats;324}325326327/**************************************************************************/328/*! Reads a CSR matrix from the supplied file and stores it the matrix's329forward structure.330\param filename is the file that stores the data.331\param format is either GK_CSR_FMT_METIS, GK_CSR_FMT_CLUTO,332GK_CSR_FMT_CSR, GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL333specifying the type of the input format.334The GK_CSR_FMT_CSR does not contain a header335line, whereas the GK_CSR_FMT_BINROW is a binary format written336by gk_csr_Write() using the same format specifier.337\param readvals is either 1 or 0, indicating if the CSR file contains338values or it does not. It only applies when GK_CSR_FMT_CSR is339used.340\param numbering is either 1 or 0, indicating if the numbering of the341indices start from 1 or 0, respectively. If they start from 1,342they are automatically decreamented during input so that they343will start from 0. It only applies when GK_CSR_FMT_CSR is344used.345\returns the matrix that was read.346*/347/**************************************************************************/348gk_csr_t *gk_csr_Read(char *filename, int format, int readvals, int numbering)349{350ssize_t i, k, l;351size_t nfields, nrows, ncols, nnz, fmt, ncon;352size_t lnlen;353ssize_t *rowptr;354int *rowind, ival;355float *rowval=NULL, fval;356int readsizes, readwgts;357char *line=NULL, *head, *tail, fmtstr[256];358FILE *fpin;359gk_csr_t *mat=NULL;360361362if (!gk_fexists(filename))363gk_errexit(SIGERR, "File %s does not exist!\n", filename);364365if (format == GK_CSR_FMT_BINROW) {366mat = gk_csr_Create();367368fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");369if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)370gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);371if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)372gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);373mat->rowptr = gk_zmalloc(mat->nrows+1, "gk_csr_Read: rowptr");374if (fread(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpin) != mat->nrows+1)375gk_errexit(SIGERR, "Failed to read the rowptr from file %s!\n", filename);376mat->rowind = gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowind");377if (fread(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])378gk_errexit(SIGERR, "Failed to read the rowind from file %s!\n", filename);379if (readvals == 1) {380mat->rowval = gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowval");381if (fread(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])382gk_errexit(SIGERR, "Failed to read the rowval from file %s!\n", filename);383}384385gk_fclose(fpin);386return mat;387}388389if (format == GK_CSR_FMT_BINCOL) {390mat = gk_csr_Create();391392fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");393if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)394gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);395if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)396gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);397mat->colptr = gk_zmalloc(mat->ncols+1, "gk_csr_Read: colptr");398if (fread(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpin) != mat->ncols+1)399gk_errexit(SIGERR, "Failed to read the colptr from file %s!\n", filename);400mat->colind = gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Read: colind");401if (fread(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])402gk_errexit(SIGERR, "Failed to read the colind from file %s!\n", filename);403if (readvals) {404mat->colval = gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Read: colval");405if (fread(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])406gk_errexit(SIGERR, "Failed to read the colval from file %s!\n", filename);407}408409gk_fclose(fpin);410return mat;411}412413414if (format == GK_CSR_FMT_CLUTO) {415fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");416do {417if (gk_getline(&line, &lnlen, fpin) <= 0)418gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);419} while (line[0] == '%');420421if (sscanf(line, "%zu %zu %zu", &nrows, &ncols, &nnz) != 3)422gk_errexit(SIGERR, "Header line must contain 3 integers.\n");423424readsizes = 0;425readwgts = 0;426readvals = 1;427numbering = 1;428}429else if (format == GK_CSR_FMT_METIS) {430fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");431do {432if (gk_getline(&line, &lnlen, fpin) <= 0)433gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);434} while (line[0] == '%');435436fmt = ncon = 0;437nfields = sscanf(line, "%zu %zu %zu %zu", &nrows, &nnz, &fmt, &ncon);438if (nfields < 2)439gk_errexit(SIGERR, "Header line must contain at least 2 integers (#vtxs and #edges).\n");440441ncols = nrows;442nnz *= 2;443444if (fmt > 111)445gk_errexit(SIGERR, "Cannot read this type of file format [fmt=%zu]!\n", fmt);446447sprintf(fmtstr, "%03zu", fmt%1000);448readsizes = (fmtstr[0] == '1');449readwgts = (fmtstr[1] == '1');450readvals = (fmtstr[2] == '1');451numbering = 1;452ncon = (ncon == 0 ? 1 : ncon);453}454else {455readsizes = 0;456readwgts = 0;457458gk_getfilestats(filename, &nrows, &nnz, NULL, NULL);459460if (readvals == 1 && nnz%2 == 1)461gk_errexit(SIGERR, "Error: The number of numbers (%zd %d) in the input file is not even.\n", nnz, readvals);462if (readvals == 1)463nnz = nnz/2;464fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");465}466467mat = gk_csr_Create();468469mat->nrows = nrows;470471rowptr = mat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Read: rowptr");472rowind = mat->rowind = gk_imalloc(nnz, "gk_csr_Read: rowind");473if (readvals != 2)474rowval = mat->rowval = gk_fsmalloc(nnz, 1.0, "gk_csr_Read: rowval");475476if (readsizes)477mat->rsizes = gk_fsmalloc(nrows, 0.0, "gk_csr_Read: rsizes");478479if (readwgts)480mat->rwgts = gk_fsmalloc(nrows*ncon, 0.0, "gk_csr_Read: rwgts");481482/*----------------------------------------------------------------------483* Read the sparse matrix file484*---------------------------------------------------------------------*/485numbering = (numbering ? - 1 : 0);486for (ncols=0, rowptr[0]=0, k=0, i=0; i<nrows; i++) {487do {488if (gk_getline(&line, &lnlen, fpin) == -1)489gk_errexit(SIGERR, "Premature end of input file: file while reading row %d\n", i);490} while (line[0] == '%');491492head = line;493tail = NULL;494495/* Read vertex sizes */496if (readsizes) {497#ifdef __MSC__498mat->rsizes[i] = (float)strtod(head, &tail);499#else500mat->rsizes[i] = strtof(head, &tail);501#endif502if (tail == head)503gk_errexit(SIGERR, "The line for vertex %zd does not have size information\n", i+1);504if (mat->rsizes[i] < 0)505errexit("The size for vertex %zd must be >= 0\n", i+1);506head = tail;507}508509/* Read vertex weights */510if (readwgts) {511for (l=0; l<ncon; l++) {512#ifdef __MSC__513mat->rwgts[i*ncon+l] = (float)strtod(head, &tail);514#else515mat->rwgts[i*ncon+l] = strtof(head, &tail);516#endif517if (tail == head)518errexit("The line for vertex %zd does not have enough weights "519"for the %d constraints.\n", i+1, ncon);520if (mat->rwgts[i*ncon+l] < 0)521errexit("The weight vertex %zd and constraint %zd must be >= 0\n", i+1, l);522head = tail;523}524}525526527/* Read the rest of the row */528while (1) {529ival = (int)strtol(head, &tail, 0);530if (tail == head)531break;532head = tail;533534if ((rowind[k] = ival + numbering) < 0)535gk_errexit(SIGERR, "Error: Invalid column number %d at row %zd.\n", ival, i);536537ncols = gk_max(rowind[k], ncols);538539if (readvals == 1) {540#ifdef __MSC__541fval = (float)strtod(head, &tail);542#else543fval = strtof(head, &tail);544#endif545if (tail == head)546gk_errexit(SIGERR, "Value could not be found for column! Row:%zd, NNZ:%zd\n", i, k);547head = tail;548549rowval[k] = fval;550}551k++;552}553rowptr[i+1] = k;554}555556if (format == GK_CSR_FMT_METIS) {557ASSERT(ncols+1 == mat->nrows);558mat->ncols = mat->nrows;559}560else {561mat->ncols = ncols+1;562}563564if (k != nnz)565gk_errexit(SIGERR, "gk_csr_Read: Something wrong with the number of nonzeros in "566"the input file. NNZ=%zd, ActualNNZ=%zd.\n", nnz, k);567568gk_fclose(fpin);569570gk_free((void **)&line, LTERM);571572return mat;573}574575576/**************************************************************************/577/*! Writes the row-based structure of a matrix into a file.578\param mat is the matrix to be written,579\param filename is the name of the output file.580\param format is one of: GK_CSR_FMT_CLUTO, GK_CSR_FMT_CSR,581GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL.582\param writevals is either 1 or 0 indicating if the values will be583written or not. This is only applicable when GK_CSR_FMT_CSR584is used.585\param numbering is either 1 or 0 indicating if the internal 0-based586numbering will be shifted by one or not during output. This587is only applicable when GK_CSR_FMT_CSR is used.588*/589/**************************************************************************/590void gk_csr_Write(gk_csr_t *mat, char *filename, int format, int writevals, int numbering)591{592ssize_t i, j;593FILE *fpout;594595if (format == GK_CSR_FMT_BINROW) {596if (filename == NULL)597gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");598fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");599600fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);601fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);602fwrite(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpout);603fwrite(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpout);604if (writevals)605fwrite(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpout);606607gk_fclose(fpout);608return;609}610611if (format == GK_CSR_FMT_BINCOL) {612if (filename == NULL)613gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");614fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");615616fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);617fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);618fwrite(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpout);619fwrite(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpout);620if (writevals)621fwrite(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpout);622623gk_fclose(fpout);624return;625}626627if (filename)628fpout = gk_fopen(filename, "w", "gk_csr_Write: fpout");629else630fpout = stdout;631632if (format == GK_CSR_FMT_CLUTO) {633fprintf(fpout, "%d %d %zd\n", mat->nrows, mat->ncols, mat->rowptr[mat->nrows]);634writevals = 1;635numbering = 1;636}637638for (i=0; i<mat->nrows; i++) {639for (j=mat->rowptr[i]; j<mat->rowptr[i+1]; j++) {640fprintf(fpout, " %d", mat->rowind[j]+(numbering ? 1 : 0));641if (writevals)642fprintf(fpout, " %f", mat->rowval[j]);643}644fprintf(fpout, "\n");645}646if (filename)647gk_fclose(fpout);648}649650651/*************************************************************************/652/*! Prunes certain rows/columns of the matrix. The prunning takes place653by analyzing the row structure of the matrix. The prunning takes place654by removing rows/columns but it does not affect the numbering of the655remaining rows/columns.656657\param mat the matrix to be prunned,658\param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)659of the matrix will be prunned,660\param minf is the minimum number of rows (columns) that a column (row) must661be present in order to be kept,662\param maxf is the maximum number of rows (columns) that a column (row) must663be present at in order to be kept.664\returns the prunned matrix consisting only of its row-based structure.665The input matrix is not modified.666*/667/**************************************************************************/668gk_csr_t *gk_csr_Prune(gk_csr_t *mat, int what, int minf, int maxf)669{670ssize_t i, j, nnz;671int nrows, ncols;672ssize_t *rowptr, *nrowptr;673int *rowind, *nrowind, *collen;674float *rowval, *nrowval;675gk_csr_t *nmat;676677nmat = gk_csr_Create();678679nrows = nmat->nrows = mat->nrows;680ncols = nmat->ncols = mat->ncols;681682rowptr = mat->rowptr;683rowind = mat->rowind;684rowval = mat->rowval;685686nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Prune: nrowptr");687nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_Prune: nrowind");688nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_Prune: nrowval");689690691switch (what) {692case GK_CSR_COL:693collen = gk_ismalloc(ncols, 0, "gk_csr_Prune: collen");694695for (i=0; i<nrows; i++) {696for (j=rowptr[i]; j<rowptr[i+1]; j++) {697ASSERT(rowind[j] < ncols);698collen[rowind[j]]++;699}700}701for (i=0; i<ncols; i++)702collen[i] = (collen[i] >= minf && collen[i] <= maxf ? 1 : 0);703704nrowptr[0] = 0;705for (nnz=0, i=0; i<nrows; i++) {706for (j=rowptr[i]; j<rowptr[i+1]; j++) {707if (collen[rowind[j]]) {708nrowind[nnz] = rowind[j];709nrowval[nnz] = rowval[j];710nnz++;711}712}713nrowptr[i+1] = nnz;714}715gk_free((void **)&collen, LTERM);716break;717718case GK_CSR_ROW:719nrowptr[0] = 0;720for (nnz=0, i=0; i<nrows; i++) {721if (rowptr[i+1]-rowptr[i] >= minf && rowptr[i+1]-rowptr[i] <= maxf) {722for (j=rowptr[i]; j<rowptr[i+1]; j++, nnz++) {723nrowind[nnz] = rowind[j];724nrowval[nnz] = rowval[j];725}726}727nrowptr[i+1] = nnz;728}729break;730731default:732gk_csr_Free(&nmat);733gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);734return NULL;735}736737return nmat;738}739740741/*************************************************************************/742/*! Eliminates certain entries from the rows/columns of the matrix. The743filtering takes place by keeping only the highest weight entries whose744sum accounts for a certain fraction of the overall weight of the745row/column.746747\param mat the matrix to be prunned,748\param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)749of the matrix will be prunned,750\param norm indicates the norm that will be used to aggregate the weights751and possible values are 1 or 2,752\param fraction is the fraction of the overall norm that will be retained753by the kept entries.754\returns the filtered matrix consisting only of its row-based structure.755The input matrix is not modified.756*/757/**************************************************************************/758gk_csr_t *gk_csr_LowFilter(gk_csr_t *mat, int what, int norm, float fraction)759{760ssize_t i, j, nnz;761int nrows, ncols, ncand, maxlen=0;762ssize_t *rowptr, *colptr, *nrowptr;763int *rowind, *colind, *nrowind;764float *rowval, *colval, *nrowval, rsum, tsum;765gk_csr_t *nmat;766gk_fkv_t *cand;767768nmat = gk_csr_Create();769770nrows = nmat->nrows = mat->nrows;771ncols = nmat->ncols = mat->ncols;772773rowptr = mat->rowptr;774rowind = mat->rowind;775rowval = mat->rowval;776colptr = mat->colptr;777colind = mat->colind;778colval = mat->colval;779780nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");781nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");782nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");783784785switch (what) {786case GK_CSR_COL:787if (mat->colptr == NULL)788gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");789790gk_zcopy(nrows+1, rowptr, nrowptr);791792for (i=0; i<ncols; i++)793maxlen = gk_max(maxlen, colptr[i+1]-colptr[i]);794795#pragma omp parallel private(i, j, ncand, rsum, tsum, cand)796{797cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");798799#pragma omp for schedule(static)800for (i=0; i<ncols; i++) {801for (tsum=0.0, ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {802cand[ncand].val = colind[j];803cand[ncand].key = colval[j];804tsum += (norm == 1 ? colval[j] : colval[j]*colval[j]);805}806gk_fkvsortd(ncand, cand);807808for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {809rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);810nrowind[nrowptr[cand[j].val]] = i;811nrowval[nrowptr[cand[j].val]] = cand[j].key;812nrowptr[cand[j].val]++;813}814}815816gk_free((void **)&cand, LTERM);817}818819/* compact the nrowind/nrowval */820for (nnz=0, i=0; i<nrows; i++) {821for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {822nrowind[nnz] = nrowind[j];823nrowval[nnz] = nrowval[j];824}825nrowptr[i] = nnz;826}827SHIFTCSR(i, nrows, nrowptr);828829break;830831case GK_CSR_ROW:832if (mat->rowptr == NULL)833gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");834835for (i=0; i<nrows; i++)836maxlen = gk_max(maxlen, rowptr[i+1]-rowptr[i]);837838#pragma omp parallel private(i, j, ncand, rsum, tsum, cand)839{840cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");841842#pragma omp for schedule(static)843for (i=0; i<nrows; i++) {844for (tsum=0.0, ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {845cand[ncand].val = rowind[j];846cand[ncand].key = rowval[j];847tsum += (norm == 1 ? rowval[j] : rowval[j]*rowval[j]);848}849gk_fkvsortd(ncand, cand);850851for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {852rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);853nrowind[rowptr[i]+j] = cand[j].val;854nrowval[rowptr[i]+j] = cand[j].key;855}856nrowptr[i+1] = rowptr[i]+j;857}858859gk_free((void **)&cand, LTERM);860}861862/* compact nrowind/nrowval */863nrowptr[0] = nnz = 0;864for (i=0; i<nrows; i++) {865for (j=rowptr[i]; j<nrowptr[i+1]; j++, nnz++) {866nrowind[nnz] = nrowind[j];867nrowval[nnz] = nrowval[j];868}869nrowptr[i+1] = nnz;870}871872break;873874default:875gk_csr_Free(&nmat);876gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);877return NULL;878}879880return nmat;881}882883884/*************************************************************************/885/*! Eliminates certain entries from the rows/columns of the matrix. The886filtering takes place by keeping only the highest weight top-K entries887along each row/column and those entries whose weight is greater than888a specified value.889890\param mat the matrix to be prunned,891\param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)892of the matrix will be prunned,893\param topk is the number of the highest weight entries to keep.894\param keepval is the weight of a term above which will be kept. This895is used to select additional terms past the first topk.896\returns the filtered matrix consisting only of its row-based structure.897The input matrix is not modified.898*/899/**************************************************************************/900gk_csr_t *gk_csr_TopKPlusFilter(gk_csr_t *mat, int what, int topk, float keepval)901{902ssize_t i, j, k, nnz;903int nrows, ncols, ncand;904ssize_t *rowptr, *colptr, *nrowptr;905int *rowind, *colind, *nrowind;906float *rowval, *colval, *nrowval;907gk_csr_t *nmat;908gk_fkv_t *cand;909910nmat = gk_csr_Create();911912nrows = nmat->nrows = mat->nrows;913ncols = nmat->ncols = mat->ncols;914915rowptr = mat->rowptr;916rowind = mat->rowind;917rowval = mat->rowval;918colptr = mat->colptr;919colind = mat->colind;920colval = mat->colval;921922nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");923nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");924nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");925926927switch (what) {928case GK_CSR_COL:929if (mat->colptr == NULL)930gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");931932cand = gk_fkvmalloc(nrows, "gk_csr_LowFilter: cand");933934gk_zcopy(nrows+1, rowptr, nrowptr);935for (i=0; i<ncols; i++) {936for (ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {937cand[ncand].val = colind[j];938cand[ncand].key = colval[j];939}940gk_fkvsortd(ncand, cand);941942k = gk_min(topk, ncand);943for (j=0; j<k; j++) {944nrowind[nrowptr[cand[j].val]] = i;945nrowval[nrowptr[cand[j].val]] = cand[j].key;946nrowptr[cand[j].val]++;947}948for (; j<ncand; j++) {949if (cand[j].key < keepval)950break;951952nrowind[nrowptr[cand[j].val]] = i;953nrowval[nrowptr[cand[j].val]] = cand[j].key;954nrowptr[cand[j].val]++;955}956}957958/* compact the nrowind/nrowval */959for (nnz=0, i=0; i<nrows; i++) {960for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {961nrowind[nnz] = nrowind[j];962nrowval[nnz] = nrowval[j];963}964nrowptr[i] = nnz;965}966SHIFTCSR(i, nrows, nrowptr);967968gk_free((void **)&cand, LTERM);969break;970971case GK_CSR_ROW:972if (mat->rowptr == NULL)973gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");974975cand = gk_fkvmalloc(ncols, "gk_csr_LowFilter: cand");976977nrowptr[0] = 0;978for (nnz=0, i=0; i<nrows; i++) {979for (ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {980cand[ncand].val = rowind[j];981cand[ncand].key = rowval[j];982}983gk_fkvsortd(ncand, cand);984985k = gk_min(topk, ncand);986for (j=0; j<k; j++, nnz++) {987nrowind[nnz] = cand[j].val;988nrowval[nnz] = cand[j].key;989}990for (; j<ncand; j++, nnz++) {991if (cand[j].key < keepval)992break;993994nrowind[nnz] = cand[j].val;995nrowval[nnz] = cand[j].key;996}997nrowptr[i+1] = nnz;998}9991000gk_free((void **)&cand, LTERM);1001break;10021003default:1004gk_csr_Free(&nmat);1005gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);1006return NULL;1007}10081009return nmat;1010}101110121013/*************************************************************************/1014/*! Eliminates certain entries from the rows/columns of the matrix. The1015filtering takes place by keeping only the terms whose contribution to1016the total length of the document is greater than a user-splied multiple1017over the average.10181019This routine assumes that the vectors are normalized to be unit length.10201021\param mat the matrix to be prunned,1022\param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)1023of the matrix will be prunned,1024\param zscore is the multiplicative factor over the average contribution1025to the length of the document.1026\returns the filtered matrix consisting only of its row-based structure.1027The input matrix is not modified.1028*/1029/**************************************************************************/1030gk_csr_t *gk_csr_ZScoreFilter(gk_csr_t *mat, int what, float zscore)1031{1032ssize_t i, j, nnz;1033int nrows;1034ssize_t *rowptr, *nrowptr;1035int *rowind, *nrowind;1036float *rowval, *nrowval, avgwgt;1037gk_csr_t *nmat;10381039nmat = gk_csr_Create();10401041nmat->nrows = mat->nrows;1042nmat->ncols = mat->ncols;10431044nrows = mat->nrows;1045rowptr = mat->rowptr;1046rowind = mat->rowind;1047rowval = mat->rowval;10481049nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_ZScoreFilter: nrowptr");1050nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowind");1051nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowval");105210531054switch (what) {1055case GK_CSR_COL:1056gk_errexit(SIGERR, "This has not been implemented yet.\n");1057break;10581059case GK_CSR_ROW:1060if (mat->rowptr == NULL)1061gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");10621063nrowptr[0] = 0;1064for (nnz=0, i=0; i<nrows; i++) {1065avgwgt = zscore/(rowptr[i+1]-rowptr[i]);1066for (j=rowptr[i]; j<rowptr[i+1]; j++) {1067if (rowval[j] > avgwgt) {1068nrowind[nnz] = rowind[j];1069nrowval[nnz] = rowval[j];1070nnz++;1071}1072}1073nrowptr[i+1] = nnz;1074}1075break;10761077default:1078gk_csr_Free(&nmat);1079gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);1080return NULL;1081}10821083return nmat;1084}108510861087/*************************************************************************/1088/*! Compacts the column-space of the matrix by removing empty columns.1089As a result of the compaction, the column numbers are renumbered.1090The compaction operation is done in place and only affects the row-based1091representation of the matrix.1092The new columns are ordered in decreasing frequency.10931094\param mat the matrix whose empty columns will be removed.1095*/1096/**************************************************************************/1097void gk_csr_CompactColumns(gk_csr_t *mat)1098{1099ssize_t i;1100int nrows, ncols, nncols;1101ssize_t *rowptr;1102int *rowind, *colmap;1103gk_ikv_t *clens;11041105nrows = mat->nrows;1106ncols = mat->ncols;1107rowptr = mat->rowptr;1108rowind = mat->rowind;11091110colmap = gk_imalloc(ncols, "gk_csr_CompactColumns: colmap");11111112clens = gk_ikvmalloc(ncols, "gk_csr_CompactColumns: clens");1113for (i=0; i<ncols; i++) {1114clens[i].key = 0;1115clens[i].val = i;1116}11171118for (i=0; i<rowptr[nrows]; i++)1119clens[rowind[i]].key++;1120gk_ikvsortd(ncols, clens);11211122for (nncols=0, i=0; i<ncols; i++) {1123if (clens[i].key > 0)1124colmap[clens[i].val] = nncols++;1125else1126break;1127}11281129for (i=0; i<rowptr[nrows]; i++)1130rowind[i] = colmap[rowind[i]];11311132mat->ncols = nncols;11331134gk_free((void **)&colmap, &clens, LTERM);1135}113611371138/*************************************************************************/1139/*! Sorts the indices in increasing order1140\param mat the matrix itself,1141\param what is either GK_CSR_ROW or GK_CSR_COL indicating which set of1142indices to sort.1143*/1144/**************************************************************************/1145void gk_csr_SortIndices(gk_csr_t *mat, int what)1146{1147int n, nn=0;1148ssize_t *ptr;1149int *ind;1150float *val;11511152switch (what) {1153case GK_CSR_ROW:1154if (!mat->rowptr)1155gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");11561157n = mat->nrows;1158ptr = mat->rowptr;1159ind = mat->rowind;1160val = mat->rowval;1161break;11621163case GK_CSR_COL:1164if (!mat->colptr)1165gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");11661167n = mat->ncols;1168ptr = mat->colptr;1169ind = mat->colind;1170val = mat->colval;1171break;11721173default:1174gk_errexit(SIGERR, "Invalid index type of %d.\n", what);1175return;1176}11771178#pragma omp parallel if (n > 100)1179{1180ssize_t i, j, k;1181gk_ikv_t *cand;1182float *tval;11831184#pragma omp single1185for (i=0; i<n; i++)1186nn = gk_max(nn, ptr[i+1]-ptr[i]);11871188cand = gk_ikvmalloc(nn, "gk_csr_SortIndices: cand");1189tval = gk_fmalloc(nn, "gk_csr_SortIndices: tval");11901191#pragma omp for schedule(static)1192for (i=0; i<n; i++) {1193for (k=0, j=ptr[i]; j<ptr[i+1]; j++) {1194if (j > ptr[i] && ind[j] < ind[j-1])1195k = 1; /* an inversion */1196cand[j-ptr[i]].val = j-ptr[i];1197cand[j-ptr[i]].key = ind[j];1198tval[j-ptr[i]] = val[j];1199}1200if (k) {1201gk_ikvsorti(ptr[i+1]-ptr[i], cand);1202for (j=ptr[i]; j<ptr[i+1]; j++) {1203ind[j] = cand[j-ptr[i]].key;1204val[j] = tval[cand[j-ptr[i]].val];1205}1206}1207}12081209gk_free((void **)&cand, &tval, LTERM);1210}12111212}121312141215/*************************************************************************/1216/*! Creates a row/column index from the column/row data.1217\param mat the matrix itself,1218\param what is either GK_CSR_ROW or GK_CSR_COL indicating which index1219will be created.1220*/1221/**************************************************************************/1222void gk_csr_CreateIndex(gk_csr_t *mat, int what)1223{1224/* 'f' stands for forward, 'r' stands for reverse */1225ssize_t i, j, k, nf, nr;1226ssize_t *fptr, *rptr;1227int *find, *rind;1228float *fval, *rval;12291230switch (what) {1231case GK_CSR_COL:1232nf = mat->nrows;1233fptr = mat->rowptr;1234find = mat->rowind;1235fval = mat->rowval;12361237if (mat->colptr) gk_free((void **)&mat->colptr, LTERM);1238if (mat->colind) gk_free((void **)&mat->colind, LTERM);1239if (mat->colval) gk_free((void **)&mat->colval, LTERM);12401241nr = mat->ncols;1242rptr = mat->colptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");1243rind = mat->colind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");1244rval = mat->colval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);1245break;1246case GK_CSR_ROW:1247nf = mat->ncols;1248fptr = mat->colptr;1249find = mat->colind;1250fval = mat->colval;12511252if (mat->rowptr) gk_free((void **)&mat->rowptr, LTERM);1253if (mat->rowind) gk_free((void **)&mat->rowind, LTERM);1254if (mat->rowval) gk_free((void **)&mat->rowval, LTERM);12551256nr = mat->nrows;1257rptr = mat->rowptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");1258rind = mat->rowind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");1259rval = mat->rowval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);1260break;1261default:1262gk_errexit(SIGERR, "Invalid index type of %d.\n", what);1263return;1264}126512661267for (i=0; i<nf; i++) {1268for (j=fptr[i]; j<fptr[i+1]; j++)1269rptr[find[j]]++;1270}1271MAKECSR(i, nr, rptr);12721273if (rptr[nr] > 6*nr) {1274for (i=0; i<nf; i++) {1275for (j=fptr[i]; j<fptr[i+1]; j++)1276rind[rptr[find[j]]++] = i;1277}1278SHIFTCSR(i, nr, rptr);12791280if (fval) {1281for (i=0; i<nf; i++) {1282for (j=fptr[i]; j<fptr[i+1]; j++)1283rval[rptr[find[j]]++] = fval[j];1284}1285SHIFTCSR(i, nr, rptr);1286}1287}1288else {1289if (fval) {1290for (i=0; i<nf; i++) {1291for (j=fptr[i]; j<fptr[i+1]; j++) {1292k = find[j];1293rind[rptr[k]] = i;1294rval[rptr[k]++] = fval[j];1295}1296}1297}1298else {1299for (i=0; i<nf; i++) {1300for (j=fptr[i]; j<fptr[i+1]; j++)1301rind[rptr[find[j]]++] = i;1302}1303}1304SHIFTCSR(i, nr, rptr);1305}1306}130713081309/*************************************************************************/1310/*! Normalizes the rows/columns of the matrix to be unit1311length.1312\param mat the matrix itself,1313\param what indicates what will be normalized and is obtained by1314specifying GK_CSR_ROW, GK_CSR_COL, GK_CSR_ROW|GK_CSR_COL.1315\param norm indicates what norm is to normalize to, 1: 1-norm, 2: 2-norm1316*/1317/**************************************************************************/1318void gk_csr_Normalize(gk_csr_t *mat, int what, int norm)1319{1320ssize_t i, j;1321int n;1322ssize_t *ptr;1323float *val, sum;13241325if (what&GK_CSR_ROW && mat->rowval) {1326n = mat->nrows;1327ptr = mat->rowptr;1328val = mat->rowval;13291330#pragma omp parallel if (ptr[n] > OMPMINOPS)1331{1332#pragma omp for private(j,sum) schedule(static)1333for (i=0; i<n; i++) {1334for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++){1335if (norm == 2)1336sum += val[j]*val[j];1337else if (norm == 1)1338sum += val[j]; /* assume val[j] > 0 */1339}1340if (sum > 0) {1341if (norm == 2)1342sum=1.0/sqrt(sum);1343else if (norm == 1)1344sum=1.0/sum;1345for (j=ptr[i]; j<ptr[i+1]; j++)1346val[j] *= sum;13471348}1349}1350}1351}13521353if (what&GK_CSR_COL && mat->colval) {1354n = mat->ncols;1355ptr = mat->colptr;1356val = mat->colval;13571358#pragma omp parallel if (ptr[n] > OMPMINOPS)1359{1360#pragma omp for private(j,sum) schedule(static)1361for (i=0; i<n; i++) {1362for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++)1363if (norm == 2)1364sum += val[j]*val[j];1365else if (norm == 1)1366sum += val[j];1367if (sum > 0) {1368if (norm == 2)1369sum=1.0/sqrt(sum);1370else if (norm == 1)1371sum=1.0/sum;1372for (j=ptr[i]; j<ptr[i+1]; j++)1373val[j] *= sum;1374}1375}1376}1377}1378}137913801381/*************************************************************************/1382/*! Applies different row scaling methods.1383\param mat the matrix itself,1384\param type indicates the type of row scaling. Possible values are:1385GK_CSR_MAXTF, GK_CSR_SQRT, GK_CSR_LOG, GK_CSR_IDF, GK_CSR_MAXTF2.1386*/1387/**************************************************************************/1388void gk_csr_Scale(gk_csr_t *mat, int type)1389{1390ssize_t i, j;1391int nrows, ncols, nnzcols, bgfreq;1392ssize_t *rowptr;1393int *rowind, *collen;1394float *rowval, *cscale, maxtf;13951396nrows = mat->nrows;1397rowptr = mat->rowptr;1398rowind = mat->rowind;1399rowval = mat->rowval;14001401switch (type) {1402case GK_CSR_MAXTF: /* TF' = .5 + .5*TF/MAX(TF) */1403#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1404{1405#pragma omp for private(j, maxtf) schedule(static)1406for (i=0; i<nrows; i++) {1407maxtf = fabs(rowval[rowptr[i]]);1408for (j=rowptr[i]; j<rowptr[i+1]; j++)1409maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);14101411for (j=rowptr[i]; j<rowptr[i+1]; j++)1412rowval[j] = .5 + .5*rowval[j]/maxtf;1413}1414}1415break;14161417case GK_CSR_MAXTF2: /* TF' = .1 + .9*TF/MAX(TF) */1418#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1419{1420#pragma omp for private(j, maxtf) schedule(static)1421for (i=0; i<nrows; i++) {1422maxtf = fabs(rowval[rowptr[i]]);1423for (j=rowptr[i]; j<rowptr[i+1]; j++)1424maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);14251426for (j=rowptr[i]; j<rowptr[i+1]; j++)1427rowval[j] = .1 + .9*rowval[j]/maxtf;1428}1429}1430break;14311432case GK_CSR_SQRT: /* TF' = .1+SQRT(TF) */1433#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1434{1435#pragma omp for private(j) schedule(static)1436for (i=0; i<nrows; i++) {1437for (j=rowptr[i]; j<rowptr[i+1]; j++) {1438if (rowval[j] != 0.0)1439rowval[j] = .1+sign(rowval[j], sqrt(fabs(rowval[j])));1440}1441}1442}1443break;14441445case GK_CSR_POW25: /* TF' = .1+POW(TF,.25) */1446#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1447{1448#pragma omp for private(j) schedule(static)1449for (i=0; i<nrows; i++) {1450for (j=rowptr[i]; j<rowptr[i+1]; j++) {1451if (rowval[j] != 0.0)1452rowval[j] = .1+sign(rowval[j], sqrt(sqrt(fabs(rowval[j]))));1453}1454}1455}1456break;14571458case GK_CSR_POW65: /* TF' = .1+POW(TF,.65) */1459#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1460{1461#pragma omp for private(j) schedule(static)1462for (i=0; i<nrows; i++) {1463for (j=rowptr[i]; j<rowptr[i+1]; j++) {1464if (rowval[j] != 0.0)1465rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .65));1466}1467}1468}1469break;14701471case GK_CSR_POW75: /* TF' = .1+POW(TF,.75) */1472#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1473{1474#pragma omp for private(j) schedule(static)1475for (i=0; i<nrows; i++) {1476for (j=rowptr[i]; j<rowptr[i+1]; j++) {1477if (rowval[j] != 0.0)1478rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .75));1479}1480}1481}1482break;14831484case GK_CSR_POW85: /* TF' = .1+POW(TF,.85) */1485#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1486{1487#pragma omp for private(j) schedule(static)1488for (i=0; i<nrows; i++) {1489for (j=rowptr[i]; j<rowptr[i+1]; j++) {1490if (rowval[j] != 0.0)1491rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .85));1492}1493}1494}1495break;14961497case GK_CSR_LOG: /* TF' = 1+log_2(TF) */1498#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1499{1500double logscale = 1.0/log(2.0);1501#pragma omp for schedule(static,32)1502for (i=0; i<rowptr[nrows]; i++) {1503if (rowval[i] != 0.0)1504rowval[i] = 1+(rowval[i]>0.0 ? log(rowval[i]) : -log(-rowval[i]))*logscale;1505}1506#ifdef XXX1507#pragma omp for private(j) schedule(static)1508for (i=0; i<nrows; i++) {1509for (j=rowptr[i]; j<rowptr[i+1]; j++) {1510if (rowval[j] != 0.0)1511rowval[j] = 1+(rowval[j]>0.0 ? log(rowval[j]) : -log(-rowval[j]))*logscale;1512//rowval[j] = 1+sign(rowval[j], log(fabs(rowval[j]))*logscale);1513}1514}1515#endif1516}1517break;15181519case GK_CSR_IDF: /* TF' = TF*IDF */1520ncols = mat->ncols;1521cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");1522collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");15231524for (i=0; i<nrows; i++) {1525for (j=rowptr[i]; j<rowptr[i+1]; j++)1526collen[rowind[j]]++;1527}15281529#pragma omp parallel if (ncols > OMPMINOPS)1530{1531#pragma omp for schedule(static)1532for (i=0; i<ncols; i++)1533cscale[i] = (collen[i] > 0 ? log(1.0*nrows/collen[i]) : 0.0);1534}15351536#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1537{1538#pragma omp for private(j) schedule(static)1539for (i=0; i<nrows; i++) {1540for (j=rowptr[i]; j<rowptr[i+1]; j++)1541rowval[j] *= cscale[rowind[j]];1542}1543}15441545gk_free((void **)&cscale, &collen, LTERM);1546break;15471548case GK_CSR_IDF2: /* TF' = TF*IDF */1549ncols = mat->ncols;1550cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");1551collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");15521553for (i=0; i<nrows; i++) {1554for (j=rowptr[i]; j<rowptr[i+1]; j++)1555collen[rowind[j]]++;1556}15571558nnzcols = 0;1559#pragma omp parallel if (ncols > OMPMINOPS)1560{1561#pragma omp for schedule(static) reduction(+:nnzcols)1562for (i=0; i<ncols; i++)1563nnzcols += (collen[i] > 0 ? 1 : 0);15641565bgfreq = gk_max(10, (ssize_t)(.5*rowptr[nrows]/nnzcols));1566printf("nnz: %zd, nnzcols: %d, bgfreq: %d\n", rowptr[nrows], nnzcols, bgfreq);15671568#pragma omp for schedule(static)1569for (i=0; i<ncols; i++)1570cscale[i] = (collen[i] > 0 ? log(1.0*(nrows+2*bgfreq)/(bgfreq+collen[i])) : 0.0);1571}15721573#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)1574{1575#pragma omp for private(j) schedule(static)1576for (i=0; i<nrows; i++) {1577for (j=rowptr[i]; j<rowptr[i+1]; j++)1578rowval[j] *= cscale[rowind[j]];1579}1580}15811582gk_free((void **)&cscale, &collen, LTERM);1583break;15841585default:1586gk_errexit(SIGERR, "Unknown scaling type of %d\n", type);1587}15881589}159015911592/*************************************************************************/1593/*! Computes the sums of the rows/columns1594\param mat the matrix itself,1595\param what is either GK_CSR_ROW or GK_CSR_COL indicating which1596sums to compute.1597*/1598/**************************************************************************/1599void gk_csr_ComputeSums(gk_csr_t *mat, int what)1600{1601ssize_t i;1602int n;1603ssize_t *ptr;1604float *val, *sums;16051606switch (what) {1607case GK_CSR_ROW:1608n = mat->nrows;1609ptr = mat->rowptr;1610val = mat->rowval;16111612if (mat->rsums)1613gk_free((void **)&mat->rsums, LTERM);16141615sums = mat->rsums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");1616break;1617case GK_CSR_COL:1618n = mat->ncols;1619ptr = mat->colptr;1620val = mat->colval;16211622if (mat->csums)1623gk_free((void **)&mat->csums, LTERM);16241625sums = mat->csums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");1626break;1627default:1628gk_errexit(SIGERR, "Invalid sum type of %d.\n", what);1629return;1630}16311632#pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)1633for (i=0; i<n; i++)1634sums[i] = gk_fsum(ptr[i+1]-ptr[i], val+ptr[i], 1);1635}163616371638/*************************************************************************/1639/*! Computes the squared of the norms of the rows/columns1640\param mat the matrix itself,1641\param what is either GK_CSR_ROW or GK_CSR_COL indicating which1642squared norms to compute.1643*/1644/**************************************************************************/1645void gk_csr_ComputeSquaredNorms(gk_csr_t *mat, int what)1646{1647ssize_t i;1648int n;1649ssize_t *ptr;1650float *val, *norms;16511652switch (what) {1653case GK_CSR_ROW:1654n = mat->nrows;1655ptr = mat->rowptr;1656val = mat->rowval;16571658if (mat->rnorms) gk_free((void **)&mat->rnorms, LTERM);16591660norms = mat->rnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");1661break;1662case GK_CSR_COL:1663n = mat->ncols;1664ptr = mat->colptr;1665val = mat->colval;16661667if (mat->cnorms) gk_free((void **)&mat->cnorms, LTERM);16681669norms = mat->cnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");1670break;1671default:1672gk_errexit(SIGERR, "Invalid norm type of %d.\n", what);1673return;1674}16751676#pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)1677for (i=0; i<n; i++)1678norms[i] = gk_fdot(ptr[i+1]-ptr[i], val+ptr[i], 1, val+ptr[i], 1);1679}168016811682/*************************************************************************/1683/*! Computes the similarity between two rows/columns16841685\param mat the matrix itself. The routine assumes that the indices1686are sorted in increasing order.1687\param i1 is the first row/column,1688\param i2 is the second row/column,1689\param what is either GK_CSR_ROW or GK_CSR_COL indicating the type of1690objects between the similarity will be computed,1691\param simtype is the type of similarity and is one of GK_CSR_COS,1692GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN1693\returns the similarity between the two rows/columns.1694*/1695/**************************************************************************/1696float gk_csr_ComputeSimilarity(gk_csr_t *mat, int i1, int i2, int what, int simtype)1697{1698int nind1, nind2;1699int *ind1, *ind2;1700float *val1, *val2, stat1, stat2, sim;17011702switch (what) {1703case GK_CSR_ROW:1704if (!mat->rowptr)1705gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");1706nind1 = mat->rowptr[i1+1]-mat->rowptr[i1];1707nind2 = mat->rowptr[i2+1]-mat->rowptr[i2];1708ind1 = mat->rowind + mat->rowptr[i1];1709ind2 = mat->rowind + mat->rowptr[i2];1710val1 = mat->rowval + mat->rowptr[i1];1711val2 = mat->rowval + mat->rowptr[i2];1712break;17131714case GK_CSR_COL:1715if (!mat->colptr)1716gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");1717nind1 = mat->colptr[i1+1]-mat->colptr[i1];1718nind2 = mat->colptr[i2+1]-mat->colptr[i2];1719ind1 = mat->colind + mat->colptr[i1];1720ind2 = mat->colind + mat->colptr[i2];1721val1 = mat->colval + mat->colptr[i1];1722val2 = mat->colval + mat->colptr[i2];1723break;17241725default:1726gk_errexit(SIGERR, "Invalid index type of %d.\n", what);1727return 0.0;1728}172917301731switch (simtype) {1732case GK_CSR_COS:1733case GK_CSR_JAC:1734sim = stat1 = stat2 = 0.0;1735i1 = i2 = 0;1736while (i1<nind1 && i2<nind2) {1737if (i1 == nind1) {1738stat2 += val2[i2]*val2[i2];1739i2++;1740}1741else if (i2 == nind2) {1742stat1 += val1[i1]*val1[i1];1743i1++;1744}1745else if (ind1[i1] < ind2[i2]) {1746stat1 += val1[i1]*val1[i1];1747i1++;1748}1749else if (ind1[i1] > ind2[i2]) {1750stat2 += val2[i2]*val2[i2];1751i2++;1752}1753else {1754sim += val1[i1]*val2[i2];1755stat1 += val1[i1]*val1[i1];1756stat2 += val2[i2]*val2[i2];1757i1++;1758i2++;1759}1760}1761if (simtype == GK_CSR_COS)1762sim = (stat1*stat2 > 0.0 ? sim/sqrt(stat1*stat2) : 0.0);1763else1764sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);1765break;17661767case GK_CSR_MIN:1768sim = stat1 = stat2 = 0.0;1769i1 = i2 = 0;1770while (i1<nind1 && i2<nind2) {1771if (i1 == nind1) {1772stat2 += val2[i2];1773i2++;1774}1775else if (i2 == nind2) {1776stat1 += val1[i1];1777i1++;1778}1779else if (ind1[i1] < ind2[i2]) {1780stat1 += val1[i1];1781i1++;1782}1783else if (ind1[i1] > ind2[i2]) {1784stat2 += val2[i2];1785i2++;1786}1787else {1788sim += gk_min(val1[i1],val2[i2]);1789stat1 += val1[i1];1790stat2 += val2[i2];1791i1++;1792i2++;1793}1794}1795sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);17961797break;17981799case GK_CSR_AMIN:1800sim = stat1 = stat2 = 0.0;1801i1 = i2 = 0;1802while (i1<nind1 && i2<nind2) {1803if (i1 == nind1) {1804stat2 += val2[i2];1805i2++;1806}1807else if (i2 == nind2) {1808stat1 += val1[i1];1809i1++;1810}1811else if (ind1[i1] < ind2[i2]) {1812stat1 += val1[i1];1813i1++;1814}1815else if (ind1[i1] > ind2[i2]) {1816stat2 += val2[i2];1817i2++;1818}1819else {1820sim += gk_min(val1[i1],val2[i2]);1821stat1 += val1[i1];1822stat2 += val2[i2];1823i1++;1824i2++;1825}1826}1827sim = (stat1 > 0.0 ? sim/stat1 : 0.0);18281829break;18301831default:1832gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);1833return -1;1834}18351836return sim;18371838}183918401841/*************************************************************************/1842/*! Finds the n most similar rows (neighbors) to the query using cosine1843similarity.18441845\param mat the matrix itself1846\param nqterms is the number of columns in the query1847\param qind is the list of query columns1848\param qval is the list of correspodning query weights1849\param simtype is the type of similarity and is one of GK_CSR_COS,1850GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN1851\param nsim is the maximum number of requested most similar rows.1852If -1 is provided, then everything is returned unsorted.1853\param minsim is the minimum similarity of the requested most1854similar rows1855\param hits is the result set. This array should be at least1856of length nsim.1857\param i_marker is an array of size equal to the number of rows1858whose values are initialized to -1. If NULL is provided1859then this array is allocated and freed internally.1860\param i_cand is an array of size equal to the number of rows.1861If NULL is provided then this array is allocated and freed1862internally.1863\returns the number of identified most similar rows, which can be1864smaller than the requested number of nnbrs in those cases1865in which there are no sufficiently many neighbors.1866*/1867/**************************************************************************/1868int gk_csr_GetSimilarRows(gk_csr_t *mat, int nqterms, int *qind,1869float *qval, int simtype, int nsim, float minsim, gk_fkv_t *hits,1870int *i_marker, gk_fkv_t *i_cand)1871{1872ssize_t i, ii, j, k;1873int nrows, ncols, ncand;1874ssize_t *colptr;1875int *colind, *marker;1876float *colval, *rnorms, mynorm, *rsums, mysum;1877gk_fkv_t *cand;18781879if (nqterms == 0)1880return 0;18811882nrows = mat->nrows;1883ncols = mat->ncols;1884colptr = mat->colptr;1885colind = mat->colind;1886colval = mat->colval;18871888marker = (i_marker ? i_marker : gk_ismalloc(nrows, -1, "gk_csr_SimilarRows: marker"));1889cand = (i_cand ? i_cand : gk_fkvmalloc(nrows, "gk_csr_SimilarRows: cand"));18901891switch (simtype) {1892case GK_CSR_COS:1893for (ncand=0, ii=0; ii<nqterms; ii++) {1894i = qind[ii];1895if (i < ncols) {1896for (j=colptr[i]; j<colptr[i+1]; j++) {1897k = colind[j];1898if (marker[k] == -1) {1899cand[ncand].val = k;1900cand[ncand].key = 0;1901marker[k] = ncand++;1902}1903cand[marker[k]].key += colval[j]*qval[ii];1904}1905}1906}1907break;19081909case GK_CSR_JAC:1910for (ncand=0, ii=0; ii<nqterms; ii++) {1911i = qind[ii];1912if (i < ncols) {1913for (j=colptr[i]; j<colptr[i+1]; j++) {1914k = colind[j];1915if (marker[k] == -1) {1916cand[ncand].val = k;1917cand[ncand].key = 0;1918marker[k] = ncand++;1919}1920cand[marker[k]].key += colval[j]*qval[ii];1921}1922}1923}19241925rnorms = mat->rnorms;1926mynorm = gk_fdot(nqterms, qval, 1, qval, 1);19271928for (i=0; i<ncand; i++)1929cand[i].key = cand[i].key/(rnorms[cand[i].val]+mynorm-cand[i].key);1930break;19311932case GK_CSR_MIN:1933for (ncand=0, ii=0; ii<nqterms; ii++) {1934i = qind[ii];1935if (i < ncols) {1936for (j=colptr[i]; j<colptr[i+1]; j++) {1937k = colind[j];1938if (marker[k] == -1) {1939cand[ncand].val = k;1940cand[ncand].key = 0;1941marker[k] = ncand++;1942}1943cand[marker[k]].key += gk_min(colval[j], qval[ii]);1944}1945}1946}19471948rsums = mat->rsums;1949mysum = gk_fsum(nqterms, qval, 1);19501951for (i=0; i<ncand; i++)1952cand[i].key = cand[i].key/(rsums[cand[i].val]+mysum-cand[i].key);1953break;19541955/* Assymetric MIN similarity */1956case GK_CSR_AMIN:1957for (ncand=0, ii=0; ii<nqterms; ii++) {1958i = qind[ii];1959if (i < ncols) {1960for (j=colptr[i]; j<colptr[i+1]; j++) {1961k = colind[j];1962if (marker[k] == -1) {1963cand[ncand].val = k;1964cand[ncand].key = 0;1965marker[k] = ncand++;1966}1967cand[marker[k]].key += gk_min(colval[j], qval[ii]);1968}1969}1970}19711972mysum = gk_fsum(nqterms, qval, 1);19731974for (i=0; i<ncand; i++)1975cand[i].key = cand[i].key/mysum;1976break;19771978default:1979gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);1980return -1;1981}19821983/* go and prune the hits that are bellow minsim */1984for (j=0, i=0; i<ncand; i++) {1985marker[cand[i].val] = -1;1986if (cand[i].key >= minsim)1987cand[j++] = cand[i];1988}1989ncand = j;19901991if (nsim == -1 || nsim >= ncand) {1992nsim = ncand;1993}1994else {1995nsim = gk_min(nsim, ncand);1996gk_dfkvkselect(ncand, nsim, cand);1997gk_fkvsortd(nsim, cand);1998}19992000gk_fkvcopy(nsim, cand, hits);20012002if (i_marker == NULL)2003gk_free((void **)&marker, LTERM);2004if (i_cand == NULL)2005gk_free((void **)&cand, LTERM);20062007return nsim;2008}2009201020112012