CoCalc -- csr.c

GitHub Repository: ElmerCSC/elmerfem
Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/csr.c
³²⁰⁶ views
1
/*!
2
 * \file 
3
 *
4
 * \brief Various routines with dealing with CSR matrices
5
 *
6
 * \author George Karypis
7
 * \version\verbatim $Id: csr.c 13437 2013-01-11 21:54:10Z karypis $ \endverbatim
8
 */
9

10
#include <GKlib.h>
11

12
#define OMPMINOPS       50000
13

14
/*************************************************************************/
15
/*! Allocate memory for a CSR matrix and initializes it 
16
    \returns the allocated matrix. The various fields are set to NULL.
17
*/
18
/**************************************************************************/
19
gk_csr_t *gk_csr_Create()
20
{
21
  gk_csr_t *mat;
22

23
  mat = (gk_csr_t *)gk_malloc(sizeof(gk_csr_t), "gk_csr_Create: mat");
24

25
  gk_csr_Init(mat);
26

27
  return mat;
28
}
29

30

31
/*************************************************************************/
32
/*! Initializes the matrix 
33
    \param mat is the matrix to be initialized.
34
*/
35
/*************************************************************************/
36
void gk_csr_Init(gk_csr_t *mat)
37
{
38
  memset(mat, 0, sizeof(gk_csr_t));
39
  mat->nrows = mat->ncols = -1;
40
}
41

42

43
/*************************************************************************/
44
/*! Frees all the memory allocated for matrix.
45
    \param mat is the matrix to be freed.
46
*/
47
/*************************************************************************/
48
void gk_csr_Free(gk_csr_t **mat)
49
{
50
  if (*mat == NULL)
51
    return;
52
  gk_csr_FreeContents(*mat);
53
  gk_free((void **)mat, LTERM);
54
}
55

56

57
/*************************************************************************/
58
/*! Frees only the memory allocated for the matrix's different fields and
59
    sets them to NULL.
60
    \param mat is the matrix whose contents will be freed.
61
*/    
62
/*************************************************************************/
63
void gk_csr_FreeContents(gk_csr_t *mat)
64
{
65
  gk_free((void *)&mat->rowptr, &mat->rowind, &mat->rowval, &mat->rowids,
66
          &mat->colptr, &mat->colind, &mat->colval, &mat->colids, 
67
          &mat->rnorms, &mat->cnorms, &mat->rsums, &mat->csums, 
68
          &mat->rsizes, &mat->csizes, &mat->rvols, &mat->cvols, 
69
          &mat->rwgts, &mat->cwgts, 
70
          LTERM);
71
}
72

73

74
/*************************************************************************/
75
/*! Returns a copy of a matrix.
76
    \param mat is the matrix to be duplicated.
77
    \returns the newly created copy of the matrix.
78
*/
79
/**************************************************************************/
80
gk_csr_t *gk_csr_Dup(gk_csr_t *mat)
81
{
82
  gk_csr_t *nmat;
83

84
  nmat = gk_csr_Create();
85

86
  nmat->nrows  = mat->nrows;
87
  nmat->ncols  = mat->ncols;
88

89
  /* copy the row structure */
90
  if (mat->rowptr)
91
    nmat->rowptr = gk_zcopy(mat->nrows+1, mat->rowptr, 
92
                            gk_zmalloc(mat->nrows+1, "gk_csr_Dup: rowptr"));
93
  if (mat->rowids)
94
    nmat->rowids = gk_icopy(mat->nrows, mat->rowids, 
95
                            gk_imalloc(mat->nrows, "gk_csr_Dup: rowids"));
96
  if (mat->rnorms)
97
    nmat->rnorms = gk_fcopy(mat->nrows, mat->rnorms, 
98
                            gk_fmalloc(mat->nrows, "gk_csr_Dup: rnorms"));
99
  if (mat->rowind)
100
    nmat->rowind = gk_icopy(mat->rowptr[mat->nrows], mat->rowind, 
101
                            gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowind"));
102
  if (mat->rowval)
103
    nmat->rowval = gk_fcopy(mat->rowptr[mat->nrows], mat->rowval, 
104
                            gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowval"));
105

106
  /* copy the col structure */
107
  if (mat->colptr)
108
    nmat->colptr = gk_zcopy(mat->ncols+1, mat->colptr, 
109
                            gk_zmalloc(mat->ncols+1, "gk_csr_Dup: colptr"));
110
  if (mat->colids)
111
    nmat->colids = gk_icopy(mat->ncols, mat->colids, 
112
                            gk_imalloc(mat->ncols, "gk_csr_Dup: colids"));
113
  if (mat->cnorms)
114
    nmat->cnorms = gk_fcopy(mat->ncols, mat->cnorms, 
115
                            gk_fmalloc(mat->ncols, "gk_csr_Dup: cnorms"));
116
  if (mat->colind)
117
    nmat->colind = gk_icopy(mat->colptr[mat->ncols], mat->colind, 
118
                            gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colind"));
119
  if (mat->colval)
120
    nmat->colval = gk_fcopy(mat->colptr[mat->ncols], mat->colval, 
121
                            gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colval"));
122

123
  return nmat;
124
}
125

126

127
/*************************************************************************/
128
/*! Returns a submatrix containint a set of consecutive rows.
129
    \param mat is the original matrix.
130
    \param rstart is the starting row.
131
    \param nrows is the number of rows from rstart to extract.
132
    \returns the row structure of the newly created submatrix.
133
*/
134
/**************************************************************************/
135
gk_csr_t *gk_csr_ExtractSubmatrix(gk_csr_t *mat, int rstart, int nrows)
136
{
137
  ssize_t i;
138
  gk_csr_t *nmat;
139

140
  if (rstart+nrows > mat->nrows)
141
    return NULL;
142

143
  nmat = gk_csr_Create();
144

145
  nmat->nrows  = nrows;
146
  nmat->ncols  = mat->ncols;
147

148
  /* copy the row structure */
149
  if (mat->rowptr)
150
    nmat->rowptr = gk_zcopy(nrows+1, mat->rowptr+rstart, 
151
                              gk_zmalloc(nrows+1, "gk_csr_ExtractSubmatrix: rowptr"));
152
  for (i=nrows; i>=0; i--)
153
    nmat->rowptr[i] -= nmat->rowptr[0];
154
  ASSERT(nmat->rowptr[0] == 0);
155

156
  if (mat->rowids)
157
    nmat->rowids = gk_icopy(nrows, mat->rowids+rstart, 
158
                            gk_imalloc(nrows, "gk_csr_ExtractSubmatrix: rowids"));
159
  if (mat->rnorms)
160
    nmat->rnorms = gk_fcopy(nrows, mat->rnorms+rstart, 
161
                            gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rnorms"));
162

163
  if (mat->rsums)
164
    nmat->rsums = gk_fcopy(nrows, mat->rsums+rstart, 
165
                            gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rsums"));
166

167
  ASSERT(nmat->rowptr[nrows] == mat->rowptr[rstart+nrows]-mat->rowptr[rstart]);
168
  if (mat->rowind)
169
    nmat->rowind = gk_icopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart], 
170
                            mat->rowind+mat->rowptr[rstart], 
171
                            gk_imalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
172
                                       "gk_csr_ExtractSubmatrix: rowind"));
173
  if (mat->rowval)
174
    nmat->rowval = gk_fcopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart], 
175
                            mat->rowval+mat->rowptr[rstart], 
176
                            gk_fmalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
177
                                       "gk_csr_ExtractSubmatrix: rowval"));
178

179
  return nmat;
180
}
181

182

183
/*************************************************************************/
184
/*! Returns a submatrix containing a certain set of rows.
185
    \param mat is the original matrix.
186
    \param nrows is the number of rows to extract.
187
    \param rind is the set of row numbers to extract.
188
    \returns the row structure of the newly created submatrix.
189
*/
190
/**************************************************************************/
191
gk_csr_t *gk_csr_ExtractRows(gk_csr_t *mat, int nrows, int *rind)
192
{
193
  ssize_t i, ii, j, nnz;
194
  gk_csr_t *nmat;
195

196
  nmat = gk_csr_Create();
197

198
  nmat->nrows = nrows;
199
  nmat->ncols = mat->ncols;
200

201
  for (nnz=0, i=0; i<nrows; i++)  
202
    nnz += mat->rowptr[rind[i]+1]-mat->rowptr[rind[i]];
203

204
  nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
205
  nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
206
  nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
207

208
  nmat->rowptr[0] = 0;
209
  for (nnz=0, j=0, ii=0; ii<nrows; ii++) {
210
    i = rind[ii];
211
    gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
212
    gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
213
    nnz += mat->rowptr[i+1]-mat->rowptr[i];
214
    nmat->rowptr[++j] = nnz;
215
  }
216
  ASSERT(j == nmat->nrows);
217

218
  return nmat;
219
}
220

221

222
/*************************************************************************/
223
/*! Returns a submatrix corresponding to a specified partitioning of rows.
224
    \param mat is the original matrix.
225
    \param part is the partitioning vector of the rows.
226
    \param pid is the partition ID that will be extracted.
227
    \returns the row structure of the newly created submatrix.
228
*/
229
/**************************************************************************/
230
gk_csr_t *gk_csr_ExtractPartition(gk_csr_t *mat, int *part, int pid)
231
{
232
  ssize_t i, j, nnz;
233
  gk_csr_t *nmat;
234

235
  nmat = gk_csr_Create();
236

237
  nmat->nrows = 0;
238
  nmat->ncols = mat->ncols;
239

240
  for (nnz=0, i=0; i<mat->nrows; i++) {
241
    if (part[i] == pid) {
242
      nmat->nrows++;
243
      nnz += mat->rowptr[i+1]-mat->rowptr[i];
244
    }
245
  }
246

247
  nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
248
  nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
249
  nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
250

251
  nmat->rowptr[0] = 0;
252
  for (nnz=0, j=0, i=0; i<mat->nrows; i++) {
253
    if (part[i] == pid) {
254
      gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
255
      gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
256
      nnz += mat->rowptr[i+1]-mat->rowptr[i];
257
      nmat->rowptr[++j] = nnz;
258
    }
259
  }
260
  ASSERT(j == nmat->nrows);
261

262
  return nmat;
263
}
264

265

266
/*************************************************************************/
267
/*! Splits the matrix into multiple sub-matrices based on the provided
268
    color array.
269
    \param mat is the original matrix.
270
    \param color is an array of size equal to the number of non-zeros
271
           in the matrix (row-wise structure). The matrix is split into
272
           as many parts as the number of colors. For meaningfull results,
273
           the colors should be numbered consecutively starting from 0.
274
    \returns an array of matrices for each supplied color number.
275
*/
276
/**************************************************************************/
277
gk_csr_t **gk_csr_Split(gk_csr_t *mat, int *color)
278
{
279
  ssize_t i, j;
280
  int nrows, ncolors;
281
  ssize_t *rowptr;
282
  int *rowind;
283
  float *rowval;
284
  gk_csr_t **smats;
285

286
  nrows  = mat->nrows;
287
  rowptr = mat->rowptr;
288
  rowind = mat->rowind;
289
  rowval = mat->rowval;
290

291
  ncolors = gk_imax(rowptr[nrows], color)+1;
292

293
  smats = (gk_csr_t **)gk_malloc(sizeof(gk_csr_t *)*ncolors, "gk_csr_Split: smats");
294
  for (i=0; i<ncolors; i++) {
295
    smats[i] = gk_csr_Create();
296
    smats[i]->nrows  = mat->nrows;
297
    smats[i]->ncols  = mat->ncols;
298
    smats[i]->rowptr = gk_zsmalloc(nrows+1, 0, "gk_csr_Split: smats[i]->rowptr"); 
299
  }
300

301
  for (i=0; i<nrows; i++) {
302
    for (j=rowptr[i]; j<rowptr[i+1]; j++) 
303
      smats[color[j]]->rowptr[i]++;
304
  }
305
  for (i=0; i<ncolors; i++) 
306
    MAKECSR(j, nrows, smats[i]->rowptr);
307

308
  for (i=0; i<ncolors; i++) {
309
    smats[i]->rowind = gk_imalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowind"); 
310
    smats[i]->rowval = gk_fmalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowval"); 
311
  }
312

313
  for (i=0; i<nrows; i++) {
314
    for (j=rowptr[i]; j<rowptr[i+1]; j++) {
315
      smats[color[j]]->rowind[smats[color[j]]->rowptr[i]] = rowind[j];
316
      smats[color[j]]->rowval[smats[color[j]]->rowptr[i]] = rowval[j];
317
      smats[color[j]]->rowptr[i]++;
318
    }
319
  }
320

321
  for (i=0; i<ncolors; i++) 
322
    SHIFTCSR(j, nrows, smats[i]->rowptr);
323

324
  return smats;
325
}
326

327

328
/**************************************************************************/
329
/*! Reads a CSR matrix from the supplied file and stores it the matrix's 
330
    forward structure.
331
    \param filename is the file that stores the data.
332
    \param format is either GK_CSR_FMT_METIS, GK_CSR_FMT_CLUTO, 
333
           GK_CSR_FMT_CSR, GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL 
334
           specifying the type of the input format. 
335
           The GK_CSR_FMT_CSR does not contain a header
336
           line, whereas the GK_CSR_FMT_BINROW is a binary format written 
337
           by gk_csr_Write() using the same format specifier.
338
    \param readvals is either 1 or 0, indicating if the CSR file contains
339
           values or it does not. It only applies when GK_CSR_FMT_CSR is
340
           used.
341
    \param numbering is either 1 or 0, indicating if the numbering of the 
342
           indices start from 1 or 0, respectively. If they start from 1, 
343
           they are automatically decreamented during input so that they
344
           will start from 0. It only applies when GK_CSR_FMT_CSR is
345
           used.
346
    \returns the matrix that was read.
347
*/
348
/**************************************************************************/
349
gk_csr_t *gk_csr_Read(char *filename, int format, int readvals, int numbering)
350
{
351
  ssize_t i, k, l;
352
  size_t nfields, nrows, ncols, nnz, fmt, ncon;
353
  size_t lnlen;
354
  ssize_t *rowptr;
355
  int *rowind, ival;
356
  float *rowval=NULL, fval;
357
  int readsizes, readwgts;
358
  char *line=NULL, *head, *tail, fmtstr[256];
359
  FILE *fpin;
360
  gk_csr_t *mat=NULL;
361

362

363
  if (!gk_fexists(filename)) 
364
    gk_errexit(SIGERR, "File %s does not exist!\n", filename);
365

366
  if (format == GK_CSR_FMT_BINROW) {
367
    mat = gk_csr_Create();
368

369
    fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
370
    if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
371
      gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
372
    if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
373
      gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
374
    mat->rowptr = gk_zmalloc(mat->nrows+1, "gk_csr_Read: rowptr");
375
    if (fread(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpin) != mat->nrows+1)
376
      gk_errexit(SIGERR, "Failed to read the rowptr from file %s!\n", filename);
377
    mat->rowind = gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowind");
378
    if (fread(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
379
      gk_errexit(SIGERR, "Failed to read the rowind from file %s!\n", filename);
380
    if (readvals == 1) {
381
      mat->rowval = gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowval");
382
      if (fread(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
383
        gk_errexit(SIGERR, "Failed to read the rowval from file %s!\n", filename);
384
    }
385

386
    gk_fclose(fpin);
387
    return mat;
388
  }
389

390
  if (format == GK_CSR_FMT_BINCOL) {
391
    mat = gk_csr_Create();
392

393
    fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
394
    if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
395
      gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
396
    if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
397
      gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
398
    mat->colptr = gk_zmalloc(mat->ncols+1, "gk_csr_Read: colptr");
399
    if (fread(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpin) != mat->ncols+1)
400
      gk_errexit(SIGERR, "Failed to read the colptr from file %s!\n", filename);
401
    mat->colind = gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Read: colind");
402
    if (fread(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
403
      gk_errexit(SIGERR, "Failed to read the colind from file %s!\n", filename);
404
    if (readvals) {
405
      mat->colval = gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Read: colval");
406
      if (fread(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
407
        gk_errexit(SIGERR, "Failed to read the colval from file %s!\n", filename);
408
    }
409

410
    gk_fclose(fpin);
411
    return mat;
412
  }
413

414

415
  if (format == GK_CSR_FMT_CLUTO) {
416
    fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
417
    do {
418
      if (gk_getline(&line, &lnlen, fpin) <= 0)
419
        gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
420
    } while (line[0] == '%');
421

422
    if (sscanf(line, "%zu %zu %zu", &nrows, &ncols, &nnz) != 3)
423
      gk_errexit(SIGERR, "Header line must contain 3 integers.\n");
424

425
    readsizes = 0;
426
    readwgts  = 0;
427
    readvals  = 1;
428
    numbering = 1;
429
  }
430
  else if (format == GK_CSR_FMT_METIS) {
431
    fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
432
    do {
433
      if (gk_getline(&line, &lnlen, fpin) <= 0)
434
        gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
435
    } while (line[0] == '%');
436

437
    fmt = ncon = 0;
438
    nfields = sscanf(line, "%zu %zu %zu %zu", &nrows, &nnz, &fmt, &ncon);
439
    if (nfields < 2)
440
      gk_errexit(SIGERR, "Header line must contain at least 2 integers (#vtxs and #edges).\n");
441

442
    ncols = nrows;
443
    nnz *= 2;
444

445
    if (fmt > 111)
446
      gk_errexit(SIGERR, "Cannot read this type of file format [fmt=%zu]!\n", fmt);
447

448
    sprintf(fmtstr, "%03zu", fmt%1000);
449
    readsizes = (fmtstr[0] == '1');
450
    readwgts  = (fmtstr[1] == '1');
451
    readvals  = (fmtstr[2] == '1');
452
    numbering = 1;
453
    ncon      = (ncon == 0 ? 1 : ncon);
454
  }
455
  else {
456
    readsizes = 0;
457
    readwgts  = 0;
458

459
    gk_getfilestats(filename, &nrows, &nnz, NULL, NULL);
460

461
    if (readvals == 1 && nnz%2 == 1)
462
      gk_errexit(SIGERR, "Error: The number of numbers (%zd %d) in the input file is not even.\n", nnz, readvals);
463
    if (readvals == 1)
464
      nnz = nnz/2;
465
    fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
466
  }
467

468
  mat = gk_csr_Create();
469

470
  mat->nrows = nrows;
471

472
  rowptr = mat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Read: rowptr");
473
  rowind = mat->rowind = gk_imalloc(nnz, "gk_csr_Read: rowind");
474
  if (readvals != 2)
475
    rowval = mat->rowval = gk_fsmalloc(nnz, 1.0, "gk_csr_Read: rowval");
476

477
  if (readsizes)
478
    mat->rsizes = gk_fsmalloc(nrows, 0.0, "gk_csr_Read: rsizes");
479

480
  if (readwgts)
481
    mat->rwgts = gk_fsmalloc(nrows*ncon, 0.0, "gk_csr_Read: rwgts");
482

483
  /*----------------------------------------------------------------------
484
   * Read the sparse matrix file
485
   *---------------------------------------------------------------------*/
486
  numbering = (numbering ? - 1 : 0);
487
  for (ncols=0, rowptr[0]=0, k=0, i=0; i<nrows; i++) {
488
    do {
489
      if (gk_getline(&line, &lnlen, fpin) == -1)
490
        gk_errexit(SIGERR, "Premature end of input file: file while reading row %d\n", i);
491
    } while (line[0] == '%');
492

493
    head = line;
494
    tail = NULL;
495

496
    /* Read vertex sizes */
497
    if (readsizes) {
498
#ifdef __MSC__
499
      mat->rsizes[i] = (float)strtod(head, &tail);
500
#else
501
      mat->rsizes[i] = strtof(head, &tail);
502
#endif
503
      if (tail == head)
504
        gk_errexit(SIGERR, "The line for vertex %zd does not have size information\n", i+1);
505
      if (mat->rsizes[i] < 0)
506
        errexit("The size for vertex %zd must be >= 0\n", i+1);
507
      head = tail;
508
    }
509

510
    /* Read vertex weights */
511
    if (readwgts) {
512
      for (l=0; l<ncon; l++) {
513
#ifdef __MSC__
514
        mat->rwgts[i*ncon+l] = (float)strtod(head, &tail);
515
#else
516
        mat->rwgts[i*ncon+l] = strtof(head, &tail);
517
#endif
518
        if (tail == head)
519
          errexit("The line for vertex %zd does not have enough weights "
520
                  "for the %d constraints.\n", i+1, ncon);
521
        if (mat->rwgts[i*ncon+l] < 0)
522
          errexit("The weight vertex %zd and constraint %zd must be >= 0\n", i+1, l);
523
        head = tail;
524
      }
525
    }
526

527
   
528
    /* Read the rest of the row */
529
    while (1) {
530
      ival = (int)strtol(head, &tail, 0);
531
      if (tail == head) 
532
        break;
533
      head = tail;
534
      
535
      if ((rowind[k] = ival + numbering) < 0)
536
        gk_errexit(SIGERR, "Error: Invalid column number %d at row %zd.\n", ival, i);
537

538
      ncols = gk_max(rowind[k], ncols);
539

540
      if (readvals == 1) {
541
#ifdef __MSC__
542
        fval = (float)strtod(head, &tail);
543
#else
544
	fval = strtof(head, &tail);
545
#endif
546
        if (tail == head)
547
          gk_errexit(SIGERR, "Value could not be found for column! Row:%zd, NNZ:%zd\n", i, k);
548
        head = tail;
549

550
        rowval[k] = fval;
551
      }
552
      k++;
553
    }
554
    rowptr[i+1] = k;
555
  }
556

557
  if (format == GK_CSR_FMT_METIS) {
558
    ASSERT(ncols+1 == mat->nrows);
559
    mat->ncols = mat->nrows;
560
  }
561
  else {
562
    mat->ncols = ncols+1;
563
  }
564

565
  if (k != nnz)
566
    gk_errexit(SIGERR, "gk_csr_Read: Something wrong with the number of nonzeros in "
567
                       "the input file. NNZ=%zd, ActualNNZ=%zd.\n", nnz, k);
568

569
  gk_fclose(fpin);
570

571
  gk_free((void **)&line, LTERM);
572

573
  return mat;
574
}
575

576

577
/**************************************************************************/
578
/*! Writes the row-based structure of a matrix into a file.
579
    \param mat is the matrix to be written,
580
    \param filename is the name of the output file.
581
    \param format is one of: GK_CSR_FMT_CLUTO, GK_CSR_FMT_CSR, 
582
           GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL.
583
    \param writevals is either 1 or 0 indicating if the values will be 
584
           written or not. This is only applicable when GK_CSR_FMT_CSR
585
           is used.
586
    \param numbering is either 1 or 0 indicating if the internal 0-based 
587
           numbering will be shifted by one or not during output. This 
588
           is only applicable when GK_CSR_FMT_CSR is used.
589
*/
590
/**************************************************************************/
591
void gk_csr_Write(gk_csr_t *mat, char *filename, int format, int writevals, int numbering)
592
{
593
  ssize_t i, j;
594
  FILE *fpout;
595

596
  if (format == GK_CSR_FMT_BINROW) {
597
    if (filename == NULL)
598
      gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
599
    fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
600

601
    fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout); 
602
    fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout); 
603
    fwrite(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpout); 
604
    fwrite(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpout); 
605
    if (writevals)
606
      fwrite(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpout); 
607

608
    gk_fclose(fpout);
609
    return;
610
  }
611

612
  if (format == GK_CSR_FMT_BINCOL) {
613
    if (filename == NULL)
614
      gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
615
    fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
616

617
    fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout); 
618
    fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout); 
619
    fwrite(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpout); 
620
    fwrite(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpout); 
621
    if (writevals) 
622
      fwrite(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpout); 
623

624
    gk_fclose(fpout);
625
    return;
626
  }
627

628
  if (filename)
629
    fpout = gk_fopen(filename, "w", "gk_csr_Write: fpout");
630
  else
631
    fpout = stdout; 
632

633
  if (format == GK_CSR_FMT_CLUTO) {
634
    fprintf(fpout, "%d %d %zd\n", mat->nrows, mat->ncols, mat->rowptr[mat->nrows]);
635
    writevals = 1;
636
    numbering = 1;
637
  }
638

639
  for (i=0; i<mat->nrows; i++) {
640
    for (j=mat->rowptr[i]; j<mat->rowptr[i+1]; j++) {
641
      fprintf(fpout, " %d", mat->rowind[j]+(numbering ? 1 : 0));
642
      if (writevals) 
643
        fprintf(fpout, " %f", mat->rowval[j]);
644
    }
645
    fprintf(fpout, "\n");
646
  }
647
  if (filename)
648
    gk_fclose(fpout);
649
}
650

651

652
/*************************************************************************/
653
/*! Prunes certain rows/columns of the matrix. The prunning takes place 
654
    by analyzing the row structure of the matrix. The prunning takes place
655
    by removing rows/columns but it does not affect the numbering of the
656
    remaining rows/columns.
657
   
658
    \param mat the matrix to be prunned,
659
    \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
660
           of the matrix will be prunned,
661
    \param minf is the minimum number of rows (columns) that a column (row) must
662
           be present in order to be kept,
663
    \param maxf is the maximum number of rows (columns) that a column (row) must
664
          be present at in order to be kept.
665
    \returns the prunned matrix consisting only of its row-based structure. 
666
          The input matrix is not modified. 
667
*/
668
/**************************************************************************/
669
gk_csr_t *gk_csr_Prune(gk_csr_t *mat, int what, int minf, int maxf)
670
{
671
  ssize_t i, j, nnz;
672
  int nrows, ncols;
673
  ssize_t *rowptr, *nrowptr;
674
  int *rowind, *nrowind, *collen;
675
  float *rowval, *nrowval;
676
  gk_csr_t *nmat;
677

678
  nmat = gk_csr_Create();
679
  
680
  nrows = nmat->nrows = mat->nrows;
681
  ncols = nmat->ncols = mat->ncols;
682

683
  rowptr = mat->rowptr;
684
  rowind = mat->rowind;
685
  rowval = mat->rowval;
686

687
  nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Prune: nrowptr");
688
  nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_Prune: nrowind");
689
  nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_Prune: nrowval");
690

691

692
  switch (what) {
693
    case GK_CSR_COL:
694
      collen = gk_ismalloc(ncols, 0, "gk_csr_Prune: collen");
695

696
      for (i=0; i<nrows; i++) {
697
        for (j=rowptr[i]; j<rowptr[i+1]; j++) {
698
          ASSERT(rowind[j] < ncols);
699
          collen[rowind[j]]++;
700
        }
701
      }
702
      for (i=0; i<ncols; i++)
703
        collen[i] = (collen[i] >= minf && collen[i] <= maxf ? 1 : 0);
704

705
      nrowptr[0] = 0;
706
      for (nnz=0, i=0; i<nrows; i++) {
707
        for (j=rowptr[i]; j<rowptr[i+1]; j++) {
708
          if (collen[rowind[j]]) {
709
            nrowind[nnz] = rowind[j];
710
            nrowval[nnz] = rowval[j];
711
            nnz++;
712
          }
713
        }
714
        nrowptr[i+1] = nnz;
715
      }
716
      gk_free((void **)&collen, LTERM);
717
      break;
718

719
    case GK_CSR_ROW:
720
      nrowptr[0] = 0;
721
      for (nnz=0, i=0; i<nrows; i++) {
722
        if (rowptr[i+1]-rowptr[i] >= minf && rowptr[i+1]-rowptr[i] <= maxf) {
723
          for (j=rowptr[i]; j<rowptr[i+1]; j++, nnz++) {
724
            nrowind[nnz] = rowind[j];
725
            nrowval[nnz] = rowval[j];
726
          }
727
        }
728
        nrowptr[i+1] = nnz;
729
      }
730
      break;
731

732
    default:
733
      gk_csr_Free(&nmat);
734
      gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
735
      return NULL;
736
  }
737

738
  return nmat;
739
}
740

741

742
/*************************************************************************/
743
/*! Eliminates certain entries from the rows/columns of the matrix. The 
744
    filtering takes place by keeping only the highest weight entries whose
745
    sum accounts for a certain fraction of the overall weight of the 
746
    row/column.
747
   
748
    \param mat the matrix to be prunned,
749
    \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
750
           of the matrix will be prunned,
751
    \param norm indicates the norm that will be used to aggregate the weights
752
           and possible values are 1 or 2,
753
    \param fraction is the fraction of the overall norm that will be retained
754
           by the kept entries.
755
    \returns the filtered matrix consisting only of its row-based structure. 
756
           The input matrix is not modified. 
757
*/
758
/**************************************************************************/
759
gk_csr_t *gk_csr_LowFilter(gk_csr_t *mat, int what, int norm, float fraction)
760
{
761
  ssize_t i, j, nnz;
762
  int nrows, ncols, ncand, maxlen=0;
763
  ssize_t *rowptr, *colptr, *nrowptr;
764
  int *rowind, *colind, *nrowind;
765
  float *rowval, *colval, *nrowval, rsum, tsum;
766
  gk_csr_t *nmat;
767
  gk_fkv_t *cand;
768

769
  nmat = gk_csr_Create();
770
  
771
  nrows = nmat->nrows = mat->nrows;
772
  ncols = nmat->ncols = mat->ncols;
773

774
  rowptr = mat->rowptr;
775
  rowind = mat->rowind;
776
  rowval = mat->rowval;
777
  colptr = mat->colptr;
778
  colind = mat->colind;
779
  colval = mat->colval;
780

781
  nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
782
  nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
783
  nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
784

785

786
  switch (what) {
787
    case GK_CSR_COL:
788
      if (mat->colptr == NULL) 
789
        gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
790

791
      gk_zcopy(nrows+1, rowptr, nrowptr);
792

793
      for (i=0; i<ncols; i++) 
794
        maxlen = gk_max(maxlen, colptr[i+1]-colptr[i]);
795

796
      #pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
797
      {
798
        cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
799

800
        #pragma omp for schedule(static)
801
        for (i=0; i<ncols; i++) {
802
          for (tsum=0.0, ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
803
            cand[ncand].val = colind[j];
804
            cand[ncand].key = colval[j];
805
            tsum += (norm == 1 ? colval[j] : colval[j]*colval[j]);
806
          }
807
          gk_fkvsortd(ncand, cand);
808

809
          for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
810
            rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
811
            nrowind[nrowptr[cand[j].val]] = i;
812
            nrowval[nrowptr[cand[j].val]] = cand[j].key;
813
            nrowptr[cand[j].val]++;
814
          }
815
        }
816

817
        gk_free((void **)&cand, LTERM);
818
      }
819

820
      /* compact the nrowind/nrowval */
821
      for (nnz=0, i=0; i<nrows; i++) {
822
        for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
823
          nrowind[nnz] = nrowind[j];
824
          nrowval[nnz] = nrowval[j];
825
        }
826
        nrowptr[i] = nnz;
827
      }
828
      SHIFTCSR(i, nrows, nrowptr);
829

830
      break;
831

832
    case GK_CSR_ROW:
833
      if (mat->rowptr == NULL) 
834
        gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
835

836
      for (i=0; i<nrows; i++) 
837
        maxlen = gk_max(maxlen, rowptr[i+1]-rowptr[i]);
838

839
      #pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
840
      {
841
        cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
842

843
        #pragma omp for schedule(static)
844
        for (i=0; i<nrows; i++) {
845
          for (tsum=0.0, ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
846
            cand[ncand].val = rowind[j];
847
            cand[ncand].key = rowval[j];
848
            tsum += (norm == 1 ? rowval[j] : rowval[j]*rowval[j]);
849
          }
850
          gk_fkvsortd(ncand, cand);
851

852
          for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
853
            rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
854
            nrowind[rowptr[i]+j] = cand[j].val;
855
            nrowval[rowptr[i]+j] = cand[j].key;
856
          }
857
          nrowptr[i+1] = rowptr[i]+j;
858
        }
859

860
        gk_free((void **)&cand, LTERM);
861
      }
862

863
      /* compact nrowind/nrowval */
864
      nrowptr[0] = nnz = 0;
865
      for (i=0; i<nrows; i++) {
866
        for (j=rowptr[i]; j<nrowptr[i+1]; j++, nnz++) {
867
          nrowind[nnz] = nrowind[j];
868
          nrowval[nnz] = nrowval[j];
869
        }
870
        nrowptr[i+1] = nnz;
871
      }
872

873
      break;
874

875
    default:
876
      gk_csr_Free(&nmat);
877
      gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
878
      return NULL;
879
  }
880

881
  return nmat;
882
}
883

884

885
/*************************************************************************/
886
/*! Eliminates certain entries from the rows/columns of the matrix. The 
887
    filtering takes place by keeping only the highest weight top-K entries 
888
    along each row/column and those entries whose weight is greater than
889
    a specified value.
890
   
891
    \param mat the matrix to be prunned,
892
    \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
893
           of the matrix will be prunned,
894
    \param topk is the number of the highest weight entries to keep.
895
    \param keepval is the weight of a term above which will be kept. This
896
           is used to select additional terms past the first topk.
897
    \returns the filtered matrix consisting only of its row-based structure. 
898
           The input matrix is not modified. 
899
*/
900
/**************************************************************************/
901
gk_csr_t *gk_csr_TopKPlusFilter(gk_csr_t *mat, int what, int topk, float keepval)
902
{
903
  ssize_t i, j, k, nnz;
904
  int nrows, ncols, ncand;
905
  ssize_t *rowptr, *colptr, *nrowptr;
906
  int *rowind, *colind, *nrowind;
907
  float *rowval, *colval, *nrowval;
908
  gk_csr_t *nmat;
909
  gk_fkv_t *cand;
910

911
  nmat = gk_csr_Create();
912
  
913
  nrows = nmat->nrows = mat->nrows;
914
  ncols = nmat->ncols = mat->ncols;
915

916
  rowptr = mat->rowptr;
917
  rowind = mat->rowind;
918
  rowval = mat->rowval;
919
  colptr = mat->colptr;
920
  colind = mat->colind;
921
  colval = mat->colval;
922

923
  nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
924
  nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
925
  nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
926

927

928
  switch (what) {
929
    case GK_CSR_COL:
930
      if (mat->colptr == NULL) 
931
        gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
932

933
      cand = gk_fkvmalloc(nrows, "gk_csr_LowFilter: cand");
934

935
      gk_zcopy(nrows+1, rowptr, nrowptr);
936
      for (i=0; i<ncols; i++) {
937
        for (ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
938
          cand[ncand].val = colind[j];
939
          cand[ncand].key = colval[j];
940
        }
941
        gk_fkvsortd(ncand, cand);
942

943
        k = gk_min(topk, ncand);
944
        for (j=0; j<k; j++) {
945
          nrowind[nrowptr[cand[j].val]] = i;
946
          nrowval[nrowptr[cand[j].val]] = cand[j].key;
947
          nrowptr[cand[j].val]++;
948
        }
949
        for (; j<ncand; j++) {
950
          if (cand[j].key < keepval) 
951
            break;
952

953
          nrowind[nrowptr[cand[j].val]] = i;
954
          nrowval[nrowptr[cand[j].val]] = cand[j].key;
955
          nrowptr[cand[j].val]++;
956
        }
957
      }
958

959
      /* compact the nrowind/nrowval */
960
      for (nnz=0, i=0; i<nrows; i++) {
961
        for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
962
          nrowind[nnz] = nrowind[j];
963
          nrowval[nnz] = nrowval[j];
964
        }
965
        nrowptr[i] = nnz;
966
      }
967
      SHIFTCSR(i, nrows, nrowptr);
968

969
      gk_free((void **)&cand, LTERM);
970
      break;
971

972
    case GK_CSR_ROW:
973
      if (mat->rowptr == NULL) 
974
        gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
975

976
      cand = gk_fkvmalloc(ncols, "gk_csr_LowFilter: cand");
977

978
      nrowptr[0] = 0;
979
      for (nnz=0, i=0; i<nrows; i++) {
980
        for (ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
981
          cand[ncand].val = rowind[j];
982
          cand[ncand].key = rowval[j];
983
        }
984
        gk_fkvsortd(ncand, cand);
985

986
        k = gk_min(topk, ncand);
987
        for (j=0; j<k; j++, nnz++) {
988
          nrowind[nnz] = cand[j].val;
989
          nrowval[nnz] = cand[j].key;
990
        }
991
        for (; j<ncand; j++, nnz++) {
992
          if (cand[j].key < keepval) 
993
            break;
994

995
          nrowind[nnz] = cand[j].val;
996
          nrowval[nnz] = cand[j].key;
997
        }
998
        nrowptr[i+1] = nnz;
999
      }
1000

1001
      gk_free((void **)&cand, LTERM);
1002
      break;
1003

1004
    default:
1005
      gk_csr_Free(&nmat);
1006
      gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
1007
      return NULL;
1008
  }
1009

1010
  return nmat;
1011
}
1012

1013

1014
/*************************************************************************/
1015
/*! Eliminates certain entries from the rows/columns of the matrix. The 
1016
    filtering takes place by keeping only the terms whose contribution to
1017
    the total length of the document is greater than a user-splied multiple
1018
    over the average.
1019

1020
    This routine assumes that the vectors are normalized to be unit length.
1021
   
1022
    \param mat the matrix to be prunned,
1023
    \param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
1024
           of the matrix will be prunned,
1025
    \param zscore is the multiplicative factor over the average contribution 
1026
           to the length of the document.
1027
    \returns the filtered matrix consisting only of its row-based structure. 
1028
           The input matrix is not modified. 
1029
*/
1030
/**************************************************************************/
1031
gk_csr_t *gk_csr_ZScoreFilter(gk_csr_t *mat, int what, float zscore)
1032
{
1033
  ssize_t i, j, nnz;
1034
  int nrows;
1035
  ssize_t *rowptr, *nrowptr;
1036
  int *rowind, *nrowind;
1037
  float *rowval, *nrowval, avgwgt;
1038
  gk_csr_t *nmat;
1039

1040
  nmat = gk_csr_Create();
1041
  
1042
  nmat->nrows = mat->nrows;
1043
  nmat->ncols = mat->ncols;
1044

1045
  nrows  = mat->nrows; 
1046
  rowptr = mat->rowptr;
1047
  rowind = mat->rowind;
1048
  rowval = mat->rowval;
1049

1050
  nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_ZScoreFilter: nrowptr");
1051
  nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowind");
1052
  nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowval");
1053

1054

1055
  switch (what) {
1056
    case GK_CSR_COL:
1057
      gk_errexit(SIGERR, "This has not been implemented yet.\n");
1058
      break;
1059

1060
    case GK_CSR_ROW:
1061
      if (mat->rowptr == NULL) 
1062
        gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
1063

1064
      nrowptr[0] = 0;
1065
      for (nnz=0, i=0; i<nrows; i++) {
1066
        avgwgt = zscore/(rowptr[i+1]-rowptr[i]);
1067
        for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1068
          if (rowval[j] > avgwgt) {
1069
            nrowind[nnz] = rowind[j];
1070
            nrowval[nnz] = rowval[j];
1071
            nnz++;
1072
          }
1073
        }
1074
        nrowptr[i+1] = nnz;
1075
      }
1076
      break;
1077

1078
    default:
1079
      gk_csr_Free(&nmat);
1080
      gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
1081
      return NULL;
1082
  }
1083

1084
  return nmat;
1085
}
1086

1087

1088
/*************************************************************************/
1089
/*! Compacts the column-space of the matrix by removing empty columns.
1090
    As a result of the compaction, the column numbers are renumbered. 
1091
    The compaction operation is done in place and only affects the row-based
1092
    representation of the matrix.
1093
    The new columns are ordered in decreasing frequency.
1094
   
1095
    \param mat the matrix whose empty columns will be removed.
1096
*/
1097
/**************************************************************************/
1098
void gk_csr_CompactColumns(gk_csr_t *mat)
1099
{
1100
  ssize_t i;
1101
  int nrows, ncols, nncols;
1102
  ssize_t *rowptr;
1103
  int *rowind, *colmap;
1104
  gk_ikv_t *clens;
1105

1106
  nrows  = mat->nrows;
1107
  ncols  = mat->ncols;
1108
  rowptr = mat->rowptr;
1109
  rowind = mat->rowind;
1110

1111
  colmap = gk_imalloc(ncols, "gk_csr_CompactColumns: colmap");
1112

1113
  clens = gk_ikvmalloc(ncols, "gk_csr_CompactColumns: clens");
1114
  for (i=0; i<ncols; i++) {
1115
    clens[i].key = 0;
1116
    clens[i].val = i;
1117
  }
1118

1119
  for (i=0; i<rowptr[nrows]; i++) 
1120
    clens[rowind[i]].key++;
1121
  gk_ikvsortd(ncols, clens);
1122

1123
  for (nncols=0, i=0; i<ncols; i++) {
1124
    if (clens[i].key > 0) 
1125
      colmap[clens[i].val] = nncols++;
1126
    else
1127
      break;
1128
  }
1129

1130
  for (i=0; i<rowptr[nrows]; i++) 
1131
    rowind[i] = colmap[rowind[i]];
1132

1133
  mat->ncols = nncols;
1134

1135
  gk_free((void **)&colmap, &clens, LTERM);
1136
}
1137

1138

1139
/*************************************************************************/
1140
/*! Sorts the indices in increasing order
1141
    \param mat the matrix itself,
1142
    \param what is either GK_CSR_ROW or GK_CSR_COL indicating which set of
1143
           indices to sort.
1144
*/
1145
/**************************************************************************/
1146
void gk_csr_SortIndices(gk_csr_t *mat, int what)
1147
{
1148
  int n, nn=0;
1149
  ssize_t *ptr;
1150
  int *ind;
1151
  float *val;
1152

1153
  switch (what) {
1154
    case GK_CSR_ROW:
1155
      if (!mat->rowptr)
1156
        gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
1157

1158
      n   = mat->nrows;
1159
      ptr = mat->rowptr;
1160
      ind = mat->rowind;
1161
      val = mat->rowval;
1162
      break;
1163

1164
    case GK_CSR_COL:
1165
      if (!mat->colptr)
1166
        gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
1167

1168
      n   = mat->ncols;
1169
      ptr = mat->colptr;
1170
      ind = mat->colind;
1171
      val = mat->colval;
1172
      break;
1173

1174
    default:
1175
      gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1176
      return;
1177
  }
1178

1179
  #pragma omp parallel if (n > 100)
1180
  {
1181
    ssize_t i, j, k;
1182
    gk_ikv_t *cand;
1183
    float *tval;
1184

1185
    #pragma omp single
1186
    for (i=0; i<n; i++) 
1187
      nn = gk_max(nn, ptr[i+1]-ptr[i]);
1188
  
1189
    cand = gk_ikvmalloc(nn, "gk_csr_SortIndices: cand");
1190
    tval = gk_fmalloc(nn, "gk_csr_SortIndices: tval");
1191
  
1192
    #pragma omp for schedule(static)
1193
    for (i=0; i<n; i++) {
1194
      for (k=0, j=ptr[i]; j<ptr[i+1]; j++) {
1195
        if (j > ptr[i] && ind[j] < ind[j-1])
1196
          k = 1; /* an inversion */
1197
        cand[j-ptr[i]].val = j-ptr[i];
1198
        cand[j-ptr[i]].key = ind[j];
1199
        tval[j-ptr[i]]     = val[j];
1200
      }
1201
      if (k) {
1202
        gk_ikvsorti(ptr[i+1]-ptr[i], cand);
1203
        for (j=ptr[i]; j<ptr[i+1]; j++) {
1204
          ind[j] = cand[j-ptr[i]].key;
1205
          val[j] = tval[cand[j-ptr[i]].val];
1206
        }
1207
      }
1208
    }
1209

1210
    gk_free((void **)&cand, &tval, LTERM);
1211
  }
1212

1213
}
1214

1215

1216
/*************************************************************************/
1217
/*! Creates a row/column index from the column/row data.
1218
    \param mat the matrix itself,
1219
    \param what is either GK_CSR_ROW or GK_CSR_COL indicating which index
1220
           will be created.
1221
*/
1222
/**************************************************************************/
1223
void gk_csr_CreateIndex(gk_csr_t *mat, int what)
1224
{
1225
  /* 'f' stands for forward, 'r' stands for reverse */
1226
  ssize_t i, j, k, nf, nr;
1227
  ssize_t *fptr, *rptr;
1228
  int *find, *rind;
1229
  float *fval, *rval;
1230

1231
  switch (what) {
1232
    case GK_CSR_COL:
1233
      nf   = mat->nrows;
1234
      fptr = mat->rowptr;
1235
      find = mat->rowind;
1236
      fval = mat->rowval;
1237

1238
      if (mat->colptr) gk_free((void **)&mat->colptr, LTERM);
1239
      if (mat->colind) gk_free((void **)&mat->colind, LTERM);
1240
      if (mat->colval) gk_free((void **)&mat->colval, LTERM);
1241

1242
      nr   = mat->ncols;
1243
      rptr = mat->colptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
1244
      rind = mat->colind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
1245
      rval = mat->colval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
1246
      break;
1247
    case GK_CSR_ROW:
1248
      nf   = mat->ncols;
1249
      fptr = mat->colptr;
1250
      find = mat->colind;
1251
      fval = mat->colval;
1252

1253
      if (mat->rowptr) gk_free((void **)&mat->rowptr, LTERM);
1254
      if (mat->rowind) gk_free((void **)&mat->rowind, LTERM);
1255
      if (mat->rowval) gk_free((void **)&mat->rowval, LTERM);
1256

1257
      nr   = mat->nrows;
1258
      rptr = mat->rowptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
1259
      rind = mat->rowind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
1260
      rval = mat->rowval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
1261
      break;
1262
    default:
1263
      gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1264
      return;
1265
  }
1266

1267

1268
  for (i=0; i<nf; i++) {
1269
    for (j=fptr[i]; j<fptr[i+1]; j++)
1270
      rptr[find[j]]++;
1271
  }
1272
  MAKECSR(i, nr, rptr);
1273
  
1274
  if (rptr[nr] > 6*nr) {
1275
    for (i=0; i<nf; i++) {
1276
      for (j=fptr[i]; j<fptr[i+1]; j++) 
1277
        rind[rptr[find[j]]++] = i;
1278
    }
1279
    SHIFTCSR(i, nr, rptr);
1280

1281
    if (fval) {
1282
      for (i=0; i<nf; i++) {
1283
        for (j=fptr[i]; j<fptr[i+1]; j++) 
1284
          rval[rptr[find[j]]++] = fval[j];
1285
      }
1286
      SHIFTCSR(i, nr, rptr);
1287
    }
1288
  }
1289
  else {
1290
    if (fval) {
1291
      for (i=0; i<nf; i++) {
1292
        for (j=fptr[i]; j<fptr[i+1]; j++) {
1293
          k = find[j];
1294
          rind[rptr[k]]   = i;
1295
          rval[rptr[k]++] = fval[j];
1296
        }
1297
      }
1298
    }
1299
    else {
1300
      for (i=0; i<nf; i++) {
1301
        for (j=fptr[i]; j<fptr[i+1]; j++) 
1302
          rind[rptr[find[j]]++] = i;
1303
      }
1304
    }
1305
    SHIFTCSR(i, nr, rptr);
1306
  }
1307
}
1308

1309

1310
/*************************************************************************/
1311
/*! Normalizes the rows/columns of the matrix to be unit 
1312
    length.
1313
    \param mat the matrix itself,
1314
    \param what indicates what will be normalized and is obtained by
1315
           specifying GK_CSR_ROW, GK_CSR_COL, GK_CSR_ROW|GK_CSR_COL. 
1316
    \param norm indicates what norm is to normalize to, 1: 1-norm, 2: 2-norm
1317
*/
1318
/**************************************************************************/
1319
void gk_csr_Normalize(gk_csr_t *mat, int what, int norm)
1320
{
1321
  ssize_t i, j;
1322
  int n;
1323
  ssize_t *ptr;
1324
  float *val, sum;
1325

1326
  if (what&GK_CSR_ROW && mat->rowval) {
1327
    n   = mat->nrows;
1328
    ptr = mat->rowptr;
1329
    val = mat->rowval;
1330

1331
    #pragma omp parallel if (ptr[n] > OMPMINOPS) 
1332
    {
1333
      #pragma omp for private(j,sum) schedule(static)
1334
      for (i=0; i<n; i++) {
1335
        for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++){
1336
  	if (norm == 2)
1337
  	  sum += val[j]*val[j];
1338
  	else if (norm == 1)
1339
  	  sum += val[j]; /* assume val[j] > 0 */ 
1340
        }
1341
        if (sum > 0) {
1342
  	if (norm == 2)
1343
  	  sum=1.0/sqrt(sum); 
1344
  	else if (norm == 1)
1345
  	  sum=1.0/sum; 
1346
          for (j=ptr[i]; j<ptr[i+1]; j++)
1347
            val[j] *= sum;
1348
  	
1349
        }
1350
      }
1351
    }
1352
  }
1353

1354
  if (what&GK_CSR_COL && mat->colval) {
1355
    n   = mat->ncols;
1356
    ptr = mat->colptr;
1357
    val = mat->colval;
1358

1359
    #pragma omp parallel if (ptr[n] > OMPMINOPS)
1360
    {
1361
    #pragma omp for private(j,sum) schedule(static)
1362
      for (i=0; i<n; i++) {
1363
        for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++)
1364
  	if (norm == 2)
1365
  	  sum += val[j]*val[j];
1366
  	else if (norm == 1)
1367
  	  sum += val[j]; 
1368
        if (sum > 0) {
1369
  	if (norm == 2)
1370
  	  sum=1.0/sqrt(sum); 
1371
  	else if (norm == 1)
1372
  	  sum=1.0/sum; 
1373
          for (j=ptr[i]; j<ptr[i+1]; j++)
1374
            val[j] *= sum;
1375
        }
1376
      }
1377
    }
1378
  }
1379
}
1380

1381

1382
/*************************************************************************/
1383
/*! Applies different row scaling methods.
1384
    \param mat the matrix itself,
1385
    \param type indicates the type of row scaling. Possible values are:
1386
           GK_CSR_MAXTF, GK_CSR_SQRT, GK_CSR_LOG, GK_CSR_IDF, GK_CSR_MAXTF2.
1387
*/
1388
/**************************************************************************/
1389
void gk_csr_Scale(gk_csr_t *mat, int type)
1390
{
1391
  ssize_t i, j;
1392
  int nrows, ncols, nnzcols, bgfreq;
1393
  ssize_t *rowptr;
1394
  int *rowind, *collen;
1395
  float *rowval, *cscale, maxtf;
1396

1397
  nrows  = mat->nrows;
1398
  rowptr = mat->rowptr;
1399
  rowind = mat->rowind;
1400
  rowval = mat->rowval;
1401

1402
  switch (type) {
1403
    case GK_CSR_MAXTF: /* TF' = .5 + .5*TF/MAX(TF) */
1404
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1405
      {
1406
        #pragma omp for private(j, maxtf) schedule(static)
1407
        for (i=0; i<nrows; i++) {
1408
          maxtf = fabs(rowval[rowptr[i]]);
1409
          for (j=rowptr[i]; j<rowptr[i+1]; j++) 
1410
            maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
1411
  
1412
          for (j=rowptr[i]; j<rowptr[i+1]; j++)
1413
            rowval[j] = .5 + .5*rowval[j]/maxtf;
1414
        }
1415
      }
1416
      break;
1417

1418
    case GK_CSR_MAXTF2: /* TF' = .1 + .9*TF/MAX(TF) */
1419
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1420
      {
1421
        #pragma omp for private(j, maxtf) schedule(static)
1422
        for (i=0; i<nrows; i++) {
1423
          maxtf = fabs(rowval[rowptr[i]]);
1424
          for (j=rowptr[i]; j<rowptr[i+1]; j++) 
1425
            maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
1426
  
1427
          for (j=rowptr[i]; j<rowptr[i+1]; j++)
1428
            rowval[j] = .1 + .9*rowval[j]/maxtf;
1429
        }
1430
      }
1431
      break;
1432

1433
    case GK_CSR_SQRT: /* TF' = .1+SQRT(TF) */
1434
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1435
      {
1436
        #pragma omp for private(j) schedule(static)
1437
        for (i=0; i<nrows; i++) {
1438
          for (j=rowptr[i]; j<rowptr[i+1]; j++) { 
1439
            if (rowval[j] != 0.0)
1440
              rowval[j] = .1+sign(rowval[j], sqrt(fabs(rowval[j])));
1441
          }
1442
        }
1443
      }
1444
      break;
1445

1446
    case GK_CSR_POW25: /* TF' = .1+POW(TF,.25) */
1447
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1448
      {
1449
        #pragma omp for private(j) schedule(static)
1450
        for (i=0; i<nrows; i++) {
1451
          for (j=rowptr[i]; j<rowptr[i+1]; j++) { 
1452
            if (rowval[j] != 0.0)
1453
              rowval[j] = .1+sign(rowval[j], sqrt(sqrt(fabs(rowval[j]))));
1454
          }
1455
        }
1456
      }
1457
      break;
1458

1459
    case GK_CSR_POW65: /* TF' = .1+POW(TF,.65) */
1460
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1461
      {
1462
        #pragma omp for private(j) schedule(static)
1463
        for (i=0; i<nrows; i++) {
1464
          for (j=rowptr[i]; j<rowptr[i+1]; j++) { 
1465
            if (rowval[j] != 0.0)
1466
              rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .65));
1467
          }
1468
        }
1469
      }
1470
      break;
1471

1472
    case GK_CSR_POW75: /* TF' = .1+POW(TF,.75) */
1473
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1474
      {
1475
        #pragma omp for private(j) schedule(static)
1476
        for (i=0; i<nrows; i++) {
1477
          for (j=rowptr[i]; j<rowptr[i+1]; j++) { 
1478
            if (rowval[j] != 0.0)
1479
              rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .75));
1480
          }
1481
        }
1482
      }
1483
      break;
1484

1485
    case GK_CSR_POW85: /* TF' = .1+POW(TF,.85) */
1486
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1487
      {
1488
        #pragma omp for private(j) schedule(static)
1489
        for (i=0; i<nrows; i++) {
1490
          for (j=rowptr[i]; j<rowptr[i+1]; j++) { 
1491
            if (rowval[j] != 0.0)
1492
              rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .85));
1493
          }
1494
        }
1495
      }
1496
      break;
1497

1498
    case GK_CSR_LOG: /* TF' = 1+log_2(TF) */
1499
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1500
      {
1501
        double logscale = 1.0/log(2.0);
1502
        #pragma omp for schedule(static,32)
1503
        for (i=0; i<rowptr[nrows]; i++) {
1504
          if (rowval[i] != 0.0)
1505
            rowval[i] = 1+(rowval[i]>0.0 ? log(rowval[i]) : -log(-rowval[i]))*logscale;
1506
        }
1507
#ifdef XXX
1508
        #pragma omp for private(j) schedule(static)
1509
        for (i=0; i<nrows; i++) {
1510
          for (j=rowptr[i]; j<rowptr[i+1]; j++) { 
1511
            if (rowval[j] != 0.0)
1512
              rowval[j] = 1+(rowval[j]>0.0 ? log(rowval[j]) : -log(-rowval[j]))*logscale;
1513
              //rowval[j] = 1+sign(rowval[j], log(fabs(rowval[j]))*logscale);
1514
          }
1515
        }
1516
#endif
1517
      }
1518
      break;
1519

1520
    case GK_CSR_IDF: /* TF' = TF*IDF */
1521
      ncols  = mat->ncols;
1522
      cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
1523
      collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
1524

1525
      for (i=0; i<nrows; i++) {
1526
        for (j=rowptr[i]; j<rowptr[i+1]; j++)
1527
          collen[rowind[j]]++;
1528
      }
1529

1530
      #pragma omp parallel if (ncols > OMPMINOPS) 
1531
      {
1532
        #pragma omp for schedule(static)
1533
        for (i=0; i<ncols; i++)
1534
          cscale[i] = (collen[i] > 0 ? log(1.0*nrows/collen[i]) : 0.0);
1535
      }
1536

1537
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS) 
1538
      {
1539
        #pragma omp for private(j) schedule(static)
1540
        for (i=0; i<nrows; i++) {
1541
          for (j=rowptr[i]; j<rowptr[i+1]; j++)
1542
            rowval[j] *= cscale[rowind[j]];
1543
        }
1544
      }
1545

1546
      gk_free((void **)&cscale, &collen, LTERM);
1547
      break;
1548

1549
    case GK_CSR_IDF2: /* TF' = TF*IDF */
1550
      ncols  = mat->ncols;
1551
      cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
1552
      collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
1553

1554
      for (i=0; i<nrows; i++) {
1555
        for (j=rowptr[i]; j<rowptr[i+1]; j++)
1556
          collen[rowind[j]]++;
1557
      }
1558

1559
      nnzcols = 0;
1560
      #pragma omp parallel if (ncols > OMPMINOPS) 
1561
      {
1562
        #pragma omp for schedule(static) reduction(+:nnzcols)
1563
        for (i=0; i<ncols; i++)
1564
          nnzcols += (collen[i] > 0 ? 1 : 0);
1565

1566
        bgfreq = gk_max(10, (ssize_t)(.5*rowptr[nrows]/nnzcols));
1567
        printf("nnz: %zd, nnzcols: %d, bgfreq: %d\n", rowptr[nrows], nnzcols, bgfreq);
1568

1569
        #pragma omp for schedule(static)
1570
        for (i=0; i<ncols; i++)
1571
          cscale[i] = (collen[i] > 0 ? log(1.0*(nrows+2*bgfreq)/(bgfreq+collen[i])) : 0.0);
1572
      }
1573

1574
      #pragma omp parallel if (rowptr[nrows] > OMPMINOPS) 
1575
      {
1576
        #pragma omp for private(j) schedule(static)
1577
        for (i=0; i<nrows; i++) {
1578
          for (j=rowptr[i]; j<rowptr[i+1]; j++)
1579
            rowval[j] *= cscale[rowind[j]];
1580
        }
1581
      }
1582

1583
      gk_free((void **)&cscale, &collen, LTERM);
1584
      break;
1585

1586
    default:
1587
      gk_errexit(SIGERR, "Unknown scaling type of %d\n", type);
1588
  }
1589

1590
}
1591

1592

1593
/*************************************************************************/
1594
/*! Computes the sums of the rows/columns
1595
    \param mat the matrix itself,
1596
    \param what is either GK_CSR_ROW or GK_CSR_COL indicating which 
1597
           sums to compute.
1598
*/
1599
/**************************************************************************/
1600
void gk_csr_ComputeSums(gk_csr_t *mat, int what)
1601
{
1602
  ssize_t i;
1603
  int n;
1604
  ssize_t *ptr;
1605
  float *val, *sums;
1606

1607
  switch (what) {
1608
    case GK_CSR_ROW:
1609
      n   = mat->nrows;
1610
      ptr = mat->rowptr;
1611
      val = mat->rowval;
1612

1613
      if (mat->rsums) 
1614
        gk_free((void **)&mat->rsums, LTERM);
1615

1616
      sums = mat->rsums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
1617
      break;
1618
    case GK_CSR_COL:
1619
      n   = mat->ncols;
1620
      ptr = mat->colptr;
1621
      val = mat->colval;
1622

1623
      if (mat->csums) 
1624
        gk_free((void **)&mat->csums, LTERM);
1625

1626
      sums = mat->csums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
1627
      break;
1628
    default:
1629
      gk_errexit(SIGERR, "Invalid sum type of %d.\n", what);
1630
      return;
1631
  }
1632

1633
  #pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
1634
  for (i=0; i<n; i++) 
1635
    sums[i] = gk_fsum(ptr[i+1]-ptr[i], val+ptr[i], 1);
1636
}
1637

1638

1639
/*************************************************************************/
1640
/*! Computes the squared of the norms of the rows/columns
1641
    \param mat the matrix itself,
1642
    \param what is either GK_CSR_ROW or GK_CSR_COL indicating which 
1643
           squared norms to compute.
1644
*/
1645
/**************************************************************************/
1646
void gk_csr_ComputeSquaredNorms(gk_csr_t *mat, int what)
1647
{
1648
  ssize_t i;
1649
  int n;
1650
  ssize_t *ptr;
1651
  float *val, *norms;
1652

1653
  switch (what) {
1654
    case GK_CSR_ROW:
1655
      n   = mat->nrows;
1656
      ptr = mat->rowptr;
1657
      val = mat->rowval;
1658

1659
      if (mat->rnorms) gk_free((void **)&mat->rnorms, LTERM);
1660

1661
      norms = mat->rnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
1662
      break;
1663
    case GK_CSR_COL:
1664
      n   = mat->ncols;
1665
      ptr = mat->colptr;
1666
      val = mat->colval;
1667

1668
      if (mat->cnorms) gk_free((void **)&mat->cnorms, LTERM);
1669

1670
      norms = mat->cnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
1671
      break;
1672
    default:
1673
      gk_errexit(SIGERR, "Invalid norm type of %d.\n", what);
1674
      return;
1675
  }
1676

1677
  #pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
1678
  for (i=0; i<n; i++) 
1679
    norms[i] = gk_fdot(ptr[i+1]-ptr[i], val+ptr[i], 1, val+ptr[i], 1);
1680
}
1681

1682

1683
/*************************************************************************/
1684
/*! Computes the similarity between two rows/columns
1685

1686
    \param mat the matrix itself. The routine assumes that the indices
1687
           are sorted in increasing order.
1688
    \param i1 is the first row/column,
1689
    \param i2 is the second row/column,
1690
    \param what is either GK_CSR_ROW or GK_CSR_COL indicating the type of
1691
           objects between the similarity will be computed,
1692
    \param simtype is the type of similarity and is one of GK_CSR_COS,
1693
           GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN
1694
    \returns the similarity between the two rows/columns.
1695
*/
1696
/**************************************************************************/
1697
float gk_csr_ComputeSimilarity(gk_csr_t *mat, int i1, int i2, int what, int simtype)
1698
{
1699
  int nind1, nind2;
1700
  int *ind1, *ind2;
1701
  float *val1, *val2, stat1, stat2, sim;
1702

1703
  switch (what) {
1704
    case GK_CSR_ROW:
1705
      if (!mat->rowptr)
1706
        gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
1707
      nind1 = mat->rowptr[i1+1]-mat->rowptr[i1];
1708
      nind2 = mat->rowptr[i2+1]-mat->rowptr[i2];
1709
      ind1  = mat->rowind + mat->rowptr[i1];
1710
      ind2  = mat->rowind + mat->rowptr[i2];
1711
      val1  = mat->rowval + mat->rowptr[i1];
1712
      val2  = mat->rowval + mat->rowptr[i2];
1713
      break;
1714

1715
    case GK_CSR_COL:
1716
      if (!mat->colptr)
1717
        gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
1718
      nind1 = mat->colptr[i1+1]-mat->colptr[i1];
1719
      nind2 = mat->colptr[i2+1]-mat->colptr[i2];
1720
      ind1  = mat->colind + mat->colptr[i1];
1721
      ind2  = mat->colind + mat->colptr[i2];
1722
      val1  = mat->colval + mat->colptr[i1];
1723
      val2  = mat->colval + mat->colptr[i2];
1724
      break;
1725

1726
    default:
1727
      gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1728
      return 0.0;
1729
  }
1730

1731

1732
  switch (simtype) {
1733
    case GK_CSR_COS:
1734
    case GK_CSR_JAC:
1735
      sim = stat1 = stat2 = 0.0;
1736
      i1 = i2 = 0;
1737
      while (i1<nind1 && i2<nind2) {
1738
        if (i1 == nind1) {
1739
          stat2 += val2[i2]*val2[i2];
1740
          i2++;
1741
        }
1742
        else if (i2 == nind2) {
1743
          stat1 += val1[i1]*val1[i1];
1744
          i1++;
1745
        }
1746
        else if (ind1[i1] < ind2[i2]) {
1747
          stat1 += val1[i1]*val1[i1];
1748
          i1++;
1749
        }
1750
        else if (ind1[i1] > ind2[i2]) {
1751
          stat2 += val2[i2]*val2[i2];
1752
          i2++;
1753
        }
1754
        else {
1755
          sim   += val1[i1]*val2[i2];
1756
          stat1 += val1[i1]*val1[i1];
1757
          stat2 += val2[i2]*val2[i2];
1758
          i1++;
1759
          i2++;
1760
        }
1761
      }
1762
      if (simtype == GK_CSR_COS)
1763
        sim = (stat1*stat2 > 0.0 ? sim/sqrt(stat1*stat2) : 0.0);
1764
      else 
1765
        sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
1766
      break;
1767

1768
    case GK_CSR_MIN:
1769
      sim = stat1 = stat2 = 0.0;
1770
      i1 = i2 = 0;
1771
      while (i1<nind1 && i2<nind2) {
1772
        if (i1 == nind1) {
1773
          stat2 += val2[i2];
1774
          i2++;
1775
        }
1776
        else if (i2 == nind2) {
1777
          stat1 += val1[i1];
1778
          i1++;
1779
        }
1780
        else if (ind1[i1] < ind2[i2]) {
1781
          stat1 += val1[i1];
1782
          i1++;
1783
        }
1784
        else if (ind1[i1] > ind2[i2]) {
1785
          stat2 += val2[i2];
1786
          i2++;
1787
        }
1788
        else {
1789
          sim   += gk_min(val1[i1],val2[i2]);
1790
          stat1 += val1[i1];
1791
          stat2 += val2[i2];
1792
          i1++;
1793
          i2++;
1794
        }
1795
      }
1796
      sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
1797

1798
      break;
1799

1800
    case GK_CSR_AMIN:
1801
      sim = stat1 = stat2 = 0.0;
1802
      i1 = i2 = 0;
1803
      while (i1<nind1 && i2<nind2) {
1804
        if (i1 == nind1) {
1805
          stat2 += val2[i2];
1806
          i2++;
1807
        }
1808
        else if (i2 == nind2) {
1809
          stat1 += val1[i1];
1810
          i1++;
1811
        }
1812
        else if (ind1[i1] < ind2[i2]) {
1813
          stat1 += val1[i1];
1814
          i1++;
1815
        }
1816
        else if (ind1[i1] > ind2[i2]) {
1817
          stat2 += val2[i2];
1818
          i2++;
1819
        }
1820
        else {
1821
          sim   += gk_min(val1[i1],val2[i2]);
1822
          stat1 += val1[i1];
1823
          stat2 += val2[i2];
1824
          i1++;
1825
          i2++;
1826
        }
1827
      }
1828
      sim = (stat1 > 0.0 ? sim/stat1 : 0.0);
1829

1830
      break;
1831

1832
    default:
1833
      gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
1834
      return -1;
1835
  }
1836

1837
  return sim;
1838

1839
}
1840

1841

1842
/*************************************************************************/
1843
/*! Finds the n most similar rows (neighbors) to the query using cosine
1844
    similarity.
1845

1846
    \param mat the matrix itself
1847
    \param nqterms is the number of columns in the query
1848
    \param qind is the list of query columns
1849
    \param qval is the list of correspodning query weights
1850
    \param simtype is the type of similarity and is one of GK_CSR_COS,
1851
           GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN
1852
    \param nsim is the maximum number of requested most similar rows.
1853
           If -1 is provided, then everything is returned unsorted.
1854
    \param minsim is the minimum similarity of the requested most 
1855
           similar rows
1856
    \param hits is the result set. This array should be at least
1857
           of length nsim.
1858
    \param i_marker is an array of size equal to the number of rows
1859
           whose values are initialized to -1. If NULL is provided
1860
           then this array is allocated and freed internally.
1861
    \param i_cand is an array of size equal to the number of rows.
1862
           If NULL is provided then this array is allocated and freed 
1863
           internally.
1864
    \returns the number of identified most similar rows, which can be
1865
             smaller than the requested number of nnbrs in those cases
1866
             in which there are no sufficiently many neighbors.
1867
*/
1868
/**************************************************************************/
1869
int gk_csr_GetSimilarRows(gk_csr_t *mat, int nqterms, int *qind, 
1870
        float *qval, int simtype, int nsim, float minsim, gk_fkv_t *hits, 
1871
        int *i_marker, gk_fkv_t *i_cand)
1872
{
1873
  ssize_t i, ii, j, k;
1874
  int nrows, ncols, ncand;
1875
  ssize_t *colptr;
1876
  int *colind, *marker;
1877
  float *colval, *rnorms, mynorm, *rsums, mysum;
1878
  gk_fkv_t *cand;
1879

1880
  if (nqterms == 0)
1881
    return 0;
1882

1883
  nrows  = mat->nrows;
1884
  ncols  = mat->ncols;
1885
  colptr = mat->colptr;
1886
  colind = mat->colind;
1887
  colval = mat->colval;
1888

1889
  marker = (i_marker ? i_marker : gk_ismalloc(nrows, -1, "gk_csr_SimilarRows: marker"));
1890
  cand   = (i_cand   ? i_cand   : gk_fkvmalloc(nrows, "gk_csr_SimilarRows: cand"));
1891

1892
  switch (simtype) {
1893
    case GK_CSR_COS:
1894
      for (ncand=0, ii=0; ii<nqterms; ii++) {
1895
        i = qind[ii];
1896
        if (i < ncols) {
1897
          for (j=colptr[i]; j<colptr[i+1]; j++) {
1898
            k = colind[j];
1899
            if (marker[k] == -1) {
1900
              cand[ncand].val = k;
1901
              cand[ncand].key = 0;
1902
              marker[k]       = ncand++;
1903
            }
1904
            cand[marker[k]].key += colval[j]*qval[ii];
1905
          }
1906
        }
1907
      }
1908
      break;
1909

1910
    case GK_CSR_JAC:
1911
      for (ncand=0, ii=0; ii<nqterms; ii++) {
1912
        i = qind[ii];
1913
        if (i < ncols) {
1914
          for (j=colptr[i]; j<colptr[i+1]; j++) {
1915
            k = colind[j];
1916
            if (marker[k] == -1) {
1917
              cand[ncand].val = k;
1918
              cand[ncand].key = 0;
1919
              marker[k]       = ncand++;
1920
            }
1921
            cand[marker[k]].key += colval[j]*qval[ii];
1922
          }
1923
        }
1924
      }
1925

1926
      rnorms = mat->rnorms;
1927
      mynorm = gk_fdot(nqterms, qval, 1, qval, 1);
1928

1929
      for (i=0; i<ncand; i++)
1930
        cand[i].key = cand[i].key/(rnorms[cand[i].val]+mynorm-cand[i].key);
1931
      break;
1932

1933
    case GK_CSR_MIN:
1934
      for (ncand=0, ii=0; ii<nqterms; ii++) {
1935
        i = qind[ii];
1936
        if (i < ncols) {
1937
          for (j=colptr[i]; j<colptr[i+1]; j++) {
1938
            k = colind[j];
1939
            if (marker[k] == -1) {
1940
              cand[ncand].val = k;
1941
              cand[ncand].key = 0;
1942
              marker[k]       = ncand++;
1943
            }
1944
            cand[marker[k]].key += gk_min(colval[j], qval[ii]);
1945
          }
1946
        }
1947
      }
1948

1949
      rsums = mat->rsums;
1950
      mysum = gk_fsum(nqterms, qval, 1);
1951

1952
      for (i=0; i<ncand; i++)
1953
        cand[i].key = cand[i].key/(rsums[cand[i].val]+mysum-cand[i].key);
1954
      break;
1955

1956
    /* Assymetric MIN  similarity */
1957
    case GK_CSR_AMIN:
1958
      for (ncand=0, ii=0; ii<nqterms; ii++) {
1959
        i = qind[ii];
1960
        if (i < ncols) {
1961
          for (j=colptr[i]; j<colptr[i+1]; j++) {
1962
            k = colind[j];
1963
            if (marker[k] == -1) {
1964
              cand[ncand].val = k;
1965
              cand[ncand].key = 0;
1966
              marker[k]       = ncand++;
1967
            }
1968
            cand[marker[k]].key += gk_min(colval[j], qval[ii]);
1969
          }
1970
        }
1971
      }
1972

1973
      mysum = gk_fsum(nqterms, qval, 1);
1974

1975
      for (i=0; i<ncand; i++)
1976
        cand[i].key = cand[i].key/mysum;
1977
      break;
1978

1979
    default:
1980
      gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
1981
      return -1;
1982
  }
1983

1984
  /* go and prune the hits that are bellow minsim */
1985
  for (j=0, i=0; i<ncand; i++) {
1986
    marker[cand[i].val] = -1;
1987
    if (cand[i].key >= minsim) 
1988
      cand[j++] = cand[i];
1989
  }
1990
  ncand = j;
1991

1992
  if (nsim == -1 || nsim >= ncand) {
1993
    nsim = ncand;
1994
  }
1995
  else {
1996
    nsim = gk_min(nsim, ncand);
1997
    gk_dfkvkselect(ncand, nsim, cand);
1998
    gk_fkvsortd(nsim, cand);
1999
  }
2000

2001
  gk_fkvcopy(nsim, cand, hits);
2002

2003
  if (i_marker == NULL)
2004
    gk_free((void **)&marker, LTERM);
2005
  if (i_cand == NULL)
2006
    gk_free((void **)&cand, LTERM);
2007

2008
  return nsim;
2009
}
2010

2011

2012
Product

Resources

Company