Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ElmerCSC
GitHub Repository: ElmerCSC/elmerfem
Path: blob/devel/elmergrid/src/metis-5.1.0/GKlib/csr.c
3206 views
1
/*!
2
* \file
3
*
4
* \brief Various routines with dealing with CSR matrices
5
*
6
* \author George Karypis
7
* \version\verbatim $Id: csr.c 13437 2013-01-11 21:54:10Z karypis $ \endverbatim
8
*/
9
10
#include <GKlib.h>
11
12
#define OMPMINOPS 50000
13
14
/*************************************************************************/
15
/*! Allocate memory for a CSR matrix and initializes it
16
\returns the allocated matrix. The various fields are set to NULL.
17
*/
18
/**************************************************************************/
19
gk_csr_t *gk_csr_Create()
20
{
21
gk_csr_t *mat;
22
23
mat = (gk_csr_t *)gk_malloc(sizeof(gk_csr_t), "gk_csr_Create: mat");
24
25
gk_csr_Init(mat);
26
27
return mat;
28
}
29
30
31
/*************************************************************************/
32
/*! Initializes the matrix
33
\param mat is the matrix to be initialized.
34
*/
35
/*************************************************************************/
36
void gk_csr_Init(gk_csr_t *mat)
37
{
38
memset(mat, 0, sizeof(gk_csr_t));
39
mat->nrows = mat->ncols = -1;
40
}
41
42
43
/*************************************************************************/
44
/*! Frees all the memory allocated for matrix.
45
\param mat is the matrix to be freed.
46
*/
47
/*************************************************************************/
48
void gk_csr_Free(gk_csr_t **mat)
49
{
50
if (*mat == NULL)
51
return;
52
gk_csr_FreeContents(*mat);
53
gk_free((void **)mat, LTERM);
54
}
55
56
57
/*************************************************************************/
58
/*! Frees only the memory allocated for the matrix's different fields and
59
sets them to NULL.
60
\param mat is the matrix whose contents will be freed.
61
*/
62
/*************************************************************************/
63
void gk_csr_FreeContents(gk_csr_t *mat)
64
{
65
gk_free((void *)&mat->rowptr, &mat->rowind, &mat->rowval, &mat->rowids,
66
&mat->colptr, &mat->colind, &mat->colval, &mat->colids,
67
&mat->rnorms, &mat->cnorms, &mat->rsums, &mat->csums,
68
&mat->rsizes, &mat->csizes, &mat->rvols, &mat->cvols,
69
&mat->rwgts, &mat->cwgts,
70
LTERM);
71
}
72
73
74
/*************************************************************************/
75
/*! Returns a copy of a matrix.
76
\param mat is the matrix to be duplicated.
77
\returns the newly created copy of the matrix.
78
*/
79
/**************************************************************************/
80
gk_csr_t *gk_csr_Dup(gk_csr_t *mat)
81
{
82
gk_csr_t *nmat;
83
84
nmat = gk_csr_Create();
85
86
nmat->nrows = mat->nrows;
87
nmat->ncols = mat->ncols;
88
89
/* copy the row structure */
90
if (mat->rowptr)
91
nmat->rowptr = gk_zcopy(mat->nrows+1, mat->rowptr,
92
gk_zmalloc(mat->nrows+1, "gk_csr_Dup: rowptr"));
93
if (mat->rowids)
94
nmat->rowids = gk_icopy(mat->nrows, mat->rowids,
95
gk_imalloc(mat->nrows, "gk_csr_Dup: rowids"));
96
if (mat->rnorms)
97
nmat->rnorms = gk_fcopy(mat->nrows, mat->rnorms,
98
gk_fmalloc(mat->nrows, "gk_csr_Dup: rnorms"));
99
if (mat->rowind)
100
nmat->rowind = gk_icopy(mat->rowptr[mat->nrows], mat->rowind,
101
gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowind"));
102
if (mat->rowval)
103
nmat->rowval = gk_fcopy(mat->rowptr[mat->nrows], mat->rowval,
104
gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Dup: rowval"));
105
106
/* copy the col structure */
107
if (mat->colptr)
108
nmat->colptr = gk_zcopy(mat->ncols+1, mat->colptr,
109
gk_zmalloc(mat->ncols+1, "gk_csr_Dup: colptr"));
110
if (mat->colids)
111
nmat->colids = gk_icopy(mat->ncols, mat->colids,
112
gk_imalloc(mat->ncols, "gk_csr_Dup: colids"));
113
if (mat->cnorms)
114
nmat->cnorms = gk_fcopy(mat->ncols, mat->cnorms,
115
gk_fmalloc(mat->ncols, "gk_csr_Dup: cnorms"));
116
if (mat->colind)
117
nmat->colind = gk_icopy(mat->colptr[mat->ncols], mat->colind,
118
gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colind"));
119
if (mat->colval)
120
nmat->colval = gk_fcopy(mat->colptr[mat->ncols], mat->colval,
121
gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Dup: colval"));
122
123
return nmat;
124
}
125
126
127
/*************************************************************************/
128
/*! Returns a submatrix containint a set of consecutive rows.
129
\param mat is the original matrix.
130
\param rstart is the starting row.
131
\param nrows is the number of rows from rstart to extract.
132
\returns the row structure of the newly created submatrix.
133
*/
134
/**************************************************************************/
135
gk_csr_t *gk_csr_ExtractSubmatrix(gk_csr_t *mat, int rstart, int nrows)
136
{
137
ssize_t i;
138
gk_csr_t *nmat;
139
140
if (rstart+nrows > mat->nrows)
141
return NULL;
142
143
nmat = gk_csr_Create();
144
145
nmat->nrows = nrows;
146
nmat->ncols = mat->ncols;
147
148
/* copy the row structure */
149
if (mat->rowptr)
150
nmat->rowptr = gk_zcopy(nrows+1, mat->rowptr+rstart,
151
gk_zmalloc(nrows+1, "gk_csr_ExtractSubmatrix: rowptr"));
152
for (i=nrows; i>=0; i--)
153
nmat->rowptr[i] -= nmat->rowptr[0];
154
ASSERT(nmat->rowptr[0] == 0);
155
156
if (mat->rowids)
157
nmat->rowids = gk_icopy(nrows, mat->rowids+rstart,
158
gk_imalloc(nrows, "gk_csr_ExtractSubmatrix: rowids"));
159
if (mat->rnorms)
160
nmat->rnorms = gk_fcopy(nrows, mat->rnorms+rstart,
161
gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rnorms"));
162
163
if (mat->rsums)
164
nmat->rsums = gk_fcopy(nrows, mat->rsums+rstart,
165
gk_fmalloc(nrows, "gk_csr_ExtractSubmatrix: rsums"));
166
167
ASSERT(nmat->rowptr[nrows] == mat->rowptr[rstart+nrows]-mat->rowptr[rstart]);
168
if (mat->rowind)
169
nmat->rowind = gk_icopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
170
mat->rowind+mat->rowptr[rstart],
171
gk_imalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
172
"gk_csr_ExtractSubmatrix: rowind"));
173
if (mat->rowval)
174
nmat->rowval = gk_fcopy(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
175
mat->rowval+mat->rowptr[rstart],
176
gk_fmalloc(mat->rowptr[rstart+nrows]-mat->rowptr[rstart],
177
"gk_csr_ExtractSubmatrix: rowval"));
178
179
return nmat;
180
}
181
182
183
/*************************************************************************/
184
/*! Returns a submatrix containing a certain set of rows.
185
\param mat is the original matrix.
186
\param nrows is the number of rows to extract.
187
\param rind is the set of row numbers to extract.
188
\returns the row structure of the newly created submatrix.
189
*/
190
/**************************************************************************/
191
gk_csr_t *gk_csr_ExtractRows(gk_csr_t *mat, int nrows, int *rind)
192
{
193
ssize_t i, ii, j, nnz;
194
gk_csr_t *nmat;
195
196
nmat = gk_csr_Create();
197
198
nmat->nrows = nrows;
199
nmat->ncols = mat->ncols;
200
201
for (nnz=0, i=0; i<nrows; i++)
202
nnz += mat->rowptr[rind[i]+1]-mat->rowptr[rind[i]];
203
204
nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
205
nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
206
nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
207
208
nmat->rowptr[0] = 0;
209
for (nnz=0, j=0, ii=0; ii<nrows; ii++) {
210
i = rind[ii];
211
gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
212
gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
213
nnz += mat->rowptr[i+1]-mat->rowptr[i];
214
nmat->rowptr[++j] = nnz;
215
}
216
ASSERT(j == nmat->nrows);
217
218
return nmat;
219
}
220
221
222
/*************************************************************************/
223
/*! Returns a submatrix corresponding to a specified partitioning of rows.
224
\param mat is the original matrix.
225
\param part is the partitioning vector of the rows.
226
\param pid is the partition ID that will be extracted.
227
\returns the row structure of the newly created submatrix.
228
*/
229
/**************************************************************************/
230
gk_csr_t *gk_csr_ExtractPartition(gk_csr_t *mat, int *part, int pid)
231
{
232
ssize_t i, j, nnz;
233
gk_csr_t *nmat;
234
235
nmat = gk_csr_Create();
236
237
nmat->nrows = 0;
238
nmat->ncols = mat->ncols;
239
240
for (nnz=0, i=0; i<mat->nrows; i++) {
241
if (part[i] == pid) {
242
nmat->nrows++;
243
nnz += mat->rowptr[i+1]-mat->rowptr[i];
244
}
245
}
246
247
nmat->rowptr = gk_zmalloc(nmat->nrows+1, "gk_csr_ExtractPartition: rowptr");
248
nmat->rowind = gk_imalloc(nnz, "gk_csr_ExtractPartition: rowind");
249
nmat->rowval = gk_fmalloc(nnz, "gk_csr_ExtractPartition: rowval");
250
251
nmat->rowptr[0] = 0;
252
for (nnz=0, j=0, i=0; i<mat->nrows; i++) {
253
if (part[i] == pid) {
254
gk_icopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowind+mat->rowptr[i], nmat->rowind+nnz);
255
gk_fcopy(mat->rowptr[i+1]-mat->rowptr[i], mat->rowval+mat->rowptr[i], nmat->rowval+nnz);
256
nnz += mat->rowptr[i+1]-mat->rowptr[i];
257
nmat->rowptr[++j] = nnz;
258
}
259
}
260
ASSERT(j == nmat->nrows);
261
262
return nmat;
263
}
264
265
266
/*************************************************************************/
267
/*! Splits the matrix into multiple sub-matrices based on the provided
268
color array.
269
\param mat is the original matrix.
270
\param color is an array of size equal to the number of non-zeros
271
in the matrix (row-wise structure). The matrix is split into
272
as many parts as the number of colors. For meaningfull results,
273
the colors should be numbered consecutively starting from 0.
274
\returns an array of matrices for each supplied color number.
275
*/
276
/**************************************************************************/
277
gk_csr_t **gk_csr_Split(gk_csr_t *mat, int *color)
278
{
279
ssize_t i, j;
280
int nrows, ncolors;
281
ssize_t *rowptr;
282
int *rowind;
283
float *rowval;
284
gk_csr_t **smats;
285
286
nrows = mat->nrows;
287
rowptr = mat->rowptr;
288
rowind = mat->rowind;
289
rowval = mat->rowval;
290
291
ncolors = gk_imax(rowptr[nrows], color)+1;
292
293
smats = (gk_csr_t **)gk_malloc(sizeof(gk_csr_t *)*ncolors, "gk_csr_Split: smats");
294
for (i=0; i<ncolors; i++) {
295
smats[i] = gk_csr_Create();
296
smats[i]->nrows = mat->nrows;
297
smats[i]->ncols = mat->ncols;
298
smats[i]->rowptr = gk_zsmalloc(nrows+1, 0, "gk_csr_Split: smats[i]->rowptr");
299
}
300
301
for (i=0; i<nrows; i++) {
302
for (j=rowptr[i]; j<rowptr[i+1]; j++)
303
smats[color[j]]->rowptr[i]++;
304
}
305
for (i=0; i<ncolors; i++)
306
MAKECSR(j, nrows, smats[i]->rowptr);
307
308
for (i=0; i<ncolors; i++) {
309
smats[i]->rowind = gk_imalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowind");
310
smats[i]->rowval = gk_fmalloc(smats[i]->rowptr[nrows], "gk_csr_Split: smats[i]->rowval");
311
}
312
313
for (i=0; i<nrows; i++) {
314
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
315
smats[color[j]]->rowind[smats[color[j]]->rowptr[i]] = rowind[j];
316
smats[color[j]]->rowval[smats[color[j]]->rowptr[i]] = rowval[j];
317
smats[color[j]]->rowptr[i]++;
318
}
319
}
320
321
for (i=0; i<ncolors; i++)
322
SHIFTCSR(j, nrows, smats[i]->rowptr);
323
324
return smats;
325
}
326
327
328
/**************************************************************************/
329
/*! Reads a CSR matrix from the supplied file and stores it the matrix's
330
forward structure.
331
\param filename is the file that stores the data.
332
\param format is either GK_CSR_FMT_METIS, GK_CSR_FMT_CLUTO,
333
GK_CSR_FMT_CSR, GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL
334
specifying the type of the input format.
335
The GK_CSR_FMT_CSR does not contain a header
336
line, whereas the GK_CSR_FMT_BINROW is a binary format written
337
by gk_csr_Write() using the same format specifier.
338
\param readvals is either 1 or 0, indicating if the CSR file contains
339
values or it does not. It only applies when GK_CSR_FMT_CSR is
340
used.
341
\param numbering is either 1 or 0, indicating if the numbering of the
342
indices start from 1 or 0, respectively. If they start from 1,
343
they are automatically decreamented during input so that they
344
will start from 0. It only applies when GK_CSR_FMT_CSR is
345
used.
346
\returns the matrix that was read.
347
*/
348
/**************************************************************************/
349
gk_csr_t *gk_csr_Read(char *filename, int format, int readvals, int numbering)
350
{
351
ssize_t i, k, l;
352
size_t nfields, nrows, ncols, nnz, fmt, ncon;
353
size_t lnlen;
354
ssize_t *rowptr;
355
int *rowind, ival;
356
float *rowval=NULL, fval;
357
int readsizes, readwgts;
358
char *line=NULL, *head, *tail, fmtstr[256];
359
FILE *fpin;
360
gk_csr_t *mat=NULL;
361
362
363
if (!gk_fexists(filename))
364
gk_errexit(SIGERR, "File %s does not exist!\n", filename);
365
366
if (format == GK_CSR_FMT_BINROW) {
367
mat = gk_csr_Create();
368
369
fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
370
if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
371
gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
372
if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
373
gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
374
mat->rowptr = gk_zmalloc(mat->nrows+1, "gk_csr_Read: rowptr");
375
if (fread(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpin) != mat->nrows+1)
376
gk_errexit(SIGERR, "Failed to read the rowptr from file %s!\n", filename);
377
mat->rowind = gk_imalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowind");
378
if (fread(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
379
gk_errexit(SIGERR, "Failed to read the rowind from file %s!\n", filename);
380
if (readvals == 1) {
381
mat->rowval = gk_fmalloc(mat->rowptr[mat->nrows], "gk_csr_Read: rowval");
382
if (fread(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpin) != mat->rowptr[mat->nrows])
383
gk_errexit(SIGERR, "Failed to read the rowval from file %s!\n", filename);
384
}
385
386
gk_fclose(fpin);
387
return mat;
388
}
389
390
if (format == GK_CSR_FMT_BINCOL) {
391
mat = gk_csr_Create();
392
393
fpin = gk_fopen(filename, "rb", "gk_csr_Read: fpin");
394
if (fread(&(mat->nrows), sizeof(int32_t), 1, fpin) != 1)
395
gk_errexit(SIGERR, "Failed to read the nrows from file %s!\n", filename);
396
if (fread(&(mat->ncols), sizeof(int32_t), 1, fpin) != 1)
397
gk_errexit(SIGERR, "Failed to read the ncols from file %s!\n", filename);
398
mat->colptr = gk_zmalloc(mat->ncols+1, "gk_csr_Read: colptr");
399
if (fread(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpin) != mat->ncols+1)
400
gk_errexit(SIGERR, "Failed to read the colptr from file %s!\n", filename);
401
mat->colind = gk_imalloc(mat->colptr[mat->ncols], "gk_csr_Read: colind");
402
if (fread(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
403
gk_errexit(SIGERR, "Failed to read the colind from file %s!\n", filename);
404
if (readvals) {
405
mat->colval = gk_fmalloc(mat->colptr[mat->ncols], "gk_csr_Read: colval");
406
if (fread(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpin) != mat->colptr[mat->ncols])
407
gk_errexit(SIGERR, "Failed to read the colval from file %s!\n", filename);
408
}
409
410
gk_fclose(fpin);
411
return mat;
412
}
413
414
415
if (format == GK_CSR_FMT_CLUTO) {
416
fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
417
do {
418
if (gk_getline(&line, &lnlen, fpin) <= 0)
419
gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
420
} while (line[0] == '%');
421
422
if (sscanf(line, "%zu %zu %zu", &nrows, &ncols, &nnz) != 3)
423
gk_errexit(SIGERR, "Header line must contain 3 integers.\n");
424
425
readsizes = 0;
426
readwgts = 0;
427
readvals = 1;
428
numbering = 1;
429
}
430
else if (format == GK_CSR_FMT_METIS) {
431
fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
432
do {
433
if (gk_getline(&line, &lnlen, fpin) <= 0)
434
gk_errexit(SIGERR, "Premature end of input file: file:%s\n", filename);
435
} while (line[0] == '%');
436
437
fmt = ncon = 0;
438
nfields = sscanf(line, "%zu %zu %zu %zu", &nrows, &nnz, &fmt, &ncon);
439
if (nfields < 2)
440
gk_errexit(SIGERR, "Header line must contain at least 2 integers (#vtxs and #edges).\n");
441
442
ncols = nrows;
443
nnz *= 2;
444
445
if (fmt > 111)
446
gk_errexit(SIGERR, "Cannot read this type of file format [fmt=%zu]!\n", fmt);
447
448
sprintf(fmtstr, "%03zu", fmt%1000);
449
readsizes = (fmtstr[0] == '1');
450
readwgts = (fmtstr[1] == '1');
451
readvals = (fmtstr[2] == '1');
452
numbering = 1;
453
ncon = (ncon == 0 ? 1 : ncon);
454
}
455
else {
456
readsizes = 0;
457
readwgts = 0;
458
459
gk_getfilestats(filename, &nrows, &nnz, NULL, NULL);
460
461
if (readvals == 1 && nnz%2 == 1)
462
gk_errexit(SIGERR, "Error: The number of numbers (%zd %d) in the input file is not even.\n", nnz, readvals);
463
if (readvals == 1)
464
nnz = nnz/2;
465
fpin = gk_fopen(filename, "r", "gk_csr_Read: fpin");
466
}
467
468
mat = gk_csr_Create();
469
470
mat->nrows = nrows;
471
472
rowptr = mat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Read: rowptr");
473
rowind = mat->rowind = gk_imalloc(nnz, "gk_csr_Read: rowind");
474
if (readvals != 2)
475
rowval = mat->rowval = gk_fsmalloc(nnz, 1.0, "gk_csr_Read: rowval");
476
477
if (readsizes)
478
mat->rsizes = gk_fsmalloc(nrows, 0.0, "gk_csr_Read: rsizes");
479
480
if (readwgts)
481
mat->rwgts = gk_fsmalloc(nrows*ncon, 0.0, "gk_csr_Read: rwgts");
482
483
/*----------------------------------------------------------------------
484
* Read the sparse matrix file
485
*---------------------------------------------------------------------*/
486
numbering = (numbering ? - 1 : 0);
487
for (ncols=0, rowptr[0]=0, k=0, i=0; i<nrows; i++) {
488
do {
489
if (gk_getline(&line, &lnlen, fpin) == -1)
490
gk_errexit(SIGERR, "Premature end of input file: file while reading row %d\n", i);
491
} while (line[0] == '%');
492
493
head = line;
494
tail = NULL;
495
496
/* Read vertex sizes */
497
if (readsizes) {
498
#ifdef __MSC__
499
mat->rsizes[i] = (float)strtod(head, &tail);
500
#else
501
mat->rsizes[i] = strtof(head, &tail);
502
#endif
503
if (tail == head)
504
gk_errexit(SIGERR, "The line for vertex %zd does not have size information\n", i+1);
505
if (mat->rsizes[i] < 0)
506
errexit("The size for vertex %zd must be >= 0\n", i+1);
507
head = tail;
508
}
509
510
/* Read vertex weights */
511
if (readwgts) {
512
for (l=0; l<ncon; l++) {
513
#ifdef __MSC__
514
mat->rwgts[i*ncon+l] = (float)strtod(head, &tail);
515
#else
516
mat->rwgts[i*ncon+l] = strtof(head, &tail);
517
#endif
518
if (tail == head)
519
errexit("The line for vertex %zd does not have enough weights "
520
"for the %d constraints.\n", i+1, ncon);
521
if (mat->rwgts[i*ncon+l] < 0)
522
errexit("The weight vertex %zd and constraint %zd must be >= 0\n", i+1, l);
523
head = tail;
524
}
525
}
526
527
528
/* Read the rest of the row */
529
while (1) {
530
ival = (int)strtol(head, &tail, 0);
531
if (tail == head)
532
break;
533
head = tail;
534
535
if ((rowind[k] = ival + numbering) < 0)
536
gk_errexit(SIGERR, "Error: Invalid column number %d at row %zd.\n", ival, i);
537
538
ncols = gk_max(rowind[k], ncols);
539
540
if (readvals == 1) {
541
#ifdef __MSC__
542
fval = (float)strtod(head, &tail);
543
#else
544
fval = strtof(head, &tail);
545
#endif
546
if (tail == head)
547
gk_errexit(SIGERR, "Value could not be found for column! Row:%zd, NNZ:%zd\n", i, k);
548
head = tail;
549
550
rowval[k] = fval;
551
}
552
k++;
553
}
554
rowptr[i+1] = k;
555
}
556
557
if (format == GK_CSR_FMT_METIS) {
558
ASSERT(ncols+1 == mat->nrows);
559
mat->ncols = mat->nrows;
560
}
561
else {
562
mat->ncols = ncols+1;
563
}
564
565
if (k != nnz)
566
gk_errexit(SIGERR, "gk_csr_Read: Something wrong with the number of nonzeros in "
567
"the input file. NNZ=%zd, ActualNNZ=%zd.\n", nnz, k);
568
569
gk_fclose(fpin);
570
571
gk_free((void **)&line, LTERM);
572
573
return mat;
574
}
575
576
577
/**************************************************************************/
578
/*! Writes the row-based structure of a matrix into a file.
579
\param mat is the matrix to be written,
580
\param filename is the name of the output file.
581
\param format is one of: GK_CSR_FMT_CLUTO, GK_CSR_FMT_CSR,
582
GK_CSR_FMT_BINROW, GK_CSR_FMT_BINCOL.
583
\param writevals is either 1 or 0 indicating if the values will be
584
written or not. This is only applicable when GK_CSR_FMT_CSR
585
is used.
586
\param numbering is either 1 or 0 indicating if the internal 0-based
587
numbering will be shifted by one or not during output. This
588
is only applicable when GK_CSR_FMT_CSR is used.
589
*/
590
/**************************************************************************/
591
void gk_csr_Write(gk_csr_t *mat, char *filename, int format, int writevals, int numbering)
592
{
593
ssize_t i, j;
594
FILE *fpout;
595
596
if (format == GK_CSR_FMT_BINROW) {
597
if (filename == NULL)
598
gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
599
fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
600
601
fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);
602
fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);
603
fwrite(mat->rowptr, sizeof(ssize_t), mat->nrows+1, fpout);
604
fwrite(mat->rowind, sizeof(int32_t), mat->rowptr[mat->nrows], fpout);
605
if (writevals)
606
fwrite(mat->rowval, sizeof(float), mat->rowptr[mat->nrows], fpout);
607
608
gk_fclose(fpout);
609
return;
610
}
611
612
if (format == GK_CSR_FMT_BINCOL) {
613
if (filename == NULL)
614
gk_errexit(SIGERR, "The filename parameter cannot be NULL.\n");
615
fpout = gk_fopen(filename, "wb", "gk_csr_Write: fpout");
616
617
fwrite(&(mat->nrows), sizeof(int32_t), 1, fpout);
618
fwrite(&(mat->ncols), sizeof(int32_t), 1, fpout);
619
fwrite(mat->colptr, sizeof(ssize_t), mat->ncols+1, fpout);
620
fwrite(mat->colind, sizeof(int32_t), mat->colptr[mat->ncols], fpout);
621
if (writevals)
622
fwrite(mat->colval, sizeof(float), mat->colptr[mat->ncols], fpout);
623
624
gk_fclose(fpout);
625
return;
626
}
627
628
if (filename)
629
fpout = gk_fopen(filename, "w", "gk_csr_Write: fpout");
630
else
631
fpout = stdout;
632
633
if (format == GK_CSR_FMT_CLUTO) {
634
fprintf(fpout, "%d %d %zd\n", mat->nrows, mat->ncols, mat->rowptr[mat->nrows]);
635
writevals = 1;
636
numbering = 1;
637
}
638
639
for (i=0; i<mat->nrows; i++) {
640
for (j=mat->rowptr[i]; j<mat->rowptr[i+1]; j++) {
641
fprintf(fpout, " %d", mat->rowind[j]+(numbering ? 1 : 0));
642
if (writevals)
643
fprintf(fpout, " %f", mat->rowval[j]);
644
}
645
fprintf(fpout, "\n");
646
}
647
if (filename)
648
gk_fclose(fpout);
649
}
650
651
652
/*************************************************************************/
653
/*! Prunes certain rows/columns of the matrix. The prunning takes place
654
by analyzing the row structure of the matrix. The prunning takes place
655
by removing rows/columns but it does not affect the numbering of the
656
remaining rows/columns.
657
658
\param mat the matrix to be prunned,
659
\param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
660
of the matrix will be prunned,
661
\param minf is the minimum number of rows (columns) that a column (row) must
662
be present in order to be kept,
663
\param maxf is the maximum number of rows (columns) that a column (row) must
664
be present at in order to be kept.
665
\returns the prunned matrix consisting only of its row-based structure.
666
The input matrix is not modified.
667
*/
668
/**************************************************************************/
669
gk_csr_t *gk_csr_Prune(gk_csr_t *mat, int what, int minf, int maxf)
670
{
671
ssize_t i, j, nnz;
672
int nrows, ncols;
673
ssize_t *rowptr, *nrowptr;
674
int *rowind, *nrowind, *collen;
675
float *rowval, *nrowval;
676
gk_csr_t *nmat;
677
678
nmat = gk_csr_Create();
679
680
nrows = nmat->nrows = mat->nrows;
681
ncols = nmat->ncols = mat->ncols;
682
683
rowptr = mat->rowptr;
684
rowind = mat->rowind;
685
rowval = mat->rowval;
686
687
nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_Prune: nrowptr");
688
nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_Prune: nrowind");
689
nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_Prune: nrowval");
690
691
692
switch (what) {
693
case GK_CSR_COL:
694
collen = gk_ismalloc(ncols, 0, "gk_csr_Prune: collen");
695
696
for (i=0; i<nrows; i++) {
697
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
698
ASSERT(rowind[j] < ncols);
699
collen[rowind[j]]++;
700
}
701
}
702
for (i=0; i<ncols; i++)
703
collen[i] = (collen[i] >= minf && collen[i] <= maxf ? 1 : 0);
704
705
nrowptr[0] = 0;
706
for (nnz=0, i=0; i<nrows; i++) {
707
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
708
if (collen[rowind[j]]) {
709
nrowind[nnz] = rowind[j];
710
nrowval[nnz] = rowval[j];
711
nnz++;
712
}
713
}
714
nrowptr[i+1] = nnz;
715
}
716
gk_free((void **)&collen, LTERM);
717
break;
718
719
case GK_CSR_ROW:
720
nrowptr[0] = 0;
721
for (nnz=0, i=0; i<nrows; i++) {
722
if (rowptr[i+1]-rowptr[i] >= minf && rowptr[i+1]-rowptr[i] <= maxf) {
723
for (j=rowptr[i]; j<rowptr[i+1]; j++, nnz++) {
724
nrowind[nnz] = rowind[j];
725
nrowval[nnz] = rowval[j];
726
}
727
}
728
nrowptr[i+1] = nnz;
729
}
730
break;
731
732
default:
733
gk_csr_Free(&nmat);
734
gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
735
return NULL;
736
}
737
738
return nmat;
739
}
740
741
742
/*************************************************************************/
743
/*! Eliminates certain entries from the rows/columns of the matrix. The
744
filtering takes place by keeping only the highest weight entries whose
745
sum accounts for a certain fraction of the overall weight of the
746
row/column.
747
748
\param mat the matrix to be prunned,
749
\param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
750
of the matrix will be prunned,
751
\param norm indicates the norm that will be used to aggregate the weights
752
and possible values are 1 or 2,
753
\param fraction is the fraction of the overall norm that will be retained
754
by the kept entries.
755
\returns the filtered matrix consisting only of its row-based structure.
756
The input matrix is not modified.
757
*/
758
/**************************************************************************/
759
gk_csr_t *gk_csr_LowFilter(gk_csr_t *mat, int what, int norm, float fraction)
760
{
761
ssize_t i, j, nnz;
762
int nrows, ncols, ncand, maxlen=0;
763
ssize_t *rowptr, *colptr, *nrowptr;
764
int *rowind, *colind, *nrowind;
765
float *rowval, *colval, *nrowval, rsum, tsum;
766
gk_csr_t *nmat;
767
gk_fkv_t *cand;
768
769
nmat = gk_csr_Create();
770
771
nrows = nmat->nrows = mat->nrows;
772
ncols = nmat->ncols = mat->ncols;
773
774
rowptr = mat->rowptr;
775
rowind = mat->rowind;
776
rowval = mat->rowval;
777
colptr = mat->colptr;
778
colind = mat->colind;
779
colval = mat->colval;
780
781
nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
782
nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
783
nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
784
785
786
switch (what) {
787
case GK_CSR_COL:
788
if (mat->colptr == NULL)
789
gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
790
791
gk_zcopy(nrows+1, rowptr, nrowptr);
792
793
for (i=0; i<ncols; i++)
794
maxlen = gk_max(maxlen, colptr[i+1]-colptr[i]);
795
796
#pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
797
{
798
cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
799
800
#pragma omp for schedule(static)
801
for (i=0; i<ncols; i++) {
802
for (tsum=0.0, ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
803
cand[ncand].val = colind[j];
804
cand[ncand].key = colval[j];
805
tsum += (norm == 1 ? colval[j] : colval[j]*colval[j]);
806
}
807
gk_fkvsortd(ncand, cand);
808
809
for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
810
rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
811
nrowind[nrowptr[cand[j].val]] = i;
812
nrowval[nrowptr[cand[j].val]] = cand[j].key;
813
nrowptr[cand[j].val]++;
814
}
815
}
816
817
gk_free((void **)&cand, LTERM);
818
}
819
820
/* compact the nrowind/nrowval */
821
for (nnz=0, i=0; i<nrows; i++) {
822
for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
823
nrowind[nnz] = nrowind[j];
824
nrowval[nnz] = nrowval[j];
825
}
826
nrowptr[i] = nnz;
827
}
828
SHIFTCSR(i, nrows, nrowptr);
829
830
break;
831
832
case GK_CSR_ROW:
833
if (mat->rowptr == NULL)
834
gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
835
836
for (i=0; i<nrows; i++)
837
maxlen = gk_max(maxlen, rowptr[i+1]-rowptr[i]);
838
839
#pragma omp parallel private(i, j, ncand, rsum, tsum, cand)
840
{
841
cand = gk_fkvmalloc(maxlen, "gk_csr_LowFilter: cand");
842
843
#pragma omp for schedule(static)
844
for (i=0; i<nrows; i++) {
845
for (tsum=0.0, ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
846
cand[ncand].val = rowind[j];
847
cand[ncand].key = rowval[j];
848
tsum += (norm == 1 ? rowval[j] : rowval[j]*rowval[j]);
849
}
850
gk_fkvsortd(ncand, cand);
851
852
for (rsum=0.0, j=0; j<ncand && rsum<=fraction*tsum; j++) {
853
rsum += (norm == 1 ? cand[j].key : cand[j].key*cand[j].key);
854
nrowind[rowptr[i]+j] = cand[j].val;
855
nrowval[rowptr[i]+j] = cand[j].key;
856
}
857
nrowptr[i+1] = rowptr[i]+j;
858
}
859
860
gk_free((void **)&cand, LTERM);
861
}
862
863
/* compact nrowind/nrowval */
864
nrowptr[0] = nnz = 0;
865
for (i=0; i<nrows; i++) {
866
for (j=rowptr[i]; j<nrowptr[i+1]; j++, nnz++) {
867
nrowind[nnz] = nrowind[j];
868
nrowval[nnz] = nrowval[j];
869
}
870
nrowptr[i+1] = nnz;
871
}
872
873
break;
874
875
default:
876
gk_csr_Free(&nmat);
877
gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
878
return NULL;
879
}
880
881
return nmat;
882
}
883
884
885
/*************************************************************************/
886
/*! Eliminates certain entries from the rows/columns of the matrix. The
887
filtering takes place by keeping only the highest weight top-K entries
888
along each row/column and those entries whose weight is greater than
889
a specified value.
890
891
\param mat the matrix to be prunned,
892
\param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
893
of the matrix will be prunned,
894
\param topk is the number of the highest weight entries to keep.
895
\param keepval is the weight of a term above which will be kept. This
896
is used to select additional terms past the first topk.
897
\returns the filtered matrix consisting only of its row-based structure.
898
The input matrix is not modified.
899
*/
900
/**************************************************************************/
901
gk_csr_t *gk_csr_TopKPlusFilter(gk_csr_t *mat, int what, int topk, float keepval)
902
{
903
ssize_t i, j, k, nnz;
904
int nrows, ncols, ncand;
905
ssize_t *rowptr, *colptr, *nrowptr;
906
int *rowind, *colind, *nrowind;
907
float *rowval, *colval, *nrowval;
908
gk_csr_t *nmat;
909
gk_fkv_t *cand;
910
911
nmat = gk_csr_Create();
912
913
nrows = nmat->nrows = mat->nrows;
914
ncols = nmat->ncols = mat->ncols;
915
916
rowptr = mat->rowptr;
917
rowind = mat->rowind;
918
rowval = mat->rowval;
919
colptr = mat->colptr;
920
colind = mat->colind;
921
colval = mat->colval;
922
923
nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_LowFilter: nrowptr");
924
nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_LowFilter: nrowind");
925
nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_LowFilter: nrowval");
926
927
928
switch (what) {
929
case GK_CSR_COL:
930
if (mat->colptr == NULL)
931
gk_errexit(SIGERR, "Cannot filter columns when column-based structure has not been created.\n");
932
933
cand = gk_fkvmalloc(nrows, "gk_csr_LowFilter: cand");
934
935
gk_zcopy(nrows+1, rowptr, nrowptr);
936
for (i=0; i<ncols; i++) {
937
for (ncand=0, j=colptr[i]; j<colptr[i+1]; j++, ncand++) {
938
cand[ncand].val = colind[j];
939
cand[ncand].key = colval[j];
940
}
941
gk_fkvsortd(ncand, cand);
942
943
k = gk_min(topk, ncand);
944
for (j=0; j<k; j++) {
945
nrowind[nrowptr[cand[j].val]] = i;
946
nrowval[nrowptr[cand[j].val]] = cand[j].key;
947
nrowptr[cand[j].val]++;
948
}
949
for (; j<ncand; j++) {
950
if (cand[j].key < keepval)
951
break;
952
953
nrowind[nrowptr[cand[j].val]] = i;
954
nrowval[nrowptr[cand[j].val]] = cand[j].key;
955
nrowptr[cand[j].val]++;
956
}
957
}
958
959
/* compact the nrowind/nrowval */
960
for (nnz=0, i=0; i<nrows; i++) {
961
for (j=rowptr[i]; j<nrowptr[i]; j++, nnz++) {
962
nrowind[nnz] = nrowind[j];
963
nrowval[nnz] = nrowval[j];
964
}
965
nrowptr[i] = nnz;
966
}
967
SHIFTCSR(i, nrows, nrowptr);
968
969
gk_free((void **)&cand, LTERM);
970
break;
971
972
case GK_CSR_ROW:
973
if (mat->rowptr == NULL)
974
gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
975
976
cand = gk_fkvmalloc(ncols, "gk_csr_LowFilter: cand");
977
978
nrowptr[0] = 0;
979
for (nnz=0, i=0; i<nrows; i++) {
980
for (ncand=0, j=rowptr[i]; j<rowptr[i+1]; j++, ncand++) {
981
cand[ncand].val = rowind[j];
982
cand[ncand].key = rowval[j];
983
}
984
gk_fkvsortd(ncand, cand);
985
986
k = gk_min(topk, ncand);
987
for (j=0; j<k; j++, nnz++) {
988
nrowind[nnz] = cand[j].val;
989
nrowval[nnz] = cand[j].key;
990
}
991
for (; j<ncand; j++, nnz++) {
992
if (cand[j].key < keepval)
993
break;
994
995
nrowind[nnz] = cand[j].val;
996
nrowval[nnz] = cand[j].key;
997
}
998
nrowptr[i+1] = nnz;
999
}
1000
1001
gk_free((void **)&cand, LTERM);
1002
break;
1003
1004
default:
1005
gk_csr_Free(&nmat);
1006
gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
1007
return NULL;
1008
}
1009
1010
return nmat;
1011
}
1012
1013
1014
/*************************************************************************/
1015
/*! Eliminates certain entries from the rows/columns of the matrix. The
1016
filtering takes place by keeping only the terms whose contribution to
1017
the total length of the document is greater than a user-splied multiple
1018
over the average.
1019
1020
This routine assumes that the vectors are normalized to be unit length.
1021
1022
\param mat the matrix to be prunned,
1023
\param what indicates if the rows (GK_CSR_ROW) or the columns (GK_CSR_COL)
1024
of the matrix will be prunned,
1025
\param zscore is the multiplicative factor over the average contribution
1026
to the length of the document.
1027
\returns the filtered matrix consisting only of its row-based structure.
1028
The input matrix is not modified.
1029
*/
1030
/**************************************************************************/
1031
gk_csr_t *gk_csr_ZScoreFilter(gk_csr_t *mat, int what, float zscore)
1032
{
1033
ssize_t i, j, nnz;
1034
int nrows;
1035
ssize_t *rowptr, *nrowptr;
1036
int *rowind, *nrowind;
1037
float *rowval, *nrowval, avgwgt;
1038
gk_csr_t *nmat;
1039
1040
nmat = gk_csr_Create();
1041
1042
nmat->nrows = mat->nrows;
1043
nmat->ncols = mat->ncols;
1044
1045
nrows = mat->nrows;
1046
rowptr = mat->rowptr;
1047
rowind = mat->rowind;
1048
rowval = mat->rowval;
1049
1050
nrowptr = nmat->rowptr = gk_zmalloc(nrows+1, "gk_csr_ZScoreFilter: nrowptr");
1051
nrowind = nmat->rowind = gk_imalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowind");
1052
nrowval = nmat->rowval = gk_fmalloc(rowptr[nrows], "gk_csr_ZScoreFilter: nrowval");
1053
1054
1055
switch (what) {
1056
case GK_CSR_COL:
1057
gk_errexit(SIGERR, "This has not been implemented yet.\n");
1058
break;
1059
1060
case GK_CSR_ROW:
1061
if (mat->rowptr == NULL)
1062
gk_errexit(SIGERR, "Cannot filter rows when row-based structure has not been created.\n");
1063
1064
nrowptr[0] = 0;
1065
for (nnz=0, i=0; i<nrows; i++) {
1066
avgwgt = zscore/(rowptr[i+1]-rowptr[i]);
1067
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1068
if (rowval[j] > avgwgt) {
1069
nrowind[nnz] = rowind[j];
1070
nrowval[nnz] = rowval[j];
1071
nnz++;
1072
}
1073
}
1074
nrowptr[i+1] = nnz;
1075
}
1076
break;
1077
1078
default:
1079
gk_csr_Free(&nmat);
1080
gk_errexit(SIGERR, "Unknown prunning type of %d\n", what);
1081
return NULL;
1082
}
1083
1084
return nmat;
1085
}
1086
1087
1088
/*************************************************************************/
1089
/*! Compacts the column-space of the matrix by removing empty columns.
1090
As a result of the compaction, the column numbers are renumbered.
1091
The compaction operation is done in place and only affects the row-based
1092
representation of the matrix.
1093
The new columns are ordered in decreasing frequency.
1094
1095
\param mat the matrix whose empty columns will be removed.
1096
*/
1097
/**************************************************************************/
1098
void gk_csr_CompactColumns(gk_csr_t *mat)
1099
{
1100
ssize_t i;
1101
int nrows, ncols, nncols;
1102
ssize_t *rowptr;
1103
int *rowind, *colmap;
1104
gk_ikv_t *clens;
1105
1106
nrows = mat->nrows;
1107
ncols = mat->ncols;
1108
rowptr = mat->rowptr;
1109
rowind = mat->rowind;
1110
1111
colmap = gk_imalloc(ncols, "gk_csr_CompactColumns: colmap");
1112
1113
clens = gk_ikvmalloc(ncols, "gk_csr_CompactColumns: clens");
1114
for (i=0; i<ncols; i++) {
1115
clens[i].key = 0;
1116
clens[i].val = i;
1117
}
1118
1119
for (i=0; i<rowptr[nrows]; i++)
1120
clens[rowind[i]].key++;
1121
gk_ikvsortd(ncols, clens);
1122
1123
for (nncols=0, i=0; i<ncols; i++) {
1124
if (clens[i].key > 0)
1125
colmap[clens[i].val] = nncols++;
1126
else
1127
break;
1128
}
1129
1130
for (i=0; i<rowptr[nrows]; i++)
1131
rowind[i] = colmap[rowind[i]];
1132
1133
mat->ncols = nncols;
1134
1135
gk_free((void **)&colmap, &clens, LTERM);
1136
}
1137
1138
1139
/*************************************************************************/
1140
/*! Sorts the indices in increasing order
1141
\param mat the matrix itself,
1142
\param what is either GK_CSR_ROW or GK_CSR_COL indicating which set of
1143
indices to sort.
1144
*/
1145
/**************************************************************************/
1146
void gk_csr_SortIndices(gk_csr_t *mat, int what)
1147
{
1148
int n, nn=0;
1149
ssize_t *ptr;
1150
int *ind;
1151
float *val;
1152
1153
switch (what) {
1154
case GK_CSR_ROW:
1155
if (!mat->rowptr)
1156
gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
1157
1158
n = mat->nrows;
1159
ptr = mat->rowptr;
1160
ind = mat->rowind;
1161
val = mat->rowval;
1162
break;
1163
1164
case GK_CSR_COL:
1165
if (!mat->colptr)
1166
gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
1167
1168
n = mat->ncols;
1169
ptr = mat->colptr;
1170
ind = mat->colind;
1171
val = mat->colval;
1172
break;
1173
1174
default:
1175
gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1176
return;
1177
}
1178
1179
#pragma omp parallel if (n > 100)
1180
{
1181
ssize_t i, j, k;
1182
gk_ikv_t *cand;
1183
float *tval;
1184
1185
#pragma omp single
1186
for (i=0; i<n; i++)
1187
nn = gk_max(nn, ptr[i+1]-ptr[i]);
1188
1189
cand = gk_ikvmalloc(nn, "gk_csr_SortIndices: cand");
1190
tval = gk_fmalloc(nn, "gk_csr_SortIndices: tval");
1191
1192
#pragma omp for schedule(static)
1193
for (i=0; i<n; i++) {
1194
for (k=0, j=ptr[i]; j<ptr[i+1]; j++) {
1195
if (j > ptr[i] && ind[j] < ind[j-1])
1196
k = 1; /* an inversion */
1197
cand[j-ptr[i]].val = j-ptr[i];
1198
cand[j-ptr[i]].key = ind[j];
1199
tval[j-ptr[i]] = val[j];
1200
}
1201
if (k) {
1202
gk_ikvsorti(ptr[i+1]-ptr[i], cand);
1203
for (j=ptr[i]; j<ptr[i+1]; j++) {
1204
ind[j] = cand[j-ptr[i]].key;
1205
val[j] = tval[cand[j-ptr[i]].val];
1206
}
1207
}
1208
}
1209
1210
gk_free((void **)&cand, &tval, LTERM);
1211
}
1212
1213
}
1214
1215
1216
/*************************************************************************/
1217
/*! Creates a row/column index from the column/row data.
1218
\param mat the matrix itself,
1219
\param what is either GK_CSR_ROW or GK_CSR_COL indicating which index
1220
will be created.
1221
*/
1222
/**************************************************************************/
1223
void gk_csr_CreateIndex(gk_csr_t *mat, int what)
1224
{
1225
/* 'f' stands for forward, 'r' stands for reverse */
1226
ssize_t i, j, k, nf, nr;
1227
ssize_t *fptr, *rptr;
1228
int *find, *rind;
1229
float *fval, *rval;
1230
1231
switch (what) {
1232
case GK_CSR_COL:
1233
nf = mat->nrows;
1234
fptr = mat->rowptr;
1235
find = mat->rowind;
1236
fval = mat->rowval;
1237
1238
if (mat->colptr) gk_free((void **)&mat->colptr, LTERM);
1239
if (mat->colind) gk_free((void **)&mat->colind, LTERM);
1240
if (mat->colval) gk_free((void **)&mat->colval, LTERM);
1241
1242
nr = mat->ncols;
1243
rptr = mat->colptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
1244
rind = mat->colind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
1245
rval = mat->colval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
1246
break;
1247
case GK_CSR_ROW:
1248
nf = mat->ncols;
1249
fptr = mat->colptr;
1250
find = mat->colind;
1251
fval = mat->colval;
1252
1253
if (mat->rowptr) gk_free((void **)&mat->rowptr, LTERM);
1254
if (mat->rowind) gk_free((void **)&mat->rowind, LTERM);
1255
if (mat->rowval) gk_free((void **)&mat->rowval, LTERM);
1256
1257
nr = mat->nrows;
1258
rptr = mat->rowptr = gk_zsmalloc(nr+1, 0, "gk_csr_CreateIndex: rptr");
1259
rind = mat->rowind = gk_imalloc(fptr[nf], "gk_csr_CreateIndex: rind");
1260
rval = mat->rowval = (fval ? gk_fmalloc(fptr[nf], "gk_csr_CreateIndex: rval") : NULL);
1261
break;
1262
default:
1263
gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1264
return;
1265
}
1266
1267
1268
for (i=0; i<nf; i++) {
1269
for (j=fptr[i]; j<fptr[i+1]; j++)
1270
rptr[find[j]]++;
1271
}
1272
MAKECSR(i, nr, rptr);
1273
1274
if (rptr[nr] > 6*nr) {
1275
for (i=0; i<nf; i++) {
1276
for (j=fptr[i]; j<fptr[i+1]; j++)
1277
rind[rptr[find[j]]++] = i;
1278
}
1279
SHIFTCSR(i, nr, rptr);
1280
1281
if (fval) {
1282
for (i=0; i<nf; i++) {
1283
for (j=fptr[i]; j<fptr[i+1]; j++)
1284
rval[rptr[find[j]]++] = fval[j];
1285
}
1286
SHIFTCSR(i, nr, rptr);
1287
}
1288
}
1289
else {
1290
if (fval) {
1291
for (i=0; i<nf; i++) {
1292
for (j=fptr[i]; j<fptr[i+1]; j++) {
1293
k = find[j];
1294
rind[rptr[k]] = i;
1295
rval[rptr[k]++] = fval[j];
1296
}
1297
}
1298
}
1299
else {
1300
for (i=0; i<nf; i++) {
1301
for (j=fptr[i]; j<fptr[i+1]; j++)
1302
rind[rptr[find[j]]++] = i;
1303
}
1304
}
1305
SHIFTCSR(i, nr, rptr);
1306
}
1307
}
1308
1309
1310
/*************************************************************************/
1311
/*! Normalizes the rows/columns of the matrix to be unit
1312
length.
1313
\param mat the matrix itself,
1314
\param what indicates what will be normalized and is obtained by
1315
specifying GK_CSR_ROW, GK_CSR_COL, GK_CSR_ROW|GK_CSR_COL.
1316
\param norm indicates what norm is to normalize to, 1: 1-norm, 2: 2-norm
1317
*/
1318
/**************************************************************************/
1319
void gk_csr_Normalize(gk_csr_t *mat, int what, int norm)
1320
{
1321
ssize_t i, j;
1322
int n;
1323
ssize_t *ptr;
1324
float *val, sum;
1325
1326
if (what&GK_CSR_ROW && mat->rowval) {
1327
n = mat->nrows;
1328
ptr = mat->rowptr;
1329
val = mat->rowval;
1330
1331
#pragma omp parallel if (ptr[n] > OMPMINOPS)
1332
{
1333
#pragma omp for private(j,sum) schedule(static)
1334
for (i=0; i<n; i++) {
1335
for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++){
1336
if (norm == 2)
1337
sum += val[j]*val[j];
1338
else if (norm == 1)
1339
sum += val[j]; /* assume val[j] > 0 */
1340
}
1341
if (sum > 0) {
1342
if (norm == 2)
1343
sum=1.0/sqrt(sum);
1344
else if (norm == 1)
1345
sum=1.0/sum;
1346
for (j=ptr[i]; j<ptr[i+1]; j++)
1347
val[j] *= sum;
1348
1349
}
1350
}
1351
}
1352
}
1353
1354
if (what&GK_CSR_COL && mat->colval) {
1355
n = mat->ncols;
1356
ptr = mat->colptr;
1357
val = mat->colval;
1358
1359
#pragma omp parallel if (ptr[n] > OMPMINOPS)
1360
{
1361
#pragma omp for private(j,sum) schedule(static)
1362
for (i=0; i<n; i++) {
1363
for (sum=0.0, j=ptr[i]; j<ptr[i+1]; j++)
1364
if (norm == 2)
1365
sum += val[j]*val[j];
1366
else if (norm == 1)
1367
sum += val[j];
1368
if (sum > 0) {
1369
if (norm == 2)
1370
sum=1.0/sqrt(sum);
1371
else if (norm == 1)
1372
sum=1.0/sum;
1373
for (j=ptr[i]; j<ptr[i+1]; j++)
1374
val[j] *= sum;
1375
}
1376
}
1377
}
1378
}
1379
}
1380
1381
1382
/*************************************************************************/
1383
/*! Applies different row scaling methods.
1384
\param mat the matrix itself,
1385
\param type indicates the type of row scaling. Possible values are:
1386
GK_CSR_MAXTF, GK_CSR_SQRT, GK_CSR_LOG, GK_CSR_IDF, GK_CSR_MAXTF2.
1387
*/
1388
/**************************************************************************/
1389
void gk_csr_Scale(gk_csr_t *mat, int type)
1390
{
1391
ssize_t i, j;
1392
int nrows, ncols, nnzcols, bgfreq;
1393
ssize_t *rowptr;
1394
int *rowind, *collen;
1395
float *rowval, *cscale, maxtf;
1396
1397
nrows = mat->nrows;
1398
rowptr = mat->rowptr;
1399
rowind = mat->rowind;
1400
rowval = mat->rowval;
1401
1402
switch (type) {
1403
case GK_CSR_MAXTF: /* TF' = .5 + .5*TF/MAX(TF) */
1404
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1405
{
1406
#pragma omp for private(j, maxtf) schedule(static)
1407
for (i=0; i<nrows; i++) {
1408
maxtf = fabs(rowval[rowptr[i]]);
1409
for (j=rowptr[i]; j<rowptr[i+1]; j++)
1410
maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
1411
1412
for (j=rowptr[i]; j<rowptr[i+1]; j++)
1413
rowval[j] = .5 + .5*rowval[j]/maxtf;
1414
}
1415
}
1416
break;
1417
1418
case GK_CSR_MAXTF2: /* TF' = .1 + .9*TF/MAX(TF) */
1419
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1420
{
1421
#pragma omp for private(j, maxtf) schedule(static)
1422
for (i=0; i<nrows; i++) {
1423
maxtf = fabs(rowval[rowptr[i]]);
1424
for (j=rowptr[i]; j<rowptr[i+1]; j++)
1425
maxtf = (maxtf < fabs(rowval[j]) ? fabs(rowval[j]) : maxtf);
1426
1427
for (j=rowptr[i]; j<rowptr[i+1]; j++)
1428
rowval[j] = .1 + .9*rowval[j]/maxtf;
1429
}
1430
}
1431
break;
1432
1433
case GK_CSR_SQRT: /* TF' = .1+SQRT(TF) */
1434
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1435
{
1436
#pragma omp for private(j) schedule(static)
1437
for (i=0; i<nrows; i++) {
1438
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1439
if (rowval[j] != 0.0)
1440
rowval[j] = .1+sign(rowval[j], sqrt(fabs(rowval[j])));
1441
}
1442
}
1443
}
1444
break;
1445
1446
case GK_CSR_POW25: /* TF' = .1+POW(TF,.25) */
1447
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1448
{
1449
#pragma omp for private(j) schedule(static)
1450
for (i=0; i<nrows; i++) {
1451
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1452
if (rowval[j] != 0.0)
1453
rowval[j] = .1+sign(rowval[j], sqrt(sqrt(fabs(rowval[j]))));
1454
}
1455
}
1456
}
1457
break;
1458
1459
case GK_CSR_POW65: /* TF' = .1+POW(TF,.65) */
1460
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1461
{
1462
#pragma omp for private(j) schedule(static)
1463
for (i=0; i<nrows; i++) {
1464
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1465
if (rowval[j] != 0.0)
1466
rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .65));
1467
}
1468
}
1469
}
1470
break;
1471
1472
case GK_CSR_POW75: /* TF' = .1+POW(TF,.75) */
1473
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1474
{
1475
#pragma omp for private(j) schedule(static)
1476
for (i=0; i<nrows; i++) {
1477
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1478
if (rowval[j] != 0.0)
1479
rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .75));
1480
}
1481
}
1482
}
1483
break;
1484
1485
case GK_CSR_POW85: /* TF' = .1+POW(TF,.85) */
1486
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1487
{
1488
#pragma omp for private(j) schedule(static)
1489
for (i=0; i<nrows; i++) {
1490
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1491
if (rowval[j] != 0.0)
1492
rowval[j] = .1+sign(rowval[j], powf(fabs(rowval[j]), .85));
1493
}
1494
}
1495
}
1496
break;
1497
1498
case GK_CSR_LOG: /* TF' = 1+log_2(TF) */
1499
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1500
{
1501
double logscale = 1.0/log(2.0);
1502
#pragma omp for schedule(static,32)
1503
for (i=0; i<rowptr[nrows]; i++) {
1504
if (rowval[i] != 0.0)
1505
rowval[i] = 1+(rowval[i]>0.0 ? log(rowval[i]) : -log(-rowval[i]))*logscale;
1506
}
1507
#ifdef XXX
1508
#pragma omp for private(j) schedule(static)
1509
for (i=0; i<nrows; i++) {
1510
for (j=rowptr[i]; j<rowptr[i+1]; j++) {
1511
if (rowval[j] != 0.0)
1512
rowval[j] = 1+(rowval[j]>0.0 ? log(rowval[j]) : -log(-rowval[j]))*logscale;
1513
//rowval[j] = 1+sign(rowval[j], log(fabs(rowval[j]))*logscale);
1514
}
1515
}
1516
#endif
1517
}
1518
break;
1519
1520
case GK_CSR_IDF: /* TF' = TF*IDF */
1521
ncols = mat->ncols;
1522
cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
1523
collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
1524
1525
for (i=0; i<nrows; i++) {
1526
for (j=rowptr[i]; j<rowptr[i+1]; j++)
1527
collen[rowind[j]]++;
1528
}
1529
1530
#pragma omp parallel if (ncols > OMPMINOPS)
1531
{
1532
#pragma omp for schedule(static)
1533
for (i=0; i<ncols; i++)
1534
cscale[i] = (collen[i] > 0 ? log(1.0*nrows/collen[i]) : 0.0);
1535
}
1536
1537
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1538
{
1539
#pragma omp for private(j) schedule(static)
1540
for (i=0; i<nrows; i++) {
1541
for (j=rowptr[i]; j<rowptr[i+1]; j++)
1542
rowval[j] *= cscale[rowind[j]];
1543
}
1544
}
1545
1546
gk_free((void **)&cscale, &collen, LTERM);
1547
break;
1548
1549
case GK_CSR_IDF2: /* TF' = TF*IDF */
1550
ncols = mat->ncols;
1551
cscale = gk_fmalloc(ncols, "gk_csr_Scale: cscale");
1552
collen = gk_ismalloc(ncols, 0, "gk_csr_Scale: collen");
1553
1554
for (i=0; i<nrows; i++) {
1555
for (j=rowptr[i]; j<rowptr[i+1]; j++)
1556
collen[rowind[j]]++;
1557
}
1558
1559
nnzcols = 0;
1560
#pragma omp parallel if (ncols > OMPMINOPS)
1561
{
1562
#pragma omp for schedule(static) reduction(+:nnzcols)
1563
for (i=0; i<ncols; i++)
1564
nnzcols += (collen[i] > 0 ? 1 : 0);
1565
1566
bgfreq = gk_max(10, (ssize_t)(.5*rowptr[nrows]/nnzcols));
1567
printf("nnz: %zd, nnzcols: %d, bgfreq: %d\n", rowptr[nrows], nnzcols, bgfreq);
1568
1569
#pragma omp for schedule(static)
1570
for (i=0; i<ncols; i++)
1571
cscale[i] = (collen[i] > 0 ? log(1.0*(nrows+2*bgfreq)/(bgfreq+collen[i])) : 0.0);
1572
}
1573
1574
#pragma omp parallel if (rowptr[nrows] > OMPMINOPS)
1575
{
1576
#pragma omp for private(j) schedule(static)
1577
for (i=0; i<nrows; i++) {
1578
for (j=rowptr[i]; j<rowptr[i+1]; j++)
1579
rowval[j] *= cscale[rowind[j]];
1580
}
1581
}
1582
1583
gk_free((void **)&cscale, &collen, LTERM);
1584
break;
1585
1586
default:
1587
gk_errexit(SIGERR, "Unknown scaling type of %d\n", type);
1588
}
1589
1590
}
1591
1592
1593
/*************************************************************************/
1594
/*! Computes the sums of the rows/columns
1595
\param mat the matrix itself,
1596
\param what is either GK_CSR_ROW or GK_CSR_COL indicating which
1597
sums to compute.
1598
*/
1599
/**************************************************************************/
1600
void gk_csr_ComputeSums(gk_csr_t *mat, int what)
1601
{
1602
ssize_t i;
1603
int n;
1604
ssize_t *ptr;
1605
float *val, *sums;
1606
1607
switch (what) {
1608
case GK_CSR_ROW:
1609
n = mat->nrows;
1610
ptr = mat->rowptr;
1611
val = mat->rowval;
1612
1613
if (mat->rsums)
1614
gk_free((void **)&mat->rsums, LTERM);
1615
1616
sums = mat->rsums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
1617
break;
1618
case GK_CSR_COL:
1619
n = mat->ncols;
1620
ptr = mat->colptr;
1621
val = mat->colval;
1622
1623
if (mat->csums)
1624
gk_free((void **)&mat->csums, LTERM);
1625
1626
sums = mat->csums = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: sums");
1627
break;
1628
default:
1629
gk_errexit(SIGERR, "Invalid sum type of %d.\n", what);
1630
return;
1631
}
1632
1633
#pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
1634
for (i=0; i<n; i++)
1635
sums[i] = gk_fsum(ptr[i+1]-ptr[i], val+ptr[i], 1);
1636
}
1637
1638
1639
/*************************************************************************/
1640
/*! Computes the squared of the norms of the rows/columns
1641
\param mat the matrix itself,
1642
\param what is either GK_CSR_ROW or GK_CSR_COL indicating which
1643
squared norms to compute.
1644
*/
1645
/**************************************************************************/
1646
void gk_csr_ComputeSquaredNorms(gk_csr_t *mat, int what)
1647
{
1648
ssize_t i;
1649
int n;
1650
ssize_t *ptr;
1651
float *val, *norms;
1652
1653
switch (what) {
1654
case GK_CSR_ROW:
1655
n = mat->nrows;
1656
ptr = mat->rowptr;
1657
val = mat->rowval;
1658
1659
if (mat->rnorms) gk_free((void **)&mat->rnorms, LTERM);
1660
1661
norms = mat->rnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
1662
break;
1663
case GK_CSR_COL:
1664
n = mat->ncols;
1665
ptr = mat->colptr;
1666
val = mat->colval;
1667
1668
if (mat->cnorms) gk_free((void **)&mat->cnorms, LTERM);
1669
1670
norms = mat->cnorms = gk_fsmalloc(n, 0, "gk_csr_ComputeSums: norms");
1671
break;
1672
default:
1673
gk_errexit(SIGERR, "Invalid norm type of %d.\n", what);
1674
return;
1675
}
1676
1677
#pragma omp parallel for if (ptr[n] > OMPMINOPS) schedule(static)
1678
for (i=0; i<n; i++)
1679
norms[i] = gk_fdot(ptr[i+1]-ptr[i], val+ptr[i], 1, val+ptr[i], 1);
1680
}
1681
1682
1683
/*************************************************************************/
1684
/*! Computes the similarity between two rows/columns
1685
1686
\param mat the matrix itself. The routine assumes that the indices
1687
are sorted in increasing order.
1688
\param i1 is the first row/column,
1689
\param i2 is the second row/column,
1690
\param what is either GK_CSR_ROW or GK_CSR_COL indicating the type of
1691
objects between the similarity will be computed,
1692
\param simtype is the type of similarity and is one of GK_CSR_COS,
1693
GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN
1694
\returns the similarity between the two rows/columns.
1695
*/
1696
/**************************************************************************/
1697
float gk_csr_ComputeSimilarity(gk_csr_t *mat, int i1, int i2, int what, int simtype)
1698
{
1699
int nind1, nind2;
1700
int *ind1, *ind2;
1701
float *val1, *val2, stat1, stat2, sim;
1702
1703
switch (what) {
1704
case GK_CSR_ROW:
1705
if (!mat->rowptr)
1706
gk_errexit(SIGERR, "Row-based view of the matrix does not exists.\n");
1707
nind1 = mat->rowptr[i1+1]-mat->rowptr[i1];
1708
nind2 = mat->rowptr[i2+1]-mat->rowptr[i2];
1709
ind1 = mat->rowind + mat->rowptr[i1];
1710
ind2 = mat->rowind + mat->rowptr[i2];
1711
val1 = mat->rowval + mat->rowptr[i1];
1712
val2 = mat->rowval + mat->rowptr[i2];
1713
break;
1714
1715
case GK_CSR_COL:
1716
if (!mat->colptr)
1717
gk_errexit(SIGERR, "Column-based view of the matrix does not exists.\n");
1718
nind1 = mat->colptr[i1+1]-mat->colptr[i1];
1719
nind2 = mat->colptr[i2+1]-mat->colptr[i2];
1720
ind1 = mat->colind + mat->colptr[i1];
1721
ind2 = mat->colind + mat->colptr[i2];
1722
val1 = mat->colval + mat->colptr[i1];
1723
val2 = mat->colval + mat->colptr[i2];
1724
break;
1725
1726
default:
1727
gk_errexit(SIGERR, "Invalid index type of %d.\n", what);
1728
return 0.0;
1729
}
1730
1731
1732
switch (simtype) {
1733
case GK_CSR_COS:
1734
case GK_CSR_JAC:
1735
sim = stat1 = stat2 = 0.0;
1736
i1 = i2 = 0;
1737
while (i1<nind1 && i2<nind2) {
1738
if (i1 == nind1) {
1739
stat2 += val2[i2]*val2[i2];
1740
i2++;
1741
}
1742
else if (i2 == nind2) {
1743
stat1 += val1[i1]*val1[i1];
1744
i1++;
1745
}
1746
else if (ind1[i1] < ind2[i2]) {
1747
stat1 += val1[i1]*val1[i1];
1748
i1++;
1749
}
1750
else if (ind1[i1] > ind2[i2]) {
1751
stat2 += val2[i2]*val2[i2];
1752
i2++;
1753
}
1754
else {
1755
sim += val1[i1]*val2[i2];
1756
stat1 += val1[i1]*val1[i1];
1757
stat2 += val2[i2]*val2[i2];
1758
i1++;
1759
i2++;
1760
}
1761
}
1762
if (simtype == GK_CSR_COS)
1763
sim = (stat1*stat2 > 0.0 ? sim/sqrt(stat1*stat2) : 0.0);
1764
else
1765
sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
1766
break;
1767
1768
case GK_CSR_MIN:
1769
sim = stat1 = stat2 = 0.0;
1770
i1 = i2 = 0;
1771
while (i1<nind1 && i2<nind2) {
1772
if (i1 == nind1) {
1773
stat2 += val2[i2];
1774
i2++;
1775
}
1776
else if (i2 == nind2) {
1777
stat1 += val1[i1];
1778
i1++;
1779
}
1780
else if (ind1[i1] < ind2[i2]) {
1781
stat1 += val1[i1];
1782
i1++;
1783
}
1784
else if (ind1[i1] > ind2[i2]) {
1785
stat2 += val2[i2];
1786
i2++;
1787
}
1788
else {
1789
sim += gk_min(val1[i1],val2[i2]);
1790
stat1 += val1[i1];
1791
stat2 += val2[i2];
1792
i1++;
1793
i2++;
1794
}
1795
}
1796
sim = (stat1+stat2-sim > 0.0 ? sim/(stat1+stat2-sim) : 0.0);
1797
1798
break;
1799
1800
case GK_CSR_AMIN:
1801
sim = stat1 = stat2 = 0.0;
1802
i1 = i2 = 0;
1803
while (i1<nind1 && i2<nind2) {
1804
if (i1 == nind1) {
1805
stat2 += val2[i2];
1806
i2++;
1807
}
1808
else if (i2 == nind2) {
1809
stat1 += val1[i1];
1810
i1++;
1811
}
1812
else if (ind1[i1] < ind2[i2]) {
1813
stat1 += val1[i1];
1814
i1++;
1815
}
1816
else if (ind1[i1] > ind2[i2]) {
1817
stat2 += val2[i2];
1818
i2++;
1819
}
1820
else {
1821
sim += gk_min(val1[i1],val2[i2]);
1822
stat1 += val1[i1];
1823
stat2 += val2[i2];
1824
i1++;
1825
i2++;
1826
}
1827
}
1828
sim = (stat1 > 0.0 ? sim/stat1 : 0.0);
1829
1830
break;
1831
1832
default:
1833
gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
1834
return -1;
1835
}
1836
1837
return sim;
1838
1839
}
1840
1841
1842
/*************************************************************************/
1843
/*! Finds the n most similar rows (neighbors) to the query using cosine
1844
similarity.
1845
1846
\param mat the matrix itself
1847
\param nqterms is the number of columns in the query
1848
\param qind is the list of query columns
1849
\param qval is the list of correspodning query weights
1850
\param simtype is the type of similarity and is one of GK_CSR_COS,
1851
GK_CSR_JAC, GK_CSR_MIN, GK_CSR_AMIN
1852
\param nsim is the maximum number of requested most similar rows.
1853
If -1 is provided, then everything is returned unsorted.
1854
\param minsim is the minimum similarity of the requested most
1855
similar rows
1856
\param hits is the result set. This array should be at least
1857
of length nsim.
1858
\param i_marker is an array of size equal to the number of rows
1859
whose values are initialized to -1. If NULL is provided
1860
then this array is allocated and freed internally.
1861
\param i_cand is an array of size equal to the number of rows.
1862
If NULL is provided then this array is allocated and freed
1863
internally.
1864
\returns the number of identified most similar rows, which can be
1865
smaller than the requested number of nnbrs in those cases
1866
in which there are no sufficiently many neighbors.
1867
*/
1868
/**************************************************************************/
1869
int gk_csr_GetSimilarRows(gk_csr_t *mat, int nqterms, int *qind,
1870
float *qval, int simtype, int nsim, float minsim, gk_fkv_t *hits,
1871
int *i_marker, gk_fkv_t *i_cand)
1872
{
1873
ssize_t i, ii, j, k;
1874
int nrows, ncols, ncand;
1875
ssize_t *colptr;
1876
int *colind, *marker;
1877
float *colval, *rnorms, mynorm, *rsums, mysum;
1878
gk_fkv_t *cand;
1879
1880
if (nqterms == 0)
1881
return 0;
1882
1883
nrows = mat->nrows;
1884
ncols = mat->ncols;
1885
colptr = mat->colptr;
1886
colind = mat->colind;
1887
colval = mat->colval;
1888
1889
marker = (i_marker ? i_marker : gk_ismalloc(nrows, -1, "gk_csr_SimilarRows: marker"));
1890
cand = (i_cand ? i_cand : gk_fkvmalloc(nrows, "gk_csr_SimilarRows: cand"));
1891
1892
switch (simtype) {
1893
case GK_CSR_COS:
1894
for (ncand=0, ii=0; ii<nqterms; ii++) {
1895
i = qind[ii];
1896
if (i < ncols) {
1897
for (j=colptr[i]; j<colptr[i+1]; j++) {
1898
k = colind[j];
1899
if (marker[k] == -1) {
1900
cand[ncand].val = k;
1901
cand[ncand].key = 0;
1902
marker[k] = ncand++;
1903
}
1904
cand[marker[k]].key += colval[j]*qval[ii];
1905
}
1906
}
1907
}
1908
break;
1909
1910
case GK_CSR_JAC:
1911
for (ncand=0, ii=0; ii<nqterms; ii++) {
1912
i = qind[ii];
1913
if (i < ncols) {
1914
for (j=colptr[i]; j<colptr[i+1]; j++) {
1915
k = colind[j];
1916
if (marker[k] == -1) {
1917
cand[ncand].val = k;
1918
cand[ncand].key = 0;
1919
marker[k] = ncand++;
1920
}
1921
cand[marker[k]].key += colval[j]*qval[ii];
1922
}
1923
}
1924
}
1925
1926
rnorms = mat->rnorms;
1927
mynorm = gk_fdot(nqterms, qval, 1, qval, 1);
1928
1929
for (i=0; i<ncand; i++)
1930
cand[i].key = cand[i].key/(rnorms[cand[i].val]+mynorm-cand[i].key);
1931
break;
1932
1933
case GK_CSR_MIN:
1934
for (ncand=0, ii=0; ii<nqterms; ii++) {
1935
i = qind[ii];
1936
if (i < ncols) {
1937
for (j=colptr[i]; j<colptr[i+1]; j++) {
1938
k = colind[j];
1939
if (marker[k] == -1) {
1940
cand[ncand].val = k;
1941
cand[ncand].key = 0;
1942
marker[k] = ncand++;
1943
}
1944
cand[marker[k]].key += gk_min(colval[j], qval[ii]);
1945
}
1946
}
1947
}
1948
1949
rsums = mat->rsums;
1950
mysum = gk_fsum(nqterms, qval, 1);
1951
1952
for (i=0; i<ncand; i++)
1953
cand[i].key = cand[i].key/(rsums[cand[i].val]+mysum-cand[i].key);
1954
break;
1955
1956
/* Assymetric MIN similarity */
1957
case GK_CSR_AMIN:
1958
for (ncand=0, ii=0; ii<nqterms; ii++) {
1959
i = qind[ii];
1960
if (i < ncols) {
1961
for (j=colptr[i]; j<colptr[i+1]; j++) {
1962
k = colind[j];
1963
if (marker[k] == -1) {
1964
cand[ncand].val = k;
1965
cand[ncand].key = 0;
1966
marker[k] = ncand++;
1967
}
1968
cand[marker[k]].key += gk_min(colval[j], qval[ii]);
1969
}
1970
}
1971
}
1972
1973
mysum = gk_fsum(nqterms, qval, 1);
1974
1975
for (i=0; i<ncand; i++)
1976
cand[i].key = cand[i].key/mysum;
1977
break;
1978
1979
default:
1980
gk_errexit(SIGERR, "Unknown similarity measure %d\n", simtype);
1981
return -1;
1982
}
1983
1984
/* go and prune the hits that are bellow minsim */
1985
for (j=0, i=0; i<ncand; i++) {
1986
marker[cand[i].val] = -1;
1987
if (cand[i].key >= minsim)
1988
cand[j++] = cand[i];
1989
}
1990
ncand = j;
1991
1992
if (nsim == -1 || nsim >= ncand) {
1993
nsim = ncand;
1994
}
1995
else {
1996
nsim = gk_min(nsim, ncand);
1997
gk_dfkvkselect(ncand, nsim, cand);
1998
gk_fkvsortd(nsim, cand);
1999
}
2000
2001
gk_fkvcopy(nsim, cand, hits);
2002
2003
if (i_marker == NULL)
2004
gk_free((void **)&marker, LTERM);
2005
if (i_cand == NULL)
2006
gk_free((void **)&cand, LTERM);
2007
2008
return nsim;
2009
}
2010
2011
2012