CoCalc -- text_similarity.R

GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/text_similarity/text_similarity.R
²⁶²⁰ views
1
# Comparing text similarity 
2

3
# -----------------------------------------------------------------------------------------------------
4
# Documentation code 
5
library(dplyr)
6
library(proxy)
7
library(data.table)
8
setwd("/Users/ethen/machine-learning/text_similarity/data")
9

10
doc <- lapply( list.files(), readLines )
11

12
# remove punctuation mark and convert to lower cases
13
# and extra white space as a single blank
14
doc1 <- lapply( doc, function(x)
15
{
16
	text <- gsub( "[[:punct:]]", "", x ) %>% tolower()
17
	text <- gsub( "\\s+", " ", text ) %>% str_trim()	
18
	word <- strsplit( text, " " ) %>% unlist()
19
	return(word)
20
})
21

22

23
# ------------------------------------------
24
# test code, not implemented
25
doc1 <- lapply( doc, function(x)
26
{
27
 	text <- gsub( "[[:punct:]]|\\s+", "", x ) %>% tolower()
28
	return(text)
29
})
30

31
# letter (character) shingling
32
Shingling <- function( document, k )
33
{
34
	shingles <- character( length = nchar(document) - k + 1 )	
35

36
	for( i in 1:nchar(document) )
37
	{
38
		if( i + k - 1 > nchar(document) )
39
			break
40
		shingles[i] <- substring( document, i, i + k - 1 )
41
	}
42
	return( unique(shingles) )	
43
}
44

45

46
# ---------------------------------------------------------------------------------
47
#                  Shingling 
48
# ---------------------------------------------------------------------------------
49

50
# word shingling
51
Shingling <- function( document, k )
52
{
53
	shingles <- character( length = ( length(document) - k + 1 ) )
54

55
	for( i in 1:( length(document) - k + 1 ) )
56
		shingles[i] <- paste( document[ i:( i + k - 1 ) ], collapse = " " )
57

58
	return( unique(shingles) )
59
}
60

61
doc1 <- lapply( doc1, function(x)
62
{
63
	Shingling( x, k = 3 )
64
})
65
doc1[[1]]
66

67
# ---------------------------------------------------------------------------------
68
#                 Jaccard Similarity  
69
# ---------------------------------------------------------------------------------
70

71
# unique sets on shingles across all documents
72
doc_dict <- unlist(doc1) %>% unique()
73

74
# convert to boolean matrices, where 
75
# rows    = elements of the universal set (every possible combinations across all documents )
76
# columns = one column per document
77
# thus the matrix has one in row i and column j if and only if document j contains the term i 
78
M <- lapply( doc1, function( set, dict )
79
{
80
	as.integer( dict %in% set )
81
}, dict = doc_dict ) %>% data.frame() 
82

83
# set the names for both rows and columns
84
setnames( M, paste( "doc", 1:length(doc), sep = "_" ) )
85
rownames(M) <- doc_dict
86
M
87

88
# How similar is two given document, jaccard similarity 
89
JaccardSimilarity <- function( x, y )
90
{
91
	non_zero <- which( x | y )
92
	set_intersect <- sum( x[non_zero] & y[non_zero] )
93
	set_union <- length(non_zero)
94
	return( set_intersect / set_union ) 
95
}
96

97
# create a new entry in the registry
98
pr_DB$set_entry( FUN = JaccardSimilarity, names = c("JaccardSimilarity") )
99

100
# distance matrix 
101
d1 <- dist( t(M), method = "JaccardSimilarity" )
102

103
# delete entry
104
pr_DB$delete_entry( "JaccardSimilarity" )
105
d1
106
doc
107

108

109
# ---------------------------------------------------------------------------------
110
#                 MinHash   
111
# ---------------------------------------------------------------------------------
112

113
# the # of shingle sets are large, use minhash to convert large sets into short signatures
114
# while still preserving similarity
115

116
# random permutation
117
# number of hash functions (signature number )
118
signature_num <- 4
119

120
# prime number
121
prime <- 17
122
set.seed(12345)
123
coeff_a <- sample( nrow(M), signature_num )
124
coeff_b <- sample( nrow(M), signature_num )
125

126
# check that it does permute 
127
permute <- lapply( 1:signature_num, function(s)
128
{
129
	hash <- numeric( length = length(nrow(M)) )
130
	for( i in 1:nrow(M) )
131
		hash[i] <- ( coeff_a[s] * i + coeff_b[s] ) %% prime
132
	
133
	return(hash)
134
})
135
# # convert to data frame 
136
permute_df <- structure( permute, names = paste0( "hash_", 1:length(permute) ) ) %>%
137
              data.frame()
138

139

140
# -----------------------------------------------------------------------------
141
# bind with the original characteristic matrix, using the first two sig 
142
M1 <- cbind( M, permute_df[1:2] )
143
rownames(M1) <- 1:nrow(M1)
144
M1
145

146
# calculate signatures 
147

148
# obtain the non zero rows' index
149
non_zero_rows <- lapply( 1:ncol(M), function(j)
150
{
151
	return( which( M[ , j ] != 0 ) )
152
})
153

154
# initialize signature matrix
155
SM <- matrix( data = NA, nrow = signature_num, ncol = ncol(M) )
156

157
# for each column (document)
158
for( i in 1:ncol(M) )
159
{
160
	# for each hash function (signature)'s value 
161
	for( s in 1:signature_num )
162
		SM[ s, i ] <- min( permute_df[ , s ][ non_zero_rows[[i]] ] )
163
}
164
colnames(SM) <- paste( "doc", 1:length(doc), sep = "_" )
165
rownames(SM) <- paste( "minhash", 1:signature_num, sep = "_" )
166
SM	
167

168
# signature similarity 
169
SigSimilarity <- function( x, y ) mean( x == y )
170

171
pr_DB$set_entry( FUN = SigSimilarity, names = c("SigSimilarity") )
172
d2 <- dist( t(SM), method = "SigSimilarity" )
173
pr_DB$delete_entry( "SigSimilarity" )
174
d2
175
as.matrix(d2)
176

177
# ---------------------------------------------------------------------------------
178
#                 Locality Sensitive Hashing    
179
# ---------------------------------------------------------------------------------
180

181
# number of bands and rows
182
bands <- 2
183
rows <- nrow(SM) / bands
184

185
data.frame(SM) %>% 
186
mutate( band = rep( 1:bands, each = rows ) ) %>%
187
select( band, everything() )
188

189

190
# work on this later
191
# evaluation and a bigger example 
192

193
1 - ( 1 - (.5)^rows )^bands
194

195

196
# http://okomestudio.net/biboroku/?p=2065#id3005927054
197

198

199

200

201
Product

Resources

Company