Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/text_similarity/text_similarity.R
2620 views
1
# Comparing text similarity
2
3
# -----------------------------------------------------------------------------------------------------
4
# Documentation code
5
library(dplyr)
6
library(proxy)
7
library(data.table)
8
setwd("/Users/ethen/machine-learning/text_similarity/data")
9
10
doc <- lapply( list.files(), readLines )
11
12
# remove punctuation mark and convert to lower cases
13
# and extra white space as a single blank
14
doc1 <- lapply( doc, function(x)
15
{
16
text <- gsub( "[[:punct:]]", "", x ) %>% tolower()
17
text <- gsub( "\\s+", " ", text ) %>% str_trim()
18
word <- strsplit( text, " " ) %>% unlist()
19
return(word)
20
})
21
22
23
# ------------------------------------------
24
# test code, not implemented
25
doc1 <- lapply( doc, function(x)
26
{
27
text <- gsub( "[[:punct:]]|\\s+", "", x ) %>% tolower()
28
return(text)
29
})
30
31
# letter (character) shingling
32
Shingling <- function( document, k )
33
{
34
shingles <- character( length = nchar(document) - k + 1 )
35
36
for( i in 1:nchar(document) )
37
{
38
if( i + k - 1 > nchar(document) )
39
break
40
shingles[i] <- substring( document, i, i + k - 1 )
41
}
42
return( unique(shingles) )
43
}
44
45
46
# ---------------------------------------------------------------------------------
47
# Shingling
48
# ---------------------------------------------------------------------------------
49
50
# word shingling
51
Shingling <- function( document, k )
52
{
53
shingles <- character( length = ( length(document) - k + 1 ) )
54
55
for( i in 1:( length(document) - k + 1 ) )
56
shingles[i] <- paste( document[ i:( i + k - 1 ) ], collapse = " " )
57
58
return( unique(shingles) )
59
}
60
61
doc1 <- lapply( doc1, function(x)
62
{
63
Shingling( x, k = 3 )
64
})
65
doc1[[1]]
66
67
# ---------------------------------------------------------------------------------
68
# Jaccard Similarity
69
# ---------------------------------------------------------------------------------
70
71
# unique sets on shingles across all documents
72
doc_dict <- unlist(doc1) %>% unique()
73
74
# convert to boolean matrices, where
75
# rows = elements of the universal set (every possible combinations across all documents )
76
# columns = one column per document
77
# thus the matrix has one in row i and column j if and only if document j contains the term i
78
M <- lapply( doc1, function( set, dict )
79
{
80
as.integer( dict %in% set )
81
}, dict = doc_dict ) %>% data.frame()
82
83
# set the names for both rows and columns
84
setnames( M, paste( "doc", 1:length(doc), sep = "_" ) )
85
rownames(M) <- doc_dict
86
M
87
88
# How similar is two given document, jaccard similarity
89
JaccardSimilarity <- function( x, y )
90
{
91
non_zero <- which( x | y )
92
set_intersect <- sum( x[non_zero] & y[non_zero] )
93
set_union <- length(non_zero)
94
return( set_intersect / set_union )
95
}
96
97
# create a new entry in the registry
98
pr_DB$set_entry( FUN = JaccardSimilarity, names = c("JaccardSimilarity") )
99
100
# distance matrix
101
d1 <- dist( t(M), method = "JaccardSimilarity" )
102
103
# delete entry
104
pr_DB$delete_entry( "JaccardSimilarity" )
105
d1
106
doc
107
108
109
# ---------------------------------------------------------------------------------
110
# MinHash
111
# ---------------------------------------------------------------------------------
112
113
# the # of shingle sets are large, use minhash to convert large sets into short signatures
114
# while still preserving similarity
115
116
# random permutation
117
# number of hash functions (signature number )
118
signature_num <- 4
119
120
# prime number
121
prime <- 17
122
set.seed(12345)
123
coeff_a <- sample( nrow(M), signature_num )
124
coeff_b <- sample( nrow(M), signature_num )
125
126
# check that it does permute
127
permute <- lapply( 1:signature_num, function(s)
128
{
129
hash <- numeric( length = length(nrow(M)) )
130
for( i in 1:nrow(M) )
131
hash[i] <- ( coeff_a[s] * i + coeff_b[s] ) %% prime
132
133
return(hash)
134
})
135
# # convert to data frame
136
permute_df <- structure( permute, names = paste0( "hash_", 1:length(permute) ) ) %>%
137
data.frame()
138
139
140
# -----------------------------------------------------------------------------
141
# bind with the original characteristic matrix, using the first two sig
142
M1 <- cbind( M, permute_df[1:2] )
143
rownames(M1) <- 1:nrow(M1)
144
M1
145
146
# calculate signatures
147
148
# obtain the non zero rows' index
149
non_zero_rows <- lapply( 1:ncol(M), function(j)
150
{
151
return( which( M[ , j ] != 0 ) )
152
})
153
154
# initialize signature matrix
155
SM <- matrix( data = NA, nrow = signature_num, ncol = ncol(M) )
156
157
# for each column (document)
158
for( i in 1:ncol(M) )
159
{
160
# for each hash function (signature)'s value
161
for( s in 1:signature_num )
162
SM[ s, i ] <- min( permute_df[ , s ][ non_zero_rows[[i]] ] )
163
}
164
colnames(SM) <- paste( "doc", 1:length(doc), sep = "_" )
165
rownames(SM) <- paste( "minhash", 1:signature_num, sep = "_" )
166
SM
167
168
# signature similarity
169
SigSimilarity <- function( x, y ) mean( x == y )
170
171
pr_DB$set_entry( FUN = SigSimilarity, names = c("SigSimilarity") )
172
d2 <- dist( t(SM), method = "SigSimilarity" )
173
pr_DB$delete_entry( "SigSimilarity" )
174
d2
175
as.matrix(d2)
176
177
# ---------------------------------------------------------------------------------
178
# Locality Sensitive Hashing
179
# ---------------------------------------------------------------------------------
180
181
# number of bands and rows
182
bands <- 2
183
rows <- nrow(SM) / bands
184
185
data.frame(SM) %>%
186
mutate( band = rep( 1:bands, each = rows ) ) %>%
187
select( band, everything() )
188
189
190
# work on this later
191
# evaluation and a bigger example
192
193
1 - ( 1 - (.5)^rows )^bands
194
195
196
# http://okomestudio.net/biboroku/?p=2065#id3005927054
197
198
199
200
201