Path: blob/master/clustering_old/text_similarity/text_similarity.R
2620 views
# Comparing text similarity12# -----------------------------------------------------------------------------------------------------3# Documentation code4library(dplyr)5library(proxy)6library(data.table)7setwd("/Users/ethen/machine-learning/text_similarity/data")89doc <- lapply( list.files(), readLines )1011# remove punctuation mark and convert to lower cases12# and extra white space as a single blank13doc1 <- lapply( doc, function(x)14{15text <- gsub( "[[:punct:]]", "", x ) %>% tolower()16text <- gsub( "\\s+", " ", text ) %>% str_trim()17word <- strsplit( text, " " ) %>% unlist()18return(word)19})202122# ------------------------------------------23# test code, not implemented24doc1 <- lapply( doc, function(x)25{26text <- gsub( "[[:punct:]]|\\s+", "", x ) %>% tolower()27return(text)28})2930# letter (character) shingling31Shingling <- function( document, k )32{33shingles <- character( length = nchar(document) - k + 1 )3435for( i in 1:nchar(document) )36{37if( i + k - 1 > nchar(document) )38break39shingles[i] <- substring( document, i, i + k - 1 )40}41return( unique(shingles) )42}434445# ---------------------------------------------------------------------------------46# Shingling47# ---------------------------------------------------------------------------------4849# word shingling50Shingling <- function( document, k )51{52shingles <- character( length = ( length(document) - k + 1 ) )5354for( i in 1:( length(document) - k + 1 ) )55shingles[i] <- paste( document[ i:( i + k - 1 ) ], collapse = " " )5657return( unique(shingles) )58}5960doc1 <- lapply( doc1, function(x)61{62Shingling( x, k = 3 )63})64doc1[[1]]6566# ---------------------------------------------------------------------------------67# Jaccard Similarity68# ---------------------------------------------------------------------------------6970# unique sets on shingles across all documents71doc_dict <- unlist(doc1) %>% unique()7273# convert to boolean matrices, where74# rows = elements of the universal set (every possible combinations across all documents )75# columns = one column per document76# thus the matrix has one in row i and column j if and only if document j contains the term i77M <- lapply( doc1, function( set, dict )78{79as.integer( dict %in% set )80}, dict = doc_dict ) %>% data.frame()8182# set the names for both rows and columns83setnames( M, paste( "doc", 1:length(doc), sep = "_" ) )84rownames(M) <- doc_dict85M8687# How similar is two given document, jaccard similarity88JaccardSimilarity <- function( x, y )89{90non_zero <- which( x | y )91set_intersect <- sum( x[non_zero] & y[non_zero] )92set_union <- length(non_zero)93return( set_intersect / set_union )94}9596# create a new entry in the registry97pr_DB$set_entry( FUN = JaccardSimilarity, names = c("JaccardSimilarity") )9899# distance matrix100d1 <- dist( t(M), method = "JaccardSimilarity" )101102# delete entry103pr_DB$delete_entry( "JaccardSimilarity" )104d1105doc106107108# ---------------------------------------------------------------------------------109# MinHash110# ---------------------------------------------------------------------------------111112# the # of shingle sets are large, use minhash to convert large sets into short signatures113# while still preserving similarity114115# random permutation116# number of hash functions (signature number )117signature_num <- 4118119# prime number120prime <- 17121set.seed(12345)122coeff_a <- sample( nrow(M), signature_num )123coeff_b <- sample( nrow(M), signature_num )124125# check that it does permute126permute <- lapply( 1:signature_num, function(s)127{128hash <- numeric( length = length(nrow(M)) )129for( i in 1:nrow(M) )130hash[i] <- ( coeff_a[s] * i + coeff_b[s] ) %% prime131132return(hash)133})134# # convert to data frame135permute_df <- structure( permute, names = paste0( "hash_", 1:length(permute) ) ) %>%136data.frame()137138139# -----------------------------------------------------------------------------140# bind with the original characteristic matrix, using the first two sig141M1 <- cbind( M, permute_df[1:2] )142rownames(M1) <- 1:nrow(M1)143M1144145# calculate signatures146147# obtain the non zero rows' index148non_zero_rows <- lapply( 1:ncol(M), function(j)149{150return( which( M[ , j ] != 0 ) )151})152153# initialize signature matrix154SM <- matrix( data = NA, nrow = signature_num, ncol = ncol(M) )155156# for each column (document)157for( i in 1:ncol(M) )158{159# for each hash function (signature)'s value160for( s in 1:signature_num )161SM[ s, i ] <- min( permute_df[ , s ][ non_zero_rows[[i]] ] )162}163colnames(SM) <- paste( "doc", 1:length(doc), sep = "_" )164rownames(SM) <- paste( "minhash", 1:signature_num, sep = "_" )165SM166167# signature similarity168SigSimilarity <- function( x, y ) mean( x == y )169170pr_DB$set_entry( FUN = SigSimilarity, names = c("SigSimilarity") )171d2 <- dist( t(SM), method = "SigSimilarity" )172pr_DB$delete_entry( "SigSimilarity" )173d2174as.matrix(d2)175176# ---------------------------------------------------------------------------------177# Locality Sensitive Hashing178# ---------------------------------------------------------------------------------179180# number of bands and rows181bands <- 2182rows <- nrow(SM) / bands183184data.frame(SM) %>%185mutate( band = rep( 1:bands, each = rows ) ) %>%186select( band, everything() )187188189# work on this later190# evaluation and a bigger example1911921 - ( 1 - (.5)^rows )^bands193194195# http://okomestudio.net/biboroku/?p=2065#id3005927054196197198199200201