Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/tf_idf/tf_idf.R
2617 views
1
library(tm)
2
library(proxy)
3
library(dplyr)
4
5
doc <- c( "The sky is blue.", "The sun is bright today.",
6
"The sun in the sky is bright.", "We can see the shining sun, the bright sun." )
7
8
# -----------------------------------------------------------------------------------
9
# TF-IDF
10
# -----------------------------------------------------------------------------------
11
12
# create corpus
13
# stop words list
14
# stopwords("english")
15
doc_corpus <- Corpus( VectorSource( doc ) )
16
control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )
17
tdm <- TermDocumentMatrix( doc_corpus, control = control_list )
18
# inspect(tdm_train)
19
20
# tf
21
tf <- as.matrix(tdm)
22
23
# idf
24
( idf <- log( ncol(tf) / ( 1 + rowSums( tf != 0 ) ) ) )
25
26
# diagonal matrix
27
( idf <- diag(idf) )
28
29
# remember to transpose the original tf matrix
30
# equivalent to t(tf) %*% idf, but crossprod is faster
31
tf_idf <- crossprod( tf, idf )
32
colnames(tf_idf) <- rownames(tf)
33
tf_idf
34
35
# normalize
36
tf_idf / sqrt( rowSums( tf_idf^2 ) )
37
38
39
# -----------------------------------------------------------------------------------
40
# Text Clustering
41
# -----------------------------------------------------------------------------------
42
43
# cosine example
44
a <- c( 3, 4 )
45
b <- c( 5, 6 )
46
47
# print cos and degree
48
l <- list( numerator = sum( a * b ), denominator = sqrt( sum( a^2 ) ) * sqrt( sum( b^2 ) ) )
49
list( cosine = l$numerator / l$denominator,
50
degree = acos( l$numerator / l$denominator ) * 180 / pi )
51
52
53
# news data
54
setwd("/Users/ethen/machine-learning/tf_idf")
55
news <- read.csv( "news.csv", stringsAsFactors = FALSE )
56
57
# [TFIDF] :
58
# @vector = pass in a vector of documents
59
TFIDF <- function( vector )
60
{
61
# tf
62
news_corpus <- Corpus( VectorSource(vector) )
63
control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )
64
tf <- TermDocumentMatrix( news_corpus, control = control_list ) %>% as.matrix()
65
66
# idf
67
idf <- log( ncol(tf) / ( 1 + rowSums( tf != 0 ) ) ) %>% diag()
68
69
return( crossprod( tf, idf ) )
70
}
71
72
# tf-idf matrix using news' title
73
news_tf_idf <- TFIDF(news$title)
74
75
76
# [Cosine] :
77
# distance between two vectors
78
Cosine <- function( x, y )
79
{
80
similarity <- sum( x * y ) / ( sqrt( sum( y^2 ) ) * sqrt( sum( x^2 ) ) )
81
82
# given the cosine value, use acos to convert back to degrees
83
# acos returns the radian, multiply it by 180 and divide by pi to obtain degrees
84
return( acos(similarity) * 180 / pi )
85
}
86
87
# calculate pair-wise distance matrix
88
pr_DB$set_entry( FUN = Cosine, names = c("Cosine") )
89
d1 <- dist( news_tf_idf, method = "Cosine" )
90
pr_DB$delete_entry( "Cosine" )
91
92
# equivalent to the built in "cosine" distance
93
# d1 <- dist( news_tf_idf, method = "cosine" )
94
95
# heirachical clustering
96
cluster1 <- hclust( d1, method = "ward.D" )
97
plot(cluster1)
98
rect.hclust( cluster1, 17 )
99
groups1 <- cutree( cluster1, 17 )
100
# table(groups1)
101
102
news$title[ groups1 == 2 ]
103
news$title[ groups1 == 7 ]
104
news$title[ groups1 == 17 ]
105
106
# -----------------------------------------------------------------------------------
107
# topic model compare results
108
109
library(topicmodels)
110
111
rect.hclust( cluster1, 8 )
112
groups2 <- cutree( cluster1, 8 )
113
114
lapply( 1:length( unique(groups2) ), function(i) news$title[ groups2 == i ] )
115
116
LDACaculation <- function(vector)
117
{
118
news_corpus <- Corpus( VectorSource(vector) )
119
control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )
120
dtm <- DocumentTermMatrix( news_corpus, control = control_list )
121
lda <- LDA( dtm, k = 8, method = "Gibbs",
122
control = list( seed = 1234,
123
burnin = 1000,
124
thin = 100,
125
iter = 1000 ) )
126
return(lda)
127
}
128
129
lda <- LDACaculation(news$title)
130
131
132
topics(lda)
133
table( topics(lda) )
134
lapply( 1:length( unique( topics(lda) ) ), function(i) news$title[ topics(lda) == i ] )
135
136
137
terms( lda, 6 )
138
139
lda@gamma
140
lda@alpha
141
posterior(lda)$documents
142
143
best_topics <- data.frame( best = apply( posterior(lda)$topics, 1, max ) )
144
145
library(ggplot2)
146
ggplot( best_topics, aes( best ) ) +
147
geom_histogram()
148
149
150