CoCalc -- tf

GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/tf_idf/tf_idf.R
²⁶¹⁷ views
1
library(tm)
2
library(proxy)
3
library(dplyr)
4

5
doc <- c( "The sky is blue.", "The sun is bright today.",
6
		  "The sun in the sky is bright.", "We can see the shining sun, the bright sun." )
7

8
# -----------------------------------------------------------------------------------
9
#                                    TF-IDF
10
# -----------------------------------------------------------------------------------
11

12
# create corpus
13
# stop words list 
14
# stopwords("english")
15
doc_corpus <- Corpus( VectorSource( doc ) )
16
control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )
17
tdm <- TermDocumentMatrix( doc_corpus, control = control_list )
18
# inspect(tdm_train)
19

20
# tf
21
tf <- as.matrix(tdm)
22

23
# idf
24
( idf <- log( ncol(tf) / ( 1 + rowSums( tf != 0 ) ) ) )
25

26
# diagonal matrix
27
( idf <- diag(idf) )
28

29
# remember to transpose the original tf matrix
30
# equivalent to t(tf) %*% idf, but crossprod is faster 
31
tf_idf <- crossprod( tf, idf )
32
colnames(tf_idf) <- rownames(tf)
33
tf_idf
34

35
# normalize 
36
tf_idf / sqrt( rowSums( tf_idf^2 ) )
37

38

39
# -----------------------------------------------------------------------------------
40
#                                    Text Clustering
41
# -----------------------------------------------------------------------------------
42

43
# cosine example 
44
a <- c( 3, 4 )
45
b <- c( 5, 6 )
46

47
# print cos and degree 
48
l <- list( numerator = sum( a * b ), denominator = sqrt( sum( a^2 ) ) * sqrt( sum( b^2 ) ) )
49
list( cosine = l$numerator / l$denominator, 
50
      degree = acos( l$numerator / l$denominator ) * 180 / pi )
51

52

53
# news data
54
setwd("/Users/ethen/machine-learning/tf_idf")
55
news <- read.csv( "news.csv", stringsAsFactors = FALSE )
56

57
# [TFIDF] :
58
# @vector = pass in a vector of documents  
59
TFIDF <- function( vector )
60
{
61
	# tf 
62
	news_corpus  <- Corpus( VectorSource(vector) )
63
	control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )
64
	tf <- TermDocumentMatrix( news_corpus, control = control_list ) %>% as.matrix()
65

66
	# idf
67
	idf <- log( ncol(tf) / ( 1 + rowSums( tf != 0 ) ) ) %>% diag()
68

69
	return( crossprod( tf, idf ) )
70
}
71

72
# tf-idf matrix using news' title 
73
news_tf_idf <- TFIDF(news$title)
74

75

76
# [Cosine] :
77
# distance between two vectors 
78
Cosine <- function( x, y )
79
{
80
	similarity <- sum( x * y ) / ( sqrt( sum( y^2 ) ) * sqrt( sum( x^2 ) ) )
81

82
	# given the cosine value, use acos to convert back to degrees
83
	# acos returns the radian, multiply it by 180 and divide by pi to obtain degrees
84
	return( acos(similarity) * 180 / pi )
85
}
86

87
# calculate pair-wise distance matrix 
88
pr_DB$set_entry( FUN = Cosine, names = c("Cosine") )
89
d1 <- dist( news_tf_idf, method = "Cosine" )
90
pr_DB$delete_entry( "Cosine" )
91

92
# equivalent to the built in "cosine" distance 
93
# d1 <- dist( news_tf_idf, method = "cosine" )
94

95
# heirachical clustering 
96
cluster1 <- hclust( d1, method = "ward.D" )
97
plot(cluster1)
98
rect.hclust( cluster1, 17 )
99
groups1 <- cutree( cluster1, 17 )
100
# table(groups1)
101

102
news$title[ groups1 == 2 ]
103
news$title[ groups1 == 7 ]
104
news$title[ groups1 == 17 ]
105

106
# -----------------------------------------------------------------------------------
107
# topic model compare results 
108

109
library(topicmodels)
110

111
rect.hclust( cluster1, 8 )
112
groups2 <- cutree( cluster1, 8 )
113

114
lapply( 1:length( unique(groups2) ), function(i) news$title[ groups2 == i ] )
115

116
LDACaculation <- function(vector)
117
{
118
	news_corpus  <- Corpus( VectorSource(vector) )
119
	control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )
120
	dtm <- DocumentTermMatrix( news_corpus, control = control_list )
121
	lda <- LDA( dtm, k = 8, method = "Gibbs", 
122
	   		    control = list( seed = 1234, 
123
	   		    				burnin = 1000, 
124
	   		    				thin = 100, 
125
	   		    				iter = 1000 ) )
126
	return(lda)	
127
}
128

129
lda <- LDACaculation(news$title)
130

131

132
topics(lda)
133
table( topics(lda) )
134
lapply( 1:length( unique( topics(lda) ) ), function(i) news$title[ topics(lda) == i ] )
135

136

137
terms( lda, 6 )
138

139
lda@gamma
140
lda@alpha
141
posterior(lda)$documents
142

143
best_topics <- data.frame( best = apply( posterior(lda)$topics, 1, max ) )
144

145
library(ggplot2)
146
ggplot( best_topics, aes( best ) ) + 
147
geom_histogram()
148

149

150
Product

Resources

Company