CoCalc -- LDA.R

GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/topic_model/LDA.R
²⁵⁸¹ views
1
# Latent Dirichlet Allocation
2
# conditioned on a dirichlet distribution 
3
# for two class = binomial distribution
4
# for K class = multinomial distribution
5
# the dirichlet distribution allows us model 
6
# the random selection from a multinomial distribution with K classes 
7
# For the symmetric distribution, a high alpha-value means that each document is 
8
# likely to contain a mixture of most of the topics, and not any single topic specifically
9

10
# ----------------------------------------------------------------------------------------
11
#								Prepare Example
12
# ----------------------------------------------------------------------------------------
13

14
# toy example 
15
rawdocs <- c(
16

17
	"eat turkey on turkey day holiday",
18
	"i like to eat cake on holiday",
19
	"turkey trot race on thanksgiving holiday",
20
	"snail race the turtle",
21
	"time travel space race",
22
	"movie on thanksgiving",
23
	"movie at air and space museum is cool movie",
24
	"aspiring movie star"
25
)
26
docs <- strsplit( rawdocs, split = " " )
27

28
# unique words
29
vocab <- unique( unlist(docs) )
30

31
# replace words in documents with wordIDs
32
for( i in 1:length(docs) )
33
	docs[[i]] <- match( docs[[i]], vocab )
34

35
# number of topics
36
K <- 2 
37

38
# initialize count matrices 
39
# @wt : word-topic matrix 
40
wt <- matrix( 0, K, length(vocab) )
41
colnames(wt) <- vocab
42

43
# @ta : topic assignment list
44
ta <- lapply( docs, function(x) rep( 0, length(x) ) ) 
45
names(ta) <- paste0( "doc", 1:length(docs) )
46

47
# @dt : counts correspond to the number of words assigned to each topic for each document
48
dt <- matrix( 0, length(docs), K )
49

50
set.seed(1234)
51
for( d in 1:length(docs) )
52
{ 
53
	# randomly assign topic to word w
54
	for( w in 1:length( docs[[d]] ) )
55
	{		
56
		ta[[d]][w] <- sample( 1:K, 1 ) 
57

58
		# extract the topic index, word id and update the corresponding cell 
59
		# in the word-topic count matrix  
60
		ti <- ta[[d]][w]
61
		wi <- docs[[d]][w]
62
		wt[ ti, wi ] <- wt[ ti, wi ] + 1    
63
	}
64

65
	# count words in document d assigned to each topic t
66
	for( t in 1:K )  
67
		dt[ d, t ] <- sum( ta[[d]] == t ) 
68
}
69

70
# the count of each word being assigned to each topic 
71
# topic assignment list
72
print(ta)
73
print(wt)
74
print(dt)
75

76

77
# ----------------------------------------------------------------------------------------
78
#								Gibbs sampling one iteration 
79
# ----------------------------------------------------------------------------------------
80

81
# hyperparameters
82
alpha <- 1
83
eta <- 1
84

85
# initial topics assigned to the first word of the first document
86
# and its corresponding word id 
87
t0  <- ta[[1]][1]
88
wid <- docs[[1]][1]
89

90
# z_-i means that we do not include token w in our word-topic and document-topic 
91
# count matrix when sampling for token w, 
92
# only leave the topic assignments of all other tokens for document 1
93
dt[ 1, t0 ]   <- dt[ 1, t0 ] - 1 
94
wt[ t0, wid ] <- wt[ t0, wid ] - 1
95

96
# Calculate left side and right side of equal sign
97
left  <- ( wt[ , wid ] + eta ) / ( rowSums(wt) + length(vocab) * eta )
98
right <- ( dt[ 1, ] + alpha ) / ( sum( dt[ 1, ] ) + K * alpha )
99

100
# draw new topic for the first word in the first document 
101
t1 <- sample( 1:K, 1, prob = left * right )
102
t1
103

104
# refresh the dt and wt with the newly assigned topic 
105
ta[[1]][1] <- t1 
106
dt[ 1, t1 ]   <- dt[ 1, t1 ] + 1  
107
wt[ t1, wid ] <- wt[ t1, wid ] + 1
108

109

110
# ----------------------------------------------------------------------------------------
111
#								Gibbs sampling ; topicmodels library 
112
# ----------------------------------------------------------------------------------------
113

114
# define parameters
115
K <- 2
116
alpha <- 1
117
eta <- .001
118
iterations <- 1000
119

120
source("/Users/ethen/machine-learning/lda_1/lda_1_functions.R")
121
set.seed(4321)
122
lda1 <- LDA1( docs = docs, vocab = vocab, 
123
			  K = K, alpha = alpha, eta = eta, iterations = iterations )
124

125

126
# posterior probability 
127
# topic probability of every word 
128
phi <- ( lda1$wt + eta ) / ( rowSums(lda1$wt) + length(vocab) * eta )
129

130
# topic probability of every document
131
theta <- ( lda1$dt + alpha ) / ( rowSums(lda1$dt) + K * alpha )
132

133
# topic assigned to each document, the one with the highest probability 
134
topic <- apply( theta, 1, which.max )
135

136
# possible words under each topic 
137
# sort the probability and obtain the user-specified number
138
Terms <- function( phi, n )
139
{
140
	term <- matrix( 0, n, K )
141
	for( p in 1:nrow(phi) )
142
		term[ , p ] <- names( sort( phi[ p, ], decreasing = TRUE )[1:n] )
143

144
	return(term)
145
}
146
term <- Terms( phi = phi, n = 3 )
147

148
list( original_text = rawdocs[ topic == 1 ], words = term[ , 1 ] )
149
list( original_text = rawdocs[ topic == 2 ], words = term[ , 2 ] )
150

151

152
# compare 
153
library(tm)
154
library(topicmodels)
155

156
# @burning : number of omitted Gibbs iterations at beginning
157
# @thin : number of omitted in-between Gibbs iterations
158
docs1 <- Corpus( VectorSource(rawdocs) )
159
dtm <- DocumentTermMatrix(docs1)
160
lda <- LDA( dtm, k = 2, method = "Gibbs", 
161
	   		control = list( seed = 1234, burnin = 500, thin = 100, iter = 4000 ) )
162

163
list( original_text = rawdocs[ topics(lda) == 1 ], words = terms( lda, 3 )[ , 1 ] )
164
list( original_text = rawdocs[ topics(lda) == 2 ], words = terms( lda, 3 )[ , 2 ] )
165

166
# ----------------------------------------------------------------------------------------
167
#										Reference
168
# ----------------------------------------------------------------------------------------
169

170
# why tagging matters
171
# http://cyber.law.harvard.edu/wg_home/uploads/507/07-WhyTaggingMatters.pdf
172

173
# math notations
174
# https://www.cl.cam.ac.uk/teaching/1213/L101/clark_lectures/lect7.pdf
175

176
# hyperparameters explanation 
177
# http://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters/37444#37444
178

179
# Reimplementation R code
180
# http://brooksandrew.github.io/simpleblog/articles/latent-dirichlet-allocation-under-the-hood/
181

182
Product

Resources

Company