Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/topic_model/LDA.R
2581 views
1
# Latent Dirichlet Allocation
2
# conditioned on a dirichlet distribution
3
# for two class = binomial distribution
4
# for K class = multinomial distribution
5
# the dirichlet distribution allows us model
6
# the random selection from a multinomial distribution with K classes
7
# For the symmetric distribution, a high alpha-value means that each document is
8
# likely to contain a mixture of most of the topics, and not any single topic specifically
9
10
# ----------------------------------------------------------------------------------------
11
# Prepare Example
12
# ----------------------------------------------------------------------------------------
13
14
# toy example
15
rawdocs <- c(
16
17
"eat turkey on turkey day holiday",
18
"i like to eat cake on holiday",
19
"turkey trot race on thanksgiving holiday",
20
"snail race the turtle",
21
"time travel space race",
22
"movie on thanksgiving",
23
"movie at air and space museum is cool movie",
24
"aspiring movie star"
25
)
26
docs <- strsplit( rawdocs, split = " " )
27
28
# unique words
29
vocab <- unique( unlist(docs) )
30
31
# replace words in documents with wordIDs
32
for( i in 1:length(docs) )
33
docs[[i]] <- match( docs[[i]], vocab )
34
35
# number of topics
36
K <- 2
37
38
# initialize count matrices
39
# @wt : word-topic matrix
40
wt <- matrix( 0, K, length(vocab) )
41
colnames(wt) <- vocab
42
43
# @ta : topic assignment list
44
ta <- lapply( docs, function(x) rep( 0, length(x) ) )
45
names(ta) <- paste0( "doc", 1:length(docs) )
46
47
# @dt : counts correspond to the number of words assigned to each topic for each document
48
dt <- matrix( 0, length(docs), K )
49
50
set.seed(1234)
51
for( d in 1:length(docs) )
52
{
53
# randomly assign topic to word w
54
for( w in 1:length( docs[[d]] ) )
55
{
56
ta[[d]][w] <- sample( 1:K, 1 )
57
58
# extract the topic index, word id and update the corresponding cell
59
# in the word-topic count matrix
60
ti <- ta[[d]][w]
61
wi <- docs[[d]][w]
62
wt[ ti, wi ] <- wt[ ti, wi ] + 1
63
}
64
65
# count words in document d assigned to each topic t
66
for( t in 1:K )
67
dt[ d, t ] <- sum( ta[[d]] == t )
68
}
69
70
# the count of each word being assigned to each topic
71
# topic assignment list
72
print(ta)
73
print(wt)
74
print(dt)
75
76
77
# ----------------------------------------------------------------------------------------
78
# Gibbs sampling one iteration
79
# ----------------------------------------------------------------------------------------
80
81
# hyperparameters
82
alpha <- 1
83
eta <- 1
84
85
# initial topics assigned to the first word of the first document
86
# and its corresponding word id
87
t0 <- ta[[1]][1]
88
wid <- docs[[1]][1]
89
90
# z_-i means that we do not include token w in our word-topic and document-topic
91
# count matrix when sampling for token w,
92
# only leave the topic assignments of all other tokens for document 1
93
dt[ 1, t0 ] <- dt[ 1, t0 ] - 1
94
wt[ t0, wid ] <- wt[ t0, wid ] - 1
95
96
# Calculate left side and right side of equal sign
97
left <- ( wt[ , wid ] + eta ) / ( rowSums(wt) + length(vocab) * eta )
98
right <- ( dt[ 1, ] + alpha ) / ( sum( dt[ 1, ] ) + K * alpha )
99
100
# draw new topic for the first word in the first document
101
t1 <- sample( 1:K, 1, prob = left * right )
102
t1
103
104
# refresh the dt and wt with the newly assigned topic
105
ta[[1]][1] <- t1
106
dt[ 1, t1 ] <- dt[ 1, t1 ] + 1
107
wt[ t1, wid ] <- wt[ t1, wid ] + 1
108
109
110
# ----------------------------------------------------------------------------------------
111
# Gibbs sampling ; topicmodels library
112
# ----------------------------------------------------------------------------------------
113
114
# define parameters
115
K <- 2
116
alpha <- 1
117
eta <- .001
118
iterations <- 1000
119
120
source("/Users/ethen/machine-learning/lda_1/lda_1_functions.R")
121
set.seed(4321)
122
lda1 <- LDA1( docs = docs, vocab = vocab,
123
K = K, alpha = alpha, eta = eta, iterations = iterations )
124
125
126
# posterior probability
127
# topic probability of every word
128
phi <- ( lda1$wt + eta ) / ( rowSums(lda1$wt) + length(vocab) * eta )
129
130
# topic probability of every document
131
theta <- ( lda1$dt + alpha ) / ( rowSums(lda1$dt) + K * alpha )
132
133
# topic assigned to each document, the one with the highest probability
134
topic <- apply( theta, 1, which.max )
135
136
# possible words under each topic
137
# sort the probability and obtain the user-specified number
138
Terms <- function( phi, n )
139
{
140
term <- matrix( 0, n, K )
141
for( p in 1:nrow(phi) )
142
term[ , p ] <- names( sort( phi[ p, ], decreasing = TRUE )[1:n] )
143
144
return(term)
145
}
146
term <- Terms( phi = phi, n = 3 )
147
148
list( original_text = rawdocs[ topic == 1 ], words = term[ , 1 ] )
149
list( original_text = rawdocs[ topic == 2 ], words = term[ , 2 ] )
150
151
152
# compare
153
library(tm)
154
library(topicmodels)
155
156
# @burning : number of omitted Gibbs iterations at beginning
157
# @thin : number of omitted in-between Gibbs iterations
158
docs1 <- Corpus( VectorSource(rawdocs) )
159
dtm <- DocumentTermMatrix(docs1)
160
lda <- LDA( dtm, k = 2, method = "Gibbs",
161
control = list( seed = 1234, burnin = 500, thin = 100, iter = 4000 ) )
162
163
list( original_text = rawdocs[ topics(lda) == 1 ], words = terms( lda, 3 )[ , 1 ] )
164
list( original_text = rawdocs[ topics(lda) == 2 ], words = terms( lda, 3 )[ , 2 ] )
165
166
# ----------------------------------------------------------------------------------------
167
# Reference
168
# ----------------------------------------------------------------------------------------
169
170
# why tagging matters
171
# http://cyber.law.harvard.edu/wg_home/uploads/507/07-WhyTaggingMatters.pdf
172
173
# math notations
174
# https://www.cl.cam.ac.uk/teaching/1213/L101/clark_lectures/lect7.pdf
175
176
# hyperparameters explanation
177
# http://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters/37444#37444
178
179
# Reimplementation R code
180
# http://brooksandrew.github.io/simpleblog/articles/latent-dirichlet-allocation-under-the-hood/
181
182