Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/topic_model/LDA_functions.R
2579 views
1
# Latent Dirichlet Allocation using gibbs sampling
2
3
# @docs : document that have be converted to token ids
4
# @vocab : unique token for all the document collection
5
# @K : Number of topic groups
6
# @alpha : parameter for the document - topic distribution
7
# @eta : parameter for the word - topic distribution
8
# @iteration : Number of iterations to run gibbs sampling to train our model
9
# returns : the "unnormalized" distribution matrix
10
# 1. wt : word-topic matrix
11
# 2. dt : document-topic matrix
12
13
LDA1 <- function( docs, vocab, K, alpha, eta, iterations )
14
{
15
# initialize count matrices
16
# @wt : word-topic matrix
17
wt <- matrix( 0, K, length(vocab) )
18
colnames(wt) <- vocab
19
20
# @ta : topic assignment list
21
ta <- lapply( docs, function(x) rep( 0, length(x) ) )
22
names(ta) <- paste0( "doc", 1:length(docs) )
23
24
# @dt : counts correspond to the number of words assigned to each topic for each document
25
dt <- matrix( 0, length(docs), K )
26
27
for( d in 1:length(docs) )
28
{
29
# randomly assign topic to word w
30
for( w in 1:length( docs[[d]] ) )
31
{
32
ta[[d]][w] <- sample( 1:K, 1 )
33
34
# extract the topic index, word id and update the corresponding cell
35
# in the word-topic count matrix
36
ti <- ta[[d]][w]
37
wi <- docs[[d]][w]
38
wt[ ti, wi ] <- wt[ ti, wi ] + 1
39
}
40
41
# count words in document d assigned to each topic t
42
for( t in 1:K )
43
dt[ d, t ] <- sum( ta[[d]] == t )
44
}
45
46
# for each pass through the corpus
47
for( i in 1:iterations )
48
{
49
# for each document
50
for( d in 1:length(docs) )
51
{
52
# for each word
53
for( w in 1:length( docs[[d]] ) )
54
{
55
t0 <- ta[[d]][w]
56
wid <- docs[[d]][w]
57
58
dt[ d, t0 ] <- dt[ d, t0 ] - 1
59
wt[ t0, wid ] <- wt[ t0, wid ] - 1
60
61
left <- ( wt[ , wid ] + eta ) / ( rowSums(wt) + length(vocab) * eta )
62
right <- ( dt[ d, ] + alpha ) / ( sum( dt[ d, ] ) + K * alpha )
63
64
t1 <- sample( 1:K, 1, prob = left * right )
65
66
# update topic assignment list with newly sampled topic for token w.
67
# and re-increment word-topic and document-topic count matrices with
68
# the new sampled topic for token w.
69
ta[[d]][w] <- t1
70
dt[ d, t1 ] <- dt[ d, t1 ] + 1
71
wt[ t1, wid ] <- wt[ t1, wid ] + 1
72
73
# examine when topic assignments change
74
# if( t0 != t1 )
75
# print( paste0( "doc:", d, " token:" , w, " topic:", t0, "=>", t1 ) )
76
}
77
}
78
}
79
80
return( list( wt = wt, dt = dt ) )
81
}
82
83
84