CoCalc -- LDA_functions.R

GitHub Repository: ethen8181/machine-learning
Path: blob/master/clustering_old/topic_model/LDA_functions.R
²⁵⁷⁹ views
1
# Latent Dirichlet Allocation using gibbs sampling 
2

3
# @docs  	 : document that have be converted to token ids
4
# @vocab 	 : unique token for all the document collection 
5
# @K     	 : Number of topic groups
6
# @alpha 	 : parameter for the document - topic distribution
7
# @eta 	 	 : parameter for the word - topic distribution 
8
# @iteration : Number of iterations to run gibbs sampling to train our model
9
# returns 	 : the "unnormalized" distribution matrix 
10
#			   1. wt : word-topic matrix
11
# 			   2. dt : document-topic matrix
12
	
13
LDA1 <- function( docs, vocab, K, alpha, eta, iterations )
14
{
15
	# initialize count matrices 
16
	# @wt : word-topic matrix 
17
	wt <- matrix( 0, K, length(vocab) )
18
	colnames(wt) <- vocab
19

20
	# @ta : topic assignment list
21
	ta <- lapply( docs, function(x) rep( 0, length(x) ) ) 
22
	names(ta) <- paste0( "doc", 1:length(docs) )
23

24
	# @dt : counts correspond to the number of words assigned to each topic for each document
25
	dt <- matrix( 0, length(docs), K )
26

27
	for( d in 1:length(docs) )
28
	{ 
29
		# randomly assign topic to word w
30
		for( w in 1:length( docs[[d]] ) )
31
		{		
32
			ta[[d]][w] <- sample( 1:K, 1 ) 
33

34
			# extract the topic index, word id and update the corresponding cell 
35
			# in the word-topic count matrix  
36
			ti <- ta[[d]][w]
37
			wi <- docs[[d]][w]
38
			wt[ ti, wi ] <- wt[ ti, wi ] + 1    
39
		}
40

41
		# count words in document d assigned to each topic t
42
		for( t in 1:K )  
43
			dt[ d, t ] <- sum( ta[[d]] == t ) 
44
	}
45

46
	# for each pass through the corpus
47
	for( i in 1:iterations ) 
48
	{
49
		# for each document
50
		for( d in 1:length(docs) )
51
		{
52
			# for each word
53
			for( w in 1:length( docs[[d]] ) )
54
			{
55
				t0  <- ta[[d]][w]
56
				wid <- docs[[d]][w]
57
				
58
				dt[ d, t0 ]   <- dt[ d, t0 ] - 1
59
				wt[ t0, wid ] <- wt[ t0, wid ] - 1 
60
				
61
				left  <- ( wt[ , wid ] + eta ) / ( rowSums(wt) + length(vocab) * eta )
62
				right <- ( dt[ d, ] + alpha ) / ( sum( dt[ d, ] ) + K * alpha )
63

64
				t1 <- sample( 1:K, 1, prob = left * right )
65
				
66
				# update topic assignment list with newly sampled topic for token w.	
67
				# and re-increment word-topic and document-topic count matrices with 
68
				# the new sampled topic for token w.
69
				ta[[d]][w] <- t1 
70
				dt[ d, t1 ]   <- dt[ d, t1 ] + 1  
71
				wt[ t1, wid ] <- wt[ t1, wid ] + 1
72
	    		
73
	    		# examine when topic assignments change
74
				# if( t0 != t1 ) 
75
				#	 print( paste0( "doc:", d, " token:" , w, " topic:", t0, "=>", t1 ) ) 
76
			}
77
		}
78
	}
79

80
	return( list( wt = wt, dt = dt ) )
81
}
82

83

84
Product

Resources

Company