Path: blob/master/clustering_old/topic_model/LDA_functions.R
2579 views
# Latent Dirichlet Allocation using gibbs sampling12# @docs : document that have be converted to token ids3# @vocab : unique token for all the document collection4# @K : Number of topic groups5# @alpha : parameter for the document - topic distribution6# @eta : parameter for the word - topic distribution7# @iteration : Number of iterations to run gibbs sampling to train our model8# returns : the "unnormalized" distribution matrix9# 1. wt : word-topic matrix10# 2. dt : document-topic matrix1112LDA1 <- function( docs, vocab, K, alpha, eta, iterations )13{14# initialize count matrices15# @wt : word-topic matrix16wt <- matrix( 0, K, length(vocab) )17colnames(wt) <- vocab1819# @ta : topic assignment list20ta <- lapply( docs, function(x) rep( 0, length(x) ) )21names(ta) <- paste0( "doc", 1:length(docs) )2223# @dt : counts correspond to the number of words assigned to each topic for each document24dt <- matrix( 0, length(docs), K )2526for( d in 1:length(docs) )27{28# randomly assign topic to word w29for( w in 1:length( docs[[d]] ) )30{31ta[[d]][w] <- sample( 1:K, 1 )3233# extract the topic index, word id and update the corresponding cell34# in the word-topic count matrix35ti <- ta[[d]][w]36wi <- docs[[d]][w]37wt[ ti, wi ] <- wt[ ti, wi ] + 138}3940# count words in document d assigned to each topic t41for( t in 1:K )42dt[ d, t ] <- sum( ta[[d]] == t )43}4445# for each pass through the corpus46for( i in 1:iterations )47{48# for each document49for( d in 1:length(docs) )50{51# for each word52for( w in 1:length( docs[[d]] ) )53{54t0 <- ta[[d]][w]55wid <- docs[[d]][w]5657dt[ d, t0 ] <- dt[ d, t0 ] - 158wt[ t0, wid ] <- wt[ t0, wid ] - 15960left <- ( wt[ , wid ] + eta ) / ( rowSums(wt) + length(vocab) * eta )61right <- ( dt[ d, ] + alpha ) / ( sum( dt[ d, ] ) + K * alpha )6263t1 <- sample( 1:K, 1, prob = left * right )6465# update topic assignment list with newly sampled topic for token w.66# and re-increment word-topic and document-topic count matrices with67# the new sampled topic for token w.68ta[[d]][w] <- t169dt[ d, t1 ] <- dt[ d, t1 ] + 170wt[ t1, wid ] <- wt[ t1, wid ] + 17172# examine when topic assignments change73# if( t0 != t1 )74# print( paste0( "doc:", d, " token:" , w, " topic:", t0, "=>", t1 ) )75}76}77}7879return( list( wt = wt, dt = dt ) )80}81828384