Path: blob/master/clustering_old/topic_model/LDA.R
2581 views
# Latent Dirichlet Allocation1# conditioned on a dirichlet distribution2# for two class = binomial distribution3# for K class = multinomial distribution4# the dirichlet distribution allows us model5# the random selection from a multinomial distribution with K classes6# For the symmetric distribution, a high alpha-value means that each document is7# likely to contain a mixture of most of the topics, and not any single topic specifically89# ----------------------------------------------------------------------------------------10# Prepare Example11# ----------------------------------------------------------------------------------------1213# toy example14rawdocs <- c(1516"eat turkey on turkey day holiday",17"i like to eat cake on holiday",18"turkey trot race on thanksgiving holiday",19"snail race the turtle",20"time travel space race",21"movie on thanksgiving",22"movie at air and space museum is cool movie",23"aspiring movie star"24)25docs <- strsplit( rawdocs, split = " " )2627# unique words28vocab <- unique( unlist(docs) )2930# replace words in documents with wordIDs31for( i in 1:length(docs) )32docs[[i]] <- match( docs[[i]], vocab )3334# number of topics35K <- 23637# initialize count matrices38# @wt : word-topic matrix39wt <- matrix( 0, K, length(vocab) )40colnames(wt) <- vocab4142# @ta : topic assignment list43ta <- lapply( docs, function(x) rep( 0, length(x) ) )44names(ta) <- paste0( "doc", 1:length(docs) )4546# @dt : counts correspond to the number of words assigned to each topic for each document47dt <- matrix( 0, length(docs), K )4849set.seed(1234)50for( d in 1:length(docs) )51{52# randomly assign topic to word w53for( w in 1:length( docs[[d]] ) )54{55ta[[d]][w] <- sample( 1:K, 1 )5657# extract the topic index, word id and update the corresponding cell58# in the word-topic count matrix59ti <- ta[[d]][w]60wi <- docs[[d]][w]61wt[ ti, wi ] <- wt[ ti, wi ] + 162}6364# count words in document d assigned to each topic t65for( t in 1:K )66dt[ d, t ] <- sum( ta[[d]] == t )67}6869# the count of each word being assigned to each topic70# topic assignment list71print(ta)72print(wt)73print(dt)747576# ----------------------------------------------------------------------------------------77# Gibbs sampling one iteration78# ----------------------------------------------------------------------------------------7980# hyperparameters81alpha <- 182eta <- 18384# initial topics assigned to the first word of the first document85# and its corresponding word id86t0 <- ta[[1]][1]87wid <- docs[[1]][1]8889# z_-i means that we do not include token w in our word-topic and document-topic90# count matrix when sampling for token w,91# only leave the topic assignments of all other tokens for document 192dt[ 1, t0 ] <- dt[ 1, t0 ] - 193wt[ t0, wid ] <- wt[ t0, wid ] - 19495# Calculate left side and right side of equal sign96left <- ( wt[ , wid ] + eta ) / ( rowSums(wt) + length(vocab) * eta )97right <- ( dt[ 1, ] + alpha ) / ( sum( dt[ 1, ] ) + K * alpha )9899# draw new topic for the first word in the first document100t1 <- sample( 1:K, 1, prob = left * right )101t1102103# refresh the dt and wt with the newly assigned topic104ta[[1]][1] <- t1105dt[ 1, t1 ] <- dt[ 1, t1 ] + 1106wt[ t1, wid ] <- wt[ t1, wid ] + 1107108109# ----------------------------------------------------------------------------------------110# Gibbs sampling ; topicmodels library111# ----------------------------------------------------------------------------------------112113# define parameters114K <- 2115alpha <- 1116eta <- .001117iterations <- 1000118119source("/Users/ethen/machine-learning/lda_1/lda_1_functions.R")120set.seed(4321)121lda1 <- LDA1( docs = docs, vocab = vocab,122K = K, alpha = alpha, eta = eta, iterations = iterations )123124125# posterior probability126# topic probability of every word127phi <- ( lda1$wt + eta ) / ( rowSums(lda1$wt) + length(vocab) * eta )128129# topic probability of every document130theta <- ( lda1$dt + alpha ) / ( rowSums(lda1$dt) + K * alpha )131132# topic assigned to each document, the one with the highest probability133topic <- apply( theta, 1, which.max )134135# possible words under each topic136# sort the probability and obtain the user-specified number137Terms <- function( phi, n )138{139term <- matrix( 0, n, K )140for( p in 1:nrow(phi) )141term[ , p ] <- names( sort( phi[ p, ], decreasing = TRUE )[1:n] )142143return(term)144}145term <- Terms( phi = phi, n = 3 )146147list( original_text = rawdocs[ topic == 1 ], words = term[ , 1 ] )148list( original_text = rawdocs[ topic == 2 ], words = term[ , 2 ] )149150151# compare152library(tm)153library(topicmodels)154155# @burning : number of omitted Gibbs iterations at beginning156# @thin : number of omitted in-between Gibbs iterations157docs1 <- Corpus( VectorSource(rawdocs) )158dtm <- DocumentTermMatrix(docs1)159lda <- LDA( dtm, k = 2, method = "Gibbs",160control = list( seed = 1234, burnin = 500, thin = 100, iter = 4000 ) )161162list( original_text = rawdocs[ topics(lda) == 1 ], words = terms( lda, 3 )[ , 1 ] )163list( original_text = rawdocs[ topics(lda) == 2 ], words = terms( lda, 3 )[ , 2 ] )164165# ----------------------------------------------------------------------------------------166# Reference167# ----------------------------------------------------------------------------------------168169# why tagging matters170# http://cyber.law.harvard.edu/wg_home/uploads/507/07-WhyTaggingMatters.pdf171172# math notations173# https://www.cl.cam.ac.uk/teaching/1213/L101/clark_lectures/lect7.pdf174175# hyperparameters explanation176# http://stats.stackexchange.com/questions/37405/natural-interpretation-for-lda-hyperparameters/37444#37444177178# Reimplementation R code179# http://brooksandrew.github.io/simpleblog/articles/latent-dirichlet-allocation-under-the-hood/180181182