Path: blob/master/clustering_old/tf_idf/tf_idf.R
2617 views
library(tm)1library(proxy)2library(dplyr)34doc <- c( "The sky is blue.", "The sun is bright today.",5"The sun in the sky is bright.", "We can see the shining sun, the bright sun." )67# -----------------------------------------------------------------------------------8# TF-IDF9# -----------------------------------------------------------------------------------1011# create corpus12# stop words list13# stopwords("english")14doc_corpus <- Corpus( VectorSource( doc ) )15control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )16tdm <- TermDocumentMatrix( doc_corpus, control = control_list )17# inspect(tdm_train)1819# tf20tf <- as.matrix(tdm)2122# idf23( idf <- log( ncol(tf) / ( 1 + rowSums( tf != 0 ) ) ) )2425# diagonal matrix26( idf <- diag(idf) )2728# remember to transpose the original tf matrix29# equivalent to t(tf) %*% idf, but crossprod is faster30tf_idf <- crossprod( tf, idf )31colnames(tf_idf) <- rownames(tf)32tf_idf3334# normalize35tf_idf / sqrt( rowSums( tf_idf^2 ) )363738# -----------------------------------------------------------------------------------39# Text Clustering40# -----------------------------------------------------------------------------------4142# cosine example43a <- c( 3, 4 )44b <- c( 5, 6 )4546# print cos and degree47l <- list( numerator = sum( a * b ), denominator = sqrt( sum( a^2 ) ) * sqrt( sum( b^2 ) ) )48list( cosine = l$numerator / l$denominator,49degree = acos( l$numerator / l$denominator ) * 180 / pi )505152# news data53setwd("/Users/ethen/machine-learning/tf_idf")54news <- read.csv( "news.csv", stringsAsFactors = FALSE )5556# [TFIDF] :57# @vector = pass in a vector of documents58TFIDF <- function( vector )59{60# tf61news_corpus <- Corpus( VectorSource(vector) )62control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )63tf <- TermDocumentMatrix( news_corpus, control = control_list ) %>% as.matrix()6465# idf66idf <- log( ncol(tf) / ( 1 + rowSums( tf != 0 ) ) ) %>% diag()6768return( crossprod( tf, idf ) )69}7071# tf-idf matrix using news' title72news_tf_idf <- TFIDF(news$title)737475# [Cosine] :76# distance between two vectors77Cosine <- function( x, y )78{79similarity <- sum( x * y ) / ( sqrt( sum( y^2 ) ) * sqrt( sum( x^2 ) ) )8081# given the cosine value, use acos to convert back to degrees82# acos returns the radian, multiply it by 180 and divide by pi to obtain degrees83return( acos(similarity) * 180 / pi )84}8586# calculate pair-wise distance matrix87pr_DB$set_entry( FUN = Cosine, names = c("Cosine") )88d1 <- dist( news_tf_idf, method = "Cosine" )89pr_DB$delete_entry( "Cosine" )9091# equivalent to the built in "cosine" distance92# d1 <- dist( news_tf_idf, method = "cosine" )9394# heirachical clustering95cluster1 <- hclust( d1, method = "ward.D" )96plot(cluster1)97rect.hclust( cluster1, 17 )98groups1 <- cutree( cluster1, 17 )99# table(groups1)100101news$title[ groups1 == 2 ]102news$title[ groups1 == 7 ]103news$title[ groups1 == 17 ]104105# -----------------------------------------------------------------------------------106# topic model compare results107108library(topicmodels)109110rect.hclust( cluster1, 8 )111groups2 <- cutree( cluster1, 8 )112113lapply( 1:length( unique(groups2) ), function(i) news$title[ groups2 == i ] )114115LDACaculation <- function(vector)116{117news_corpus <- Corpus( VectorSource(vector) )118control_list <- list( removePunctuation = TRUE, stopwords = TRUE, tolower = TRUE )119dtm <- DocumentTermMatrix( news_corpus, control = control_list )120lda <- LDA( dtm, k = 8, method = "Gibbs",121control = list( seed = 1234,122burnin = 1000,123thin = 100,124iter = 1000 ) )125return(lda)126}127128lda <- LDACaculation(news$title)129130131topics(lda)132table( topics(lda) )133lapply( 1:length( unique( topics(lda) ) ), function(i) news$title[ topics(lda) == i ] )134135136terms( lda, 6 )137138lda@gamma139lda@alpha140posterior(lda)$documents141142best_topics <- data.frame( best = apply( posterior(lda)$topics, 1, max ) )143144library(ggplot2)145ggplot( best_topics, aes( best ) ) +146geom_histogram()147148149150