CoCalc -- natural_language

GitHub Repository: debakarr/machinelearning
Path: blob/master/Part 7 - Natural Language Processing/natural_language_processing.R
¹⁰⁰² views

1
# Natural Language Processing
2

3
# Importing the dataset
4
dataset_original = read.delim('Restaurant_Reviews.tsv', quote = '', stringsAsFactors = FALSE)
5

6
# Cleaning the texts
7
# install.packages('tm')
8
# install.packages('SnowballC')
9
library(tm)
10
library(SnowballC)
11
corpus = VCorpus(VectorSource(dataset_original$Review))
12
corpus = tm_map(corpus, content_transformer(tolower))
13
corpus = tm_map(corpus, removeNumbers)
14
corpus = tm_map(corpus, removePunctuation)
15
corpus = tm_map(corpus, removeWords, stopwords())
16
corpus = tm_map(corpus, stemDocument)
17
corpus = tm_map(corpus, stripWhitespace)
18

19
# Creating the Bag of Words model
20
dtm = DocumentTermMatrix(corpus)
21
dtm = removeSparseTerms(dtm, 0.999)
22
dataset = as.data.frame(as.matrix(dtm))
23
dataset$Liked = dataset_original$Liked
24

25
# Importing the dataset
26
dataset = read.csv('Social_Network_Ads.csv')
27
dataset = dataset[3:5]
28

29
# Encoding the target feature as factor
30
dataset$Liked = factor(dataset$Liked, levels = c(0, 1))
31

32
# Splitting the dataset into the Training set and Test set
33
# install.packages('caTools')
34
library(caTools)
35
set.seed(123)
36
split = sample.split(dataset$Liked, SplitRatio = 0.8)
37
training_set = subset(dataset, split == TRUE)
38
test_set = subset(dataset, split == FALSE)
39

40
# Fitting Random Forest Classification to the Training set
41
# install.packages('randomForest')
42
library(randomForest)
43
classifier = randomForest(x = training_set[-692],
44
                          y = training_set$Liked,
45
                          ntree = 10)
46

47
# Predicting the Test set results
48
y_pred = predict(classifier, newdata = test_set[-692])
49

50
# Making the Confusion Matrix
51
cm = table(test_set[, 692], y_pred)
52

Product

Resources

Company