Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hackassin
GitHub Repository: hackassin/Coursera-Machine-Learning
Path: blob/master/Week 7/Programming Assignment - 6/ex6/ex6_spam.m
863 views
1
%% Machine Learning Online Class
2
% Exercise 6 | Spam Classification with SVMs
3
%
4
% Instructions
5
% ------------
6
%
7
% This file contains code that helps you get started on the
8
% exercise. You will need to complete the following functions:
9
%
10
% gaussianKernel.m
11
% dataset3Params.m
12
% processEmail.m
13
% emailFeatures.m
14
%
15
% For this exercise, you will not need to change any code in this file,
16
% or any other files other than those mentioned above.
17
%
18
19
%% Initialization
20
clear ; close all; clc
21
22
%% ==================== Part 1: Email Preprocessing ====================
23
% To use an SVM to classify emails into Spam v.s. Non-Spam, you first need
24
% to convert each email into a vector of features. In this part, you will
25
% implement the preprocessing steps for each email. You should
26
% complete the code in processEmail.m to produce a word indices vector
27
% for a given email.
28
29
fprintf('\nPreprocessing sample email (emailSample1.txt)\n');
30
31
% Extract Features
32
file_contents = readFile('emailSample1.txt');
33
word_indices = processEmail(file_contents);
34
35
% Print Stats
36
fprintf('Word Indices: \n');
37
fprintf(' %d', word_indices);
38
fprintf('\n\n');
39
40
fprintf('Program paused. Press enter to continue.\n');
41
pause;
42
43
%% ==================== Part 2: Feature Extraction ====================
44
% Now, you will convert each email into a vector of features in R^n.
45
% You should complete the code in emailFeatures.m to produce a feature
46
% vector for a given email.
47
48
fprintf('\nExtracting features from sample email (emailSample1.txt)\n');
49
50
% Extract Features
51
file_contents = readFile('emailSample1.txt');
52
word_indices = processEmail(file_contents);
53
features = emailFeatures(word_indices);
54
55
% Print Stats
56
fprintf('Length of feature vector: %d\n', length(features));
57
fprintf('Number of non-zero entries: %d\n', sum(features > 0));
58
59
fprintf('Program paused. Press enter to continue.\n');
60
pause;
61
62
%% =========== Part 3: Train Linear SVM for Spam Classification ========
63
% In this section, you will train a linear classifier to determine if an
64
% email is Spam or Not-Spam.
65
66
% Load the Spam Email dataset
67
% You will have X, y in your environment
68
load('spamTrain.mat');
69
70
fprintf('\nTraining Linear SVM (Spam Classification)\n')
71
fprintf('(this may take 1 to 2 minutes) ...\n')
72
73
C = 0.1;
74
model = svmTrain(X, y, C, @linearKernel);
75
76
p = svmPredict(model, X);
77
78
fprintf('Training Accuracy: %f\n', mean(double(p == y)) * 100);
79
80
%% =================== Part 4: Test Spam Classification ================
81
% After training the classifier, we can evaluate it on a test set. We have
82
% included a test set in spamTest.mat
83
84
% Load the test dataset
85
% You will have Xtest, ytest in your environment
86
load('spamTest.mat');
87
88
fprintf('\nEvaluating the trained Linear SVM on a test set ...\n')
89
90
p = svmPredict(model, Xtest);
91
92
fprintf('Test Accuracy: %f\n', mean(double(p == ytest)) * 100);
93
pause;
94
95
96
%% ================= Part 5: Top Predictors of Spam ====================
97
% Since the model we are training is a linear SVM, we can inspect the
98
% weights learned by the model to understand better how it is determining
99
% whether an email is spam or not. The following code finds the words with
100
% the highest weights in the classifier. Informally, the classifier
101
% 'thinks' that these words are the most likely indicators of spam.
102
%
103
104
% Sort the weights and obtin the vocabulary list
105
[weight, idx] = sort(model.w, 'descend');
106
vocabList = getVocabList();
107
108
fprintf('\nTop predictors of spam: \n');
109
for i = 1:15
110
fprintf(' %-15s (%f) \n', vocabList{idx(i)}, weight(i));
111
end
112
113
fprintf('\n\n');
114
fprintf('\nProgram paused. Press enter to continue.\n');
115
pause;
116
117
%% =================== Part 6: Try Your Own Emails =====================
118
% Now that you've trained the spam classifier, you can use it on your own
119
% emails! In the starter code, we have included spamSample1.txt,
120
% spamSample2.txt, emailSample1.txt and emailSample2.txt as examples.
121
% The following code reads in one of these emails and then uses your
122
% learned SVM classifier to determine whether the email is Spam or
123
% Not Spam
124
125
% Set the file to be read in (change this to spamSample2.txt,
126
% emailSample1.txt or emailSample2.txt to see different predictions on
127
% different emails types). Try your own emails as well!
128
filename = 'spamSample1.txt';
129
130
% Read and predict
131
file_contents = readFile(filename);
132
word_indices = processEmail(file_contents);
133
x = emailFeatures(word_indices);
134
p = svmPredict(model, x);
135
136
fprintf('\nProcessed %s\n\nSpam Classification: %d\n', filename, p);
137
fprintf('(1 indicates spam, 0 indicates not spam)\n\n');
138
139
140