CoCalc -- processEmail.m

GitHub Repository: hackassin/Coursera-Machine-Learning
Path: blob/master/Week 7/Programming Assignment - 6/ex6/processEmail.m
⁸⁶³ views
1
function word_indices = processEmail(email_contents)
2
%PROCESSEMAIL preprocesses a the body of an email and
3
%returns a list of word_indices 
4
%   word_indices = PROCESSEMAIL(email_contents) preprocesses 
5
%   the body of an email and returns a list of indices of the 
6
%   words contained in the email. 
7
%
8

9
% Load Vocabulary
10
vocabList = getVocabList();
11

12
% Init return value
13
word_indices = [];
14

15
% ========================== Preprocess Email ===========================
16

17
% Find the Headers ( \n\n and remove )
18
% Uncomment the following lines if you are working with raw emails with the
19
% full headers
20

21
% hdrstart = strfind(email_contents, ([char(10) char(10)]));
22
% email_contents = email_contents(hdrstart(1):end);
23

24
% Lower case
25
email_contents = lower(email_contents);
26

27
% Strip all HTML
28
% Looks for any expression that starts with < and ends with > and replace
29
% and does not have any < or > in the tag it with a space
30
email_contents = regexprep(email_contents, '<[^<>]+>', ' ');
31

32
% Handle Numbers
33
% Look for one or more characters between 0-9
34
email_contents = regexprep(email_contents, '[0-9]+', 'number');
35

36
% Handle URLS
37
% Look for strings starting with http:// or https://
38
email_contents = regexprep(email_contents, ...
39
                           '(http|https)://[^\s]*', 'httpaddr');
40

41
% Handle Email Addresses
42
% Look for strings with @ in the middle
43
email_contents = regexprep(email_contents, '[^\s]+@[^\s]+', 'emailaddr');
44

45
% Handle $ sign
46
email_contents = regexprep(email_contents, '[$]+', 'dollar');
47

48

49
% ========================== Tokenize Email ===========================
50

51
% Output the email to screen as well
52
fprintf('\n==== Processed Email ====\n\n');
53

54
% Process file
55
l = 0;
56

57
while ~isempty(email_contents)
58

59
    % Tokenize and also get rid of any punctuation
60
    [str, email_contents] = ...
61
       strtok(email_contents, ...
62
              [' @$/#.-:&*+=[]?!(){},''">_<;%' char(10) char(13)]);
63
   
64
    % Remove any non alphanumeric characters
65
    str = regexprep(str, '[^a-zA-Z0-9]', '');
66

67
    % Stem the word 
68
    % (the porterStemmer sometimes has issues, so we use a try catch block)
69
    try str = porterStemmer(strtrim(str)); 
70
    catch str = ''; continue;
71
    end;
72

73
    % Skip the word if it is too short
74
    if length(str) < 1
75
       continue;
76
    end
77

78
    % Look up the word in the dictionary and add to word_indices if
79
    % found
80
    % ====================== YOUR CODE HERE ======================
81
    % Instructions: Fill in this function to add the index of str to
82
    %               word_indices if it is in the vocabulary. At this point
83
    %               of the code, you have a stemmed word from the email in
84
    %               the variable str. You should look up str in the
85
    %               vocabulary list (vocabList). If a match exists, you
86
    %               should add the index of the word to the word_indices
87
    %               vector. Concretely, if str = 'action', then you should
88
    %               look up the vocabulary list to find where in vocabList
89
    %               'action' appears. For example, if vocabList{18} =
90
    %               'action', then, you should add 18 to the word_indices 
91
    %               vector (e.g., word_indices = [word_indices ; 18]; ).
92
    % 
93
    % Note: vocabList{idx} returns a the word with index idx in the
94
    %       vocabulary list.
95
    % 
96
    % Note: You can use strcmp(str1, str2) to compare two strings (str1 and
97
    %       str2). It will return 1 only if the two strings are equivalent.
98
    %
99

100

101

102

103

104

105

106

107

108

109
    % =============================================================
110

111

112
    % Print to screen, ensuring that the output lines are not too long
113
    if (l + length(str) + 1) > 78
114
        fprintf('\n');
115
        l = 0;
116
    end
117
    fprintf('%s ', str);
118
    l = l + length(str) + 1;
119

120
end
121

122
% Print footer
123
fprintf('\n\n=========================\n');
124

125
end
126

127
Product

Resources

Company