CoCalc -- spellcorrection.py

GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-3/Project/spellcorrection.py
⁴²⁷ views
1
import re, random
2
import spacy
3
import time
4

5
start_time = time.time()
6
nlp = spacy.load('en');
7

8
to_sample = False # if you're impatient switch this flag
9

10
def spacy_tokenize(text):
11
    return [token.text for token in nlp.tokenizer(text)]
12

13
def dameraulevenshtein(seq1, seq2):
14
    oneago = None
15
    thisrow = list(range(1, len(seq2) + 1)) + [0]
16
    for x in range(len(seq1)):
17
        twoago, oneago, thisrow = (oneago, thisrow, [0] * len(seq2) + [x + 1])
18
        for y in range(len(seq2)):
19
            delcost = oneago[y] + 1
20
            addcost = thisrow[y - 1] + 1
21
            subcost = oneago[y - 1] + (seq1[x] != seq2[y])
22
            thisrow[y] = min(delcost, addcost, subcost)
23
            # This block deals with transpositions
24
            if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
25
                    and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
26
                thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
27
    return thisrow[len(seq2) - 1]
28

29

30
class SymSpell:
31
    def __init__(self, max_edit_distance=3, verbose=0):
32
        self.max_edit_distance = max_edit_distance
33
        self.verbose = verbose
34
        self.dictionary = {}
35
        self.longest_word_length = 0
36

37
    def get_deletes_list(self, w):
38
        """given a word, derive strings with up to max_edit_distance characters
39
           deleted"""
40

41
        deletes = []
42
        queue = [w]
43
        for d in range(self.max_edit_distance):
44
            temp_queue = []
45
            for word in queue:
46
                if len(word) > 1:
47
                    for c in range(len(word)):  # character index
48
                        word_minus_c = word[:c] + word[c + 1:]
49
                        if word_minus_c not in deletes:
50
                            deletes.append(word_minus_c)
51
                        if word_minus_c not in temp_queue:
52
                            temp_queue.append(word_minus_c)
53
            queue = temp_queue
54

55
        return deletes
56

57
    def create_dictionary_entry(self, w):
58
        '''add word and its derived deletions to dictionary'''
59
        new_real_word_added = False
60
        if w in self.dictionary:
61
            self.dictionary[w] = (self.dictionary[w][0], self.dictionary[w][1] + 1)
62
        else:
63
            self.dictionary[w] = ([], 1)
64
            self.longest_word_length = max(self.longest_word_length, len(w))
65

66
        if self.dictionary[w][1] == 1:
67
            new_real_word_added = True
68
            deletes = self.get_deletes_list(w)
69
            for item in deletes:
70
                if item in self.dictionary:
71
                    self.dictionary[item][0].append(w)
72
                else:
73
                    self.dictionary[item] = ([w], 0)
74

75
        return new_real_word_added
76

77
    def create_dictionary_from_arr(self, arr, token_pattern=r'[a-z]+'):
78
        total_word_count = 0
79
        unique_word_count = 0
80

81
        for line in arr:
82
            words = re.findall(token_pattern, line.lower())
83
            for word in words:
84
                total_word_count += 1
85
                if self.create_dictionary_entry(word):
86
                    unique_word_count += 1
87

88
        print("total words processed: %i" % total_word_count)
89
        print("total unique words in corpus: %i" % unique_word_count)
90
        print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))
91
        print("  edit distance for deletions: %i" % self.max_edit_distance)
92
        print("  length of longest word in corpus: %i" % self.longest_word_length)
93
        return self.dictionary
94

95
    def create_dictionary(self, fname):
96
        total_word_count = 0
97
        unique_word_count = 0
98

99
        with open(fname) as file:
100
            for line in file:
101
                words = re.findall('[a-z]+', line.lower())
102
                for word in words:
103
                    total_word_count += 1
104
                    if self.create_dictionary_entry(word):
105
                        unique_word_count += 1
106

107
        print("total words processed: %i" % total_word_count)
108
        print("total unique words in corpus: %i" % unique_word_count)
109
        print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))
110
        print("  edit distance for deletions: %i" % self.max_edit_distance)
111
        print("  length of longest word in corpus: %i" % self.longest_word_length)
112
        return self.dictionary
113

114
    def get_suggestions(self, string, silent=False):
115
        """return list of suggested corrections for potentially incorrectly
116
           spelled word"""
117
        if (len(string) - self.longest_word_length) > self.max_edit_distance:
118
            if not silent:
119
                print("no items in dictionary within maximum edit distance")
120
            return []
121

122
        suggest_dict = {}
123
        min_suggest_len = float('inf')
124

125
        queue = [string]
126
        q_dictionary = {}  # items other than string that we've checked
127

128
        while len(queue) > 0:
129
            q_item = queue[0]  # pop
130
            queue = queue[1:]
131

132
            # early exit
133
            if ((self.verbose < 2) and (len(suggest_dict) > 0) and
134
                    ((len(string) - len(q_item)) > min_suggest_len)):
135
                break
136

137
            # process queue item
138
            if (q_item in self.dictionary) and (q_item not in suggest_dict):
139
                if self.dictionary[q_item][1] > 0:
140
                    assert len(string) >= len(q_item)
141
                    suggest_dict[q_item] = (self.dictionary[q_item][1],
142
                                            len(string) - len(q_item))
143
                    # early exit
144
                    if (self.verbose < 2) and (len(string) == len(q_item)):
145
                        break
146
                    elif (len(string) - len(q_item)) < min_suggest_len:
147
                        min_suggest_len = len(string) - len(q_item)
148
                for sc_item in self.dictionary[q_item][0]:
149
                    if sc_item not in suggest_dict:
150
                        assert len(sc_item) > len(q_item)
151
                        assert len(q_item) <= len(string)
152
                        if len(q_item) == len(string):
153
                            assert q_item == string
154
                            item_dist = len(sc_item) - len(q_item)
155
                        assert sc_item != string
156
                        item_dist = dameraulevenshtein(sc_item, string)
157
                        if (self.verbose < 2) and (item_dist > min_suggest_len):
158
                            pass
159
                        elif item_dist <= self.max_edit_distance:
160
                            assert sc_item in self.dictionary
161
                            suggest_dict[sc_item] = (self.dictionary[sc_item][1], item_dist)
162
                            if item_dist < min_suggest_len:
163
                                min_suggest_len = item_dist
164
                        if self.verbose < 2:
165
                            suggest_dict = {k: v for k, v in suggest_dict.items() if v[1] <= min_suggest_len}
166
            assert len(string) >= len(q_item)
167
            if (self.verbose < 2) and ((len(string) - len(q_item)) > min_suggest_len):
168
                pass
169
            elif (len(string) - len(q_item)) < self.max_edit_distance and len(q_item) > 1:
170
                for c in range(len(q_item)):  # character index
171
                    word_minus_c = q_item[:c] + q_item[c + 1:]
172
                    if word_minus_c not in q_dictionary:
173
                        queue.append(word_minus_c)
174
                        q_dictionary[word_minus_c] = None
175
        if not silent and self.verbose != 0:
176
            print("number of possible corrections: %i" % len(suggest_dict))
177
            print("  edit distance for deletions: %i" % self.max_edit_distance)
178

179
        as_list = suggest_dict.items()
180
        outlist = sorted(as_list, key=lambda x: (x[1][1], -x[1][0]))
181

182
        if self.verbose == 0:
183
            return outlist[0]
184
        else:
185
            return outlist
186

187
    def best_word(self, s, silent=False):
188
        try:
189
            return self.get_suggestions(s, silent)[0]
190
        except:
191
            return None
192

193
def spell_corrector(word_list, words_d) :
194
    result_list = []
195
    for word in word_list:
196
        if word not in words_d:
197
            suggestion = ss.best_word(word, silent=True)
198
            if suggestion is not None:
199
                result_list.append(suggestion)
200
        else:
201
            result_list.append(word)
202

203
    return " ".join(result_list)
204

205
Product

Resources

Company