Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Aniket025
GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-3/Project/spellcorrection.py
427 views
1
import re, random
2
import spacy
3
import time
4
5
start_time = time.time()
6
nlp = spacy.load('en');
7
8
to_sample = False # if you're impatient switch this flag
9
10
def spacy_tokenize(text):
11
return [token.text for token in nlp.tokenizer(text)]
12
13
def dameraulevenshtein(seq1, seq2):
14
oneago = None
15
thisrow = list(range(1, len(seq2) + 1)) + [0]
16
for x in range(len(seq1)):
17
twoago, oneago, thisrow = (oneago, thisrow, [0] * len(seq2) + [x + 1])
18
for y in range(len(seq2)):
19
delcost = oneago[y] + 1
20
addcost = thisrow[y - 1] + 1
21
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
22
thisrow[y] = min(delcost, addcost, subcost)
23
# This block deals with transpositions
24
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
25
and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):
26
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
27
return thisrow[len(seq2) - 1]
28
29
30
class SymSpell:
31
def __init__(self, max_edit_distance=3, verbose=0):
32
self.max_edit_distance = max_edit_distance
33
self.verbose = verbose
34
self.dictionary = {}
35
self.longest_word_length = 0
36
37
def get_deletes_list(self, w):
38
"""given a word, derive strings with up to max_edit_distance characters
39
deleted"""
40
41
deletes = []
42
queue = [w]
43
for d in range(self.max_edit_distance):
44
temp_queue = []
45
for word in queue:
46
if len(word) > 1:
47
for c in range(len(word)): # character index
48
word_minus_c = word[:c] + word[c + 1:]
49
if word_minus_c not in deletes:
50
deletes.append(word_minus_c)
51
if word_minus_c not in temp_queue:
52
temp_queue.append(word_minus_c)
53
queue = temp_queue
54
55
return deletes
56
57
def create_dictionary_entry(self, w):
58
'''add word and its derived deletions to dictionary'''
59
new_real_word_added = False
60
if w in self.dictionary:
61
self.dictionary[w] = (self.dictionary[w][0], self.dictionary[w][1] + 1)
62
else:
63
self.dictionary[w] = ([], 1)
64
self.longest_word_length = max(self.longest_word_length, len(w))
65
66
if self.dictionary[w][1] == 1:
67
new_real_word_added = True
68
deletes = self.get_deletes_list(w)
69
for item in deletes:
70
if item in self.dictionary:
71
self.dictionary[item][0].append(w)
72
else:
73
self.dictionary[item] = ([w], 0)
74
75
return new_real_word_added
76
77
def create_dictionary_from_arr(self, arr, token_pattern=r'[a-z]+'):
78
total_word_count = 0
79
unique_word_count = 0
80
81
for line in arr:
82
words = re.findall(token_pattern, line.lower())
83
for word in words:
84
total_word_count += 1
85
if self.create_dictionary_entry(word):
86
unique_word_count += 1
87
88
print("total words processed: %i" % total_word_count)
89
print("total unique words in corpus: %i" % unique_word_count)
90
print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))
91
print(" edit distance for deletions: %i" % self.max_edit_distance)
92
print(" length of longest word in corpus: %i" % self.longest_word_length)
93
return self.dictionary
94
95
def create_dictionary(self, fname):
96
total_word_count = 0
97
unique_word_count = 0
98
99
with open(fname) as file:
100
for line in file:
101
words = re.findall('[a-z]+', line.lower())
102
for word in words:
103
total_word_count += 1
104
if self.create_dictionary_entry(word):
105
unique_word_count += 1
106
107
print("total words processed: %i" % total_word_count)
108
print("total unique words in corpus: %i" % unique_word_count)
109
print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))
110
print(" edit distance for deletions: %i" % self.max_edit_distance)
111
print(" length of longest word in corpus: %i" % self.longest_word_length)
112
return self.dictionary
113
114
def get_suggestions(self, string, silent=False):
115
"""return list of suggested corrections for potentially incorrectly
116
spelled word"""
117
if (len(string) - self.longest_word_length) > self.max_edit_distance:
118
if not silent:
119
print("no items in dictionary within maximum edit distance")
120
return []
121
122
suggest_dict = {}
123
min_suggest_len = float('inf')
124
125
queue = [string]
126
q_dictionary = {} # items other than string that we've checked
127
128
while len(queue) > 0:
129
q_item = queue[0] # pop
130
queue = queue[1:]
131
132
# early exit
133
if ((self.verbose < 2) and (len(suggest_dict) > 0) and
134
((len(string) - len(q_item)) > min_suggest_len)):
135
break
136
137
# process queue item
138
if (q_item in self.dictionary) and (q_item not in suggest_dict):
139
if self.dictionary[q_item][1] > 0:
140
assert len(string) >= len(q_item)
141
suggest_dict[q_item] = (self.dictionary[q_item][1],
142
len(string) - len(q_item))
143
# early exit
144
if (self.verbose < 2) and (len(string) == len(q_item)):
145
break
146
elif (len(string) - len(q_item)) < min_suggest_len:
147
min_suggest_len = len(string) - len(q_item)
148
for sc_item in self.dictionary[q_item][0]:
149
if sc_item not in suggest_dict:
150
assert len(sc_item) > len(q_item)
151
assert len(q_item) <= len(string)
152
if len(q_item) == len(string):
153
assert q_item == string
154
item_dist = len(sc_item) - len(q_item)
155
assert sc_item != string
156
item_dist = dameraulevenshtein(sc_item, string)
157
if (self.verbose < 2) and (item_dist > min_suggest_len):
158
pass
159
elif item_dist <= self.max_edit_distance:
160
assert sc_item in self.dictionary
161
suggest_dict[sc_item] = (self.dictionary[sc_item][1], item_dist)
162
if item_dist < min_suggest_len:
163
min_suggest_len = item_dist
164
if self.verbose < 2:
165
suggest_dict = {k: v for k, v in suggest_dict.items() if v[1] <= min_suggest_len}
166
assert len(string) >= len(q_item)
167
if (self.verbose < 2) and ((len(string) - len(q_item)) > min_suggest_len):
168
pass
169
elif (len(string) - len(q_item)) < self.max_edit_distance and len(q_item) > 1:
170
for c in range(len(q_item)): # character index
171
word_minus_c = q_item[:c] + q_item[c + 1:]
172
if word_minus_c not in q_dictionary:
173
queue.append(word_minus_c)
174
q_dictionary[word_minus_c] = None
175
if not silent and self.verbose != 0:
176
print("number of possible corrections: %i" % len(suggest_dict))
177
print(" edit distance for deletions: %i" % self.max_edit_distance)
178
179
as_list = suggest_dict.items()
180
outlist = sorted(as_list, key=lambda x: (x[1][1], -x[1][0]))
181
182
if self.verbose == 0:
183
return outlist[0]
184
else:
185
return outlist
186
187
def best_word(self, s, silent=False):
188
try:
189
return self.get_suggestions(s, silent)[0]
190
except:
191
return None
192
193
def spell_corrector(word_list, words_d) :
194
result_list = []
195
for word in word_list:
196
if word not in words_d:
197
suggestion = ss.best_word(word, silent=True)
198
if suggestion is not None:
199
result_list.append(suggestion)
200
else:
201
result_list.append(word)
202
203
return " ".join(result_list)
204
205