Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Aniket025
GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-5/spell.py
427 views
1
import re
2
import time
3
4
max_edit_distance = 3
5
verbose = 0
6
# 0: top suggestion
7
# 1: all suggestions of smallest edit distance
8
# 2: all suggestions <= max_edit_distance (slower, no early termination)
9
10
longest_word_length = 0
11
12
def get_deletes_list(w):
13
'''given a word, derive strings with up to max_edit_distance characters
14
deleted'''
15
deletes = []
16
queue = [w]
17
for d in range(max_edit_distance):
18
temp_queue = []
19
for word in queue:
20
if len(word)>1:
21
for c in range(len(word)): # character index
22
word_minus_c = word[:c] + word[c+1:]
23
if word_minus_c not in deletes:
24
deletes.append(word_minus_c)
25
if word_minus_c not in temp_queue:
26
temp_queue.append(word_minus_c)
27
queue = temp_queue
28
29
return deletes
30
31
def create_dictionary_entry(dictionary, w):
32
'''add word and its derived deletions to dictionary'''
33
# check if word is already in dictionary
34
# dictionary entries are in the form: (list of suggested corrections,
35
# frequency of word in corpus)
36
global longest_word_length
37
new_real_word_added = False
38
if w in dictionary:
39
# increment count of word in corpus
40
dictionary[w] = (dictionary[w][0], dictionary[w][1] + 1)
41
else:
42
dictionary[w] = ([], 1)
43
longest_word_length = max(longest_word_length, len(w))
44
45
if dictionary[w][1]==1:
46
# first appearance of word in corpus
47
# n.b. word may already be in dictionary as a derived word
48
# (deleting character from a real word)
49
# but counter of frequency of word in corpus is not incremented
50
# in those cases)
51
new_real_word_added = True
52
deletes = get_deletes_list(w)
53
for item in deletes:
54
if item in dictionary:
55
# add (correct) word to delete's suggested correction list
56
dictionary[item][0].append(w)
57
else:
58
# note frequency of word in corpus is not incremented
59
dictionary[item] = ([w], 0)
60
61
return new_real_word_added
62
63
def create_dictionary(dictionary, fname):
64
65
total_word_count = 0
66
unique_word_count = 0
67
68
with open(fname) as file:
69
print "Creating dictionary..."
70
for line in file:
71
# separate by words by non-alphabetical characters
72
words = re.findall('[a-z]+', line.lower())
73
for word in words:
74
total_word_count += 1
75
if create_dictionary_entry(dictionary, word):
76
unique_word_count += 1
77
78
return dictionary
79
80
def dameraulevenshtein(seq1, seq2):
81
"""Calculate the Damerau-Levenshtein distance between sequences.
82
This method has not been modified from the original.
83
Source: http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
84
85
This distance is the number of additions, deletions, substitutions,
86
and transpositions needed to transform the first sequence into the
87
second. Although generally used with strings, any sequences of
88
comparable objects will work.
89
Transpositions are exchanges of *consecutive* characters; all other
90
operations are self-explanatory.
91
This implementation is O(N*M) time and O(M) space, for N and M the
92
lengths of the two sequences.
93
>>> dameraulevenshtein('ba', 'abc')
94
2
95
>>> dameraulevenshtein('fee', 'deed')
96
2
97
It works with arbitrary sequences too:
98
>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
99
2
100
"""
101
# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
102
# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
103
# However, only the current and two previous rows are needed at once,
104
# so we only store those.
105
oneago = None
106
thisrow = range(1, len(seq2) + 1) + [0]
107
for x in xrange(len(seq1)):
108
# Python lists wrap around for negative indices, so put the
109
# leftmost column at the *end* of the list. This matches with
110
# the zero-indexed strings and saves extra calculation.
111
twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
112
for y in xrange(len(seq2)):
113
delcost = oneago[y] + 1
114
addcost = thisrow[y - 1] + 1
115
subcost = oneago[y - 1] + (seq1[x] != seq2[y])
116
thisrow[y] = min(delcost, addcost, subcost)
117
# This block deals with transpositions
118
if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
119
and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
120
thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
121
return thisrow[len(seq2) - 1]
122
123
def get_suggestions(dictionary, string, silent=False):
124
'''return list of suggested corrections for potentially incorrectly
125
spelled word'''
126
if (len(string) - longest_word_length) > max_edit_distance:
127
if not silent:
128
print "no items in dictionary within maximum edit distance"
129
return []
130
131
global verbose
132
suggest_dict = {}
133
min_suggest_len = float('inf')
134
135
queue = [string]
136
q_dictionary = {} # items other than string that we've checked
137
138
while len(queue)>0:
139
q_item = queue[0] # pop
140
queue = queue[1:]
141
142
# early exit
143
if ((verbose<2) and (len(suggest_dict)>0) and
144
((len(string)-len(q_item))>min_suggest_len)):
145
break
146
147
# process queue item
148
if (q_item in dictionary) and (q_item not in suggest_dict):
149
if (dictionary[q_item][1]>0):
150
# word is in dictionary, and is a word from the corpus, and
151
# not already in suggestion list so add to suggestion
152
# dictionary, indexed by the word with value (frequency in
153
# corpus, edit distance)
154
# note q_items that are not the input string are shorter
155
# than input string since only deletes are added (unless
156
# manual dictionary corrections are added)
157
assert len(string)>=len(q_item)
158
suggest_dict[q_item] = (dictionary[q_item][1],
159
len(string) - len(q_item))
160
# early exit
161
if ((verbose<2) and (len(string)==len(q_item))):
162
break
163
elif (len(string) - len(q_item)) < min_suggest_len:
164
min_suggest_len = len(string) - len(q_item)
165
166
# the suggested corrections for q_item as stored in
167
# dictionary (whether or not q_item itself is a valid word
168
# or merely a delete) can be valid corrections
169
for sc_item in dictionary[q_item][0]:
170
if (sc_item not in suggest_dict):
171
172
# compute edit distance
173
# suggested items should always be longer
174
# (unless manual corrections are added)
175
assert len(sc_item)>len(q_item)
176
177
# q_items that are not input should be shorter
178
# than original string
179
# (unless manual corrections added)
180
assert len(q_item)<=len(string)
181
182
if len(q_item)==len(string):
183
assert q_item==string
184
item_dist = len(sc_item) - len(q_item)
185
186
# item in suggestions list should not be the same as
187
# the string itself
188
assert sc_item!=string
189
190
# calculate edit distance using, for example,
191
# Damerau-Levenshtein distance
192
item_dist = dameraulevenshtein(sc_item, string)
193
194
# do not add words with greater edit distance if
195
# verbose setting not on
196
if ((verbose<2) and (item_dist>min_suggest_len)):
197
pass
198
elif item_dist<=max_edit_distance:
199
assert sc_item in dictionary # should already be in dictionary if in suggestion list
200
suggest_dict[sc_item] = (dictionary[sc_item][1], item_dist)
201
if item_dist < min_suggest_len:
202
min_suggest_len = item_dist
203
204
# depending on order words are processed, some words
205
# with different edit distances may be entered into
206
# suggestions; trim suggestion dictionary if verbose
207
# setting not on
208
if verbose<2:
209
suggest_dict = {k:v for k, v in suggest_dict.items() if v[1]<=min_suggest_len}
210
211
# now generate deletes (e.g. a substring of string or of a delete)
212
# from the queue item
213
# as additional items to check -- add to end of queue
214
assert len(string)>=len(q_item)
215
216
# do not add words with greater edit distance if verbose setting
217
# is not on
218
if ((verbose<2) and ((len(string)-len(q_item))>min_suggest_len)):
219
pass
220
elif (len(string)-len(q_item))<max_edit_distance and len(q_item)>1:
221
for c in range(len(q_item)): # character index
222
word_minus_c = q_item[:c] + q_item[c+1:]
223
if word_minus_c not in q_dictionary:
224
queue.append(word_minus_c)
225
q_dictionary[word_minus_c] = None # arbitrary value, just to identify we checked this
226
227
# queue is now empty: convert suggestions in dictionary to
228
# list for output
229
if not silent and verbose!=0:
230
print "number of possible corrections: %i" %len(suggest_dict)
231
print " edit distance for deletions: %i" % max_edit_distance
232
233
# output option 1
234
# sort results by ascending order of edit distance and descending
235
# order of frequency
236
# and return list of suggested word corrections only:
237
# return sorted(suggest_dict, key = lambda x:
238
# (suggest_dict[x][1], -suggest_dict[x][0]))
239
240
# output option 2
241
# return list of suggestions with (correction,
242
# (frequency in corpus, edit distance)):
243
as_list = suggest_dict.items()
244
outlist = sorted(as_list, key=lambda(term, (freq, dist)): (dist, -freq))
245
246
if verbose==0:
247
if len(outlist)==0:
248
return []
249
return outlist[0]
250
else:
251
return outlist
252
253
'''
254
Option 1:
255
['file', 'five', 'fire', 'fine', ...]
256
257
Option 2:
258
[('file', (5, 0)),
259
('five', (67, 1)),
260
('fire', (54, 1)),
261
('fine', (17, 1))...]
262
'''
263
264
def best_word(dictionary, s, silent=False):
265
try:
266
return get_suggestions(dictionary, s, silent)[0]
267
except:
268
return s
269
270
def correct_document(dictionary, fname, outfname, printlist=True):
271
# correct an entire document
272
with open(outfname,'w') as outfile:
273
with open(fname) as file:
274
doc_word_count = 0
275
corrected_word_count = 0
276
unknown_word_count = 0
277
278
for i, line in enumerate(file):
279
doc_words = re.findall("[a-z]+|[^a-z]+",line.lower())
280
for doc_word in doc_words:
281
doc_word_count += 1
282
if doc_word[0].isalpha():
283
suggestion = get_suggestions(dictionary, doc_word)
284
if len(suggestion)==0:
285
outfile.write(doc_word)
286
unknown_word_count += 1
287
else:
288
outfile.write(suggestion[0])
289
if suggestion[0]!=doc_word:
290
corrected_word_count += 1
291
else:
292
outfile.write(doc_word)
293
294
## main
295
def spellcheck(dictionary, word_in):
296
correct_document(dictionary, word_in, "output.txt")
297
298
if __name__ == "__main__":
299
print "Please wait..."
300
time.sleep(2)
301
start_time = time.time()
302
dictionary = {}
303
create_dictionary(dictionary,"./total_medicine_data.txt")
304
run_time = time.time() - start_time
305
print '-----'
306
print '%.2f seconds to run' % run_time
307
print '-----'
308
309
print " "
310
print "Word correction"
311
print "---------------"
312
313
314
while True:
315
word_in = raw_input('Enter your file path (or enter to exit): ')
316
if len(word_in)==0:
317
print "goodbye"
318
break
319
start_time = time.time()
320
print correct_document(dictionary, word_in, "output.txt")
321
run_time = time.time() - start_time
322
print '-----'
323
print '%.5f seconds to run' % run_time
324
print '-----'
325
326
327