CoCalc -- spell.py

GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-5/spell.py
⁴²⁷ views
1
import re
2
import time
3

4
max_edit_distance = 3
5
verbose = 0
6
# 0: top suggestion
7
# 1: all suggestions of smallest edit distance
8
# 2: all suggestions <= max_edit_distance (slower, no early termination)
9

10
longest_word_length = 0
11

12
def get_deletes_list(w):
13
	'''given a word, derive strings with up to max_edit_distance characters
14
	   deleted'''
15
	deletes = []
16
	queue = [w]
17
	for d in range(max_edit_distance):
18
		temp_queue = []
19
		for word in queue:
20
			if len(word)>1:
21
				for c in range(len(word)):  # character index
22
					word_minus_c = word[:c] + word[c+1:]
23
					if word_minus_c not in deletes:
24
						deletes.append(word_minus_c)
25
					if word_minus_c not in temp_queue:
26
						temp_queue.append(word_minus_c)
27
		queue = temp_queue
28

29
	return deletes
30

31
def create_dictionary_entry(dictionary, w):
32
	'''add word and its derived deletions to dictionary'''
33
	# check if word is already in dictionary
34
	# dictionary entries are in the form: (list of suggested corrections,
35
	# frequency of word in corpus)
36
	global longest_word_length
37
	new_real_word_added = False
38
	if w in dictionary:
39
		# increment count of word in corpus
40
		dictionary[w] = (dictionary[w][0], dictionary[w][1] + 1)
41
	else:
42
		dictionary[w] = ([], 1)
43
		longest_word_length = max(longest_word_length, len(w))
44

45
	if dictionary[w][1]==1:
46
		# first appearance of word in corpus
47
		# n.b. word may already be in dictionary as a derived word
48
		# (deleting character from a real word)
49
		# but counter of frequency of word in corpus is not incremented
50
		# in those cases)
51
		new_real_word_added = True
52
		deletes = get_deletes_list(w)
53
		for item in deletes:
54
			if item in dictionary:
55
				# add (correct) word to delete's suggested correction list
56
				dictionary[item][0].append(w)
57
			else:
58
				# note frequency of word in corpus is not incremented
59
				dictionary[item] = ([w], 0)
60

61
	return new_real_word_added
62

63
def create_dictionary(dictionary, fname):
64

65
	total_word_count = 0
66
	unique_word_count = 0
67

68
	with open(fname) as file:
69
		print "Creating dictionary..."
70
		for line in file:
71
			# separate by words by non-alphabetical characters
72
			words = re.findall('[a-z]+', line.lower())
73
			for word in words:
74
				total_word_count += 1
75
				if create_dictionary_entry(dictionary, word):
76
					unique_word_count += 1
77

78
	return dictionary
79

80
def dameraulevenshtein(seq1, seq2):
81
	"""Calculate the Damerau-Levenshtein distance between sequences.
82
	This method has not been modified from the original.
83
	Source: http://mwh.geek.nz/2009/04/26/python-damerau-levenshtein-distance/
84

85
	This distance is the number of additions, deletions, substitutions,
86
	and transpositions needed to transform the first sequence into the
87
	second. Although generally used with strings, any sequences of
88
	comparable objects will work.
89
	Transpositions are exchanges of *consecutive* characters; all other
90
	operations are self-explanatory.
91
	This implementation is O(N*M) time and O(M) space, for N and M the
92
	lengths of the two sequences.
93
	>>> dameraulevenshtein('ba', 'abc')
94
	2
95
	>>> dameraulevenshtein('fee', 'deed')
96
	2
97
	It works with arbitrary sequences too:
98
	>>> dameraulevenshtein('abcd', ['b', 'a', 'c', 'd', 'e'])
99
	2
100
	"""
101
	# codesnippet:D0DE4716-B6E6-4161-9219-2903BF8F547F
102
	# Conceptually, this is based on a len(seq1) + 1 * len(seq2) + 1 matrix.
103
	# However, only the current and two previous rows are needed at once,
104
	# so we only store those.
105
	oneago = None
106
	thisrow = range(1, len(seq2) + 1) + [0]
107
	for x in xrange(len(seq1)):
108
		# Python lists wrap around for negative indices, so put the
109
		# leftmost column at the *end* of the list. This matches with
110
		# the zero-indexed strings and saves extra calculation.
111
		twoago, oneago, thisrow = oneago, thisrow, [0] * len(seq2) + [x + 1]
112
		for y in xrange(len(seq2)):
113
			delcost = oneago[y] + 1
114
			addcost = thisrow[y - 1] + 1
115
			subcost = oneago[y - 1] + (seq1[x] != seq2[y])
116
			thisrow[y] = min(delcost, addcost, subcost)
117
			# This block deals with transpositions
118
			if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]
119
				and seq1[x-1] == seq2[y] and seq1[x] != seq2[y]):
120
				thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)
121
	return thisrow[len(seq2) - 1]
122

123
def get_suggestions(dictionary, string, silent=False):
124
	'''return list of suggested corrections for potentially incorrectly
125
	   spelled word'''
126
	if (len(string) - longest_word_length) > max_edit_distance:
127
		if not silent:
128
			print "no items in dictionary within maximum edit distance"
129
		return []
130

131
	global verbose
132
	suggest_dict = {}
133
	min_suggest_len = float('inf')
134

135
	queue = [string]
136
	q_dictionary = {}  # items other than string that we've checked
137

138
	while len(queue)>0:
139
		q_item = queue[0]  # pop
140
		queue = queue[1:]
141

142
		# early exit
143
		if ((verbose<2) and (len(suggest_dict)>0) and
144
			  ((len(string)-len(q_item))>min_suggest_len)):
145
			break
146

147
		# process queue item
148
		if (q_item in dictionary) and (q_item not in suggest_dict):
149
			if (dictionary[q_item][1]>0):
150
			# word is in dictionary, and is a word from the corpus, and
151
			# not already in suggestion list so add to suggestion
152
			# dictionary, indexed by the word with value (frequency in
153
			# corpus, edit distance)
154
			# note q_items that are not the input string are shorter
155
			# than input string since only deletes are added (unless
156
			# manual dictionary corrections are added)
157
				assert len(string)>=len(q_item)
158
				suggest_dict[q_item] = (dictionary[q_item][1],
159
										len(string) - len(q_item))
160
				# early exit
161
				if ((verbose<2) and (len(string)==len(q_item))):
162
					break
163
				elif (len(string) - len(q_item)) < min_suggest_len:
164
					min_suggest_len = len(string) - len(q_item)
165

166
			# the suggested corrections for q_item as stored in
167
			# dictionary (whether or not q_item itself is a valid word
168
			# or merely a delete) can be valid corrections
169
			for sc_item in dictionary[q_item][0]:
170
				if (sc_item not in suggest_dict):
171

172
					# compute edit distance
173
					# suggested items should always be longer
174
					# (unless manual corrections are added)
175
					assert len(sc_item)>len(q_item)
176

177
					# q_items that are not input should be shorter
178
					# than original string
179
					# (unless manual corrections added)
180
					assert len(q_item)<=len(string)
181

182
					if len(q_item)==len(string):
183
						assert q_item==string
184
						item_dist = len(sc_item) - len(q_item)
185

186
					# item in suggestions list should not be the same as
187
					# the string itself
188
					assert sc_item!=string
189

190
					# calculate edit distance using, for example,
191
					# Damerau-Levenshtein distance
192
					item_dist = dameraulevenshtein(sc_item, string)
193

194
					# do not add words with greater edit distance if
195
					# verbose setting not on
196
					if ((verbose<2) and (item_dist>min_suggest_len)):
197
						pass
198
					elif item_dist<=max_edit_distance:
199
						assert sc_item in dictionary  # should already be in dictionary if in suggestion list
200
						suggest_dict[sc_item] = (dictionary[sc_item][1], item_dist)
201
						if item_dist < min_suggest_len:
202
							min_suggest_len = item_dist
203

204
					# depending on order words are processed, some words
205
					# with different edit distances may be entered into
206
					# suggestions; trim suggestion dictionary if verbose
207
					# setting not on
208
					if verbose<2:
209
						suggest_dict = {k:v for k, v in suggest_dict.items() if v[1]<=min_suggest_len}
210

211
		# now generate deletes (e.g. a substring of string or of a delete)
212
		# from the queue item
213
		# as additional items to check -- add to end of queue
214
		assert len(string)>=len(q_item)
215

216
		# do not add words with greater edit distance if verbose setting
217
		# is not on
218
		if ((verbose<2) and ((len(string)-len(q_item))>min_suggest_len)):
219
			pass
220
		elif (len(string)-len(q_item))<max_edit_distance and len(q_item)>1:
221
			for c in range(len(q_item)): # character index
222
				word_minus_c = q_item[:c] + q_item[c+1:]
223
				if word_minus_c not in q_dictionary:
224
					queue.append(word_minus_c)
225
					q_dictionary[word_minus_c] = None  # arbitrary value, just to identify we checked this
226

227
	# queue is now empty: convert suggestions in dictionary to
228
	# list for output
229
	if not silent and verbose!=0:
230
		print "number of possible corrections: %i" %len(suggest_dict)
231
		print "  edit distance for deletions: %i" % max_edit_distance
232

233
	# output option 1
234
	# sort results by ascending order of edit distance and descending
235
	# order of frequency
236
	#	 and return list of suggested word corrections only:
237
	# return sorted(suggest_dict, key = lambda x:
238
	#			   (suggest_dict[x][1], -suggest_dict[x][0]))
239

240
	# output option 2
241
	# return list of suggestions with (correction,
242
	#								  (frequency in corpus, edit distance)):
243
	as_list = suggest_dict.items()
244
	outlist = sorted(as_list, key=lambda(term, (freq, dist)): (dist, -freq))
245

246
	if verbose==0:
247
		if len(outlist)==0:
248
			return []
249
		return outlist[0]
250
	else:
251
		return outlist
252

253
	'''
254
	Option 1:
255
	['file', 'five', 'fire', 'fine', ...]
256

257
	Option 2:
258
	[('file', (5, 0)),
259
	 ('five', (67, 1)),
260
	 ('fire', (54, 1)),
261
	 ('fine', (17, 1))...]
262
	'''
263

264
def best_word(dictionary, s, silent=False):
265
	try:
266
		return get_suggestions(dictionary, s, silent)[0]
267
	except:
268
		return s
269

270
def correct_document(dictionary, fname, outfname, printlist=True):
271
	# correct an entire document
272
	with open(outfname,'w') as outfile:
273
		with open(fname) as file:
274
			doc_word_count = 0
275
			corrected_word_count = 0
276
			unknown_word_count = 0
277

278
			for i, line in enumerate(file):
279
				doc_words = re.findall("[a-z]+|[^a-z]+",line.lower())
280
				for doc_word in doc_words:
281
					doc_word_count += 1
282
					if doc_word[0].isalpha():
283
						suggestion = get_suggestions(dictionary, doc_word)
284
						if len(suggestion)==0:
285
							outfile.write(doc_word)
286
							unknown_word_count += 1
287
						else:
288
							outfile.write(suggestion[0])
289
							if suggestion[0]!=doc_word:
290
								corrected_word_count += 1
291
					else:
292
						outfile.write(doc_word)
293

294
## main
295
def spellcheck(dictionary, word_in):
296
	correct_document(dictionary, word_in, "output.txt")
297

298
if __name__ == "__main__":
299
	print "Please wait..."
300
	time.sleep(2)
301
	start_time = time.time()
302
	dictionary = {}
303
	create_dictionary(dictionary,"./total_medicine_data.txt")
304
	run_time = time.time() - start_time
305
	print '-----'
306
	print '%.2f seconds to run' % run_time
307
	print '-----'
308

309
	print " "
310
	print "Word correction"
311
	print "---------------"
312

313

314
	while True:
315
		word_in = raw_input('Enter your file path (or enter to exit): ')
316
		if len(word_in)==0:
317
			print "goodbye"
318
			break
319
		start_time = time.time()
320
		print correct_document(dictionary, word_in, "output.txt")
321
		run_time = time.time() - start_time
322
		print '-----'
323
		print '%.5f seconds to run' % run_time
324
		print '-----'
325
	
326

327
Product

Resources

Company