Path: blob/master/Model-3/Project/spellcorrection.py
427 views
import re, random1import spacy2import time34start_time = time.time()5nlp = spacy.load('en');67to_sample = False # if you're impatient switch this flag89def spacy_tokenize(text):10return [token.text for token in nlp.tokenizer(text)]1112def dameraulevenshtein(seq1, seq2):13oneago = None14thisrow = list(range(1, len(seq2) + 1)) + [0]15for x in range(len(seq1)):16twoago, oneago, thisrow = (oneago, thisrow, [0] * len(seq2) + [x + 1])17for y in range(len(seq2)):18delcost = oneago[y] + 119addcost = thisrow[y - 1] + 120subcost = oneago[y - 1] + (seq1[x] != seq2[y])21thisrow[y] = min(delcost, addcost, subcost)22# This block deals with transpositions23if (x > 0 and y > 0 and seq1[x] == seq2[y - 1]24and seq1[x - 1] == seq2[y] and seq1[x] != seq2[y]):25thisrow[y] = min(thisrow[y], twoago[y - 2] + 1)26return thisrow[len(seq2) - 1]272829class SymSpell:30def __init__(self, max_edit_distance=3, verbose=0):31self.max_edit_distance = max_edit_distance32self.verbose = verbose33self.dictionary = {}34self.longest_word_length = 03536def get_deletes_list(self, w):37"""given a word, derive strings with up to max_edit_distance characters38deleted"""3940deletes = []41queue = [w]42for d in range(self.max_edit_distance):43temp_queue = []44for word in queue:45if len(word) > 1:46for c in range(len(word)): # character index47word_minus_c = word[:c] + word[c + 1:]48if word_minus_c not in deletes:49deletes.append(word_minus_c)50if word_minus_c not in temp_queue:51temp_queue.append(word_minus_c)52queue = temp_queue5354return deletes5556def create_dictionary_entry(self, w):57'''add word and its derived deletions to dictionary'''58new_real_word_added = False59if w in self.dictionary:60self.dictionary[w] = (self.dictionary[w][0], self.dictionary[w][1] + 1)61else:62self.dictionary[w] = ([], 1)63self.longest_word_length = max(self.longest_word_length, len(w))6465if self.dictionary[w][1] == 1:66new_real_word_added = True67deletes = self.get_deletes_list(w)68for item in deletes:69if item in self.dictionary:70self.dictionary[item][0].append(w)71else:72self.dictionary[item] = ([w], 0)7374return new_real_word_added7576def create_dictionary_from_arr(self, arr, token_pattern=r'[a-z]+'):77total_word_count = 078unique_word_count = 07980for line in arr:81words = re.findall(token_pattern, line.lower())82for word in words:83total_word_count += 184if self.create_dictionary_entry(word):85unique_word_count += 18687print("total words processed: %i" % total_word_count)88print("total unique words in corpus: %i" % unique_word_count)89print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))90print(" edit distance for deletions: %i" % self.max_edit_distance)91print(" length of longest word in corpus: %i" % self.longest_word_length)92return self.dictionary9394def create_dictionary(self, fname):95total_word_count = 096unique_word_count = 09798with open(fname) as file:99for line in file:100words = re.findall('[a-z]+', line.lower())101for word in words:102total_word_count += 1103if self.create_dictionary_entry(word):104unique_word_count += 1105106print("total words processed: %i" % total_word_count)107print("total unique words in corpus: %i" % unique_word_count)108print("total items in dictionary (corpus words and deletions): %i" % len(self.dictionary))109print(" edit distance for deletions: %i" % self.max_edit_distance)110print(" length of longest word in corpus: %i" % self.longest_word_length)111return self.dictionary112113def get_suggestions(self, string, silent=False):114"""return list of suggested corrections for potentially incorrectly115spelled word"""116if (len(string) - self.longest_word_length) > self.max_edit_distance:117if not silent:118print("no items in dictionary within maximum edit distance")119return []120121suggest_dict = {}122min_suggest_len = float('inf')123124queue = [string]125q_dictionary = {} # items other than string that we've checked126127while len(queue) > 0:128q_item = queue[0] # pop129queue = queue[1:]130131# early exit132if ((self.verbose < 2) and (len(suggest_dict) > 0) and133((len(string) - len(q_item)) > min_suggest_len)):134break135136# process queue item137if (q_item in self.dictionary) and (q_item not in suggest_dict):138if self.dictionary[q_item][1] > 0:139assert len(string) >= len(q_item)140suggest_dict[q_item] = (self.dictionary[q_item][1],141len(string) - len(q_item))142# early exit143if (self.verbose < 2) and (len(string) == len(q_item)):144break145elif (len(string) - len(q_item)) < min_suggest_len:146min_suggest_len = len(string) - len(q_item)147for sc_item in self.dictionary[q_item][0]:148if sc_item not in suggest_dict:149assert len(sc_item) > len(q_item)150assert len(q_item) <= len(string)151if len(q_item) == len(string):152assert q_item == string153item_dist = len(sc_item) - len(q_item)154assert sc_item != string155item_dist = dameraulevenshtein(sc_item, string)156if (self.verbose < 2) and (item_dist > min_suggest_len):157pass158elif item_dist <= self.max_edit_distance:159assert sc_item in self.dictionary160suggest_dict[sc_item] = (self.dictionary[sc_item][1], item_dist)161if item_dist < min_suggest_len:162min_suggest_len = item_dist163if self.verbose < 2:164suggest_dict = {k: v for k, v in suggest_dict.items() if v[1] <= min_suggest_len}165assert len(string) >= len(q_item)166if (self.verbose < 2) and ((len(string) - len(q_item)) > min_suggest_len):167pass168elif (len(string) - len(q_item)) < self.max_edit_distance and len(q_item) > 1:169for c in range(len(q_item)): # character index170word_minus_c = q_item[:c] + q_item[c + 1:]171if word_minus_c not in q_dictionary:172queue.append(word_minus_c)173q_dictionary[word_minus_c] = None174if not silent and self.verbose != 0:175print("number of possible corrections: %i" % len(suggest_dict))176print(" edit distance for deletions: %i" % self.max_edit_distance)177178as_list = suggest_dict.items()179outlist = sorted(as_list, key=lambda x: (x[1][1], -x[1][0]))180181if self.verbose == 0:182return outlist[0]183else:184return outlist185186def best_word(self, s, silent=False):187try:188return self.get_suggestions(s, silent)[0]189except:190return None191192def spell_corrector(word_list, words_d) :193result_list = []194for word in word_list:195if word not in words_d:196suggestion = ss.best_word(word, silent=True)197if suggestion is not None:198result_list.append(suggestion)199else:200result_list.append(word)201202return " ".join(result_list)203204205