Path: blob/master/Model-3/Project/main.py
427 views
import preprocess1import GCVocr2import spellcorrection34import cv25import matplotlib.pyplot as plt67if __name__ == '__main__':8filename = argv[1]910PP = Preprocess(filename)11image = PP.read_image()12crop_image = PP.crop_image(image)13gray_image = PP.binarization(crop_image)1415plt.show(image)16plt.show(crop_image)17plt.show(gray_image)1819cv2.imwrite("temp.jpg", gray_image)2021OCR = GoogleCloudVisionOCR("temp.jpg")22response = OCR.request_ocr()23if response.status_code != 200 or response.json().get('error'):24print(response.text)25return26else:27for idx, resp in enumerate(response.json()['responses']):28imgname = image_filenames[idx]29jpath = join(OCR.RESULTS_DIR, basename(imgname) + '.json')30with open(jpath, 'w') as f:31datatxt = json.dumps(resp, indent=2)32print("Wrote", len(datatxt), "bytes to", jpath)33f.write(datatxt)3435print("---------------------------------------------")36t = resp['textAnnotations'][0]37print(" Bounding Polygon:")38print(t['boundingPoly'])39print(" Text:")40print(t['description'])4142ss = SymSpell(max_edit_distance=2)43with open('./bad-words.csv') as bf:44bad_words = bf.readlines()45bad_words = [word.strip() for word in bad_words]4647# fetch english words dictionary48with open('./english_words_479k.txt') as f:49words = f.readlines()50eng_words = [word.strip() for word in words]51print(eng_words[:5])52print(bad_words[:5])5354print('Total english words: {}'.format(len(eng_words)))55print('Total bad words: {}'.format(len(bad_words)))5657print('create symspell dict...')5859if to_sample:60# sampling from list for kernel runtime61sample_idxs = random.sample(range(len(eng_words)), 100)62eng_words = [eng_words[i] for i in sorted(sample_idxs)] + \63'to infinity and beyond'.split() # make sure our sample misspell is in there6465all_words_list = list(set(bad_words + eng_words))66silence = ss.create_dictionary_from_arr(all_words_list, token_pattern=r'.+')6768# create a dictionary of rightly spelled words for lookup69words_dict = {k: 0 for k in all_words_list}7071sample_text = 'to infifity and byond'72tokens = spacy_tokenize(sample_text)7374print('run spell checker...')75print()76print('original text: ' + sample_text)77print()78correct_text = spell_corrector(tokens, words_dict)79print('corrected text: ' + correct_text)8081print('Done.')82print("--- %s seconds ---" % (time.time() - start_time))838485