Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Aniket025
GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-3/Project/main.py
427 views
1
import preprocess
2
import GCVocr
3
import spellcorrection
4
5
import cv2
6
import matplotlib.pyplot as plt
7
8
if __name__ == '__main__':
9
filename = argv[1]
10
11
PP = Preprocess(filename)
12
image = PP.read_image()
13
crop_image = PP.crop_image(image)
14
gray_image = PP.binarization(crop_image)
15
16
plt.show(image)
17
plt.show(crop_image)
18
plt.show(gray_image)
19
20
cv2.imwrite("temp.jpg", gray_image)
21
22
OCR = GoogleCloudVisionOCR("temp.jpg")
23
response = OCR.request_ocr()
24
if response.status_code != 200 or response.json().get('error'):
25
print(response.text)
26
return
27
else:
28
for idx, resp in enumerate(response.json()['responses']):
29
imgname = image_filenames[idx]
30
jpath = join(OCR.RESULTS_DIR, basename(imgname) + '.json')
31
with open(jpath, 'w') as f:
32
datatxt = json.dumps(resp, indent=2)
33
print("Wrote", len(datatxt), "bytes to", jpath)
34
f.write(datatxt)
35
36
print("---------------------------------------------")
37
t = resp['textAnnotations'][0]
38
print(" Bounding Polygon:")
39
print(t['boundingPoly'])
40
print(" Text:")
41
print(t['description'])
42
43
ss = SymSpell(max_edit_distance=2)
44
with open('./bad-words.csv') as bf:
45
bad_words = bf.readlines()
46
bad_words = [word.strip() for word in bad_words]
47
48
# fetch english words dictionary
49
with open('./english_words_479k.txt') as f:
50
words = f.readlines()
51
eng_words = [word.strip() for word in words]
52
print(eng_words[:5])
53
print(bad_words[:5])
54
55
print('Total english words: {}'.format(len(eng_words)))
56
print('Total bad words: {}'.format(len(bad_words)))
57
58
print('create symspell dict...')
59
60
if to_sample:
61
# sampling from list for kernel runtime
62
sample_idxs = random.sample(range(len(eng_words)), 100)
63
eng_words = [eng_words[i] for i in sorted(sample_idxs)] + \
64
'to infinity and beyond'.split() # make sure our sample misspell is in there
65
66
all_words_list = list(set(bad_words + eng_words))
67
silence = ss.create_dictionary_from_arr(all_words_list, token_pattern=r'.+')
68
69
# create a dictionary of rightly spelled words for lookup
70
words_dict = {k: 0 for k in all_words_list}
71
72
sample_text = 'to infifity and byond'
73
tokens = spacy_tokenize(sample_text)
74
75
print('run spell checker...')
76
print()
77
print('original text: ' + sample_text)
78
print()
79
correct_text = spell_corrector(tokens, words_dict)
80
print('corrected text: ' + correct_text)
81
82
print('Done.')
83
print("--- %s seconds ---" % (time.time() - start_time))
84
85