CoCalc -- main.py

GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-3/Project/main.py
⁴²⁷ views
1
import preprocess
2
import GCVocr
3
import spellcorrection
4

5
import cv2
6
import matplotlib.pyplot as plt
7

8
if __name__ == '__main__':
9
    filename = argv[1]
10

11
    PP = Preprocess(filename)
12
    image = PP.read_image()
13
    crop_image = PP.crop_image(image)
14
    gray_image = PP.binarization(crop_image)
15

16
    plt.show(image)
17
    plt.show(crop_image)
18
    plt.show(gray_image)
19

20
    cv2.imwrite("temp.jpg", gray_image)
21

22
    OCR = GoogleCloudVisionOCR("temp.jpg")
23
    response = OCR.request_ocr()
24
    if response.status_code != 200 or response.json().get('error'):
25
        print(response.text)
26
        return
27
    else:
28
        for idx, resp in enumerate(response.json()['responses']):
29
            imgname = image_filenames[idx]
30
            jpath = join(OCR.RESULTS_DIR, basename(imgname) + '.json')
31
            with open(jpath, 'w') as f:
32
                datatxt = json.dumps(resp, indent=2)
33
                print("Wrote", len(datatxt), "bytes to", jpath)
34
                f.write(datatxt)
35

36
            print("---------------------------------------------")
37
            t = resp['textAnnotations'][0]
38
            print("    Bounding Polygon:")
39
            print(t['boundingPoly'])
40
            print("    Text:")
41
            print(t['description'])
42

43
            ss = SymSpell(max_edit_distance=2)
44
            with open('./bad-words.csv') as bf:
45
                bad_words = bf.readlines()
46
            bad_words = [word.strip() for word in bad_words]
47

48
            # fetch english words dictionary
49
            with open('./english_words_479k.txt') as f:
50
                words = f.readlines()
51
            eng_words = [word.strip() for word in words]
52
            print(eng_words[:5])
53
            print(bad_words[:5])
54

55
            print('Total english words: {}'.format(len(eng_words)))
56
            print('Total bad words: {}'.format(len(bad_words)))
57

58
            print('create symspell dict...')
59

60
            if to_sample:
61
                # sampling from list for kernel runtime
62
                sample_idxs = random.sample(range(len(eng_words)), 100)
63
                eng_words = [eng_words[i] for i in sorted(sample_idxs)] + \
64
                    'to infinity and beyond'.split() # make sure our sample misspell is in there
65

66
            all_words_list = list(set(bad_words + eng_words))
67
            silence = ss.create_dictionary_from_arr(all_words_list, token_pattern=r'.+')
68

69
            # create a dictionary of rightly spelled words for lookup
70
            words_dict = {k: 0 for k in all_words_list}
71

72
            sample_text = 'to infifity and byond'
73
            tokens = spacy_tokenize(sample_text)
74

75
            print('run spell checker...')
76
            print()
77
            print('original text: ' + sample_text)
78
            print()
79
            correct_text = spell_corrector(tokens, words_dict)
80
            print('corrected text: ' + correct_text)
81

82
            print('Done.')
83
            print("--- %s seconds ---" % (time.time() - start_time))
84

85
Product

Resources

Company