CoCalc -- datahelpers.py

GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-3/ocr/datahelpers.py
⁴²⁶ views
1
# -*- coding: utf-8 -*-
2
"""
3
Helper functions for loading and creating datasets
4
"""
5
import numpy as np
6
import glob
7
import simplejson
8
import cv2
9
import unidecode
10
from .helpers import implt
11
from .normalization import letterNorm
12
from .viz import printProgressBar
13

14

15
CHARS = ['', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
16
         'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
17
         'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c',
18
         'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
19
         'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
20
         'x', 'y', 'z', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ý', 'á',
21
         'é', 'í', 'ó', 'ú', 'ý', 'Č', 'č', 'Ď', 'ď', 'Ě',
22
         'ě', 'Ň', 'ň', 'Ř', 'ř', 'Š', 'š', 'Ť', 'ť', 'Ů',
23
         'ů', 'Ž', 'ž']
24

25
idxs = [i for i in range(len(CHARS))]
26
idx_to_chars = dict(zip(idxs, CHARS))
27
chars_to_idx = dict(zip(CHARS, idxs))
28

29
def char2idx(c, sequence=False):
30
    if sequence:
31
        return chars_to_idx[c] + 1
32
    return chars_to_idx[c]
33

34
def idx2char(idx, sequence=False):
35
    if sequence:
36
        return idx_to_chars[idx-1]
37
    return idx_to_chars[idx]
38
    
39

40
def loadWordsData(dataloc='data/words/', loadGaplines=True, debug=False):
41
    """
42
    Load word images with corresponding labels and gaplines (if loadGaplines == True)
43
    Args:
44
        dataloc: image folder location - can be list of multiple locations,
45
        loadGaplines: wheter or not load gaplines positions files
46
        debug: for printing example image
47
    Returns:
48
        (images, labels (, gaplines))
49
    """
50
    print("Loading words...")
51
    imglist = []
52
    tmpLabels = []
53
    if type(dataloc) is list:
54
        for loc in dataloc:
55
            loc += '/' if loc[-1] != '/' else ''
56
            tmpList = glob.glob(loc + '*.jpg')
57
            imglist += tmpList
58
            tmpLabels += [name[len(loc):].split("_")[0] for name in tmpList]
59
    else:
60
        dataloc += '/' if dataloc[-1] != '/' else ''
61
        imglist = glob.glob(dataloc + '*.jpg')
62
        tmpLabels = [name[len(dataloc):].split("_")[0] for name in imglist]
63
    
64
    labels = np.array(tmpLabels)
65
    images = np.empty(len(imglist), dtype=object)
66

67
    # Load grayscaled images
68
    for i, img in enumerate(imglist):
69
        images[i] = cv2.imread(img, 0)
70
    
71
    # Load gaplines (lines separating letters) from txt files
72
    if loadGaplines:
73
        gaplines = np.empty(len(imglist), dtype=object)
74
        for i, name in enumerate(imglist):
75
            with open(name[:-3] + 'txt', 'r') as fp:
76
                gaplines[i] = np.array(simplejson.load(fp))
77
                
78
    # Check the same lenght of labels and images
79
    if loadGaplines:
80
        assert len(labels) == len(images) == len(gaplines)
81
    else:
82
        assert len(labels) == len(images)
83
    print("-> Number of words:", len(labels))
84

85
    # Print one of the images (last one)
86
    if debug:
87
        implt(images[-1], 'gray', 'Example')
88
        print("Word:", labels[-1])
89
        if loadGaplines:
90
            print("Gaplines:", gaplines[-1])
91
    
92
    if loadGaplines:
93
        return (images, labels, gaplines)
94
    return (images, labels)
95

96

97
def words2chars(images, labels, gaplines, lang='cz'):
98
    """ Transform word images with gaplines into individual chars """
99
    # Total number of chars
100
    length = sum([len(l) for l in labels])
101
    
102
    imgs = np.empty(length, dtype=object)
103
    newLabels = []
104
    
105
    height = images[0].shape[0]
106
    
107
    idx = 0;
108
    for i, gaps in enumerate(gaplines):
109
        for pos in range(len(gaps) - 1):
110
            imgs[idx] = images[i][0:height, gaps[pos]:gaps[pos+1]]
111
            if lang == 'cz':
112
                newLabels.append(char2idx(labels[i][pos]))
113
            else:
114
                newLabels.append(char2idx(unidecode.unidecode(labels[i][pos])))
115
            idx += 1
116
           
117
    print("Loaded chars from words:", length)            
118
    return imgs, newLabels
119

120

121
def loadCharsData(charloc='data/charclas/', wordloc='data/words/', lang='cz'):
122
    """
123
    Load chars images with corresponding labels
124
    Args:
125
        charloc: char images FOLDER LOCATION
126
        wordloc: word images with gaplines FOLDER LOCATION
127
    Returns:
128
        (images, labels)
129
    """
130
    print("Loading chars...")
131
    images = np.zeros((1, 4096))
132
    labels = []
133

134
    if charloc != '':
135
        # Get subfolders with chars
136
        dirlist = glob.glob(charloc + lang + "/*/")
137
        dirlist.sort()    
138

139
        if lang == 'en':
140
            chars = CHARS_EN
141
        else:
142
            chars = CHARS_CZ
143

144
        assert [d[-2] if d[-2] != '0' else '' for d in dirlist] == chars
145

146
        # For every label load images and create corresponding labels
147
        # cv2.imread(img, 0) - for loading images in grayscale
148
        # Images are scaled to 64x64 = 4096 px
149
        for i in range(len(chars)):
150
            imglist = glob.glob(dirlist[i] + '*.jpg')
151
            imgs = np.array([letterNorm(cv2.imread(img, 0)) for img in imglist])
152
            images = np.concatenate([images, imgs.reshape(len(imgs), 4096)])
153
            labels.extend([i] * len(imgs))
154
        
155
    if wordloc != '':    
156
        imgs, words, gaplines = loadWordsData(wordloc)
157
        imgs, chars = words2chars(imgs, words, gaplines, lang)
158
        
159
        labels.extend(chars)
160
        for i in range(len(imgs)):
161
            printProgressBar(i, len(imgs))
162
            images = np.concatenate([images,
163
                                     letterNorm(imgs[i]).reshape(1, 4096)])            
164

165
    images = images[1:]
166
    labels = np.array(labels)
167
    
168
    print("-> Number of chars:", len(labels))
169
    return (images, labels)
170

171

172
def loadGapData(loc='data/gapdet/large/', slider=(60, 120), seq=False, flatten=True):
173
    """ 
174
    Load gap data from location with corresponding labels
175
    Args:
176
        loc: location of folder with words separated into gap data
177
             images have to by named as label_timestamp.jpg, label is 0 or 1
178
        slider: dimensions of of output images
179
        seq: Store images from one word as a sequence
180
        flatten: Flatten the output images
181
    Returns:
182
        (images, labels)
183
    """
184
    print('Loading gap data...')
185
    loc += '/' if loc[-1] != '/' else ''
186
    dirlist = glob.glob(loc + "*/")
187
    dirlist.sort()
188
    
189
    if slider[1] > 120:
190
        # TODO Implement for higher dimmensions
191
        slider[1] = 120
192
        
193
    cut_s = None if (120 - slider[1]) // 2 <= 0 else  (120 - slider[1]) // 2
194
    cut_e = None if (120 - slider[1]) // 2 <= 0 else -(120 - slider[1]) // 2
195
    
196
    if seq:
197
        images = np.empty(len(dirlist), dtype=object)
198
        labels = np.empty(len(dirlist), dtype=object)
199
        
200
        for i, loc in enumerate(dirlist):
201
            # TODO Check for empty directories
202
            imglist = glob.glob(loc + '*.jpg')
203
            if (len(imglist) != 0):
204
                imgList = sorted(imglist, key=lambda x: int(x[len(loc):].split("_")[1][:-4]))
205
                images[i] = np.array([(cv2.imread(img, 0)[:, cut_s:cut_e].flatten() if flatten else
206
                                       cv2.imread(img, 0)[:, cut_s:cut_e])
207
                                      for img in imglist])
208
                labels[i] = np.array([int(name[len(loc):].split("_")[0]) for name in imglist])
209
        
210
    else:
211
        images = np.zeros((1, slider[0]*slider[1]))
212
        labels = []
213

214
        for i in range(len(dirlist)):
215
            imglist = glob.glob(dirlist[i] + '*.jpg')
216
            if (len(imglist) != 0):
217
                imgs = np.array([cv2.imread(img, 0)[:, cut_s:cut_e] for img in imglist])
218
                images = np.concatenate([images, imgs.reshape(len(imgs), slider[0]*slider[1])])
219
                labels.extend([int(img[len(dirlist[i])]) for img in imglist])
220

221
        images = images[1:]
222
        labels = np.array(labels)
223
    
224
    if seq:
225
        print("-> Number of words / gaps and letters:",
226
              len(labels), '/', sum([len(l) for l in labels]))
227
    else:
228
        print("-> Number of gaps and letters:", len(labels))
229
    return (images, labels)    
230

231

232
def correspondingShuffle(a):
233
    """ 
234
    Shuffle array of numpy arrays such that
235
    each pair a[x][i] and a[y][i] remains the same
236
    Args:
237
        a: array of same length numpy arrays
238
    Returns:
239
        Array a with shuffled numpy arrays
240
    """
241
    assert all([len(a[0]) == len(a[i]) for i in range(len(a))])
242
    p = np.random.permutation(len(a[0]))
243
    for i in range(len(a)):
244
        a[i] = a[i][p]
245
    return a
246

247

248
def sequences_to_sparse(sequences):
249
    """
250
    Create a sparse representention of sequences.
251
    Args:
252
        sequences: a list of lists of type dtype where each element is a sequence
253
    Returns:
254
        A tuple with (indices, values, shape)
255
    """
256
    indices = []
257
    values = []
258

259
    for n, seq in enumerate(sequences):
260
        indices.extend(zip([n]*len(seq), range(len(seq))))
261
        values.extend(seq)
262
        
263
    indices = np.asarray(indices, dtype=np.int64)
264
    values = np.asarray(values, dtype=np.int32)
265
    shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
266

267
    return indices, values, shape
268
Product

Resources

Company