Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Aniket025
GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-3/ocr/datahelpers.py
426 views
1
# -*- coding: utf-8 -*-
2
"""
3
Helper functions for loading and creating datasets
4
"""
5
import numpy as np
6
import glob
7
import simplejson
8
import cv2
9
import unidecode
10
from .helpers import implt
11
from .normalization import letterNorm
12
from .viz import printProgressBar
13
14
15
CHARS = ['', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
16
'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
17
'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c',
18
'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
19
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
20
'x', 'y', 'z', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ý', 'á',
21
'é', 'í', 'ó', 'ú', 'ý', 'Č', 'č', 'Ď', 'ď', 'Ě',
22
'ě', 'Ň', 'ň', 'Ř', 'ř', 'Š', 'š', 'Ť', 'ť', 'Ů',
23
'ů', 'Ž', 'ž']
24
25
idxs = [i for i in range(len(CHARS))]
26
idx_to_chars = dict(zip(idxs, CHARS))
27
chars_to_idx = dict(zip(CHARS, idxs))
28
29
def char2idx(c, sequence=False):
30
if sequence:
31
return chars_to_idx[c] + 1
32
return chars_to_idx[c]
33
34
def idx2char(idx, sequence=False):
35
if sequence:
36
return idx_to_chars[idx-1]
37
return idx_to_chars[idx]
38
39
40
def loadWordsData(dataloc='data/words/', loadGaplines=True, debug=False):
41
"""
42
Load word images with corresponding labels and gaplines (if loadGaplines == True)
43
Args:
44
dataloc: image folder location - can be list of multiple locations,
45
loadGaplines: wheter or not load gaplines positions files
46
debug: for printing example image
47
Returns:
48
(images, labels (, gaplines))
49
"""
50
print("Loading words...")
51
imglist = []
52
tmpLabels = []
53
if type(dataloc) is list:
54
for loc in dataloc:
55
loc += '/' if loc[-1] != '/' else ''
56
tmpList = glob.glob(loc + '*.jpg')
57
imglist += tmpList
58
tmpLabels += [name[len(loc):].split("_")[0] for name in tmpList]
59
else:
60
dataloc += '/' if dataloc[-1] != '/' else ''
61
imglist = glob.glob(dataloc + '*.jpg')
62
tmpLabels = [name[len(dataloc):].split("_")[0] for name in imglist]
63
64
labels = np.array(tmpLabels)
65
images = np.empty(len(imglist), dtype=object)
66
67
# Load grayscaled images
68
for i, img in enumerate(imglist):
69
images[i] = cv2.imread(img, 0)
70
71
# Load gaplines (lines separating letters) from txt files
72
if loadGaplines:
73
gaplines = np.empty(len(imglist), dtype=object)
74
for i, name in enumerate(imglist):
75
with open(name[:-3] + 'txt', 'r') as fp:
76
gaplines[i] = np.array(simplejson.load(fp))
77
78
# Check the same lenght of labels and images
79
if loadGaplines:
80
assert len(labels) == len(images) == len(gaplines)
81
else:
82
assert len(labels) == len(images)
83
print("-> Number of words:", len(labels))
84
85
# Print one of the images (last one)
86
if debug:
87
implt(images[-1], 'gray', 'Example')
88
print("Word:", labels[-1])
89
if loadGaplines:
90
print("Gaplines:", gaplines[-1])
91
92
if loadGaplines:
93
return (images, labels, gaplines)
94
return (images, labels)
95
96
97
def words2chars(images, labels, gaplines, lang='cz'):
98
""" Transform word images with gaplines into individual chars """
99
# Total number of chars
100
length = sum([len(l) for l in labels])
101
102
imgs = np.empty(length, dtype=object)
103
newLabels = []
104
105
height = images[0].shape[0]
106
107
idx = 0;
108
for i, gaps in enumerate(gaplines):
109
for pos in range(len(gaps) - 1):
110
imgs[idx] = images[i][0:height, gaps[pos]:gaps[pos+1]]
111
if lang == 'cz':
112
newLabels.append(char2idx(labels[i][pos]))
113
else:
114
newLabels.append(char2idx(unidecode.unidecode(labels[i][pos])))
115
idx += 1
116
117
print("Loaded chars from words:", length)
118
return imgs, newLabels
119
120
121
def loadCharsData(charloc='data/charclas/', wordloc='data/words/', lang='cz'):
122
"""
123
Load chars images with corresponding labels
124
Args:
125
charloc: char images FOLDER LOCATION
126
wordloc: word images with gaplines FOLDER LOCATION
127
Returns:
128
(images, labels)
129
"""
130
print("Loading chars...")
131
images = np.zeros((1, 4096))
132
labels = []
133
134
if charloc != '':
135
# Get subfolders with chars
136
dirlist = glob.glob(charloc + lang + "/*/")
137
dirlist.sort()
138
139
if lang == 'en':
140
chars = CHARS_EN
141
else:
142
chars = CHARS_CZ
143
144
assert [d[-2] if d[-2] != '0' else '' for d in dirlist] == chars
145
146
# For every label load images and create corresponding labels
147
# cv2.imread(img, 0) - for loading images in grayscale
148
# Images are scaled to 64x64 = 4096 px
149
for i in range(len(chars)):
150
imglist = glob.glob(dirlist[i] + '*.jpg')
151
imgs = np.array([letterNorm(cv2.imread(img, 0)) for img in imglist])
152
images = np.concatenate([images, imgs.reshape(len(imgs), 4096)])
153
labels.extend([i] * len(imgs))
154
155
if wordloc != '':
156
imgs, words, gaplines = loadWordsData(wordloc)
157
imgs, chars = words2chars(imgs, words, gaplines, lang)
158
159
labels.extend(chars)
160
for i in range(len(imgs)):
161
printProgressBar(i, len(imgs))
162
images = np.concatenate([images,
163
letterNorm(imgs[i]).reshape(1, 4096)])
164
165
images = images[1:]
166
labels = np.array(labels)
167
168
print("-> Number of chars:", len(labels))
169
return (images, labels)
170
171
172
def loadGapData(loc='data/gapdet/large/', slider=(60, 120), seq=False, flatten=True):
173
"""
174
Load gap data from location with corresponding labels
175
Args:
176
loc: location of folder with words separated into gap data
177
images have to by named as label_timestamp.jpg, label is 0 or 1
178
slider: dimensions of of output images
179
seq: Store images from one word as a sequence
180
flatten: Flatten the output images
181
Returns:
182
(images, labels)
183
"""
184
print('Loading gap data...')
185
loc += '/' if loc[-1] != '/' else ''
186
dirlist = glob.glob(loc + "*/")
187
dirlist.sort()
188
189
if slider[1] > 120:
190
# TODO Implement for higher dimmensions
191
slider[1] = 120
192
193
cut_s = None if (120 - slider[1]) // 2 <= 0 else (120 - slider[1]) // 2
194
cut_e = None if (120 - slider[1]) // 2 <= 0 else -(120 - slider[1]) // 2
195
196
if seq:
197
images = np.empty(len(dirlist), dtype=object)
198
labels = np.empty(len(dirlist), dtype=object)
199
200
for i, loc in enumerate(dirlist):
201
# TODO Check for empty directories
202
imglist = glob.glob(loc + '*.jpg')
203
if (len(imglist) != 0):
204
imgList = sorted(imglist, key=lambda x: int(x[len(loc):].split("_")[1][:-4]))
205
images[i] = np.array([(cv2.imread(img, 0)[:, cut_s:cut_e].flatten() if flatten else
206
cv2.imread(img, 0)[:, cut_s:cut_e])
207
for img in imglist])
208
labels[i] = np.array([int(name[len(loc):].split("_")[0]) for name in imglist])
209
210
else:
211
images = np.zeros((1, slider[0]*slider[1]))
212
labels = []
213
214
for i in range(len(dirlist)):
215
imglist = glob.glob(dirlist[i] + '*.jpg')
216
if (len(imglist) != 0):
217
imgs = np.array([cv2.imread(img, 0)[:, cut_s:cut_e] for img in imglist])
218
images = np.concatenate([images, imgs.reshape(len(imgs), slider[0]*slider[1])])
219
labels.extend([int(img[len(dirlist[i])]) for img in imglist])
220
221
images = images[1:]
222
labels = np.array(labels)
223
224
if seq:
225
print("-> Number of words / gaps and letters:",
226
len(labels), '/', sum([len(l) for l in labels]))
227
else:
228
print("-> Number of gaps and letters:", len(labels))
229
return (images, labels)
230
231
232
def correspondingShuffle(a):
233
"""
234
Shuffle array of numpy arrays such that
235
each pair a[x][i] and a[y][i] remains the same
236
Args:
237
a: array of same length numpy arrays
238
Returns:
239
Array a with shuffled numpy arrays
240
"""
241
assert all([len(a[0]) == len(a[i]) for i in range(len(a))])
242
p = np.random.permutation(len(a[0]))
243
for i in range(len(a)):
244
a[i] = a[i][p]
245
return a
246
247
248
def sequences_to_sparse(sequences):
249
"""
250
Create a sparse representention of sequences.
251
Args:
252
sequences: a list of lists of type dtype where each element is a sequence
253
Returns:
254
A tuple with (indices, values, shape)
255
"""
256
indices = []
257
values = []
258
259
for n, seq in enumerate(sequences):
260
indices.extend(zip([n]*len(seq), range(len(seq))))
261
values.extend(seq)
262
263
indices = np.asarray(indices, dtype=np.int64)
264
values = np.asarray(values, dtype=np.int32)
265
shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)
266
267
return indices, values, shape
268