Path: blob/master/Model-3/ocr/datahelpers.py
426 views
# -*- coding: utf-8 -*-1"""2Helper functions for loading and creating datasets3"""4import numpy as np5import glob6import simplejson7import cv28import unidecode9from .helpers import implt10from .normalization import letterNorm11from .viz import printProgressBar121314CHARS = ['', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',15'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',16'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c',17'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',18'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',19'x', 'y', 'z', 'Á', 'É', 'Í', 'Ó', 'Ú', 'Ý', 'á',20'é', 'í', 'ó', 'ú', 'ý', 'Č', 'č', 'Ď', 'ď', 'Ě',21'ě', 'Ň', 'ň', 'Ř', 'ř', 'Š', 'š', 'Ť', 'ť', 'Ů',22'ů', 'Ž', 'ž']2324idxs = [i for i in range(len(CHARS))]25idx_to_chars = dict(zip(idxs, CHARS))26chars_to_idx = dict(zip(CHARS, idxs))2728def char2idx(c, sequence=False):29if sequence:30return chars_to_idx[c] + 131return chars_to_idx[c]3233def idx2char(idx, sequence=False):34if sequence:35return idx_to_chars[idx-1]36return idx_to_chars[idx]373839def loadWordsData(dataloc='data/words/', loadGaplines=True, debug=False):40"""41Load word images with corresponding labels and gaplines (if loadGaplines == True)42Args:43dataloc: image folder location - can be list of multiple locations,44loadGaplines: wheter or not load gaplines positions files45debug: for printing example image46Returns:47(images, labels (, gaplines))48"""49print("Loading words...")50imglist = []51tmpLabels = []52if type(dataloc) is list:53for loc in dataloc:54loc += '/' if loc[-1] != '/' else ''55tmpList = glob.glob(loc + '*.jpg')56imglist += tmpList57tmpLabels += [name[len(loc):].split("_")[0] for name in tmpList]58else:59dataloc += '/' if dataloc[-1] != '/' else ''60imglist = glob.glob(dataloc + '*.jpg')61tmpLabels = [name[len(dataloc):].split("_")[0] for name in imglist]6263labels = np.array(tmpLabels)64images = np.empty(len(imglist), dtype=object)6566# Load grayscaled images67for i, img in enumerate(imglist):68images[i] = cv2.imread(img, 0)6970# Load gaplines (lines separating letters) from txt files71if loadGaplines:72gaplines = np.empty(len(imglist), dtype=object)73for i, name in enumerate(imglist):74with open(name[:-3] + 'txt', 'r') as fp:75gaplines[i] = np.array(simplejson.load(fp))7677# Check the same lenght of labels and images78if loadGaplines:79assert len(labels) == len(images) == len(gaplines)80else:81assert len(labels) == len(images)82print("-> Number of words:", len(labels))8384# Print one of the images (last one)85if debug:86implt(images[-1], 'gray', 'Example')87print("Word:", labels[-1])88if loadGaplines:89print("Gaplines:", gaplines[-1])9091if loadGaplines:92return (images, labels, gaplines)93return (images, labels)949596def words2chars(images, labels, gaplines, lang='cz'):97""" Transform word images with gaplines into individual chars """98# Total number of chars99length = sum([len(l) for l in labels])100101imgs = np.empty(length, dtype=object)102newLabels = []103104height = images[0].shape[0]105106idx = 0;107for i, gaps in enumerate(gaplines):108for pos in range(len(gaps) - 1):109imgs[idx] = images[i][0:height, gaps[pos]:gaps[pos+1]]110if lang == 'cz':111newLabels.append(char2idx(labels[i][pos]))112else:113newLabels.append(char2idx(unidecode.unidecode(labels[i][pos])))114idx += 1115116print("Loaded chars from words:", length)117return imgs, newLabels118119120def loadCharsData(charloc='data/charclas/', wordloc='data/words/', lang='cz'):121"""122Load chars images with corresponding labels123Args:124charloc: char images FOLDER LOCATION125wordloc: word images with gaplines FOLDER LOCATION126Returns:127(images, labels)128"""129print("Loading chars...")130images = np.zeros((1, 4096))131labels = []132133if charloc != '':134# Get subfolders with chars135dirlist = glob.glob(charloc + lang + "/*/")136dirlist.sort()137138if lang == 'en':139chars = CHARS_EN140else:141chars = CHARS_CZ142143assert [d[-2] if d[-2] != '0' else '' for d in dirlist] == chars144145# For every label load images and create corresponding labels146# cv2.imread(img, 0) - for loading images in grayscale147# Images are scaled to 64x64 = 4096 px148for i in range(len(chars)):149imglist = glob.glob(dirlist[i] + '*.jpg')150imgs = np.array([letterNorm(cv2.imread(img, 0)) for img in imglist])151images = np.concatenate([images, imgs.reshape(len(imgs), 4096)])152labels.extend([i] * len(imgs))153154if wordloc != '':155imgs, words, gaplines = loadWordsData(wordloc)156imgs, chars = words2chars(imgs, words, gaplines, lang)157158labels.extend(chars)159for i in range(len(imgs)):160printProgressBar(i, len(imgs))161images = np.concatenate([images,162letterNorm(imgs[i]).reshape(1, 4096)])163164images = images[1:]165labels = np.array(labels)166167print("-> Number of chars:", len(labels))168return (images, labels)169170171def loadGapData(loc='data/gapdet/large/', slider=(60, 120), seq=False, flatten=True):172"""173Load gap data from location with corresponding labels174Args:175loc: location of folder with words separated into gap data176images have to by named as label_timestamp.jpg, label is 0 or 1177slider: dimensions of of output images178seq: Store images from one word as a sequence179flatten: Flatten the output images180Returns:181(images, labels)182"""183print('Loading gap data...')184loc += '/' if loc[-1] != '/' else ''185dirlist = glob.glob(loc + "*/")186dirlist.sort()187188if slider[1] > 120:189# TODO Implement for higher dimmensions190slider[1] = 120191192cut_s = None if (120 - slider[1]) // 2 <= 0 else (120 - slider[1]) // 2193cut_e = None if (120 - slider[1]) // 2 <= 0 else -(120 - slider[1]) // 2194195if seq:196images = np.empty(len(dirlist), dtype=object)197labels = np.empty(len(dirlist), dtype=object)198199for i, loc in enumerate(dirlist):200# TODO Check for empty directories201imglist = glob.glob(loc + '*.jpg')202if (len(imglist) != 0):203imgList = sorted(imglist, key=lambda x: int(x[len(loc):].split("_")[1][:-4]))204images[i] = np.array([(cv2.imread(img, 0)[:, cut_s:cut_e].flatten() if flatten else205cv2.imread(img, 0)[:, cut_s:cut_e])206for img in imglist])207labels[i] = np.array([int(name[len(loc):].split("_")[0]) for name in imglist])208209else:210images = np.zeros((1, slider[0]*slider[1]))211labels = []212213for i in range(len(dirlist)):214imglist = glob.glob(dirlist[i] + '*.jpg')215if (len(imglist) != 0):216imgs = np.array([cv2.imread(img, 0)[:, cut_s:cut_e] for img in imglist])217images = np.concatenate([images, imgs.reshape(len(imgs), slider[0]*slider[1])])218labels.extend([int(img[len(dirlist[i])]) for img in imglist])219220images = images[1:]221labels = np.array(labels)222223if seq:224print("-> Number of words / gaps and letters:",225len(labels), '/', sum([len(l) for l in labels]))226else:227print("-> Number of gaps and letters:", len(labels))228return (images, labels)229230231def correspondingShuffle(a):232"""233Shuffle array of numpy arrays such that234each pair a[x][i] and a[y][i] remains the same235Args:236a: array of same length numpy arrays237Returns:238Array a with shuffled numpy arrays239"""240assert all([len(a[0]) == len(a[i]) for i in range(len(a))])241p = np.random.permutation(len(a[0]))242for i in range(len(a)):243a[i] = a[i][p]244return a245246247def sequences_to_sparse(sequences):248"""249Create a sparse representention of sequences.250Args:251sequences: a list of lists of type dtype where each element is a sequence252Returns:253A tuple with (indices, values, shape)254"""255indices = []256values = []257258for n, seq in enumerate(sequences):259indices.extend(zip([n]*len(seq), range(len(seq))))260values.extend(seq)261262indices = np.asarray(indices, dtype=np.int64)263values = np.asarray(values, dtype=np.int32)264shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1]+1], dtype=np.int64)265266return indices, values, shape267268