Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Aniket025
GitHub Repository: Aniket025/Medical-Prescription-OCR
Path: blob/master/Model-1/WordClassifier-CTC.ipynb
427 views
Kernel: Python 3

Connectionist temporal classification - Word Classification

Implemented in Tensorflow

TODO

Blank labels? Indexs? Add propper accuracy Add border to words images
import numpy as np import pandas as pd import cv2 import matplotlib as plt import tensorflow as tf import tensorflow.contrib.seq2seq as seq2seq from tensorflow.python.layers import core as layers_core import time import math import unidecode from ocr.datahelpers import loadWordsData, correspondingShuffle, char2idx, sequences_to_sparse from ocr.helpers import extendImg from ocr.mlhelpers import TrainingPlot from ocr.tfhelpers import Graph, create_cell from ocr.imgtransform import coordinates_remap from ocr.normalization import imageNorm %matplotlib notebook # Increase size of images plt.rcParams['figure.figsize'] = (9.0, 5.0) tf.reset_default_graph() sess = tf.InteractiveSession() print('Tensorflow', tf.__version__)
Tensorflow 1.4.0

Loading images

LANG = 'en'
images, labels = loadWordsData(['data/words2/'], loadGaplines=False) if LANG == 'en': for i in range(len(labels)): labels[i] = unidecode.unidecode(labels[i])
Loading words... -> Number of words: 5069

Settings

char_size = 52 PAD = 0 # Padding num_new_images = 2 # Number of new images per image fac_alpha = 2.0 # Factors for image preprocessing fac_sigma = 0.08 num_buckets = 5 slider_size = (60, 60) step_size = 10 N_INPUT = slider_size[0]*slider_size[1] vocab_size = char_size + 2 # Number of different chars + <PAD> and <EOS> layers = 2 residual_layers = 1 # HAVE TO be smaller than layers units = 256 num_hidden = 2*units learning_rate = 1e-4 # 1e-4 max_gradient_norm = 5.0 # For gradient clipping dropout = 0.4 train_per = 0.8 # Percentage of training data TRAIN_STEPS = 100000 # Number of training steps! TEST_ITER = 150 LOSS_ITER = 50 SAVE_ITER = 2000 BATCH_SIZE = 25 EPOCH = 2000 # Number of batches in epoch - not accurate save_location = 'models/word-clas/' + LANG + '/CTC/Classifier2'

Dataset

# Shuffle data for later splitting images, labels = correspondingShuffle([images, labels]) for i in range(len(images)): images[i] = cv2.copyMakeBorder( images[i], 0, 0, slider_size[1]//2, slider_size[1]//2, cv2.BORDER_CONSTANT, value=[0, 0, 0]) labels_idx = np.empty(len(labels), dtype=object) for i, label in enumerate(labels): labels_idx[i] = [char2idx(c) for c in label] # char2idx(c, True)-2 # Split data on train and test dataset div = int(train_per * len(images)) trainImages = images[0:div] testImages = images[div:] trainLabels_idx = labels_idx[0:div] testLabels_idx = labels_idx[div:] print("Training images:", div) print("Testing images:", len(images) - div)
Training images: 4055 Testing images: 1014
# Dont mix train and test images trainImagesFinal = np.empty(len(trainImages) * (num_new_images+1), dtype=object) trainLabelsFinal_idx = np.empty(len(trainImages)*(num_new_images+1), dtype=object) for idx, img in enumerate(trainImages): trainImagesFinal[idx*(num_new_images+1)] = img trainLabelsFinal_idx[idx*(num_new_images+1)] = trainLabels_idx[idx] for i in range(num_new_images): trainImagesFinal[idx*(num_new_images+1) + (i+1)] = coordinates_remap(img, fac_alpha, fac_sigma) trainLabelsFinal_idx[idx*(num_new_images+1) + (i+1)] = trainLabels_idx[idx] print("Transformed train images", len(trainImagesFinal))
Transformed train images 12165
class BucketDataIterator(): """ Iterator for feeding seq2seq model during training """ def __init__(self, images, targets, num_buckets=5, slider=(60, 30), slider_step=2, train=True): self.train = train # First PADDING of images to slider size: -(-a // b) == ceil(a/b) self.slider = slider for i in range(len(images)): images[i] = extendImg( images[i], (images[i].shape[0], max(-(-images[i].shape[1] // slider_step) * slider_step, 60))) in_length = [(image.shape[1] + 1 - slider[1])//slider_step for image in images] # Split images to sequence of vectors imgseq = np.empty(len(images), dtype=object) for i, img in enumerate(images): imgseq[i] = [img[:, loc*slider_step: loc*slider_step + slider[1]].flatten() for loc in range(in_length[i])] # Create pandas dataFrame and sort it by images width (length) self.dataFrame = pd.DataFrame({'in_length': in_length, 'images': imgseq, 'targets': targets }).sort_values('in_length').reset_index(drop=True) bsize = int(len(images) / num_buckets) self.num_buckets = num_buckets # Create buckets by slicing parts by indexes self.buckets = [] for bucket in range(num_buckets-1): self.buckets.append(self.dataFrame.iloc[bucket * bsize: (bucket+1) * bsize]) self.buckets.append(self.dataFrame.iloc[(num_buckets-1) * bsize:]) self.buckets_size = [len(bucket) for bucket in self.buckets] # cursor[i] will be the cursor for the ith bucket self.cursor = np.array([0] * num_buckets) self.bucket_order = np.random.permutation(num_buckets) self.bucket_cursor = 0 self.shuffle() print("Iterator created.") def shuffle(self, idx=None): """ Shuffle idx bucket or each bucket separately """ for i in [idx] if idx is not None else range(self.num_buckets): self.buckets[i] = self.buckets[i].sample(frac=1).reset_index(drop=True) self.cursor[i] = 0 def next_batch(self, batch_size): """ Creates next training batch of size: batch_size Retruns: image seq, letter seq, image seq lengths, letter seq lengths """ i_bucket = self.bucket_order[self.bucket_cursor] # Increment cursor and shuffle in case of new round self.bucket_cursor = (self.bucket_cursor + 1) % self.num_buckets if self.bucket_cursor == 0: self.bucket_order = np.random.permutation(self.num_buckets) if self.cursor[i_bucket] + batch_size > self.buckets_size[i_bucket]: self.shuffle(i_bucket) # Handle too big batch sizes if (batch_size > self.buckets_size[i_bucket]): batch_size = self.buckets_size[i_bucket] res = self.buckets[i_bucket].iloc[self.cursor[i_bucket]: self.cursor[i_bucket]+batch_size] self.cursor[i_bucket] += batch_size # PAD input sequence and output input_max = max(res['in_length']) input_seq = np.zeros((batch_size, input_max, N_INPUT), dtype=np.float32) for i, img in enumerate(res['images']): try: input_seq[i][:res['in_length'].values[i]] = img except: print(i, ":", res['in_length'].values[i]) print(img) input_seq = input_seq.swapaxes(0, 1) targets = sequences_to_sparse(res['targets'].values) return input_seq, targets, res['in_length'].values def next_feed(self, size): """ Create feed directly for model training """ (inputs_, targets_, inputs_length_) = self.next_batch(size) return { inputs: inputs_, inputs_length: inputs_length_, targets: targets_, keep_prob: (1.0 - dropout) if self.train else 1.0 }
# Create iterator for feeding RNN # Create only once, it modifies: labels_idx train_iterator = BucketDataIterator(trainImagesFinal, trainLabelsFinal_idx, num_buckets, slider_size, step_size, train=True) test_iterator = BucketDataIterator(testImages, testLabels_idx, num_buckets, slider_size, step_size, train=False)
Iterator created. Iterator created.

Placeholders

# Input placehodlers # N_INPUT -> size of vector representing one image in sequence # Inputs - Time major: (max_seq_length, batch_size, vec_size) inputs = tf.placeholder(shape=(None, None, N_INPUT), dtype=tf.float32, name='inputs') inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='inputs_length') # required for training, not required for testing and application targets = tf.sparse_placeholder(dtype=tf.int32, name='targets') # Dropout value keep_prob = tf.placeholder(tf.float32, name='keep_prob')

Graph

enc_cell_fw = create_cell(units, layers, residual_layers, is_dropout=True, keep_prob=keep_prob) enc_cell_bw = create_cell(units, layers, residual_layers, is_dropout=True, keep_prob=keep_prob)

CNN

### CNN ### SCALE = 0.01 # 0.1 # Functions for initializing convulation and pool layers def weights(name, shape): return tf.get_variable(name, shape=shape, initializer=tf.contrib.layers.xavier_initializer(), regularizer=tf.contrib.layers.l2_regularizer(scale=SCALE)) def bias(const, shape, name=None): return tf.Variable(tf.constant(const, shape=shape), name=name) def conv2d(x, W, name=None): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME', name=name) def conv2d2(x, W, name=None): return tf.nn.conv2d(x, W, strides=[1, 2, 2, 1], padding='SAME', name=name) def max_pool_2x2(x, name=None): return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME', name=name) def inception2d(x, in_channels, filter_count, var_dict): # 1x1 one_by_one = conv2d(x, var_dict['one_filter']) + var_dict['one_bias'] # 3x3 three_by_three = conv2d(x, var_dict['three_filter']) + var_dict['three_bias'] # 5x5 five_by_five = conv2d(x, var_dict['five_filter']) + var_dict['five_bias'] # avg pooling pooling = tf.nn.max_pool(x, ksize=[1, 3, 3, 1], strides=[1, 1, 1, 1], padding='SAME') x = tf.concat([one_by_one, three_by_three, five_by_five, pooling], axis=3) # Concat in the 4th dim to stack return tf.nn.relu(x) in_channels1 = 4 filter_count1 = 12 in_channels2 = 40 filter_count2 = 20 W_conv1 = weights('W_conv1', shape=[16, 16, 1, 4]) b_conv1 = bias(0.1, shape=[4], name='b_conv1') W_fc1 = weights('W_fc2', shape=[8*8*100, char_size]) b_fc1 = bias(0.1, shape=[char_size], name='b_fc2') var_dict1 = { 'one_filter': weights('one_filter1',shape=[1, 1, in_channels1, filter_count1]), 'one_bias': bias(0.2, shape=[filter_count1]), 'three_filter': weights('three_filter1', shape=[3, 3, in_channels1, filter_count1]), 'three_bias': bias(0.2, shape=[filter_count1]), 'five_filter': weights('five_filter1', shape=[5, 5, in_channels1, filter_count1]), 'five_bias': bias(0.2, shape=[filter_count1]) } var_dict2 = { 'one_filter': weights('one_filter2',shape=[1, 1, in_channels2, filter_count2]), 'one_bias': bias(0.2, shape=[filter_count2]), 'three_filter': weights('three_filter2', shape=[3, 3, in_channels2, filter_count2]), 'three_bias': bias(0.2, shape=[filter_count2]), 'five_filter': weights('five_filter2', shape=[5, 5, in_channels2, filter_count2]), 'five_bias': bias(0.2, shape=[filter_count2]) } def CNN(x): x = tf.image.per_image_standardization(x) img = tf.reshape(x, [1, slider_size[0], slider_size[1], 1]) # 1. Layer - Convulation h_conv1 = tf.nn.relu(conv2d2(img, W_conv1) + b_conv1, name='h_conv1') # 2. Layer - Max Pool h_pool1 = max_pool_2x2(h_conv1, name='h_pool1') # 3. Inception incept1 = inception2d(h_pool1, in_channels1, filter_count1, var_dict1) # 4. Inception incept2 = inception2d(incept1, in_channels2, filter_count2, var_dict2) # 5. Layer - Max Pool h_pool3 = max_pool_2x2(incept2) return h_pool3
# Input images CNN processed_inputs = tf.map_fn( lambda seq: tf.map_fn( lambda img: tf.reshape( CNN(tf.reshape(img, [slider_size[0], slider_size[1], 1])), [-1]), seq), inputs, dtype=tf.float32) # Bidirectional RNN, gibe fw and bw outputs separately bi_outputs, _ = tf.nn.bidirectional_dynamic_rnn( cell_fw = enc_cell_fw, cell_bw = enc_cell_bw, inputs = processed_inputs, sequence_length = inputs_length, dtype = tf.float32, time_major = True) con_outputs = tf.concat(bi_outputs, -1) max_timesteps, batch_size, _ = tf.unstack(tf.shape(inputs)) W = weights('W', shape=[num_hidden, vocab_size]) b = bias(0., shape=[vocab_size], name='b') outputs = tf.reshape(con_outputs, [-1, num_hidden]) logits = tf.matmul(outputs, W) + b logits = tf.reshape(logits, [-1, batch_size, vocab_size]) # Or tf.nn.ctc_greedy_decoder (shoudl be faster, but beam_widt=1) decoded, log_prob = tf.nn.ctc_beam_search_decoder(logits, inputs_length) word_prediction = tf.sparse_tensor_to_dense(decoded[0], name='word_prediction')
sess.run(tf.global_variables_initializer()) fd = test_iterator.next_feed(5) print(word_prediction.eval(fd).shape)
(5, 9)

Optimizer

ctc_loss = tf.nn.ctc_loss(targets, logits, inputs_length) regularization = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) loss = tf.identity(tf.reduce_mean(ctc_loss) + sum(regularization), name='loss') ### Optimization # optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9) optimizer = tf.train.AdamOptimizer(learning_rate) train_step = optimizer.minimize(loss, name='train_step') test_targets = tf.sparse_tensor_to_dense(targets) # TODO CHANGE TO ACCURACY # It is label error rate not accuracy accuracy = tf.reduce_mean( tf.edit_distance(tf.cast(decoded[0], tf.int32), targets))

Accuracy + Padding

# pred_length = decoded[0].get_shape().as_list()[1] # targets_length = targets.get_shape().as_list()[1] # pad_lenght = tf.maximum(pred_length, targets_length) # pred_pad = tf.pad( # word_prediction, # [[0, 0], # [0, pad_lenght - pred_length]], # constant_values=PAD, # mode='CONSTANT') # targets_pad = tf.pad( # test_targets, # [[0, 0], # [0, pad_lenght - targets_lenght]], # constant_values=PAD, # mode='CONSTANT') # acc_weights = tf.cast(tf.divide(pred_pad, pred_pad), tf.float32) # # acc_weights = tf.sequence_mask( # # tf.subtract(final_seq_lengths, 1), # word_inputs_length, try max(targets, inputs) # # word_pad_lenght, # # dtype=tf.float32) # correct_prediction = tf.equal(pred_pad, targets_pad) # accuracy = (tf.reduce_sum(tf.cast(correct_prediction, tf.float32) * acc_weights) \ # / tf.reduce_sum(acc_weights))

Training

sess.run(tf.global_variables_initializer()) saver = tf.train.Saver() # Creat plot for live stats ploting trainPlot = TrainingPlot(TRAIN_STEPS, TEST_ITER, LOSS_ITER) try: for i_batch in range(TRAIN_STEPS): fd = train_iterator.next_feed(BATCH_SIZE) train_step.run(fd) if i_batch % LOSS_ITER == 0: # Plotting loss tmpLoss = loss.eval(fd) trainPlot.updateCost(tmpLoss, i_batch // LOSS_ITER) if i_batch % TEST_ITER == 0: # Plotting accuracy fd_test = test_iterator.next_feed(BATCH_SIZE) accTest = accuracy.eval(fd_test) accTrain = accuracy.eval(fd) trainPlot.updateAcc(accTest, accTrain, i_batch // TEST_ITER) if i_batch % SAVE_ITER == 0: saver.save(sess, save_location) if i_batch % EPOCH == 0: fd_test = test_iterator.next_feed(BATCH_SIZE) print('batch %r - loss: %r' % (i_batch, sess.run(loss, fd_test))) predict_, target_ = sess.run([word_prediction, test_targets], fd_test) for i, (inp, pred) in enumerate(zip(target_, predict_)): print(' expected > {}'.format(inp)) print(' predicted > {}'.format(pred)) if i >= 1: break print() except KeyboardInterrupt: saver.save(sess, save_location) print('Training interrupted, model saved.')
<IPython.core.display.Javascript object>
batch 0 - loss: 14.03011 expected > [44 31 27 38 38 51] predicted > [] expected > [10 0 0 0 0 0] predicted > [] batch 2000 - loss: 7.7040024 expected > [ 3 34 31 31 44 0 0] predicted > [ 1 46 31 44 0 0] expected > [30 35 46 31 0 0 0] predicted > [30 35 27 46 31 0] batch 4000 - loss: 4.9426394 expected > [29 0 0 0 0] predicted > [29 35 0 0] expected > [27 33 31 0 0] predicted > [45 38 0 0] batch 6000 - loss: 4.2646828 expected > [15 47 46 0 0 0 0 0] predicted > [15 27 46 0 0 0 0] expected > [26 31 39 0 0 0 0 0] predicted > [ 6 31 40 0 0 0 0] batch 8000 - loss: 4.3730812 expected > [42 27 44 37 0] predicted > [42 27 45 37 0] expected > [34 35 45 0 0] predicted > [34 35 45 0 0] batch 10000 - loss: 5.6516752 expected > [38 41 48 31 0 0 0 0] predicted > [38 41 48 31 0 0 0 0] expected > [42 41 39 41 29 35 0 0] predicted > [42 41 48 41 35 0 0 0] batch 12000 - loss: 3.5471888 expected > [10 35 46 37 27 0] predicted > [36 35 46 37 27 0] expected > [52 0 0 0 0 0] predicted > [52 0 0 0 0 0] batch 14000 - loss: 3.1041458 expected > [48 51 38 31 46 0] predicted > [48 51 31 46 0] expected > [36 35 0 0 0 0] predicted > [36 35 0 0 0] batch 16000 - loss: 3.7438998 expected > [48 29 31 44 27 0 0 0] predicted > [48 29 31 44 27 0 0 0] expected > [48 45 35 39 40 41 47 46] predicted > [48 40 27 40 41 27 40 46] batch 18000 - loss: 2.9463203 expected > [29 34 48 35 38 31 0 0 0] predicted > [29 37 48 35 34 31 0 0] expected > [45 37 27 38 27 0 0 0 0] predicted > [45 37 27 38 27 0 0 0] batch 20000 - loss: 4.4252157 expected > [52 41 41 0 0 0 0 0 0] predicted > [29 48 41 0 0 0 0 0 0 0] expected > [40 35 29 35 0 0 0 0 0] predicted > [40 35 29 35 0 0 0 0 0 0] batch 22000 - loss: 3.3735261 expected > [38 35 37 31 0 0] predicted > [38 35 37 31 0 0 0] expected > [45 51 29 31 37 0] predicted > [45 51 29 31 46 27 0] batch 24000 - loss: 4.7486191 expected > [19 27 30 38 51 0 0] predicted > [19 27 29 38 51 0] expected > [52 30 27 0 0 0 0] predicted > [52 30 27 0 0 0] batch 26000 - loss: 3.6218534 expected > [49 34 35 38 31 0 0 0] predicted > [49 34 35 38 31 0 0 0 0] expected > [29 35 44 29 38 31 0 0] predicted > [29 35 44 27 38 31 0 0 0] batch 28000 - loss: 2.8629522 expected > [7 0 0 0] predicted > [7 0 0 0] expected > [42 27 37 0] predicted > [42 27 37 0] batch 30000 - loss: 3.5273075 expected > [41 38 41 48 41 0 0 0] predicted > [30 27 48 41 0 0 0] expected > [16 47 29 34 0 0 0 0] predicted > [16 47 29 34 0 0 0] batch 32000 - loss: 5.6210623 expected > [ 7 35 44 38 0 0 0] predicted > [ 7 35 45 46 0 0] expected > [31 44 27 0 0 0 0] predicted > [31 44 27 0 0 0] batch 34000 - loss: 4.0684881 expected > [ 5 35 46 34 31 44 0 0] predicted > [ 5 35 46 34 31 44 0 0] expected > [19 46 35 38 38 0 0 0] predicted > [19 46 35 38 0 0 0 0] batch 36000 - loss: 2.9283721 expected > [32 27 46 0 0] predicted > [28 46 0 0 0 0] expected > [45 35 42 0 0] predicted > [45 35 36 0 0 0] batch 38000 - loss: 5.4859548 expected > [49 41 44 38 30 0 0 0] predicted > [49 41 44 38 30 0 0 0] expected > [35 40 0 0 0 0 0 0] predicted > [35 40 0 0 0 0 0 0] batch 40000 - loss: 5.6999946 expected > [22 35 46 44 0 0 0 0 0] predicted > [22 35 46 44 0 0 0 0 0] expected > [52 37 41 47 39 27 36 35 0] predicted > [31 34 41 47 44 40 33 36 35] Training interrupted, model saved.
for i in range(5): fd_test = test_iterator.next_feed(BATCH_SIZE) predict_, target_ = sess.run([word_prediction, test_targets], fd_test) for i, (inp, pred) in enumerate(zip(target_, predict_)): print(' expected > {}'.format(inp)) print(' predicted > {}'.format(pred)) if i >= 1: break print()
expected > [10 47 45 46 0 0 0] predicted > [10 47 45 46 0 0 0] expected > [ 3 34 27 46 0 0 0] predicted > [15 38 27 46 0 0 0] expected > [38 35 28 44 27 44 51 0 0 0] predicted > [28 35 46 44 47 44 51 0] expected > [18 41 41 46 0 0 0 0 0 0] predicted > [18 41 46 0 0 0 0 0] expected > [52 48 27 40 0 0 0 0 0 0] predicted > [52 41 27 40 29 0 0 0 0 0] expected > [45 46 27 48 28 27 0 0 0 0] predicted > [45 46 27 48 28 27 0 0 0 0] expected > [45 38 51 0 0] predicted > [35 38 51 0 0 0] expected > [27 28 51 0 0] predicted > [27 28 41 51 0 0] expected > [15 30 31 35 27 0 0] predicted > [ 5 30 31 44 35 27] expected > [ 4 47 44 27 0 0 0] predicted > [ 4 47 44 27 0 0]