From ebc67c8e5e0d06bb38ba30a6d5212053965022c2 Mon Sep 17 00:00:00 2001 From: Vladislav Isenbaev Date: Sat, 4 Aug 2018 22:43:26 +0000 Subject: [PATCH] Clean up iam_lines dataset from " --- lab5/text_recognizer/datasets/iam_lines.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lab5/text_recognizer/datasets/iam_lines.py b/lab5/text_recognizer/datasets/iam_lines.py index b3d0626..f1d6656 100644 --- a/lab5/text_recognizer/datasets/iam_lines.py +++ b/lab5/text_recognizer/datasets/iam_lines.py @@ -11,6 +11,8 @@ from text_recognizer.datasets.base import Dataset from text_recognizer.datasets.emnist import EmnistDataset +import numpy as np + DATA_DIRNAME = pathlib.Path(__file__).parents[2].resolve() / 'data' PROCESSED_DATA_DIRNAME = DATA_DIRNAME / 'processed' / 'iam_lines' @@ -39,6 +41,15 @@ def __init__(self): self.input_shape = (28, 952) self.output_shape = (97, self.num_classes) + def cleanup(self, labels): + labels_clean = [] + for label in labels: + s = ''.join([self.mapping.get(i, '') for i in label]) + t = s.replace('"', '"') + t = t + '_' * (self.output_shape[0] - len(t)) + labels_clean.append([self.inverse_mapping[c] for c in t]) + return np.array(labels_clean) + def load_or_generate_data(self): if not PROCESSED_DATA_FILENAME.exists(): PROCESSED_DATA_DIRNAME.mkdir(parents=True, exist_ok=True) @@ -46,9 +57,9 @@ def load_or_generate_data(self): urlretrieve(PROCESSED_DATA_URL, PROCESSED_DATA_FILENAME) with h5py.File(PROCESSED_DATA_FILENAME, 'r') as f: self.x_train = f['x_train'][:] - self.y_train_int = f['y_train'][:] + self.y_train_int = self.cleanup(f['y_train'][:]) self.x_test = f['x_test'][:] - self.y_test_int = f['y_test'][:] + self.y_test_int = self.cleanup(f['y_test'][:]) @cachedproperty def y_train(self): @@ -72,4 +83,3 @@ def __repr__(self): dataset = IamLinesDataset() dataset.load_or_generate_data() print(dataset) -