From 9dc7eab1f7b42c7d6f9baec17c7baf03e745f9ee Mon Sep 17 00:00:00 2001 From: Hikaru Ikuta Date: Tue, 6 Oct 2020 06:34:23 +0900 Subject: [PATCH] Add an order-preserving output option for the `Parser.get_annotation` function --- README.md | 42 ++++++++++ manga109api/manga109api.py | 152 ++++++++++++++++++------------------- requirements.txt | 1 - tests/test_data_type.py | 40 +++++++++- tests/test_func.py | 38 ---------- 5 files changed, 154 insertions(+), 119 deletions(-) diff --git a/README.md b/README.md index 1808286..d9af45f 100644 --- a/README.md +++ b/README.md @@ -127,6 +127,48 @@ pprint(annotation["page"][3]) # the data of the 4th page of "ARMS" # '@xmin': 1155, # '@ymax': 686, # '@ymin': 595} ... ]} + +# (4) Preserve the raw tag ordering in the output annotation data +annotation_ordered = p.get_annotation(book="ARMS", separate_by_tag=False) + +# In the raw XML in the Manga109 dataset, the bounding box data in the +# `page` tag is not sorted by its annotation type, and each bounding +# box type appears in an arbitrary order. When the `separate_by_tag=False` +# option is set, the output will preserve the ordering of each +# bounding box tag in the raw XML data, mainly for data editing purposes. +# Note that the ordering of the bounding box tags does not carry any +# useful information about the contents of the data. + +# Caution: Due to the aforementioned feature, the format of the output +# dictionary will differ slightly comapred to when the option is not set. + +# Here is an example output of the ordered data: +pprint(annotation_ordered["page"][3]) # the data of the 4th page of "ARMS" +# Output (dict): +# {'@height': 1170, +# '@index': 3, +# '@width': 1654, +# 'contents': [{'#text': 'キャーッ', +# '@id': '00000005', +# '@xmax': 685, +# '@xmin': 601, +# '@ymax': 402, +# '@ymin': 291, +# 'type': 'text'}, +# {'@character': '00000003', +# '@id': '00000006', +# '@xmax': 1352, +# '@xmin': 1229, +# '@ymax': 875, +# '@ymin': 709, +# 'type': 'body'}, +# {'#text': 'はやく逃げないとまきぞえくっちゃう', +# '@id': '00000007', +# '@xmax': 1239, +# '@xmin': 1155, +# '@ymax': 686, +# '@ymin': 595, +# 'type': 'text'}, ... ]} ``` diff --git a/manga109api/manga109api.py b/manga109api/manga109api.py index 0ed554c..a6f022c 100644 --- a/manga109api/manga109api.py +++ b/manga109api/manga109api.py @@ -1,9 +1,11 @@ import pathlib import json -import xmltodict +import xml.etree.ElementTree as ET class Parser(object): + annotation_tags = ["frame", "face", "body", "text"] + def __init__(self, root_dir): """ Manga109 annotation parser @@ -17,22 +19,84 @@ def __init__(self, root_dir): with (self.root_dir / "books.txt").open("rt", encoding='utf-8') as f: self.books = [line.rstrip() for line in f] - def get_annotation(self, book, annotation_type="annotations"): + + def get_annotation(self, book, annotation_type="annotations", separate_by_tag=True): """ - Given a book title, return annotation in the form of dict. + Given a book title, return its annotations as a dict. Args: - book (str): A title of a book. Should be in self.books. + book (str): The title of the book to get the annotations of. + The title must be contained in the list `self.books`. + annotation_type (str) default `"annotations"` : The directory to load the xml data from. + separate_by_tag (bool) default `True` : When set to `True`, each annotation data type + ("frame", "face", "body", "text") will be stored in a different list in the output + dictionary. When set to `False`, all of the annotation data will be stored in a + single list in the output dictionary. In the latter case, the data in the list will + appear in the same order as in the original XML file. Returns: - annotation (dict): Annotation data consists of dict. + annotation (dict): The annotation data """ assert book in self.books - with (self.root_dir / annotation_type / (book + ".xml")).open("rt", encoding= 'utf-8') as f: - annotation = xmltodict.parse(f.read()) - annotation = json.loads(json.dumps(annotation)) # OrderedDict -> dict - annotation = _format_annotation(annotation) - _convert_str_to_int_recursively(annotation) # str -> int, for some attributes + + def int_literals_to_int(t): + """ + Convert integer literal strings to integers, + if the stringified result of the integer expression + matches the original string. + The following keys will be affected with this function: + '@index', '@width', '@height', '@xmax', '@ymax', '@xmin', '@ymin' + """ + try: + if str(t) == str(int(t)): + return int(t) # Example case: t == "42" + else: + return t # Example case: t == "00001234" + except ValueError as e: + return t # Example case: t == "some text" or t == "000012ab" + + def formatted_dict(d): + """ + - Prepends an "@" in front of each key of a given dict. + - Also applies `int_literals_to_int` to each value of the given dict. + Example: + input: {"index": "5", "title": "a"} + output: {"@index": 5, "@title": "a"} + """ + return dict([("@"+k, int_literals_to_int(v)) for k, v in d.items()]) + + with (self.root_dir / annotation_type / (book + ".xml")).open("rt", encoding='utf-8') as f: + xml = ET.parse(f).getroot() + annotation = {"title" : xml.attrib["title"]} + + characters = [] + for t in xml.find("characters"): + characters.append(formatted_dict(t.attrib)) + annotation["character"] = characters + + pages = [] + for page_xml in xml.find("pages"): + page = formatted_dict(page_xml.attrib) + + if separate_by_tag: + for annotation_tag in self.annotation_tags: + page[annotation_tag] = [] + else: + page["contents"] = [] + + for bb_xml in page_xml: + d = formatted_dict(bb_xml.attrib) + if bb_xml.text is not None: + d["#text"] = bb_xml.text + d["type"] = bb_xml.tag + + if separate_by_tag: + page[bb_xml.tag].append(d) + else: + page["contents"].append(d) + + pages.append(page) + annotation["page"] = pages return annotation def img_path(self, book, index): @@ -49,71 +113,3 @@ def img_path(self, book, index): assert book in self.books assert isinstance(index, int) return str((self.root_dir / "images" / book / (str(index).zfill(3) + ".jpg")).resolve()) - -def _format_annotation(annotation): - """ - Given annotation data, convert to an easily accessible dict. - For example, dict['book']['characters']['character'] -> dict['character'] - - Args: - annotation (dict): Annotation data. Root key is 'book'. - - Returns: - annotation (dict): Annotation data. Root keys are 'title', 'character' and 'page'. - """ - - title = annotation['book']['@title'] - try: - character = annotation['book']['characters']['character'] - except: - character = None - page = annotation['book']['pages']['page'] - - if not isinstance(character, list): - character = [character] - if not isinstance(page, list): - page = [page] - _format_page_dict_style(page) - - return { - 'title': title, - 'character': character, - 'page': page - } - - -def _format_page_dict_style(page): - """ - Format page annotation data. Make page data have the same key, and align the style of dict. - For example, - in: [{'body': [123], 'face': 123, 'frame': []}] - out: [{'body': [123], 'face': [123], 'frame': [], 'text': []}] - - Args: - page (dict): Annotation data for all pages including info such as frame, text, etc. - """ - types = ['body', 'face', 'frame', 'text'] - for i, p in enumerate(page): - for t in set(types) - set(p.keys()): - page[i][t] = [] - for t in types: - if not isinstance(p[t], list): - page[i][t] = [page[i][t]] - - -def _convert_str_to_int_recursively(annotation): - """ - Given annotation data (nested list or dict), convert some attributes from string to integer. - For example, [{'@xmax': '234', 'id': '0007a8be'}] -> [{'@xmax': 234, 'id': '0007a8be'}] - - Args: - annotation (list or dict): Annotation date that consists of list or dict. Can be deeply nested. - """ - if isinstance(annotation, dict): - for k, v in annotation.items(): - if k in ['@index', '@width', '@height', '@xmax', '@ymax', '@xmin', '@ymin']: - annotation[k] = int(v) - _convert_str_to_int_recursively(v) - elif isinstance(annotation, list): - for v in annotation: - _convert_str_to_int_recursively(v) diff --git a/requirements.txt b/requirements.txt index a0d4bb7..e69de29 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +0,0 @@ -xmltodict diff --git a/tests/test_data_type.py b/tests/test_data_type.py index e424644..4a1f0b3 100644 --- a/tests/test_data_type.py +++ b/tests/test_data_type.py @@ -6,7 +6,7 @@ def test_data_type(): p = manga109api.Parser(root_dir=manga109_root_dir) for book in p.books: - annotation = p.get_annotation(book=book) + annotation = p.get_annotation(book=book, separate_by_tag=False) # title assert isinstance(annotation["title"], str) @@ -23,7 +23,42 @@ def test_data_type(): assert isinstance(page["@index"], int) assert isinstance(page["@width"], int) assert isinstance(page["@height"], int) - + + assert isinstance(page["contents"], list) + for obj in page["contents"]: + assert isinstance(obj["@id"], str) + assert isinstance(obj["@xmin"], int) + assert isinstance(obj["@xmax"], int) + assert isinstance(obj["@ymin"], int) + assert isinstance(obj["@ymax"], int) + assert isinstance(obj["type"], str) + + if obj["type"] == "text": + assert isinstance(obj["#text"], str) + +def test_data_type_separated(): + manga109_root_dir = "tests/data_dummy/" + p = manga109api.Parser(root_dir=manga109_root_dir) + + for book in p.books: + annotation = p.get_annotation(book=book, separate_by_tag=True) + + # title + assert isinstance(annotation["title"], str) + + # character + assert isinstance(annotation["character"], list) + for character in annotation["character"]: + assert isinstance(character["@id"], str) + assert isinstance(character["@name"], str) + + # page + assert isinstance(annotation["page"], list) + for page in annotation["page"]: + assert isinstance(page["@index"], int) + assert isinstance(page["@width"], int) + assert isinstance(page["@height"], int) + for obj_type in {"body", "face", "frame", "text"}: assert isinstance(page[obj_type], list) for obj in page[obj_type]: @@ -32,6 +67,7 @@ def test_data_type(): assert isinstance(obj["@xmax"], int) assert isinstance(obj["@ymin"], int) assert isinstance(obj["@ymax"], int) + assert obj["type"] == obj_type if obj_type == "text": assert isinstance(obj["#text"], str) diff --git a/tests/test_func.py b/tests/test_func.py index 3eb76e3..3ef8176 100644 --- a/tests/test_func.py +++ b/tests/test_func.py @@ -9,41 +9,3 @@ def test_img_path(): img1 = Path(p.img_path(book="TitleA", index=0)).absolute() img2 = Path("tests/data_dummy/images/TitleA/000.jpg").absolute() assert(img1 == img2) - - -def test_format_annotation(): - annotation = { - 'book': { - '@title': 'AAA', - 'characters': {'character': [ - {'id': '123', 'name': 'yyy'} - ]}, - 'pages': {'page': [ - {'index': 234, 'width': 345, 'height': 456, - 'frame': [{'id': '567', 'xmin': 11, 'ymin': 22, 'xmax': 33, 'ymax': 44}]} - ]} - } - } - gt = { - 'title': 'AAA', - 'character': [{'id': '123', 'name': 'yyy'}], - 'page': [ - {'index': 234, 'width': 345, 'height': 456, 'face': [], 'body': [], 'text': [], - 'frame': [{'id': '567', 'xmin': 11, 'ymin': 22, 'xmax': 33, 'ymax': 44}]} - ] - } - - ret = manga109api.manga109api._format_annotation(annotation) - assert(gt == ret) - -def test_format_page_dict_style(): - page = [{'body': [123], 'face': 123, 'frame': []}] - gt = [{'body': [123], 'face': [123], 'frame': [], 'text': []}] - manga109api.manga109api._format_page_dict_style(page) - assert(gt == page) - -def test_convert_str_to_int_recursively(): - annotation = [{'@xmax': '234', 'id': '0007a8be'}] - gt = [{'@xmax': 234, 'id': '0007a8be'}] - manga109api.manga109api._convert_str_to_int_recursively(annotation) - assert(gt == annotation)