From 9dc7eab1f7b42c7d6f9baec17c7baf03e745f9ee Mon Sep 17 00:00:00 2001
From: Hikaru Ikuta <woodrush924@gmail.com>
Date: Tue, 6 Oct 2020 06:34:23 +0900
Subject: [PATCH] Add an order-preserving output option for the
 `Parser.get_annotation` function

---
 README.md                  |  42 ++++++++++
 manga109api/manga109api.py | 152 ++++++++++++++++++-------------------
 requirements.txt           |   1 -
 tests/test_data_type.py    |  40 +++++++++-
 tests/test_func.py         |  38 ----------
 5 files changed, 154 insertions(+), 119 deletions(-)

diff --git a/README.md b/README.md
index 1808286..d9af45f 100644
--- a/README.md
+++ b/README.md
@@ -127,6 +127,48 @@ pprint(annotation["page"][3])  # the data of the 4th page of "ARMS"
 #            '@xmin': 1155,
 #            '@ymax': 686,
 #            '@ymin': 595} ... ]}
+
+# (4) Preserve the raw tag ordering in the output annotation data
+annotation_ordered = p.get_annotation(book="ARMS", separate_by_tag=False)
+
+# In the raw XML in the Manga109 dataset, the bounding box data in the
+# `page` tag is not sorted by its annotation type, and each bounding
+# box type appears in an arbitrary order. When the `separate_by_tag=False`
+# option is set, the output will preserve the ordering of each
+# bounding box tag in the raw XML data, mainly for data editing purposes.
+# Note that the ordering of the bounding box tags does not carry any
+# useful information about the contents of the data.
+
+# Caution: Due to the aforementioned feature, the format of the output
+# dictionary will differ slightly comapred to when the option is not set.
+
+# Here is an example output of the ordered data:
+pprint(annotation_ordered["page"][3])  # the data of the 4th page of "ARMS"
+# Output (dict):
+# {'@height': 1170,
+#  '@index': 3,
+#  '@width': 1654,
+#  'contents': [{'#text': 'キャーッ',
+#                '@id': '00000005',
+#                '@xmax': 685,
+#                '@xmin': 601,
+#                '@ymax': 402,
+#                '@ymin': 291,
+#                'type': 'text'},
+#               {'@character': '00000003',
+#                '@id': '00000006',
+#                '@xmax': 1352,
+#                '@xmin': 1229,
+#                '@ymax': 875,
+#                '@ymin': 709,
+#                'type': 'body'},
+#               {'#text': 'はやく逃げないとまきぞえくっちゃう',
+#                '@id': '00000007',
+#                '@xmax': 1239,
+#                '@xmin': 1155,
+#                '@ymax': 686,
+#                '@ymin': 595,
+#                'type': 'text'}, ... ]}
 ```
 
 
diff --git a/manga109api/manga109api.py b/manga109api/manga109api.py
index 0ed554c..a6f022c 100644
--- a/manga109api/manga109api.py
+++ b/manga109api/manga109api.py
@@ -1,9 +1,11 @@
 import pathlib
 import json
-import xmltodict
+import xml.etree.ElementTree as ET
 
 
 class Parser(object):
+    annotation_tags = ["frame", "face", "body", "text"]
+
     def __init__(self, root_dir):
         """
         Manga109 annotation parser
@@ -17,22 +19,84 @@ def __init__(self, root_dir):
         with (self.root_dir / "books.txt").open("rt", encoding='utf-8') as f:
             self.books = [line.rstrip() for line in f]
 
-    def get_annotation(self, book, annotation_type="annotations"):
+
+    def get_annotation(self, book, annotation_type="annotations", separate_by_tag=True):
         """
-        Given a book title, return annotation in the form of dict.
+        Given a book title, return its annotations as a dict.
 
         Args:
-            book (str): A title of a book. Should be in self.books.
+            book (str): The title of the book to get the annotations of.
+                The title must be contained in the list `self.books`.
+            annotation_type (str) default `"annotations"` : The directory to load the xml data from.
+            separate_by_tag (bool) default `True` : When set to `True`, each annotation data type
+                ("frame", "face", "body", "text") will be stored in a different list in the output
+                dictionary. When set to `False`, all of the annotation data will be stored in a
+                single list in the output dictionary. In the latter case, the data in the list will
+                appear in the same order as in the original XML file.
 
         Returns:
-            annotation (dict): Annotation data consists of dict.
+            annotation (dict): The annotation data
         """
         assert book in self.books
-        with (self.root_dir / annotation_type / (book + ".xml")).open("rt", encoding= 'utf-8') as f:
-            annotation = xmltodict.parse(f.read())
-        annotation = json.loads(json.dumps(annotation))  # OrderedDict -> dict
-        annotation = _format_annotation(annotation)
-        _convert_str_to_int_recursively(annotation)  # str -> int, for some attributes
+
+        def int_literals_to_int(t):
+            """
+            Convert integer literal strings to integers,
+            if the stringified result of the integer expression
+            matches the original string.
+            The following keys will be affected with this function:
+            '@index', '@width', '@height', '@xmax', '@ymax', '@xmin', '@ymin'
+            """
+            try:
+                if str(t) == str(int(t)):
+                    return int(t)          # Example case: t == "42"
+                else:
+                    return t               # Example case: t == "00001234"
+            except ValueError as e:
+                return t                   # Example case: t == "some text" or t == "000012ab"
+
+        def formatted_dict(d):
+            """
+            - Prepends an "@" in front of each key of a given dict.
+            - Also applies `int_literals_to_int` to each value of the given dict.
+            Example:
+                input:  {"index": "5", "title": "a"}
+                output: {"@index": 5,  "@title": "a"}
+            """
+            return dict([("@"+k, int_literals_to_int(v)) for k, v in d.items()])
+
+        with (self.root_dir / annotation_type / (book + ".xml")).open("rt", encoding='utf-8') as f:
+            xml = ET.parse(f).getroot()
+        annotation = {"title" : xml.attrib["title"]}
+
+        characters = []
+        for t in xml.find("characters"):
+            characters.append(formatted_dict(t.attrib))
+        annotation["character"] = characters
+
+        pages = []
+        for page_xml in xml.find("pages"):
+            page = formatted_dict(page_xml.attrib)
+
+            if separate_by_tag:
+                for annotation_tag in self.annotation_tags:
+                    page[annotation_tag] = []
+            else:
+                page["contents"] = []
+
+            for bb_xml in page_xml:
+                d = formatted_dict(bb_xml.attrib)
+                if bb_xml.text is not None:
+                    d["#text"] = bb_xml.text
+                d["type"] = bb_xml.tag
+
+                if separate_by_tag:
+                    page[bb_xml.tag].append(d)
+                else:
+                    page["contents"].append(d)
+
+            pages.append(page)
+        annotation["page"] = pages
         return annotation
 
     def img_path(self, book, index):
@@ -49,71 +113,3 @@ def img_path(self, book, index):
         assert book in self.books
         assert isinstance(index, int)
         return str((self.root_dir / "images" / book / (str(index).zfill(3) + ".jpg")).resolve())
-
-def _format_annotation(annotation):
-    """
-    Given annotation data, convert to an easily accessible dict.
-    For example, dict['book']['characters']['character'] -> dict['character']
-
-    Args:
-        annotation (dict): Annotation data. Root key is 'book'.
-
-    Returns:
-        annotation (dict): Annotation data. Root keys are 'title', 'character' and 'page'.
-    """
-
-    title = annotation['book']['@title']
-    try:
-        character = annotation['book']['characters']['character']
-    except:
-        character = None
-    page = annotation['book']['pages']['page']
-
-    if not isinstance(character, list):
-        character = [character]
-    if not isinstance(page, list):
-        page = [page]
-    _format_page_dict_style(page)
-
-    return {
-        'title': title,
-        'character': character,
-        'page': page
-    }
-
-
-def _format_page_dict_style(page):
-    """
-    Format page annotation data. Make page data have the same key, and align the style of dict.
-    For example,
-      in:  [{'body': [123], 'face': 123, 'frame': []}]
-      out: [{'body': [123], 'face': [123], 'frame': [], 'text': []}]
-
-    Args:
-        page (dict): Annotation data for all pages including info such as frame, text, etc.
-    """
-    types = ['body', 'face', 'frame', 'text']
-    for i, p in enumerate(page):
-        for t in set(types) - set(p.keys()):
-            page[i][t] = []
-        for t in types:
-            if not isinstance(p[t], list):
-                page[i][t] = [page[i][t]]
-
-
-def _convert_str_to_int_recursively(annotation):
-    """
-    Given annotation data (nested list or dict), convert some attributes from string to integer.
-    For example, [{'@xmax': '234', 'id': '0007a8be'}] -> [{'@xmax': 234, 'id': '0007a8be'}]
-
-    Args:
-        annotation  (list or dict): Annotation date that consists of list or dict. Can be deeply nested.
-    """
-    if isinstance(annotation, dict):
-        for k, v in annotation.items():
-            if k in ['@index', '@width', '@height', '@xmax', '@ymax', '@xmin', '@ymin']:
-                annotation[k] = int(v)
-            _convert_str_to_int_recursively(v)
-    elif isinstance(annotation, list):
-        for v in annotation:
-            _convert_str_to_int_recursively(v)
diff --git a/requirements.txt b/requirements.txt
index a0d4bb7..e69de29 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +0,0 @@
-xmltodict
diff --git a/tests/test_data_type.py b/tests/test_data_type.py
index e424644..4a1f0b3 100644
--- a/tests/test_data_type.py
+++ b/tests/test_data_type.py
@@ -6,7 +6,7 @@ def test_data_type():
     p = manga109api.Parser(root_dir=manga109_root_dir)
 
     for book in p.books:
-        annotation = p.get_annotation(book=book)
+        annotation = p.get_annotation(book=book, separate_by_tag=False)
 
         # title
         assert isinstance(annotation["title"], str)
@@ -23,7 +23,42 @@ def test_data_type():
             assert isinstance(page["@index"], int)
             assert isinstance(page["@width"], int)
             assert isinstance(page["@height"], int)
-            
+
+            assert isinstance(page["contents"], list)
+            for obj in page["contents"]:
+                assert isinstance(obj["@id"], str)
+                assert isinstance(obj["@xmin"], int)
+                assert isinstance(obj["@xmax"], int)
+                assert isinstance(obj["@ymin"], int)
+                assert isinstance(obj["@ymax"], int)
+                assert isinstance(obj["type"], str)
+
+                if obj["type"] == "text":
+                    assert isinstance(obj["#text"], str)
+
+def test_data_type_separated():
+    manga109_root_dir = "tests/data_dummy/"
+    p = manga109api.Parser(root_dir=manga109_root_dir)
+
+    for book in p.books:
+        annotation = p.get_annotation(book=book, separate_by_tag=True)
+
+        # title
+        assert isinstance(annotation["title"], str)
+
+        # character
+        assert isinstance(annotation["character"], list)
+        for character in annotation["character"]:
+            assert isinstance(character["@id"], str)
+            assert isinstance(character["@name"], str)
+
+        # page
+        assert isinstance(annotation["page"], list)
+        for page in annotation["page"]:
+            assert isinstance(page["@index"], int)
+            assert isinstance(page["@width"], int)
+            assert isinstance(page["@height"], int)
+
             for obj_type in {"body", "face", "frame", "text"}:
                 assert isinstance(page[obj_type], list)
                 for obj in page[obj_type]:
@@ -32,6 +67,7 @@ def test_data_type():
                     assert isinstance(obj["@xmax"], int)
                     assert isinstance(obj["@ymin"], int)
                     assert isinstance(obj["@ymax"], int)
+                    assert obj["type"] == obj_type
 
                     if obj_type == "text":
                         assert isinstance(obj["#text"], str)
diff --git a/tests/test_func.py b/tests/test_func.py
index 3eb76e3..3ef8176 100644
--- a/tests/test_func.py
+++ b/tests/test_func.py
@@ -9,41 +9,3 @@ def test_img_path():
     img1 = Path(p.img_path(book="TitleA", index=0)).absolute()
     img2 = Path("tests/data_dummy/images/TitleA/000.jpg").absolute()
     assert(img1 == img2)
-
-
-def test_format_annotation():
-    annotation = {
-        'book': {
-            '@title': 'AAA',
-            'characters': {'character': [
-                {'id': '123', 'name': 'yyy'}
-            ]},
-            'pages': {'page': [
-                {'index': 234, 'width': 345, 'height': 456,
-                 'frame': [{'id': '567', 'xmin': 11, 'ymin': 22, 'xmax': 33, 'ymax': 44}]}
-            ]}
-        }
-    }
-    gt = {
-        'title': 'AAA',
-        'character': [{'id': '123', 'name': 'yyy'}],
-        'page': [
-            {'index': 234, 'width': 345, 'height': 456, 'face': [], 'body': [], 'text': [],
-             'frame': [{'id': '567', 'xmin': 11, 'ymin': 22, 'xmax': 33, 'ymax': 44}]}
-        ]
-    }
-
-    ret = manga109api.manga109api._format_annotation(annotation)
-    assert(gt == ret)
-
-def test_format_page_dict_style():
-    page = [{'body': [123], 'face': 123, 'frame': []}]
-    gt = [{'body': [123], 'face': [123], 'frame': [], 'text': []}]
-    manga109api.manga109api._format_page_dict_style(page)
-    assert(gt == page)
-
-def test_convert_str_to_int_recursively():
-    annotation = [{'@xmax': '234', 'id': '0007a8be'}]
-    gt = [{'@xmax': 234, 'id': '0007a8be'}]
-    manga109api.manga109api._convert_str_to_int_recursively(annotation)
-    assert(gt == annotation)