Merge pull request #21 from manga109/for_custom_tag

add try & except for custom tag
manga109 · Oct 14, 2020 · bf6a5ce · bf6a5ce
2 parents 0427aad + 11bc24b
commit bf6a5ce
Show file tree

Hide file tree

Showing 4 changed files with 83 additions and 25 deletions.
diff --git a/manga109api/manga109api.py b/manga109api/manga109api.py
@@ -18,7 +18,6 @@ def __init__(self, root_dir):
         with (self.root_dir / "books.txt").open("rt", encoding='utf-8') as f:
             self.books = [line.rstrip() for line in f]
 
-
     def get_annotation(self, book, annotation_type="annotations", separate_by_tag=True):
         """
         Given a book title, return its annotations as a dict.
@@ -62,15 +61,18 @@ def formatted_dict(d):
                 input:  {"index": "5", "title": "a"}
                 output: {"@index": 5,  "@title": "a"}
             """
-            return dict([("@"+k, int_literals_to_int(v)) for k, v in d.items()])
+            return dict([("@" + k, int_literals_to_int(v)) for k, v in d.items()])
 
         with (self.root_dir / annotation_type / (book + ".xml")).open("rt", encoding='utf-8') as f:
             xml = ET.parse(f).getroot()
-        annotation = {"title" : xml.attrib["title"]}
+        annotation = {"title": xml.attrib["title"]}
 
         characters = []
-        for t in xml.find("characters"):
-            characters.append(formatted_dict(t.attrib))
+        try:
+            for t in xml.find("characters"):
+                characters.append(formatted_dict(t.attrib))
+        except:
+            pass
         annotation["character"] = characters
 
         pages = []
@@ -90,7 +92,10 @@ def formatted_dict(d):
                 d["type"] = bb_xml.tag
 
                 if separate_by_tag:
-                    page[bb_xml.tag].append(d)
+                    try:
+                        page[bb_xml.tag].append(d)
+                    except:
+                        page[bb_xml.tag] = [d]
                 else:
                     page["contents"].append(d)
 

diff --git a/tests/data_dummy/annotations/TitleC_with_custom_tag.xml b/tests/data_dummy/annotations/TitleC_with_custom_tag.xml
@@ -0,0 +1,20 @@
+<book title="TitleC">
+  <characters>
+    <character id="00000017" name="山田太郎"/>
+    <character id="00000018" name="田中花子"/>
+  </characters>
+  <pages>
+    <page index="0" width="1654" height="1170">
+      <body id="00000018" xmin="848" ymin="236" xmax="985" ymax="614" character="00000017"/>
+      <!-- w/ character -->
+      <custom_tag id="00000019" attr_num="0" attr_str="aaa" attr_mix="123a" attr_unk='111' character="00000017"/>
+      <custom_tag id="00000020" attr_num="12" attr_str="bbb" attr_mix="456b" attr_unk='asdf' character="00000017"/>
+      <custom_tag2 id="00000021" attr_num="345" attr_str="ccc" attr_mix="7cd" attr_unk='1a' attr_unk2='2b' character="00000018">dummy_text</custom_tag2>
+      <custom_tag2 id="00000022" attr_num="6789" attr_str="ddd" attr_mix="8ef" attr_unk='b' attr_unk2='33' character="00000018">dummy_text2</custom_tag2>
+      <!-- w/o character -->
+      <custom_tag3 id="00000023" attr_num="234" attr_str="xxx" attr_mix="12fsd3a" attr_unk='13411'/>
+    </page>
+    <page index="1" width="1654" height="1170"/>
+    <page index="2" width="1654" height="1170"/>
+  </pages>
+</book>
diff --git a/tests/data_dummy/books.txt b/tests/data_dummy/books.txt
@@ -1,2 +1,3 @@
 TitleA
 TitleB
+TitleC_with_custom_tag
diff --git a/tests/test_data_type.py b/tests/test_data_type.py
@@ -26,15 +26,31 @@ def test_data_type():
 
             assert isinstance(page["contents"], list)
             for obj in page["contents"]:
-                assert isinstance(obj["@id"], str)
-                assert isinstance(obj["@xmin"], int)
-                assert isinstance(obj["@xmax"], int)
-                assert isinstance(obj["@ymin"], int)
-                assert isinstance(obj["@ymax"], int)
-                assert isinstance(obj["type"], str)
+                if obj["type"] in {"body", "face", "frame", "text"}:
+                    assert isinstance(obj["@id"], str)
+                    assert isinstance(obj["@xmin"], int)
+                    assert isinstance(obj["@xmax"], int)
+                    assert isinstance(obj["@ymin"], int)
+                    assert isinstance(obj["@ymax"], int)
+                    assert isinstance(obj["type"], str)
+
+                    if obj["type"] == "text":
+                        assert isinstance(obj["#text"], str)
+
+                # custom tag test
+                else:
+                    assert isinstance(obj["@id"], str)
+                    assert isinstance(obj["@attr_num"], int)
+                    assert isinstance(obj["@attr_str"], str)
+                    assert isinstance(obj["@attr_mix"], str)
+                    assert isinstance(obj["type"], str)
+
+                    for key in (obj.keys() - {"@id", "@attr_num", "@attr_str", "@attr_mix", "type"}):
+                        assert isinstance(obj[key], (int, str))
+
+                    if "#text" in obj.keys():
+                        assert isinstance(obj["#text"], str)
 
-                if obj["type"] == "text":
-                    assert isinstance(obj["#text"], str)
 
 def test_data_type_separated():
     manga109_root_dir = "tests/data_dummy/"
@@ -59,15 +75,31 @@ def test_data_type_separated():
             assert isinstance(page["@width"], int)
             assert isinstance(page["@height"], int)
 
-            for obj_type in {"body", "face", "frame", "text"}:
-                assert isinstance(page[obj_type], list)
-                for obj in page[obj_type]:
-                    assert isinstance(obj["@id"], str)
-                    assert isinstance(obj["@xmin"], int)
-                    assert isinstance(obj["@xmax"], int)
-                    assert isinstance(obj["@ymin"], int)
-                    assert isinstance(obj["@ymax"], int)
-                    assert obj["type"] == obj_type
+            for obj_type in page.keys():
+                if obj_type in {"body", "face", "frame", "text"}:
+                    assert isinstance(page[obj_type], list)
+                    for obj in page[obj_type]:
+                        assert isinstance(obj["@id"], str)
+                        assert isinstance(obj["@xmin"], int)
+                        assert isinstance(obj["@xmax"], int)
+                        assert isinstance(obj["@ymin"], int)
+                        assert isinstance(obj["@ymax"], int)
+                        assert obj["type"] == obj_type
 
-                    if obj_type == "text":
-                        assert isinstance(obj["#text"], str)
+                        if obj_type == "text":
+                            assert isinstance(obj["#text"], str)
+
+                # custom tag test
+                elif obj_type not in {"@index", "@width", "@height"}:
+                    for obj in page[obj_type]:
+                        assert isinstance(obj["@id"], str)
+                        assert isinstance(obj["@attr_num"], int)
+                        assert isinstance(obj["@attr_str"], str)
+                        assert isinstance(obj["@attr_mix"], str)
+                        assert obj["type"] == obj_type
+
+                        for key in (obj.keys() - {"@id", "@attr_num", "@attr_str", "@attr_mix", "type"}):
+                            assert isinstance(obj[key], (int, str))
+
+                        if "#text" in obj.keys():
+                            assert isinstance(obj["#text"], str)