Skip to content

Commit

Permalink
Merge pull request #17 from manga109/add-ordered-output-func
Browse files Browse the repository at this point in the history
Add a tag-order-preserving option for `Parser.get_annotation`
  • Loading branch information
matsui528 authored Oct 6, 2020
2 parents 5dcace5 + 9dc7eab commit 65dc258
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 119 deletions.
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,48 @@ pprint(annotation["page"][3]) # the data of the 4th page of "ARMS"
# '@xmin': 1155,
# '@ymax': 686,
# '@ymin': 595} ... ]}

# (4) Preserve the raw tag ordering in the output annotation data
annotation_ordered = p.get_annotation(book="ARMS", separate_by_tag=False)

# In the raw XML in the Manga109 dataset, the bounding box data in the
# `page` tag is not sorted by its annotation type, and each bounding
# box type appears in an arbitrary order. When the `separate_by_tag=False`
# option is set, the output will preserve the ordering of each
# bounding box tag in the raw XML data, mainly for data editing purposes.
# Note that the ordering of the bounding box tags does not carry any
# useful information about the contents of the data.

# Caution: Due to the aforementioned feature, the format of the output
# dictionary will differ slightly comapred to when the option is not set.

# Here is an example output of the ordered data:
pprint(annotation_ordered["page"][3]) # the data of the 4th page of "ARMS"
# Output (dict):
# {'@height': 1170,
# '@index': 3,
# '@width': 1654,
# 'contents': [{'#text': 'キャーッ',
# '@id': '00000005',
# '@xmax': 685,
# '@xmin': 601,
# '@ymax': 402,
# '@ymin': 291,
# 'type': 'text'},
# {'@character': '00000003',
# '@id': '00000006',
# '@xmax': 1352,
# '@xmin': 1229,
# '@ymax': 875,
# '@ymin': 709,
# 'type': 'body'},
# {'#text': 'はやく逃げないとまきぞえくっちゃう',
# '@id': '00000007',
# '@xmax': 1239,
# '@xmin': 1155,
# '@ymax': 686,
# '@ymin': 595,
# 'type': 'text'}, ... ]}
```


Expand Down
152 changes: 74 additions & 78 deletions manga109api/manga109api.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import pathlib
import json
import xmltodict
import xml.etree.ElementTree as ET


class Parser(object):
annotation_tags = ["frame", "face", "body", "text"]

def __init__(self, root_dir):
"""
Manga109 annotation parser
Expand All @@ -17,22 +19,84 @@ def __init__(self, root_dir):
with (self.root_dir / "books.txt").open("rt", encoding='utf-8') as f:
self.books = [line.rstrip() for line in f]

def get_annotation(self, book, annotation_type="annotations"):

def get_annotation(self, book, annotation_type="annotations", separate_by_tag=True):
"""
Given a book title, return annotation in the form of dict.
Given a book title, return its annotations as a dict.
Args:
book (str): A title of a book. Should be in self.books.
book (str): The title of the book to get the annotations of.
The title must be contained in the list `self.books`.
annotation_type (str) default `"annotations"` : The directory to load the xml data from.
separate_by_tag (bool) default `True` : When set to `True`, each annotation data type
("frame", "face", "body", "text") will be stored in a different list in the output
dictionary. When set to `False`, all of the annotation data will be stored in a
single list in the output dictionary. In the latter case, the data in the list will
appear in the same order as in the original XML file.
Returns:
annotation (dict): Annotation data consists of dict.
annotation (dict): The annotation data
"""
assert book in self.books
with (self.root_dir / annotation_type / (book + ".xml")).open("rt", encoding= 'utf-8') as f:
annotation = xmltodict.parse(f.read())
annotation = json.loads(json.dumps(annotation)) # OrderedDict -> dict
annotation = _format_annotation(annotation)
_convert_str_to_int_recursively(annotation) # str -> int, for some attributes

def int_literals_to_int(t):
"""
Convert integer literal strings to integers,
if the stringified result of the integer expression
matches the original string.
The following keys will be affected with this function:
'@index', '@width', '@height', '@xmax', '@ymax', '@xmin', '@ymin'
"""
try:
if str(t) == str(int(t)):
return int(t) # Example case: t == "42"
else:
return t # Example case: t == "00001234"
except ValueError as e:
return t # Example case: t == "some text" or t == "000012ab"

def formatted_dict(d):
"""
- Prepends an "@" in front of each key of a given dict.
- Also applies `int_literals_to_int` to each value of the given dict.
Example:
input: {"index": "5", "title": "a"}
output: {"@index": 5, "@title": "a"}
"""
return dict([("@"+k, int_literals_to_int(v)) for k, v in d.items()])

with (self.root_dir / annotation_type / (book + ".xml")).open("rt", encoding='utf-8') as f:
xml = ET.parse(f).getroot()
annotation = {"title" : xml.attrib["title"]}

characters = []
for t in xml.find("characters"):
characters.append(formatted_dict(t.attrib))
annotation["character"] = characters

pages = []
for page_xml in xml.find("pages"):
page = formatted_dict(page_xml.attrib)

if separate_by_tag:
for annotation_tag in self.annotation_tags:
page[annotation_tag] = []
else:
page["contents"] = []

for bb_xml in page_xml:
d = formatted_dict(bb_xml.attrib)
if bb_xml.text is not None:
d["#text"] = bb_xml.text
d["type"] = bb_xml.tag

if separate_by_tag:
page[bb_xml.tag].append(d)
else:
page["contents"].append(d)

pages.append(page)
annotation["page"] = pages
return annotation

def img_path(self, book, index):
Expand All @@ -49,71 +113,3 @@ def img_path(self, book, index):
assert book in self.books
assert isinstance(index, int)
return str((self.root_dir / "images" / book / (str(index).zfill(3) + ".jpg")).resolve())

def _format_annotation(annotation):
"""
Given annotation data, convert to an easily accessible dict.
For example, dict['book']['characters']['character'] -> dict['character']
Args:
annotation (dict): Annotation data. Root key is 'book'.
Returns:
annotation (dict): Annotation data. Root keys are 'title', 'character' and 'page'.
"""

title = annotation['book']['@title']
try:
character = annotation['book']['characters']['character']
except:
character = None
page = annotation['book']['pages']['page']

if not isinstance(character, list):
character = [character]
if not isinstance(page, list):
page = [page]
_format_page_dict_style(page)

return {
'title': title,
'character': character,
'page': page
}


def _format_page_dict_style(page):
"""
Format page annotation data. Make page data have the same key, and align the style of dict.
For example,
in: [{'body': [123], 'face': 123, 'frame': []}]
out: [{'body': [123], 'face': [123], 'frame': [], 'text': []}]
Args:
page (dict): Annotation data for all pages including info such as frame, text, etc.
"""
types = ['body', 'face', 'frame', 'text']
for i, p in enumerate(page):
for t in set(types) - set(p.keys()):
page[i][t] = []
for t in types:
if not isinstance(p[t], list):
page[i][t] = [page[i][t]]


def _convert_str_to_int_recursively(annotation):
"""
Given annotation data (nested list or dict), convert some attributes from string to integer.
For example, [{'@xmax': '234', 'id': '0007a8be'}] -> [{'@xmax': 234, 'id': '0007a8be'}]
Args:
annotation (list or dict): Annotation date that consists of list or dict. Can be deeply nested.
"""
if isinstance(annotation, dict):
for k, v in annotation.items():
if k in ['@index', '@width', '@height', '@xmax', '@ymax', '@xmin', '@ymin']:
annotation[k] = int(v)
_convert_str_to_int_recursively(v)
elif isinstance(annotation, list):
for v in annotation:
_convert_str_to_int_recursively(v)
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
xmltodict
40 changes: 38 additions & 2 deletions tests/test_data_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def test_data_type():
p = manga109api.Parser(root_dir=manga109_root_dir)

for book in p.books:
annotation = p.get_annotation(book=book)
annotation = p.get_annotation(book=book, separate_by_tag=False)

# title
assert isinstance(annotation["title"], str)
Expand All @@ -23,7 +23,42 @@ def test_data_type():
assert isinstance(page["@index"], int)
assert isinstance(page["@width"], int)
assert isinstance(page["@height"], int)


assert isinstance(page["contents"], list)
for obj in page["contents"]:
assert isinstance(obj["@id"], str)
assert isinstance(obj["@xmin"], int)
assert isinstance(obj["@xmax"], int)
assert isinstance(obj["@ymin"], int)
assert isinstance(obj["@ymax"], int)
assert isinstance(obj["type"], str)

if obj["type"] == "text":
assert isinstance(obj["#text"], str)

def test_data_type_separated():
manga109_root_dir = "tests/data_dummy/"
p = manga109api.Parser(root_dir=manga109_root_dir)

for book in p.books:
annotation = p.get_annotation(book=book, separate_by_tag=True)

# title
assert isinstance(annotation["title"], str)

# character
assert isinstance(annotation["character"], list)
for character in annotation["character"]:
assert isinstance(character["@id"], str)
assert isinstance(character["@name"], str)

# page
assert isinstance(annotation["page"], list)
for page in annotation["page"]:
assert isinstance(page["@index"], int)
assert isinstance(page["@width"], int)
assert isinstance(page["@height"], int)

for obj_type in {"body", "face", "frame", "text"}:
assert isinstance(page[obj_type], list)
for obj in page[obj_type]:
Expand All @@ -32,6 +67,7 @@ def test_data_type():
assert isinstance(obj["@xmax"], int)
assert isinstance(obj["@ymin"], int)
assert isinstance(obj["@ymax"], int)
assert obj["type"] == obj_type

if obj_type == "text":
assert isinstance(obj["#text"], str)
38 changes: 0 additions & 38 deletions tests/test_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,41 +9,3 @@ def test_img_path():
img1 = Path(p.img_path(book="TitleA", index=0)).absolute()
img2 = Path("tests/data_dummy/images/TitleA/000.jpg").absolute()
assert(img1 == img2)


def test_format_annotation():
annotation = {
'book': {
'@title': 'AAA',
'characters': {'character': [
{'id': '123', 'name': 'yyy'}
]},
'pages': {'page': [
{'index': 234, 'width': 345, 'height': 456,
'frame': [{'id': '567', 'xmin': 11, 'ymin': 22, 'xmax': 33, 'ymax': 44}]}
]}
}
}
gt = {
'title': 'AAA',
'character': [{'id': '123', 'name': 'yyy'}],
'page': [
{'index': 234, 'width': 345, 'height': 456, 'face': [], 'body': [], 'text': [],
'frame': [{'id': '567', 'xmin': 11, 'ymin': 22, 'xmax': 33, 'ymax': 44}]}
]
}

ret = manga109api.manga109api._format_annotation(annotation)
assert(gt == ret)

def test_format_page_dict_style():
page = [{'body': [123], 'face': 123, 'frame': []}]
gt = [{'body': [123], 'face': [123], 'frame': [], 'text': []}]
manga109api.manga109api._format_page_dict_style(page)
assert(gt == page)

def test_convert_str_to_int_recursively():
annotation = [{'@xmax': '234', 'id': '0007a8be'}]
gt = [{'@xmax': 234, 'id': '0007a8be'}]
manga109api.manga109api._convert_str_to_int_recursively(annotation)
assert(gt == annotation)

0 comments on commit 65dc258

Please sign in to comment.