Skip to content

Commit b1a293c

Browse files
authored
[WIP] add extracting table info into a csv (#41)
* add extracting table info into a csv * format * _ * unused import
1 parent 6cbcff2 commit b1a293c

File tree

4 files changed

+61
-4
lines changed

4 files changed

+61
-4
lines changed

src/pubget/_coordinates.py

+2
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,8 @@ def _extract_coordinates_from_article_dir(
9999
f"in article pmcid {pmcid}"
100100
)
101101
continue
102+
if coordinates.empty:
103+
continue
102104
coordinates["pmcid"] = pmcid
103105
coordinates["table_id"] = table_info["table_id"]
104106
coordinates["table_label"] = table_info["table_label"]

src/pubget/_data_extraction.py

+2
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
from pubget._coordinates import CoordinateExtractor
2727
from pubget._links import LinkExtractor, neurovault_id_extractors
2828
from pubget._metadata import MetadataExtractor
29+
from pubget._table_info import TableInfoExtractor
2930
from pubget._text import TextExtractor
3031
from pubget._typing import (
3132
ArgparseActions,
@@ -208,6 +209,7 @@ def _get_data_extractors() -> List[Extractor]:
208209
MetadataExtractor(),
209210
AuthorsExtractor(),
210211
TextExtractor(),
212+
TableInfoExtractor(),
211213
CoordinateExtractor(),
212214
CoordinateSpaceExtractor(),
213215
LinkExtractor(),

src/pubget/_table_info.py

+51
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
"""Extracting table info from article directories."""
2+
import json
3+
from pathlib import Path
4+
from typing import Dict
5+
6+
import pandas as pd
7+
from lxml import etree
8+
9+
from pubget import _utils
10+
from pubget._typing import Extractor, Records
11+
12+
_TABLE_FIELDS = (
13+
"pmcid",
14+
"table_id",
15+
"table_label",
16+
"table_caption",
17+
"table_foot",
18+
"n_header_rows",
19+
"table_data_file",
20+
)
21+
22+
23+
class TableInfoExtractor(Extractor):
24+
"""Read table info JSON files so they can be assembled in a single CSV."""
25+
26+
fields = _TABLE_FIELDS
27+
name = "tables"
28+
29+
def extract(
30+
self,
31+
article: etree.ElementTree,
32+
article_dir: Path,
33+
previous_extractors_output: Dict[str, Records],
34+
) -> pd.DataFrame:
35+
del article, previous_extractors_output
36+
all_tables_info = []
37+
pmcid = _utils.get_pmcid_from_article_dir(article_dir)
38+
for table_json in _utils.get_table_info_files_from_article_dir(
39+
article_dir
40+
):
41+
table_info = json.loads(table_json.read_text("UTF-8"))
42+
table_info["pmcid"] = pmcid
43+
table_info["table_data_file"] = str(
44+
table_json.with_name(
45+
table_info["table_data_file"]
46+
).relative_to(article_dir.parents[2])
47+
)
48+
all_tables_info.append(table_info)
49+
if all_tables_info:
50+
return pd.DataFrame(all_tables_info)[list(self.fields)]
51+
return pd.DataFrame(columns=list(self.fields))

src/pubget/_utils.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
import shutil
1111
from datetime import datetime
1212
from pathlib import Path
13-
from typing import Any, Dict, Generator, Optional, Tuple, Union
13+
from typing import Any, Dict, Generator, List, Optional, Tuple, Union
1414

1515
import pandas as pd
1616
from lxml import etree
@@ -160,13 +160,15 @@ def read_article_table(
160160
return table_info, table_data
161161

162162

163+
def get_table_info_files_from_article_dir(article_dir: Path) -> List[Path]:
164+
return sorted(article_dir.joinpath("tables").glob("table_*_info.json"))
165+
166+
163167
def get_tables_from_article_dir(
164168
article_dir: Path,
165169
) -> Generator[Tuple[Dict[str, Any], pd.DataFrame], None, None]:
166170
"""Load information and data for all tables belonging to an article."""
167-
for table_info_json in sorted(
168-
article_dir.joinpath("tables").glob("table_*_info.json")
169-
):
171+
for table_info_json in get_table_info_files_from_article_dir(article_dir):
170172
yield read_article_table(table_info_json)
171173

172174

0 commit comments

Comments
 (0)