Skip to content

Commit

Permalink
add more manual exceptions
Browse files Browse the repository at this point in the history
  • Loading branch information
rgiessmann committed May 19, 2024
1 parent bc74bb6 commit b1d30d1
Showing 1 changed file with 19 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,11 @@
"71TAN/JOH",
"91HOR/UEH",
"76SCH/KRI",
"99TEW/SCH"
]
## shouldn't be necessary after introduction of duplicate_table column; so we use:
MANUALLY_EXCLUDED_DUPLICATES = []
##
test_df = test_df[~test_df.reference.isin(MANUALLY_EXCLUDED_DUPLICATES)]
assert sum(test_df[((test_df["entry nr"]=="duplicate") | (test_df["entry nr"]=="error"))].id.isna())==0, ("Duplicate or error found for an empty-ID row", test_df[(((test_df["entry nr"]=="duplicate") | (test_df["entry nr"]=="error"))) & test_df.id.isna()])

Expand Down Expand Up @@ -64,8 +68,8 @@
## tables intact in themselves
for which, g in df.groupby(["part","page","col l/r","table from top"]):
#print((which,g))
if which == (3, 1091, 1, 1):
continue
#if which == (3, 1091, 1, 1):
# continue

assert len(g.reference.unique())==1, (which, print(g.to_string()))
assert len(g.EC.unique()) == 1, (which, print(g.to_string()))
Expand All @@ -88,8 +92,15 @@
(3, 1041, 1),
(3, 1076, 2),
(7, 1360, 2),
(7, 1369, 2),
(7, 1373, 1),

]
## shouldn't be necessary after introduction of duplicate_table column; so we use:
MANUALLY_EXCLUDED_COLUMNS = []
## because there is a peculiarity about tables not-existent-in-the-pdf, but found in randr, we have to use:
MANUALLY_EXCLUDED_COLUMNS = [
(6, 948, 2),
(6, 949, 1),
]
if which in MANUALLY_EXCLUDED_COLUMNS:
continue
Expand Down Expand Up @@ -144,7 +155,7 @@
"ionic_strength",
"p_h",
"p_mg",
#"K_prime",
"K_prime",
]
for s in SHOULD_BE_THE_SAME:
entries_where_both_are_nans = leftjoined[ leftjoined[f"{s}_x"].isna() & leftjoined[f"{s}_y"].isna() ]
Expand All @@ -155,7 +166,8 @@
assert (tmp[f"{s}_x"] == tmp[f"{s}_y"]).all(), (s, print(tmp[~(tmp[f"{s}_x"] == tmp[f"{s}_y"])][["id",f"{s}_x",f"{s}_y"]].to_string()))

## did someone add a new row without id, where they should have corrected a row with id?
merged = pandas.merge(online, noor, on=[ "reference",
merged = pandas.merge(online, noor, on=[
"reference",
"temperature",
"ionic_strength",
"p_h",
Expand Down Expand Up @@ -226,6 +238,8 @@
"https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry82",
"https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry806",
"https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry805",
"https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry3654",
"https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry2277",
]
potential_errors = potential_errors[ ~potential_errors.id_y.isin(MANUALLY_EXCLUDED) ]
if len(potential_errors) > 0:
Expand Down

0 comments on commit b1d30d1

Please sign in to comment.