From b1d30d1c4b203e814f7b26d24cfc16d70337a7fe Mon Sep 17 00:00:00 2001
From: Robert Giessmann <rgiessmann@gmail.com>
Date: Sun, 19 May 2024 19:11:44 +0200
Subject: [PATCH] add more manual exceptions

---
 ...on-quality-check-and-extract-table-data.py | 24 +++++++++++++++----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/materials/tecrdb recuration/2024-01-06-opentecr-recuration-quality-check-and-extract-table-data.py b/materials/tecrdb recuration/2024-01-06-opentecr-recuration-quality-check-and-extract-table-data.py
index d20dc2f..28499a8 100644
--- a/materials/tecrdb recuration/2024-01-06-opentecr-recuration-quality-check-and-extract-table-data.py	
+++ b/materials/tecrdb recuration/2024-01-06-opentecr-recuration-quality-check-and-extract-table-data.py	
@@ -34,7 +34,11 @@
         "71TAN/JOH",
         "91HOR/UEH",
         "76SCH/KRI",
+        "99TEW/SCH"
     ]
+    ## shouldn't be necessary after introduction of duplicate_table column; so we use:
+    MANUALLY_EXCLUDED_DUPLICATES = []
+    ##
     test_df = test_df[~test_df.reference.isin(MANUALLY_EXCLUDED_DUPLICATES)]
     assert sum(test_df[((test_df["entry nr"]=="duplicate") | (test_df["entry nr"]=="error"))].id.isna())==0, ("Duplicate or error found for an empty-ID row", test_df[(((test_df["entry nr"]=="duplicate") | (test_df["entry nr"]=="error"))) & test_df.id.isna()])
 
@@ -64,8 +68,8 @@
     ## tables intact in themselves
     for which, g in df.groupby(["part","page","col l/r","table from top"]):
         #print((which,g))
-        if which == (3, 1091, 1, 1):
-            continue
+        #if which == (3, 1091, 1, 1):
+        #    continue
             
         assert len(g.reference.unique())==1, (which, print(g.to_string()))
         assert len(g.EC.unique()) == 1, (which, print(g.to_string()))
@@ -88,8 +92,15 @@
             (3, 1041, 1),
             (3, 1076, 2),
             (7, 1360, 2),
+            (7, 1369, 2),
             (7, 1373, 1),
-
+        ]
+        ## shouldn't be necessary after introduction of duplicate_table column; so we use:
+        MANUALLY_EXCLUDED_COLUMNS = []
+        ## because there is a peculiarity about tables not-existent-in-the-pdf, but found in randr, we have to use:
+        MANUALLY_EXCLUDED_COLUMNS = [
+            (6, 948, 2),
+            (6, 949, 1),
         ]
         if which in MANUALLY_EXCLUDED_COLUMNS:
             continue
@@ -144,7 +155,7 @@
     "ionic_strength",
     "p_h",
     "p_mg",
-    #"K_prime",
+    "K_prime",
 ]
 for s in SHOULD_BE_THE_SAME:
     entries_where_both_are_nans = leftjoined[ leftjoined[f"{s}_x"].isna() & leftjoined[f"{s}_y"].isna() ]
@@ -155,7 +166,8 @@
         assert (tmp[f"{s}_x"] == tmp[f"{s}_y"]).all(), (s, print(tmp[~(tmp[f"{s}_x"] == tmp[f"{s}_y"])][["id",f"{s}_x",f"{s}_y"]].to_string()))
 
 ## did someone add a new row without id, where they should have corrected a row with id?
-merged = pandas.merge(online, noor, on=[    "reference",
+merged = pandas.merge(online, noor, on=[
+    "reference",
     "temperature",
     "ionic_strength",
     "p_h",
@@ -226,6 +238,8 @@
     "https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry82",
     "https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry806",
     "https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry805",
+    "https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry3654",
+    "https://w3id.org/related-to/doi.org/10.5281/zenodo.3978439/files/TECRDB.csv#entry2277",
 ]
 potential_errors = potential_errors[ ~potential_errors.id_y.isin(MANUALLY_EXCLUDED) ]
 if len(potential_errors) > 0: