add coltype aware matching and warning

nicolasaldecoa · nicolasaldecoa · commit bc0f75703149 · 2023-02-08T14:13:36.000-03:00
diff --git a/data_diff/hashdiff_tables.py b/data_diff/hashdiff_tables.py
@@ -7,7 +7,7 @@
 
 from runtype import dataclass
 
-from sqeleton.abcs import ColType_UUID, NumericType, PrecisionType, StringType, Boolean
+from sqeleton.abcs import ColType_UUID, NumericType, PrecisionType, StringType, Boolean, JSONType
 
 from .info_tree import InfoTree
 from .utils import safezip, diffs_are_equiv_jsons
@@ -24,10 +24,7 @@
 logger = logging.getLogger("hashdiff_tables")
 
 
-def diff_sets(a: list, b: list, has_json_cols: bool = None) -> Iterator:
-    # check unless the only item is the key. TODO: pass a boolean to know whether the schema has json columns or not
-    has_json_cols = len(a[0]) > 1
-
+def diff_sets(a: list, b: list, json_cols: dict = None) -> Iterator:
     sa = set(a)
     sb = set(b)
 
@@ -41,9 +38,17 @@ def diff_sets(a: list, b: list, has_json_cols: bool = None) -> Iterator:
         if row not in sa:
             d[row[0]].append(("+", row))
 
+    warned_diff_cols = set()
     for _k, v in sorted(d.items(), key=lambda i: i[0]):
-        if has_json_cols and diffs_are_equiv_jsons(v):
-            continue  # don't count this as a diff, maybe do and send a warning, maybe parametrized ??
+        if json_cols:
+            parsed_match, overriden_diff_cols = diffs_are_equiv_jsons(v, json_cols)
+            if parsed_match:
+                to_warn = overriden_diff_cols - warned_diff_cols
+                for w in to_warn:
+                    logger.warning(f"Equivalent JSON objects with different string representations detected "
+                                   f"in column '{w}'. These cases are NOT reported as differences.")
+                    warned_diff_cols.add(w)
+                continue
         yield from v
 
 
@@ -199,7 +204,9 @@ def _bisect_and_diff_segments(
         # This saves time, as bisection speed is limited by ping and query performance.
         if max_rows < self.bisection_threshold or max_space_size < self.bisection_factor * 2:
             rows1, rows2 = self._threaded_call("get_values", [table1, table2])
-            diff = list(diff_sets(rows1, rows2))
+            json_cols = {i: colname for i, colname in enumerate(table1.extra_columns)
+                         if isinstance(table1._schema[colname], JSONType)}
+            diff = list(diff_sets(rows1, rows2, json_cols))
 
             info_tree.info.set_diff(diff)
             info_tree.info.rowcounts = {1: len(rows1), 2: len(rows2)}
diff --git a/data_diff/utils.py b/data_diff/utils.py
@@ -75,20 +75,25 @@ def get_timestamp(_match):
     return re.sub("%t", get_timestamp, name)
 
 
-def _jsons_equal(a, b):
+def _jsons_equiv(a: str, b: str):
     try:
         return json.loads(a) == json.loads(b)
     except (ValueError, TypeError, json.decoder.JSONDecodeError):  # not valid jsons
         return False
 
 
-def diffs_are_equiv_jsons(v):
-    if (len(v) != 2) or ({v[0][0], v[1][0]} != {'+', '-'}):  # ignore rows that are missing in one of the tables
+def diffs_are_equiv_jsons(diff: list, json_cols: dict):
+    if (len(diff) != 2) or ({diff[0][0], diff[1][0]} != {'+', '-'}):
         return False
-    # check all extra columns.  TODO: would be more efficient if we pass the indices of json cols to only compare those
     match = True
-    for col_a, col_b in safezip(v[0][1][1:], v[1][1][1:]):
-        match = (col_a == col_b) or _jsons_equal(col_a, col_b)
+    overriden_diff_cols = set()
+    for i, (col_a, col_b) in enumerate(safezip(diff[0][1][1:], diff[1][1][1:])):  # index 0 is extra_columns first elem
+        # we only attempt to parse columns of JSONType, but we still need to check if non-json columns don't match
+        match = col_a == col_b
+        if not match and (i in json_cols):
+            if _jsons_equiv(col_a, col_b):
+                overriden_diff_cols.add(json_cols[i])
+                match = True
         if not match:
             break
-    return match
+    return match, overriden_diff_cols
diff --git a/tests/test_database_types.py b/tests/test_database_types.py
@@ -569,7 +569,7 @@ def expand_params(testcase_func, param_num, param):
     return name
 
 
-def _insert_to_table(conn, table_path, values, type):
+def _insert_to_table(conn, table_path, values, coltype):
     tbl = table(table_path)
 
     current_n_rows = conn.query(tbl.count(), int)
@@ -578,31 +578,34 @@ def _insert_to_table(conn, table_path, values, type):
         return
     elif current_n_rows > 0:
         conn.query(drop_table(table_name))
-        _create_table_with_indexes(conn, table_path, type)
+        _create_table_with_indexes(conn, table_path, coltype)
 
     # if BENCHMARK and N_SAMPLES > 10_000:
     #     description = f"{conn.name}: {table}"
     #     values = rich.progress.track(values, total=N_SAMPLES, description=description)
 
-    if type == "boolean":
+    if coltype == "boolean":
         values = [(i, bool(sample)) for i, sample in values]
-    elif re.search(r"(time zone|tz)", type):
+    elif re.search(r"(time zone|tz)", coltype):
         values = [(i, sample.replace(tzinfo=timezone.utc)) for i, sample in values]
 
     if isinstance(conn, db.Clickhouse):
-        if type.startswith("DateTime64"):
+        if coltype.startswith("DateTime64"):
             values = [(i, f"{sample.replace(tzinfo=None)}") for i, sample in values]
 
-        elif type == "DateTime":
+        elif coltype == "DateTime":
             # Clickhouse's DateTime does not allow to store micro/milli/nano seconds
             values = [(i, str(sample)[:19]) for i, sample in values]
 
-        elif type.startswith("Decimal("):
-            precision = int(type[8:].rstrip(")").split(",")[1])
+        elif coltype.startswith("Decimal("):
+            precision = int(coltype[8:].rstrip(")").split(",")[1])
             values = [(i, round(sample, precision)) for i, sample in values]
-    elif isinstance(conn, db.BigQuery) and type == "datetime":
+    elif isinstance(conn, db.BigQuery) and coltype == "datetime":
         values = [(i, Code(f"cast(timestamp '{sample}' as datetime)")) for i, sample in values]
 
+    if isinstance(conn, db.Redshift) and coltype == "json":
+        values = [(i, Code(f"JSON_PARSE('{sample}')")) for i, sample in values]
+
     insert_rows_in_batches(conn, tbl, values, columns=["id", "col"])
     conn.query(commit)