Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit c290356

Browse files
[WIP] Compare jsons as dicts, add json type to tests
1 parent 2d624d6 commit c290356

File tree

3 files changed

+50
-2
lines changed

3 files changed

+50
-2
lines changed

data_diff/hashdiff_tables.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from sqeleton.abcs import ColType_UUID, NumericType, PrecisionType, StringType, Boolean
1111

1212
from .info_tree import InfoTree
13-
from .utils import safezip
13+
from .utils import safezip, diffs_are_equiv_jsons
1414
from .thread_utils import ThreadedYielder
1515
from .table_segment import TableSegment
1616

@@ -24,7 +24,10 @@
2424
logger = logging.getLogger("hashdiff_tables")
2525

2626

27-
def diff_sets(a: set, b: set) -> Iterator:
27+
def diff_sets(a: list, b: list, has_json_cols: bool = None) -> Iterator:
28+
# check unless the only item is the key. TODO: pass a boolean to know whether the schema has json columns or not
29+
has_json_cols = len(a[0]) > 1
30+
2831
sa = set(a)
2932
sb = set(b)
3033

@@ -39,6 +42,8 @@ def diff_sets(a: set, b: set) -> Iterator:
3942
d[row[0]].append(("+", row))
4043

4144
for _k, v in sorted(d.items(), key=lambda i: i[0]):
45+
if has_json_cols and diffs_are_equiv_jsons(v):
46+
continue # don't count this as a diff, maybe do and send a warning, maybe parametrized ??
4247
yield from v
4348

4449

data_diff/utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import operator
66
import threading
77
from datetime import datetime
8+
import json
89

910

1011
def safezip(*args):
@@ -72,3 +73,22 @@ def get_timestamp(_match):
7273
return datetime.now().isoformat("_", "seconds").replace(":", "_")
7374

7475
return re.sub("%t", get_timestamp, name)
76+
77+
78+
def _jsons_equal(a, b):
79+
try:
80+
return json.loads(a) == json.loads(b)
81+
except (ValueError, TypeError, json.decoder.JSONDecodeError): # not valid jsons
82+
return False
83+
84+
85+
def diffs_are_equiv_jsons(v):
86+
if (len(v) != 2) or ({v[0][0], v[1][0]} != {'+', '-'}): # ignore rows that are missing in one of the tables
87+
return False
88+
# check all extra columns. TODO: would be more efficient if we pass the indices of json cols to only compare those
89+
match = True
90+
for col_a, col_b in safezip(v[0][1][1:], v[1][1][1:]):
91+
match = (col_a == col_b) or _jsons_equal(col_a, col_b)
92+
if not match:
93+
break
94+
return match

tests/test_database_types.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ def init_conns():
7474
"boolean": [
7575
"boolean",
7676
],
77+
"json": [
78+
"json",
79+
"jsonb"
80+
]
7781
},
7882
db.MySQL: {
7983
# https://dev.mysql.com/doc/refman/8.0/en/integer-types.html
@@ -199,6 +203,9 @@ def init_conns():
199203
"boolean": [
200204
"boolean",
201205
],
206+
"json": [
207+
"super",
208+
]
202209
},
203210
db.Oracle: {
204211
"int": [
@@ -469,12 +476,28 @@ def __iter__(self):
469476
return (uuid.uuid1(i) for i in range(self.max))
470477

471478

479+
class JsonFaker:
480+
MANUAL_FAKES = [
481+
'{"keyText": "text", "keyInt": 3, "keyFloat": 5.4445, "keyBoolean": true}',
482+
]
483+
484+
def __init__(self, max):
485+
self.max = max
486+
487+
def __iter__(self):
488+
return iter(self.MANUAL_FAKES[: self.max])
489+
490+
def __len__(self):
491+
return min(self.max, len(self.MANUAL_FAKES))
492+
493+
472494
TYPE_SAMPLES = {
473495
"int": IntFaker(N_SAMPLES),
474496
"datetime": DateTimeFaker(N_SAMPLES),
475497
"float": FloatFaker(N_SAMPLES),
476498
"uuid": UUID_Faker(N_SAMPLES),
477499
"boolean": BooleanFaker(N_SAMPLES),
500+
"json": JsonFaker(N_SAMPLES)
478501
}
479502

480503

0 commit comments

Comments
 (0)