Skip to content

Commit 148dda1

Browse files
authored
Destinations Bigquery+Snowflake: Do not dedup raw table (#31520)
Co-authored-by: edgao <[email protected]>
1 parent 534ccb1 commit 148dda1

File tree

29 files changed

+292
-260
lines changed

29 files changed

+292
-260
lines changed

airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseSqlGeneratorIntegrationTest.java

+10-11
Original file line numberDiff line numberDiff line change
@@ -659,13 +659,11 @@ public void incrementalDedupNoCursor() throws Exception {
659659
final List<JsonNode> actualRawRecords = dumpRawTableRecords(streamId);
660660
final List<JsonNode> actualFinalRecords = dumpFinalTableRecords(streamId, "");
661661
verifyRecordCounts(
662-
1,
662+
2,
663663
actualRawRecords,
664664
1,
665665
actualFinalRecords);
666-
assertAll(
667-
() -> assertEquals("bar", actualRawRecords.get(0).get("_airbyte_data").get("string").asText()),
668-
() -> assertEquals("bar", actualFinalRecords.get(0).get(generator.buildColumnId("string").name()).asText()));
666+
assertEquals("bar", actualFinalRecords.get(0).get(generator.buildColumnId("string").name()).asText());
669667
}
670668

671669
@Test
@@ -796,10 +794,9 @@ public void cdcComplexUpdate() throws Exception {
796794
destinationHandler.execute(sql);
797795

798796
verifyRecordCounts(
799-
// We keep the newest raw record per PK
800-
7,
797+
11,
801798
dumpRawTableRecords(streamId),
802-
5,
799+
6,
803800
dumpFinalTableRecords(streamId, ""));
804801
}
805802

@@ -824,11 +821,12 @@ public void testCdcOrdering_updateAfterDelete() throws Exception {
824821
streamId,
825822
BaseTypingDedupingTest.readRecords("sqlgenerator/cdcordering_updateafterdelete_inputrecords.jsonl"));
826823

827-
final String sql = generator.updateTable(cdcIncrementalDedupStream, "", Optional.empty());
824+
final Optional<Instant> minTimestampForSync = destinationHandler.getMinTimestampForSync(cdcIncrementalAppendStream.id());
825+
final String sql = generator.updateTable(cdcIncrementalDedupStream, "", minTimestampForSync);
828826
destinationHandler.execute(sql);
829827

830828
verifyRecordCounts(
831-
1,
829+
2,
832830
dumpRawTableRecords(streamId),
833831
0,
834832
dumpFinalTableRecords(streamId, ""));
@@ -861,11 +859,12 @@ public void testCdcOrdering_insertAfterDelete() throws Exception {
861859
"",
862860
BaseTypingDedupingTest.readRecords("sqlgenerator/cdcordering_insertafterdelete_inputrecords_final.jsonl"));
863861

864-
final String sql = generator.updateTable(cdcIncrementalDedupStream, "", Optional.empty());
862+
final Optional<Instant> minTimestampForSync = destinationHandler.getMinTimestampForSync(cdcIncrementalAppendStream.id());
863+
final String sql = generator.updateTable(cdcIncrementalDedupStream, "", minTimestampForSync);
865864
destinationHandler.execute(sql);
866865

867866
verifyRecordCounts(
868-
1,
867+
2,
869868
dumpRawTableRecords(streamId),
870869
1,
871870
dumpFinalTableRecords(streamId, ""));

airbyte-integrations/bases/base-typing-deduping-test/src/main/java/io/airbyte/integrations/base/destination/typing_deduping/BaseTypingDedupingTest.java

+26-25
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44

55
package io.airbyte.integrations.base.destination.typing_deduping;
66

7+
import static org.junit.jupiter.api.Assertions.assertAll;
8+
79
import com.fasterxml.jackson.databind.JsonNode;
810
import com.fasterxml.jackson.databind.node.ObjectNode;
911
import com.google.common.collect.ImmutableMap;
@@ -226,7 +228,7 @@ public void fullRefreshOverwrite() throws Exception {
226228

227229
runSync(catalog, messages1);
228230

229-
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_nondedup_raw.jsonl");
231+
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_raw.jsonl");
230232
final List<JsonNode> expectedFinalRecords1 = readRecords("dat/sync1_expectedrecords_nondedup_final.jsonl");
231233
verifySyncResult(expectedRawRecords1, expectedFinalRecords1);
232234

@@ -261,7 +263,7 @@ public void fullRefreshAppend() throws Exception {
261263

262264
runSync(catalog, messages1);
263265

264-
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_nondedup_raw.jsonl");
266+
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_raw.jsonl");
265267
final List<JsonNode> expectedFinalRecords1 = readRecords("dat/sync1_expectedrecords_nondedup_final.jsonl");
266268
verifySyncResult(expectedRawRecords1, expectedFinalRecords1);
267269

@@ -270,7 +272,7 @@ public void fullRefreshAppend() throws Exception {
270272

271273
runSync(catalog, messages2);
272274

273-
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_fullrefresh_append_raw.jsonl");
275+
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_raw.jsonl");
274276
final List<JsonNode> expectedFinalRecords2 = readRecords("dat/sync2_expectedrecords_fullrefresh_append_final.jsonl");
275277
verifySyncResult(expectedRawRecords2, expectedFinalRecords2);
276278
}
@@ -300,7 +302,7 @@ public void incrementalAppend() throws Exception {
300302

301303
runSync(catalog, messages1);
302304

303-
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_nondedup_raw.jsonl");
305+
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_raw.jsonl");
304306
final List<JsonNode> expectedFinalRecords1 = readRecords("dat/sync1_expectedrecords_nondedup_final.jsonl");
305307
verifySyncResult(expectedRawRecords1, expectedFinalRecords1);
306308

@@ -309,7 +311,7 @@ public void incrementalAppend() throws Exception {
309311

310312
runSync(catalog, messages2);
311313

312-
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_fullrefresh_append_raw.jsonl");
314+
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_raw.jsonl");
313315
final List<JsonNode> expectedFinalRecords2 = readRecords("dat/sync2_expectedrecords_fullrefresh_append_final.jsonl");
314316
verifySyncResult(expectedRawRecords2, expectedFinalRecords2);
315317
}
@@ -337,7 +339,7 @@ public void incrementalDedup() throws Exception {
337339

338340
runSync(catalog, messages1);
339341

340-
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_dedup_raw.jsonl");
342+
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_raw.jsonl");
341343
final List<JsonNode> expectedFinalRecords1 = readRecords("dat/sync1_expectedrecords_dedup_final.jsonl");
342344
verifySyncResult(expectedRawRecords1, expectedFinalRecords1);
343345

@@ -346,7 +348,7 @@ public void incrementalDedup() throws Exception {
346348

347349
runSync(catalog, messages2);
348350

349-
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_incremental_dedup_raw.jsonl");
351+
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_raw.jsonl");
350352
final List<JsonNode> expectedFinalRecords2 = readRecords("dat/sync2_expectedrecords_incremental_dedup_final.jsonl");
351353
verifySyncResult(expectedRawRecords2, expectedFinalRecords2);
352354
}
@@ -372,7 +374,7 @@ public void incrementalDedupDefaultNamespace() throws Exception {
372374

373375
runSync(catalog, messages1);
374376

375-
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_dedup_raw.jsonl");
377+
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_raw.jsonl");
376378
final List<JsonNode> expectedFinalRecords1 = readRecords("dat/sync1_expectedrecords_dedup_final.jsonl");
377379
verifySyncResult(expectedRawRecords1, expectedFinalRecords1, null, streamName);
378380

@@ -381,7 +383,7 @@ public void incrementalDedupDefaultNamespace() throws Exception {
381383

382384
runSync(catalog, messages2);
383385

384-
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_incremental_dedup_raw.jsonl");
386+
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_raw.jsonl");
385387
final List<JsonNode> expectedFinalRecords2 = readRecords("dat/sync2_expectedrecords_incremental_dedup_final.jsonl");
386388
verifySyncResult(expectedRawRecords2, expectedFinalRecords2, null, streamName);
387389
}
@@ -424,7 +426,7 @@ public void testIncrementalSyncDropOneColumn() throws Exception {
424426

425427
runSync(catalog, messages1);
426428

427-
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_nondedup_raw.jsonl");
429+
final List<JsonNode> expectedRawRecords1 = readRecords("dat/sync1_expectedrecords_raw.jsonl");
428430
final List<JsonNode> expectedFinalRecords1 = readRecords("dat/sync1_expectedrecords_nondedup_final.jsonl");
429431
verifySyncResult(expectedRawRecords1, expectedFinalRecords1);
430432

@@ -437,7 +439,7 @@ public void testIncrementalSyncDropOneColumn() throws Exception {
437439
runSync(catalog, messages2);
438440

439441
// The raw data is unaffected by the schema, but the final table should not have a `name` column.
440-
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_fullrefresh_append_raw.jsonl");
442+
final List<JsonNode> expectedRawRecords2 = readRecords("dat/sync2_expectedrecords_raw.jsonl");
441443
final List<JsonNode> expectedFinalRecords2 = readRecords("dat/sync2_expectedrecords_fullrefresh_append_final.jsonl").stream()
442444
.peek(record -> ((ObjectNode) record).remove(getSqlGenerator().buildColumnId("name").name()))
443445
.toList();
@@ -500,12 +502,12 @@ public void incrementalDedupIdenticalName() throws Exception {
500502
runSync(catalog, messages1);
501503

502504
verifySyncResult(
503-
readRecords("dat/sync1_expectedrecords_dedup_raw.jsonl"),
505+
readRecords("dat/sync1_expectedrecords_raw.jsonl"),
504506
readRecords("dat/sync1_expectedrecords_dedup_final.jsonl"),
505507
namespace1,
506508
streamName);
507509
verifySyncResult(
508-
readRecords("dat/sync1_expectedrecords_dedup_raw2.jsonl"),
510+
readRecords("dat/sync1_expectedrecords_raw2.jsonl"),
509511
readRecords("dat/sync1_expectedrecords_dedup_final2.jsonl"),
510512
namespace2,
511513
streamName);
@@ -518,12 +520,12 @@ public void incrementalDedupIdenticalName() throws Exception {
518520
runSync(catalog, messages2);
519521

520522
verifySyncResult(
521-
readRecords("dat/sync2_expectedrecords_incremental_dedup_raw.jsonl"),
523+
readRecords("dat/sync2_expectedrecords_raw.jsonl"),
522524
readRecords("dat/sync2_expectedrecords_incremental_dedup_final.jsonl"),
523525
namespace1,
524526
streamName);
525527
verifySyncResult(
526-
readRecords("dat/sync2_expectedrecords_incremental_dedup_raw2.jsonl"),
528+
readRecords("dat/sync2_expectedrecords_raw2.jsonl"),
527529
readRecords("dat/sync2_expectedrecords_incremental_dedup_final2.jsonl"),
528530
namespace2,
529531
streamName);
@@ -585,16 +587,15 @@ public void identicalNameSimultaneousSync() throws Exception {
585587
// And this will dump sync2's entire stdout to our stdout
586588
endSync(sync2);
587589

588-
verifySyncResult(
589-
readRecords("dat/sync1_expectedrecords_dedup_raw.jsonl"),
590-
readRecords("dat/sync1_expectedrecords_dedup_final.jsonl"),
591-
namespace1,
592-
streamName);
593-
verifySyncResult(
594-
readRecords("dat/sync1_expectedrecords_dedup_raw2.jsonl"),
595-
readRecords("dat/sync1_expectedrecords_dedup_final2.jsonl"),
596-
namespace2,
597-
streamName);
590+
// For simplicity, don't verify the raw table. Assume that if the final table is correct, then
591+
// the raw data is correct. This is generally a safe assumption.
592+
assertAll(
593+
() -> DIFFER.diffFinalTableRecords(
594+
readRecords("dat/sync1_expectedrecords_dedup_final.jsonl"),
595+
dumpFinalTableRecords(namespace1, streamName)),
596+
() -> DIFFER.diffFinalTableRecords(
597+
readRecords("dat/sync1_expectedrecords_dedup_final2.jsonl"),
598+
dumpFinalTableRecords(namespace2, streamName)));
598599
}
599600

600601
@Test

airbyte-integrations/bases/base-typing-deduping-test/src/main/resources/sqlgenerator/cdcupdate_inputrecords_raw.jsonl

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,5 +12,5 @@
1212
{"_airbyte_raw_id": "4d8674a5-eb6e-41ca-a310-69c64c88d101", "_airbyte_extracted_at": "2023-01-01T00:00:00Z", "_airbyte_data": {"id1": 0, "id2": 100, "updated_at": "2023-01-01T05:00:00Z", "_ab_cdc_deleted_at": null, "string": "zombie_returned"}}
1313
// CDC generally outputs an explicit null for deleted_at, but verify that we can also handle the case where deleted_at is unset.
1414
{"_airbyte_raw_id": "f0b59e49-8c74-4101-9f14-cb4d1193fd5a", "_airbyte_extracted_at": "2023-01-01T00:00:00Z", "_airbyte_data": {"id1": 4, "id2": 100, "updated_at": "2023-01-01T06:00:00Z", "string": "charlie"}}
15-
// Verify that we can handle weird values in deleted_at
15+
// Invalid values in _ab_cdc_deleted_at result in the record NOT being deleted. This behavior is up for debate, but it's an extreme edge case so not a high priority.
1616
{"_airbyte_raw_id": "d4e1d989-c115-403c-9e68-5d320e6376bb", "_airbyte_extracted_at": "2023-01-01T00:00:00Z", "_airbyte_data": {"id1": 5, "id2": 100, "updated_at": "2023-01-01T07:00:00Z", "_ab_cdc_deleted_at": {}, "string": "david1"}}

airbyte-integrations/connectors/destination-bigquery/metadata.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ data:
22
connectorSubtype: database
33
connectorType: destination
44
definitionId: 22f6c74f-5699-40ff-833c-4a879ea40133
5-
dockerImageTag: 2.1.6
5+
dockerImageTag: 2.2.0
66
dockerRepository: airbyte/destination-bigquery
77
githubIssueLabel: destination-bigquery
88
icon: bigquery.svg

0 commit comments

Comments
 (0)