From 0d60f9a5003e744d1088f9a75abe063985c612de Mon Sep 17 00:00:00 2001 From: James Johnston Date: Tue, 25 Mar 2025 01:19:23 -0700 Subject: [PATCH] equality test can now compare columns with different names The compare_columns now optionally accepts a list of two column names. This allows the column names to be different between base model and other model when comparing two columns. The body of the macro is also refactored to make it more DRY, and reduce convoluting the different steps and capabilities. 1. First, we gather a set of numeric column names that we need to round. This is referenced later. 2. Next, we build two lists of column names that we want to compare: one list per model. If compare_columns was given, we build the lists from that. Otherwise, read column names from the first model, filter the excluded column names, and use the result for both models, as we did before this commit. 3. Now that we have lists of column names, we can build comma-separated lists for each model. At this point, we do the number rounding expression if the column name is found in the set from step 1. Note this refactoring also cleans up some weird inconsistencies that resulted from duplication of logic. For example, suppose a model's column name was uppercase, and your database is case-sensitive. If you did not specify a "precision" argument, then you need to provide an upper-case "compare_columns" argument. However, if you _did_ specify a "precision" argument, then the "compare-columns" argument was _NOT_ case-sensitive. It's pretty unexpected to have the case sensitivity of one argument be dependent on a seemingly-unrelated argument. --- README.md | 12 ++ ...test_equality_different_column_names_a.csv | 4 + ...test_equality_different_column_names_b.csv | 4 + .../models/generic_tests/schema.yml | 22 +++ macros/generic_tests/equality.sql | 130 +++++++++++------- 5 files changed, 123 insertions(+), 49 deletions(-) create mode 100644 integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv create mode 100644 integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv diff --git a/README.md b/README.md index 378c5930..c603705e 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,18 @@ models: compare_model: ref('other_table_name') exclude_columns: - third_column + + # if the columns to be compared have different names, you can match them up like this + - name: model_name_different_names + tests: + - dbt_utils.equality: + compare_model: ref('other_table_name') + compare_columns: + - first_column + # This will compare `model_name_different_names.second_column_in_model` + # and `other_table_name.second_column_in_other_table` + - [second_column_in_model, second_column_in_other_table] + precision: 4 ``` ### expression_is_true ([source](macros/generic_tests/expression_is_true.sql)) diff --git a/integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv b/integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv new file mode 100644 index 00000000..756f55a0 --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv @@ -0,0 +1,4 @@ +col_a,tbl_a_col_b,tbl_a_col_float +1,1,1.100005 +1,2,1.200005 +2,3,1.300005 diff --git a/integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv b/integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv new file mode 100644 index 00000000..25b72e9f --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv @@ -0,0 +1,4 @@ +tbl_b_col_b,col_a,tbl_b_col_float +1,1,1.100006 +2,1,1.200007 +3,2,1.300008 diff --git a/integration_tests/models/generic_tests/schema.yml b/integration_tests/models/generic_tests/schema.yml index b12e3c7f..9f6a4779 100644 --- a/integration_tests/models/generic_tests/schema.yml +++ b/integration_tests/models/generic_tests/schema.yml @@ -164,6 +164,28 @@ seeds: exclude_columns: - col_c + - name: data_test_equality_different_column_names_a + data_tests: + - dbt_utils.equality: + compare_model: ref('data_test_equality_different_column_names_b') + compare_columns: + - col_a + - [tbl_a_col_b, tbl_b_col_b] + - dbt_utils.equality: + compare_model: ref('data_test_equality_different_column_names_b') + compare_columns: + - col_a + - [tbl_a_col_float, tbl_b_col_float] + precision: 4 + - dbt_utils.equality: + compare_model: ref('data_test_equality_different_column_names_b') + compare_columns: + - col_a + - [tbl_a_col_float, tbl_b_col_float] + precision: 8 + error_if: "<1" #sneaky way to ensure that the test is returning failing rows + warn_if: "<0" + - name: data_test_equality_floats_a data_tests: # test precision only diff --git a/macros/generic_tests/equality.sql b/macros/generic_tests/equality.sql index d7d7197c..66271fef 100644 --- a/macros/generic_tests/equality.sql +++ b/macros/generic_tests/equality.sql @@ -63,42 +63,15 @@ {%- endif -%} {% if compare_columns_set != compare_model_columns_set %} - {{ exceptions.raise_compiler_error(compare_model ~" has less columns than " ~ model ~ ", please ensure they have the same columns or use the `compare_columns` or `exclude_columns` arguments to subset them.") }} + {{ exceptions.raise_compiler_error(compare_model ~" has different columns than " ~ model ~ ", please ensure they have the same columns or use the `compare_columns` or `exclude_columns` arguments to subset them.") }} {% endif %} {% endif %} -{%- if not precision -%} - {%- if not compare_columns -%} - {# - You cannot get the columns in an ephemeral model (due to not existing in the information schema), - so if the user does not provide an explicit list of columns we must error in the case it is ephemeral - #} - {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%} - {%- set compare_columns = adapter.get_columns_in_relation(model)-%} - - {%- if exclude_columns -%} - {#-- Lower case ignore columns for easier comparison --#} - {%- set exclude_columns = exclude_columns | map("lower") | list %} - - {# Filter out the excluded columns #} - {%- set include_columns = [] %} - {%- for column in compare_columns -%} - {%- if column.name | lower not in exclude_columns -%} - {% do include_columns.append(column) %} - {%- endif %} - {%- endfor %} - - {%- set compare_columns = include_columns | map(attribute='quoted') %} - {%- else -%} {# Compare columns provided #} - {%- set compare_columns = compare_columns | map(attribute='quoted') %} - {%- endif -%} - {%- endif -%} - - {% set compare_cols_csv = compare_columns | join(', ') %} - -{% else %} {# Precision required #} +{# If testing with precision, then find out which columns in the main input model are numeric #} +{%- set numeric_columns = {} -%} +{%- if precision -%} {#- If rounding is required, we need to get the types, so it cannot be ephemeral even if they provide column names -#} @@ -107,23 +80,82 @@ {% set columns_list = [] %} {%- for col in columns -%} - {%- if ( - (col.name|lower in compare_columns|map('lower') or not compare_columns) and - (col.name|lower not in exclude_columns|map('lower') or not exclude_columns) - ) -%} - {# Databricks double type is not picked up by any number type checks in dbt #} - {%- if col.is_float() or col.is_numeric() or col.data_type == 'double' -%} - {# Cast is required due to postgres not having round for a double precision number #} - {%- do columns_list.append('round(cast(' ~ col.quoted ~ ' as ' ~ dbt.type_numeric() ~ '),' ~ precision ~ ') as ' ~ col.quoted) -%} - {%- else -%} {# Non-numeric type #} - {%- do columns_list.append(col.quoted) -%} - {%- endif -%} - {% endif %} + {# Databricks double type is not picked up by any number type checks in dbt #} + {%- if col.is_float() or col.is_numeric() or col.data_type == 'double' -%} + {#- Lower case the column name for easier case-insensitive comparison -#} + {%- do numeric_columns.update({col.name|lower: true}) -%} + {#- Also include the quoted version, since we may see it as well. -#} + {%- do numeric_columns.update({col.quoted|lower: true}) -%} + {%- endif -%} {%- endfor -%} +{%- endif -%} - {% set compare_cols_csv = columns_list | join(', ') %} +{# If compare_columns is provided, sort any given arrays into lists of columns for each model #} +{%- if compare_columns -%} + {%- set compare_columns__model = [] %} + {%- set compare_columns__compare_model = [] %} + + {%- for column in compare_columns -%} + {%- if column is string -%} + {# A simple string was given. Assume the same column name in both models. #} + {%- do compare_columns__model.append(column) -%} + {%- do compare_columns__compare_model.append(column) -%} + {%- elif column is iterable and column | length == 2 -%} + {%- do compare_columns__model.append(column[0]) -%} + {%- do compare_columns__compare_model.append(column[1]) -%} + {%- else -%} + {{ exceptions.raise_compiler_error("compare_columns must be a string or a list of 2 strings") }} + {%- endif -%} + {%- endfor -%} +{%- else -%} + {# + You cannot get the columns in an ephemeral model (due to not existing in the information schema), + so if the user does not provide an explicit list of columns we must error in the case it is ephemeral + #} + {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%} + {%- set model_columns = adapter.get_columns_in_relation(model)-%} -{% endif %} + {%- if exclude_columns -%} + {#-- Lower case ignore columns for easier comparison --#} + {%- set exclude_columns = exclude_columns | map("lower") | list %} + {%- endif -%} + + {# Filter out the excluded columns #} + {%- set include_columns = [] %} + {%- for column in model_columns -%} + {%- if (not exclude_columns) or (column.name | lower not in exclude_columns) -%} + {% do include_columns.append(column) %} + {%- endif %} + {%- endfor %} + + {# Assume same column names in the comparison model, since no alternates were given using compare_columns. #} + {%- set compare_columns__model = include_columns | map(attribute='quoted') | list %} + {%- set compare_columns__compare_model = compare_columns__model %} +{%- endif -%} + +{# Build comma-delimited lists of column names for each input model. Round numeric types as needed. #} +{%- set compare_columns_csv = [] -%} +{%- set numeric_column_indexes_in_first_model = [] -%} +{%- for this_model_compare_columns in [compare_columns__model, compare_columns__compare_model] -%} + {%- set columns_list = [] %} + {%- set is_first_model = loop.first -%} + + {%- for this_compare_column in this_model_compare_columns -%} + {# NOTE: We assume any numeric columns in the first model are also numeric in the second model #} + {%- if (is_first_model and this_compare_column|lower in numeric_columns) or (loop.index0 in numeric_column_indexes_in_first_model) -%} + {# Cast is required due to postgres not having round for a double precision number #} + {%- do columns_list.append('round(cast(' ~ this_compare_column ~ ' as ' ~ dbt.type_numeric() ~ '),' ~ precision ~ ') as ' ~ this_compare_column) -%} + + {%- if is_first_model -%} + {%- do numeric_column_indexes_in_first_model.append(loop.index0) -%} + {%- endif -%} + {%- else -%} {# Non-numeric type #} + {%- do columns_list.append(this_compare_column) -%} + {%- endif -%} + {%- endfor -%} + + {%- do compare_columns_csv.append(columns_list | join(', ')) -%} +{%- endfor -%} with a as ( @@ -139,17 +171,17 @@ b as ( a_minus_b as ( - select {{compare_cols_csv}} from a + select {{compare_columns_csv[0]}} from a {{ dbt.except() }} - select {{compare_cols_csv}} from b + select {{compare_columns_csv[1]}} from b ), b_minus_a as ( - select {{compare_cols_csv}} from b + select {{compare_columns_csv[1]}} from b {{ dbt.except() }} - select {{compare_cols_csv}} from a + select {{compare_columns_csv[0]}} from a ),