diff --git a/README.md b/README.md index 378c5930..c603705e 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,18 @@ models: compare_model: ref('other_table_name') exclude_columns: - third_column + + # if the columns to be compared have different names, you can match them up like this + - name: model_name_different_names + tests: + - dbt_utils.equality: + compare_model: ref('other_table_name') + compare_columns: + - first_column + # This will compare `model_name_different_names.second_column_in_model` + # and `other_table_name.second_column_in_other_table` + - [second_column_in_model, second_column_in_other_table] + precision: 4 ``` ### expression_is_true ([source](macros/generic_tests/expression_is_true.sql)) diff --git a/integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv b/integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv new file mode 100644 index 00000000..756f55a0 --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_different_column_names_a.csv @@ -0,0 +1,4 @@ +col_a,tbl_a_col_b,tbl_a_col_float +1,1,1.100005 +1,2,1.200005 +2,3,1.300005 diff --git a/integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv b/integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv new file mode 100644 index 00000000..25b72e9f --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_equality_different_column_names_b.csv @@ -0,0 +1,4 @@ +tbl_b_col_b,col_a,tbl_b_col_float +1,1,1.100006 +2,1,1.200007 +3,2,1.300008 diff --git a/integration_tests/models/generic_tests/schema.yml b/integration_tests/models/generic_tests/schema.yml index b12e3c7f..9f6a4779 100644 --- a/integration_tests/models/generic_tests/schema.yml +++ b/integration_tests/models/generic_tests/schema.yml @@ -164,6 +164,28 @@ seeds: exclude_columns: - col_c + - name: data_test_equality_different_column_names_a + data_tests: + - dbt_utils.equality: + compare_model: ref('data_test_equality_different_column_names_b') + compare_columns: + - col_a + - [tbl_a_col_b, tbl_b_col_b] + - dbt_utils.equality: + compare_model: ref('data_test_equality_different_column_names_b') + compare_columns: + - col_a + - [tbl_a_col_float, tbl_b_col_float] + precision: 4 + - dbt_utils.equality: + compare_model: ref('data_test_equality_different_column_names_b') + compare_columns: + - col_a + - [tbl_a_col_float, tbl_b_col_float] + precision: 8 + error_if: "<1" #sneaky way to ensure that the test is returning failing rows + warn_if: "<0" + - name: data_test_equality_floats_a data_tests: # test precision only diff --git a/macros/generic_tests/equality.sql b/macros/generic_tests/equality.sql index d7d7197c..66271fef 100644 --- a/macros/generic_tests/equality.sql +++ b/macros/generic_tests/equality.sql @@ -63,42 +63,15 @@ {%- endif -%} {% if compare_columns_set != compare_model_columns_set %} - {{ exceptions.raise_compiler_error(compare_model ~" has less columns than " ~ model ~ ", please ensure they have the same columns or use the `compare_columns` or `exclude_columns` arguments to subset them.") }} + {{ exceptions.raise_compiler_error(compare_model ~" has different columns than " ~ model ~ ", please ensure they have the same columns or use the `compare_columns` or `exclude_columns` arguments to subset them.") }} {% endif %} {% endif %} -{%- if not precision -%} - {%- if not compare_columns -%} - {# - You cannot get the columns in an ephemeral model (due to not existing in the information schema), - so if the user does not provide an explicit list of columns we must error in the case it is ephemeral - #} - {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%} - {%- set compare_columns = adapter.get_columns_in_relation(model)-%} - - {%- if exclude_columns -%} - {#-- Lower case ignore columns for easier comparison --#} - {%- set exclude_columns = exclude_columns | map("lower") | list %} - - {# Filter out the excluded columns #} - {%- set include_columns = [] %} - {%- for column in compare_columns -%} - {%- if column.name | lower not in exclude_columns -%} - {% do include_columns.append(column) %} - {%- endif %} - {%- endfor %} - - {%- set compare_columns = include_columns | map(attribute='quoted') %} - {%- else -%} {# Compare columns provided #} - {%- set compare_columns = compare_columns | map(attribute='quoted') %} - {%- endif -%} - {%- endif -%} - - {% set compare_cols_csv = compare_columns | join(', ') %} - -{% else %} {# Precision required #} +{# If testing with precision, then find out which columns in the main input model are numeric #} +{%- set numeric_columns = {} -%} +{%- if precision -%} {#- If rounding is required, we need to get the types, so it cannot be ephemeral even if they provide column names -#} @@ -107,23 +80,82 @@ {% set columns_list = [] %} {%- for col in columns -%} - {%- if ( - (col.name|lower in compare_columns|map('lower') or not compare_columns) and - (col.name|lower not in exclude_columns|map('lower') or not exclude_columns) - ) -%} - {# Databricks double type is not picked up by any number type checks in dbt #} - {%- if col.is_float() or col.is_numeric() or col.data_type == 'double' -%} - {# Cast is required due to postgres not having round for a double precision number #} - {%- do columns_list.append('round(cast(' ~ col.quoted ~ ' as ' ~ dbt.type_numeric() ~ '),' ~ precision ~ ') as ' ~ col.quoted) -%} - {%- else -%} {# Non-numeric type #} - {%- do columns_list.append(col.quoted) -%} - {%- endif -%} - {% endif %} + {# Databricks double type is not picked up by any number type checks in dbt #} + {%- if col.is_float() or col.is_numeric() or col.data_type == 'double' -%} + {#- Lower case the column name for easier case-insensitive comparison -#} + {%- do numeric_columns.update({col.name|lower: true}) -%} + {#- Also include the quoted version, since we may see it as well. -#} + {%- do numeric_columns.update({col.quoted|lower: true}) -%} + {%- endif -%} {%- endfor -%} +{%- endif -%} - {% set compare_cols_csv = columns_list | join(', ') %} +{# If compare_columns is provided, sort any given arrays into lists of columns for each model #} +{%- if compare_columns -%} + {%- set compare_columns__model = [] %} + {%- set compare_columns__compare_model = [] %} + + {%- for column in compare_columns -%} + {%- if column is string -%} + {# A simple string was given. Assume the same column name in both models. #} + {%- do compare_columns__model.append(column) -%} + {%- do compare_columns__compare_model.append(column) -%} + {%- elif column is iterable and column | length == 2 -%} + {%- do compare_columns__model.append(column[0]) -%} + {%- do compare_columns__compare_model.append(column[1]) -%} + {%- else -%} + {{ exceptions.raise_compiler_error("compare_columns must be a string or a list of 2 strings") }} + {%- endif -%} + {%- endfor -%} +{%- else -%} + {# + You cannot get the columns in an ephemeral model (due to not existing in the information schema), + so if the user does not provide an explicit list of columns we must error in the case it is ephemeral + #} + {%- do dbt_utils._is_ephemeral(model, 'test_equality') -%} + {%- set model_columns = adapter.get_columns_in_relation(model)-%} -{% endif %} + {%- if exclude_columns -%} + {#-- Lower case ignore columns for easier comparison --#} + {%- set exclude_columns = exclude_columns | map("lower") | list %} + {%- endif -%} + + {# Filter out the excluded columns #} + {%- set include_columns = [] %} + {%- for column in model_columns -%} + {%- if (not exclude_columns) or (column.name | lower not in exclude_columns) -%} + {% do include_columns.append(column) %} + {%- endif %} + {%- endfor %} + + {# Assume same column names in the comparison model, since no alternates were given using compare_columns. #} + {%- set compare_columns__model = include_columns | map(attribute='quoted') | list %} + {%- set compare_columns__compare_model = compare_columns__model %} +{%- endif -%} + +{# Build comma-delimited lists of column names for each input model. Round numeric types as needed. #} +{%- set compare_columns_csv = [] -%} +{%- set numeric_column_indexes_in_first_model = [] -%} +{%- for this_model_compare_columns in [compare_columns__model, compare_columns__compare_model] -%} + {%- set columns_list = [] %} + {%- set is_first_model = loop.first -%} + + {%- for this_compare_column in this_model_compare_columns -%} + {# NOTE: We assume any numeric columns in the first model are also numeric in the second model #} + {%- if (is_first_model and this_compare_column|lower in numeric_columns) or (loop.index0 in numeric_column_indexes_in_first_model) -%} + {# Cast is required due to postgres not having round for a double precision number #} + {%- do columns_list.append('round(cast(' ~ this_compare_column ~ ' as ' ~ dbt.type_numeric() ~ '),' ~ precision ~ ') as ' ~ this_compare_column) -%} + + {%- if is_first_model -%} + {%- do numeric_column_indexes_in_first_model.append(loop.index0) -%} + {%- endif -%} + {%- else -%} {# Non-numeric type #} + {%- do columns_list.append(this_compare_column) -%} + {%- endif -%} + {%- endfor -%} + + {%- do compare_columns_csv.append(columns_list | join(', ')) -%} +{%- endfor -%} with a as ( @@ -139,17 +171,17 @@ b as ( a_minus_b as ( - select {{compare_cols_csv}} from a + select {{compare_columns_csv[0]}} from a {{ dbt.except() }} - select {{compare_cols_csv}} from b + select {{compare_columns_csv[1]}} from b ), b_minus_a as ( - select {{compare_cols_csv}} from b + select {{compare_columns_csv[1]}} from b {{ dbt.except() }} - select {{compare_cols_csv}} from a + select {{compare_columns_csv[0]}} from a ),