diff --git a/README.md b/README.md index 378c5930..3c066630 100644 --- a/README.md +++ b/README.md @@ -613,6 +613,27 @@ models: where: "num_orders > 0" ``` +### functional_dependency ([source](macros/generic_tests/functional_dependency.sql)) + +This test confirms that a particular column is *functionally dependent* on one or more other columns. That is, for each distinct combination of those other columns, there should be no more than one distinct value in our particular column. + +This test is often useful for denormalized source data, where logical relationships between fields are implicitly expected but don't always hold, due to manual entry errors, or merges from different systems. Broken functional dependencies often surface as dupes and other anomalies downstream. + +*Common misunderstanding*: Functional dependency is *not* uniqueness. Functional dependency checks there is at most one distinct value (in each group), but allows that value to appear many times. Uniqueness allows many distinct values, but checks each value appears only once. + +**Usage:** + +```yaml +models: + - name: orders + columns: + - name: customer_name + tests: + - dbt_utils.functional_dependency: + depends_on: + - customer_id +``` + ---- ### Grouping in tests diff --git a/integration_tests/data/schema_tests/data_test_functional_dependency.csv b/integration_tests/data/schema_tests/data_test_functional_dependency.csv new file mode 100644 index 00000000..1b431244 --- /dev/null +++ b/integration_tests/data/schema_tests/data_test_functional_dependency.csv @@ -0,0 +1,6 @@ +order_id,customer_id,customer_name +1001,1,Ash +1002,2,Brock +1003,2,Brock +1004,3,Ash +1005,4, diff --git a/integration_tests/data/schema_tests/schema.yml b/integration_tests/data/schema_tests/schema.yml index 7b57a911..be49c844 100644 --- a/integration_tests/data/schema_tests/schema.yml +++ b/integration_tests/data/schema_tests/schema.yml @@ -19,3 +19,14 @@ seeds: - dbt_utils.sequential_values: interval: 1 datepart: 'hour' + + + - name: data_test_functional_dependency + columns: + - name: order_id + - name: customer_id + - name: customer_name + data_tests: + - dbt_utils.functional_dependency: + depends_on: + - customer_id diff --git a/macros/generic_tests/functional_dependency.sql b/macros/generic_tests/functional_dependency.sql new file mode 100644 index 00000000..1bb06374 --- /dev/null +++ b/macros/generic_tests/functional_dependency.sql @@ -0,0 +1,39 @@ +{% test functional_dependency(model, column_name, depends_on, quote_columns=False) %} + {{ return(adapter.dispatch('test_functional_dependency', 'dbt_utils')(model, column_name, depends_on, quote_columns)) }} +{% endtest %} + + +{% macro default__test_functional_dependency(model, column_name, depends_on, quote_columns=False) %} + + +{% if not quote_columns %} + {%- set column_list=depends_on %} +{% elif quote_columns %} + {%- set column_list=[] %} + {%- for column in depends_on %} + {%- set column_list = column_list.append( adapter.quote(column) ) %} + {%- endfor %} +{% else %} + {{ exceptions.raise_compiler_error( + "`quote_columns` argument for functional_dependency test must be one of [True, False]" + ) }} +{% endif %} + + +{%- set columns_csv=column_list | join(', ') %} + + +with validation_errors as ( + + select {{ columns_csv }} + from {{ model }} + group by {{ columns_csv }} + having count(distinct {{ column_name }}) > 1 + +) + +select * +from validation_errors + + +{% endmacro %}