feat: lakeformation tags for columns support (dbt-labs#185)

svdimchenko · web-flow · commit faa37ff3767c · 2023-03-27T09:55:05.000+02:00
diff --git a/README.md b/README.md
@@ -60,19 +60,19 @@ stored login info. You can configure the AWS profile name to use via `aws_profil
 
 A dbt profile can be configured to run against AWS Athena using the following configuration:
 
-| Option           | Description                                                                    | Required?   | Example               |
-|------------------|--------------------------------------------------------------------------------|-------------|-----------------------|
-| s3_staging_dir   | S3 location to store Athena query results and metadata                         | Required    | `s3://bucket/dbt/`    |
-| s3_data_dir      | Prefix for storing tables, if different from the connection's `s3_staging_dir` | Optional    | `s3://bucket2/dbt/`   |
-| s3_data_naming   | How to generate table paths in `s3_data_dir`                                   | Optional    | `schema_table_unique` |
-| region_name      | AWS region of your Athena instance                                             | Required    | `eu-west-1`           |
-| schema           | Specify the schema (Athena database) to build models into (lowercase **only**) | Required    | `dbt`                 |
-| database         | Specify the database (Data catalog) to build models into (lowercase **only**)  | Required    | `awsdatacatalog`      |
-| poll_interval    | Interval in seconds to use for polling the status of query results in Athena   | Optional    | `5`                   |
-| aws_profile_name | Profile to use from your AWS shared credentials file.                          | Optional    | `my-profile`          |
-| work_group       | Identifier of Athena workgroup                                                 | Optional    | `my-custom-workgroup` |
-| num_retries      | Number of times to retry a failing query                                       | Optional    | `3`                   |
-| lf_tags          | Default lf tags to apply to any database created by dbt                        | Optional    | `{"origin": "dbt", "team": "analytics"}`|
+| Option           | Description                                                                    | Required? | Example                                  |
+|------------------|--------------------------------------------------------------------------------|-----------|------------------------------------------|
+| s3_staging_dir   | S3 location to store Athena query results and metadata                         | Required  | `s3://bucket/dbt/`                       |
+| s3_data_dir      | Prefix for storing tables, if different from the connection's `s3_staging_dir` | Optional  | `s3://bucket2/dbt/`                      |
+| s3_data_naming   | How to generate table paths in `s3_data_dir`                                   | Optional  | `schema_table_unique`                    |
+| region_name      | AWS region of your Athena instance                                             | Required  | `eu-west-1`                              |
+| schema           | Specify the schema (Athena database) to build models into (lowercase **only**) | Required  | `dbt`                                    |
+| database         | Specify the database (Data catalog) to build models into (lowercase **only**)  | Required  | `awsdatacatalog`                         |
+| poll_interval    | Interval in seconds to use for polling the status of query results in Athena   | Optional  | `5`                                      |
+| aws_profile_name | Profile to use from your AWS shared credentials file.                          | Optional  | `my-profile`                             |
+| work_group       | Identifier of Athena workgroup                                                 | Optional  | `my-custom-workgroup`                    |
+| num_retries      | Number of times to retry a failing query                                       | Optional  | `3`                                      |
+| lf_tags          | Default lf tags to apply to any database created by dbt                        | Optional  | `{"origin": "dbt", "team": "analytics"}` |
 
 **Example profiles.yml entry:**
 ```yaml
@@ -125,6 +125,9 @@ _Additional information_
 * `lf_tags` (`default=none`)
   * lf tags to associate with the table
   * format: `{"tag1": "value1", "tag2": "value2"}`
+* `lf_tags_columns` (`default=none`)
+  * lf tags to associate with the table columns
+  * format: `{"tag1": {"value1": ["column1": "column2"]}}`
 
 #### Table location
 
diff --git a/dbt/adapters/athena/impl.py b/dbt/adapters/athena/impl.py
@@ -56,59 +56,84 @@ def convert_number_type(cls, agate_table: agate.Table, col_idx: int) -> str:
     def convert_datetime_type(cls, agate_table: agate.Table, col_idx: int) -> str:
         return "timestamp"
 
+    @classmethod
+    def parse_lf_response(
+        cls,
+        response: Dict[str, Any],
+        database: str,
+        table: Optional[str],
+        columns: Optional[List[str]],
+        lf_tags: Dict[str, str],
+    ) -> str:
+        failures = response.get("Failures", [])
+        tbl_appendix = f".{table}" if table else ""
+        columns_appendix = f" for columns {columns}" if columns else ""
+        msg_appendix = tbl_appendix + columns_appendix
+        if failures:
+            base_msg = f"Failed to add LF tags: {lf_tags} to {database}" + msg_appendix
+            for failure in failures:
+                tag = failure.get("LFTag", {}).get("TagKey")
+                error = failure.get("Error", {}).get("ErrorMessage")
+                logger.error(f"Failed to set {tag} for {database}" + msg_appendix + f" - {error}")
+            raise DbtRuntimeError(base_msg)
+        return f"Added LF tags: {lf_tags} to {database}" + msg_appendix
+
+    @classmethod
+    def lf_tags_columns_is_valid(cls, lf_tags_columns: Dict[str, Dict[str, List[str]]]) -> Optional[bool]:
+        if not lf_tags_columns:
+            return False
+        for tag_key, tag_config in lf_tags_columns.items():
+            if isinstance(tag_config, Dict):
+                for tag_value, columns in tag_config.items():
+                    if not isinstance(columns, List):
+                        raise DbtRuntimeError(f"Not a list: {columns}. " + "Expected format: ['c1', 'c2']")
+            else:
+                raise DbtRuntimeError(f"Not a dict: {tag_config}. " + "Expected format: {'tag_value': ['c1', 'c2']}")
+        return True
+
     # TODO: Add more lf-tag unit tests when moto supports lakeformation
     # moto issue: https://github.com/getmoto/moto/issues/5964
     @available
-    def add_lf_tags(self, database: str, table: str = None, lf_tags: Dict[str, str] = None):
+    def add_lf_tags(
+        self,
+        database: str,
+        table: str = None,
+        lf_tags: Optional[Dict[str, str]] = None,
+        lf_tags_columns: Optional[Dict[str, Dict[str, List[str]]]] = None,
+    ):
         conn = self.connections.get_thread_connection()
         client = conn.handle
 
         lf_tags = lf_tags or conn.credentials.lf_tags
-        if not lf_tags:
-            logger.debug("No LF tags configured")
-            return
-
-        resource = {
-            "Database": {"Name": database},
-        }
 
-        if table:
-            resource = {
-                "Table": {
-                    "DatabaseName": database,
-                    "Name": table,
-                }
-            }
-
-        with boto3_client_lock:
-            lf_client = client.session.client(
-                "lakeformation", region_name=client.region_name, config=get_boto3_config()
-            )
+        if not lf_tags and not lf_tags_columns:
+            logger.debug("No LF tags configured")
+        else:
+            with boto3_client_lock:
+                lf_client = client.session.client(
+                    "lakeformation", region_name=client.region_name, config=get_boto3_config()
+                )
 
-        response = lf_client.add_lf_tags_to_resource(
-            Resource=resource,
-            LFTags=[
-                {
-                    "TagKey": key,
-                    "TagValues": [
-                        value,
-                    ],
-                }
-                for key, value in lf_tags.items()
-            ],
-        )
+            if lf_tags:
+                resource = {"Database": {"Name": database}}
+                if table:
+                    resource = {"Table": {"DatabaseName": database, "Name": table}}
 
-        failures = response.get("Failures", [])
-        tbl_appendix = f".{table}" if table else ""
-        if failures:
-            base_msg = f"Failed to add LF tags: {lf_tags} to {database}" + tbl_appendix
-            for failure in failures:
-                tag = failure.get("LFTag", {}).get("TagKey")
-                error = failure.get("Error", {}).get("ErrorMessage")
-                logger.error(f"Failed to set {tag} for {database}" + tbl_appendix + f" - {error}")
-            raise DbtRuntimeError(base_msg)
-        else:
-            logger.debug(f"Added LF tags: {lf_tags} to {database}" + tbl_appendix)
+                response = lf_client.add_lf_tags_to_resource(
+                    Resource=resource, LFTags=[{"TagKey": key, "TagValues": [value]} for key, value in lf_tags.items()]
+                )
+                logger.debug(self.parse_lf_response(response, database, table, None, lf_tags))
+
+            if self.lf_tags_columns_is_valid(lf_tags_columns):
+                for tag_key, tag_config in lf_tags_columns.items():
+                    for tag_value, columns in tag_config.items():
+                        response = lf_client.add_lf_tags_to_resource(
+                            Resource={
+                                "TableWithColumns": {"DatabaseName": database, "Name": table, "ColumnNames": columns}
+                            },
+                            LFTags=[{"TagKey": tag_key, "TagValues": [tag_value]}],
+                        )
+                        logger.debug(self.parse_lf_response(response, database, table, columns, {tag_key: tag_value}))
 
     @available
     def get_work_group_output_location(self) -> Optional[str]:
diff --git a/dbt/include/athena/macros/materializations/models/incremental/incremental.sql b/dbt/include/athena/macros/materializations/models/incremental/incremental.sql
@@ -6,6 +6,7 @@
   {% set on_schema_change = incremental_validate_on_schema_change(config.get('on_schema_change'), default='ignore') %}
 
   {% set lf_tags = config.get('lf_tags', default=none) %}
+  {% set lf_tags_columns = config.get('lf_tags_columns', default=none) %}
   {% set partitioned_by = config.get('partitioned_by', default=none) %}
   {% set target_relation = this.incorporate(type='table') %}
   {% set existing_relation = load_relation(this) %}
@@ -84,8 +85,8 @@
 
   {{ run_hooks(post_hooks, inside_transaction=False) }}
 
-  {% if lf_tags is not none %}
-    {{ adapter.add_lf_tags(target_relation.schema, target_relation.identifier, lf_tags) }}
+  {% if lf_tags is not none or lf_tags_columns is not none %}
+    {{ adapter.add_lf_tags(target_relation.schema, target_relation.identifier, lf_tags, lf_tags_columns) }}
   {% endif %}
 
   {{ return({'relations': [target_relation]}) }}
diff --git a/dbt/include/athena/macros/materializations/models/table/table.sql b/dbt/include/athena/macros/materializations/models/table/table.sql
@@ -2,6 +2,7 @@
   {%- set identifier = model['alias'] -%}
 
   {%- set lf_tags = config.get('lf_tags', default=none) -%}
+  {%- set lf_tags_columns = config.get('lf_tags_columns', default=none) -%}
   {%- set table_type = config.get('table_type', default='hive') | lower -%}
   {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%}
   {%- set target_relation = api.Relation.create(identifier=identifier,
@@ -27,8 +28,8 @@
 
   {{ run_hooks(post_hooks) }}
 
-  {% if lf_tags is not none %}
-    {{ adapter.add_lf_tags(target_relation.schema, identifier, lf_tags) }}
+  {% if lf_tags is not none or lf_tags_columns is not none %}
+    {{ adapter.add_lf_tags(target_relation.schema, identifier, lf_tags, lf_tags_columns) }}
   {% endif %}
 
   {% do persist_docs(target_relation, model) %}
diff --git a/dbt/include/athena/macros/materializations/models/view/create_or_replace_view.sql b/dbt/include/athena/macros/materializations/models/view/create_or_replace_view.sql
@@ -2,6 +2,7 @@
   {%- set identifier = model['alias'] -%}
 
   {%- set lf_tags = config.get('lf_tags', default=none) -%}
+  {%- set lf_tags_columns = config.get('lf_tags_columns', default=none) -%}
   {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%}
   {%- set exists_as_view = (old_relation is not none and old_relation.is_view) -%}
   {%- set target_relation = api.Relation.create(
@@ -29,8 +30,8 @@
     {{ create_view_as(target_relation, sql) }}
   {%- endcall %}
 
-  {% if lf_tags is not none %}
-    {{ adapter.add_lf_tags(target_relation.schema, identifier, lf_tags) }}
+  {% if lf_tags is not none or lf_tags_columns is not none %}
+    {{ adapter.add_lf_tags(target_relation.schema, identifier, lf_tags, lf_tags_columns) }}
   {% endif %}
 
   {{ run_hooks(post_hooks, inside_transaction=True) }}
diff --git a/dbt/include/athena/macros/materializations/seeds/helpers.sql b/dbt/include/athena/macros/materializations/seeds/helpers.sql
@@ -11,6 +11,7 @@
   {%- set identifier = model['alias'] -%}
 
   {%- set lf_tags = config.get('lf_tags', default=none) -%}
+  {%- set lf_tags_columns = config.get('lf_tags_columns', default=none) -%}
   {%- set column_override = config.get('column_types', {}) -%}
   {%- set quote_seed_column = config.get('quote_columns', None) -%}
   {%- set s3_data_dir = config.get('s3_data_dir', default=target.s3_data_dir) -%}
@@ -35,8 +36,8 @@
     {{ sql }}
   {%- endcall %}
 
-  {% if lf_tags is not none %}
-    {{ adapter.add_lf_tags(model.schema, identifier, lf_tags) }}
+  {% if lf_tags is not none or lf_tags_columns is not none %}
+    {{ adapter.add_lf_tags(model.schema, identifier, lf_tags, lf_tags_columns) }}
   {% endif %}
 
   {{ return(sql) }}
diff --git a/pyproject.toml b/pyproject.toml
@@ -12,7 +12,7 @@ include = '\.pyi?$'
 [tool.flake8]
 files = '.*\.py'
 max-line-length = 120
-exclude = ['.git', '.eggs', '__pycache__', 'venv']
+exclude = ['.git', '.eggs', '__pycache__', 'venv', '.venv']
 ignore = [
     # space before : (needed for how black formats slicing)
     'E203',
diff --git a/tests/unit/test_adapter.py b/tests/unit/test_adapter.py
@@ -793,6 +793,80 @@ def test_get_columns_in_relation(self):
             Column("dt", "date"),
         ]
 
+    @pytest.mark.parametrize(
+        "response,database,table,columns,lf_tags,expected",
+        [
+            pytest.param(
+                {
+                    "Failures": [
+                        {
+                            "LFTag": {"CatalogId": "test_catalog", "TagKey": "test_key", "TagValues": ["test_values"]},
+                            "Error": {"ErrorCode": "test_code", "ErrorMessage": "test_err_msg"},
+                        }
+                    ]
+                },
+                "test_database",
+                "test_table",
+                ["column1", "column2"],
+                {"tag_key": "tag_value"},
+                None,
+                id="lf_tag error",
+                marks=pytest.mark.xfail,
+            ),
+            pytest.param(
+                {"Failures": []},
+                "test_database",
+                None,
+                None,
+                {"tag_key": "tag_value"},
+                "Added LF tags: {'tag_key': 'tag_value'} to test_database",
+                id="lf_tag database",
+            ),
+            pytest.param(
+                {"Failures": []},
+                "test_db",
+                "test_table",
+                None,
+                {"tag_key": "tag_value"},
+                "Added LF tags: {'tag_key': 'tag_value'} to test_db.test_table",
+                id="lf_tag database and table",
+            ),
+            pytest.param(
+                {"Failures": []},
+                "test_db",
+                "test_table",
+                ["column1", "column2"],
+                {"tag_key": "tag_value"},
+                "Added LF tags: {'tag_key': 'tag_value'} to test_db.test_table for columns ['column1', 'column2']",
+                id="lf_tag database table and columns",
+            ),
+        ],
+    )
+    def test_parse_lf_response(self, response, database, table, columns, lf_tags, expected):
+        assert self.adapter.parse_lf_response(response, database, table, columns, lf_tags) == expected
+
+    @pytest.mark.parametrize(
+        "lf_tags_columns,expected",
+        [
+            pytest.param({"tag_key": {"tag_value": ["col1, col2"]}}, True, id="valid lf_tags_columns"),
+            pytest.param(None, False, id="empty lf_tags_columns"),
+            pytest.param(
+                {"tag_key": "tag_value"},
+                None,
+                id="lf_tags_columns tag config is not a dict",
+                marks=pytest.mark.xfail(raises=DbtRuntimeError),
+            ),
+            pytest.param(
+                {"tag_key": {"tag_value": "col1"}},
+                None,
+                id="lf_tags_columns columns config is not a list",
+                marks=pytest.mark.xfail(raises=DbtRuntimeError),
+            ),
+        ],
+    )
+    def test_lf_tags_columns_is_valid(self, lf_tags_columns, expected):
+        assert self.adapter.lf_tags_columns_is_valid(lf_tags_columns) == expected
+
 
 class TestAthenaFilterCatalog:
     def test__catalog_filter_table(self):