Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ logs/
*__pycache__/
archive/
dbt_packages/
.DS_Store
2 changes: 2 additions & 0 deletions dbt_project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ quoting:
schema: false
identifier: false
models:
vars:
hash: SHA
+transient: false
raw_vault:
stages:
Expand Down
8 changes: 8 additions & 0 deletions domain_dictionary.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"CUSTOMER": {
"short_name": "CUST",
"description": "this is the customer"
},
"PRODUCT": "PRDCT",
"TRANSACTION": "TRNSCTN"
}
90 changes: 90 additions & 0 deletions generate_raw_vault/app/export_document.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
from pathlib import Path

from generate_raw_vault.app.find_metadata_files import load_metadata_file
from generate_raw_vault.app.find_metadata_files import load_template
from generate_raw_vault.app.metadata_handler import Metadata

class ExportDocument(Metadata):

def __init__(self, metadata, template):
self.metadata = metadata
self.template = template

def get_versioned_source_name_desc(self):
versioned_source_name = self.get_versioned_source_name().lower()
versioned_source_name_desc = "".join([versioned_source_name,"_desc"])
return versioned_source_name_desc

def get_table_description(self):
table_description = self.metadata.get("table_description")
return table_description

def get_freshness(self):
freshness = self.metadata.get("freshness")
return freshness

def get_format(self):
format = self.metadata.get("format")
return format

def get_filetype(self):
filetype = self.metadata.get("filetype")
return filetype

def get_source_location(self):
source_location = self.metadata.get("source_location")
return source_location

def get_database_location(self):
database_location = self.metadata.get("database_location")
return database_location

def get_access_roles(self):
access_roles = self.metadata.get("access_roles")
access_roles = ', '.join('"{0}"'.format(role) for role in access_roles)
return access_roles

def get_access_requests(self):
access_requests = self.metadata.get("access_requests")
return access_requests

def get_version(self):
version = self.metadata.get("version")
return version

def get_quality(self):
quality = self.metadata.get("quality")
return quality


def safe_substitute(self):
substitute_metadata = self.template.safe_substitute(
versioned_source_name_desc = self.get_versioned_source_name_desc(),
versioned_source_name = self.get_versioned_source_name(),
table_description = self.get_table_description(),
unit_of_work = self.get_unit_of_work(),
source_name = self.get_source_name(),
version = self.get_version(),
freshness = self.get_freshness(),
format = self.get_format(),
filetype = self.get_filetype(),
source_location = self.get_source_location(),
database_location = self.get_database_location(),
access_roles = self.get_access_roles(),
access_requests = self.get_access_requests(),
quality = self.get_quality()
)
return substitute_metadata

def create_document(self, file_path):
substitute_metadata = self.safe_substitute()
with open(
Path(f"./{file_path}"), "w"
) as doc:
doc.write(substitute_metadata)

if __name__ == "__main__":
template = load_template("generate_raw_vault/app/templates/documentation.md")
metadata_file = load_metadata_file("source_metadata/customers_v0_1_1.json")
doc_exporter = ExportDocument(metadata_file,template)
doc_exporter.create_document("models/source_descriptions/customers_v0_1_0.md")
6 changes: 4 additions & 2 deletions generate_raw_vault/app/export_effsat_vault_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
find_json_metadata,
)
from generate_raw_vault.app.metadata_handler import Metadata
from generate_raw_vault.app.model_creation import write_model_files
from string import Template


Expand Down Expand Up @@ -34,8 +35,9 @@ def export_all_effsat_files(metadata_file_dirs):
substitution_values.update({"source": source})
substitution_values.update({"link_key": link_key})
substitutions = create_effsat_substitutions(substitution_values)
create_effsat_model_files(
substitutions, link_template, substitution_values["file_name"]
formatted_effsat_name = substitution_values["file_name"].lower()
write_model_files(
substitutions, link_template, "effsat", formatted_effsat_name
)


Expand Down
1 change: 1 addition & 0 deletions generate_raw_vault/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def main(metadata_file_dirs):
"./models/raw_vault/hubs",
"./models/raw_vault/links",
"./models/raw_vault/sats",
"./models/raw_vault/effsats",
"./models/raw_vault/stages",
]
for directory in directories_to_create:
Expand Down
34 changes: 34 additions & 0 deletions generate_raw_vault/app/templates/documentation.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{% docs $versioned_source_name_desc %}

# $versioned_source_name
$table_description

The original business process that generated the data is: $unit_of_work

## Source
- Name: $source_name
- Version: $version
- System: CRM_SYSTEM
- Freshness: $freshness
- Format: $format
- Filetype: $filetype

## Accessibility
- Raw source location: $source_location
- Database location: $database_location
- Database role access: $access_roles

### Access requests
$access_requests

## Quality
$quality
{% enddocs %}

{% docs $versioned_source_name_col_1 %}
$versioned_source_name_col_1_desc
{% enddocs %}

{% docs $versioned_source_name_col_2 %}
$versioned_source_name_col_2_desc
{% enddocs %}
48 changes: 48 additions & 0 deletions models/overview.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
{% docs __overview__ %}
# Integrated Data Model

This project facilitates data discovery through a dictionary and lineage. Each source and model are fully described and include attributes such as:
- Description
- Role Access
- Maintainer
- Source system
- Business process
- Storage (lake)
- Driving key
- Required field
- Uniqueness

- Account access / database access
- Tools to access
- Slack request or form, approve, give role. How to get status of access request.
- How often we get the data
- Typical use
- Known issues
- Ongoing work
- table schema evolution
- PII
- Data structure, csv json
- glossary, domain specific language, abreviations
- modelb last updated
- information on aliased columns
- mapping tables e.g. for ID
- governance
- global to a role for hubs and links; more granular RBAC for each satellite to take presidence over global role;
## Navigation
You can use the Project and Database navigation tabs on the left side of the window to explore the models in your project.

### Project Tab
The Project tab mirrors the directory structure of the dbt project. In this tab, you can see all of the models defined in your dbt project, as well as models imported from dbt packages.

### Database Tab
The Database tab also exposes your models, but in a format that looks more like a database explorer. This view shows relations (tables and views) grouped into database schemas. Note that ephemeral models are not shown in this interface, as they do not exist in the database.

### Graph Exploration
You can click the blue icon on the bottom-right corner of the page to view the lineage graph of your models.

On model pages, you'll see the immediate parents and children of the model you're exploring. By clicking the Expand button at the top-right of this lineage pane, you'll be able to see all of the models that are used to build, or are built from, the model you're exploring.

Once expanded, you'll be able to use the --select and --exclude model selection syntax to filter the models in the graph. For more information on model selection, check out the dbt docs.

Note that you can also right-click on models to interactively filter and explore the graph.
{% enddocs %}
Empty file.
5 changes: 4 additions & 1 deletion models/schema.yml → models/raw_vault/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,15 @@ sources:
schema: PUBLIC
tables:
- name: CUSTOMERS_V0_1_0
description: '{{ doc("customers_v0_1_0_desc") }}'
columns:
- name: CUSTOMER_ID
description: "CUSTOMERS_V1 Business key 11111"
description: '{{ doc("customers_v0_1_0_col_customer_id") }}'
tests:
- unique
- not_null
- name: AGE
description: '{{ doc("customers_v0_1_0_col_age") }}'
- name: CUSTOMERS_V0_1_1
columns:
- name: CUSTOMER_ID
Expand Down
34 changes: 34 additions & 0 deletions models/source_descriptions/customers_v0_1_0.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
{% docs customers_v0_1_0_desc %}

# CUSTOMERS_V0_1_0
This table contains data describing a customer.

The original business process that generated the data is: CUST_STATS

## Source
- Name: CUSTOMERS
- Version: 0.1.0
- System: CRM_SYSTEM
- Freshness: Updated daily at 5am
- Format: Flat
- Filetype: .csv

## Accessibility
- Raw source location: s3://data-lake-dev/customers
- Database location: https://instance.eu-west-1.snowflakecomputing.com/console
- Database role access: "data_analyst", "data_scientist"

### Access requests
Contact the Data squad for Snowflake and table access.

## Quality
This file often has null primary keys
{% enddocs %}

{% docs customers_v0_1_0_col_customer_id %}
This is the driving key, a unique identifier to represent a customer.
{% enddocs %}

{% docs customers_v0_1_0_col_age %}
The customers age in years
{% enddocs %}
7 changes: 7 additions & 0 deletions seeds/customers_v0_1_0.csv
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
CUSTOMER_ID,AGE,RECORD_SOURCE,LOAD_DATETIME
C1,7,CUSTOMERS_V3,2021-10-17 14:38:02.229 +0100
C1,7,CUSTOMERS_V3,2021-10-16 14:38:02.229 +0100
C1,7,CUSTOMERS_V2,2021-10-16 14:38:02.229 +0100
C1,7,CUSTOMERS_V1,2021-10-15 14:38:02.229 +0100
C2,4,CUSTOMERS_V1,2021-10-15 14:38:02.229 +0100
C3,8,CUSTOMERS_V1,2021-10-15 14:38:02.229 +0100
Expand Down Expand Up @@ -136,3 +139,7 @@ C134,6,CUSTOMERS_V1,2021-10-15 14:38:02.229 +0100
C135,2,CUSTOMERS_V1,2021-10-15 14:38:02.229 +0100
C136,10,CUSTOMERS_V1,2021-10-15 14:38:02.229 +0100
C137,9,CUSTOMERS_V1,2021-10-15 14:38:02.229 +0100
C138,9,CUSTOMERS_V1,2021-10-15 14:38:02.229 +0100
C139,9,CUSTOMERS_V1,2021-10-15 16:38:02.229 +0100
C139,10,CUSTOMERS_V1,2021-10-15 17:38:02.229 +0100
C139,11,CUSTOMERS_V1,2021-10-15 18:38:02.229 +0100
Loading