From 36cc4e48079da9eee5f289b83cadaf85c6b8ec8a Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Sun, 10 May 2026 22:24:43 -0400 Subject: [PATCH 01/12] improve S3 build comparison notebook --- .../marimo/lifecycle/build_qa/s3_compare.py | 287 +++++++++++++++--- 1 file changed, 252 insertions(+), 35 deletions(-) diff --git a/notebooks/marimo/lifecycle/build_qa/s3_compare.py b/notebooks/marimo/lifecycle/build_qa/s3_compare.py index 7cbdd76147..2fc81352ad 100644 --- a/notebooks/marimo/lifecycle/build_qa/s3_compare.py +++ b/notebooks/marimo/lifecycle/build_qa/s3_compare.py @@ -1,6 +1,6 @@ import marimo -__generated_with = "0.23.1" +__generated_with = "0.23.3" app = marimo.App(width="full") with app.setup: @@ -112,6 +112,31 @@ def _(df_a, df_b, path_a_input, path_b_input): return (comparison,) +@app.cell(hide_code=True) +def _(comparison): + _only_a = comparison[~comparison["in_b"]].shape[0] + _only_b = comparison[~comparison["in_a"]].shape[0] + _shared = comparison[comparison["in_a"] & comparison["in_b"]] + _sizes_equal = ( + (_shared["size_a"] == _shared["size_b"]).all() if len(_shared) else True + ) + _equal = _only_a == 0 and _only_b == 0 and _sizes_equal + if _equal: + _msg = "**Directory equal:** ✅ Yes — same files and sizes." + elif _only_a == 0 and _only_b == 0: + _msg = "**Directory equal:** ❌ No — same file set but sizes differ for some files." + else: + _msg = ( + f"**Directory equal:** ❌ No — {_only_a} file(s) only in A, {_only_b} file(s) only in B" + + ("." if _sizes_equal else ", and sizes differ.") + ) + mo.callout( + mo.md(_msg), + kind="success" if _equal else "danger", + ) + return + + @app.cell(hide_code=True) def _(): mo.md(""" @@ -170,7 +195,7 @@ def _(comparison): @app.cell(hide_code=True) def _(): mo.md(""" - ## Table Comparison + ## File Comparison """) return @@ -180,17 +205,17 @@ def _(comparison): _files_in_both = sorted( comparison[comparison["in_a"] & comparison["in_b"]]["filename"].tolist() ) - table_selector = mo.ui.dropdown( + file_selector = mo.ui.dropdown( options=_files_in_both, label="Select a file to compare", searchable=True, ) - mo.vstack([table_selector], align="center") - return (table_selector,) + mo.vstack([file_selector], align="center") + return (file_selector,) @app.cell(hide_code=True) -def _(bucket_input, path_a_input, path_b_input, table_selector): +def _(bucket_input, file_selector, path_a_input, path_b_input): from io import BytesIO def _load_file(prefix: str, filename: str) -> pd.DataFrame: @@ -207,43 +232,161 @@ def _load_file(prefix: str, filename: str) -> pd.DataFrame: else: raise ValueError(f"Unsupported file extension: .{ext}") - if table_selector.value: - with mo.status.spinner(title=f"Loading {table_selector.value}…"): - tbl_a = _load_file(path_a_input.value, table_selector.value) - tbl_b = _load_file(path_b_input.value, table_selector.value) + if file_selector.value: + with mo.status.spinner(title=f"Loading {file_selector.value}…"): + file_df_a = _load_file(path_a_input.value, file_selector.value) + file_df_b = _load_file(path_b_input.value, file_selector.value) else: - tbl_a = tbl_b = None - return tbl_a, tbl_b + file_df_a = file_df_b = None + return file_df_a, file_df_b + + +@app.cell(hide_code=True) +def _(file_df_a, file_df_b): + mo.stop(file_df_a is None or file_df_b is None) + _equal = file_df_a.reset_index(drop=True).equals(file_df_b.reset_index(drop=True)) + mo.callout( + mo.md( + "**File equal:** ✅ Yes — contents are identical." + if _equal + else "**File equal:** ❌ No — contents differ." + ), + kind="success" if _equal else "danger", + ) + return @app.cell(hide_code=True) -def _(table_selector, tbl_a, tbl_b): +def _(file_df_a, file_df_b): mo.stop( - tbl_a is None or tbl_b is None, + file_df_a is None or file_df_b is None, + ) + row_focus = mo.ui.number( + label="Jump to row", + value=-1, + step=1, + start=-1, + stop=max(len(file_df_a), len(file_df_b)) - 1, + ) + context_rows = mo.ui.number( + label="Context rows (\u00b1)", + value=0, + step=1, + start=0, + stop=50, + ) + show_diff_rows = mo.ui.checkbox(label="Show diff rows only") + mo.hstack( + [ + row_focus, + context_rows, + show_diff_rows, + mo.md("_Set \u2018Jump to row\u2019 to \u22121 to show all rows._"), + ], + gap=2, + align="center", + ) + return context_rows, row_focus, show_diff_rows + + +@app.cell(hide_code=True) +def _(file_df_a, file_df_b): + mo.stop(file_df_a is None or file_df_b is None) + + _di_shared = [c for c in file_df_a.columns if c in file_df_b.columns] + _di_a = file_df_a[_di_shared].reset_index(drop=True) + _di_b = file_df_b[_di_shared].reset_index(drop=True) + _di_min = min(len(_di_a), len(_di_b)) + diff_row_indices: set[int] = set() + for _di_col in _di_shared: + _ca = _di_a.iloc[:_di_min][_di_col] + _cb = _di_b.iloc[:_di_min][_di_col] + _ch = ~((_ca == _cb) | (_ca.isna() & _cb.isna())) + diff_row_indices.update(int(i) for i in _ch[_ch].index) + return (diff_row_indices,) + + +@app.cell(hide_code=True) +def _( + context_rows, + diff_row_indices: set[int], + file_df_a, + file_df_b, + file_selector, + row_focus, + show_diff_rows, +): + mo.stop( + file_df_a is None or file_df_b is None, mo.md("_Select a file above to load and compare._"), ) - _cols_only_a = sorted(set(tbl_a.columns) - set(tbl_b.columns)) - _cols_only_b = sorted(set(tbl_b.columns) - set(tbl_a.columns)) + _cols_only_a = sorted(set(file_df_a.columns) - set(file_df_b.columns)) + _cols_only_b = sorted(set(file_df_b.columns) - set(file_df_a.columns)) _col_status = ( "Column sets match." if not _cols_only_a and not _cols_only_b - else f"Only in A: `{'`, `'.join(_cols_only_a) or '—'}` · Only in B: `{'`, `'.join(_cols_only_b) or '—'}`" + else ( + ("Only in A: " + ", ".join(_cols_only_a) if _cols_only_a else "") + + ( + " \u00b7 Only in B: " + ", ".join(_cols_only_b) + if _cols_only_b + else "" + ) + ) + ) + + def _with_row_num(df): + out = df.reset_index(drop=True) + out.insert(0, "row", range(len(out))) + return out + + def _maybe_focus(df_with_row): + r = row_focus.value + ctx_n = context_rows.value + if show_diff_rows.value and diff_row_indices: + rows_to_show: set[int] = set() + max_row = len(df_with_row) - 1 + for dr in diff_row_indices: + lo = max(0, dr - ctx_n) + hi = min(max_row, dr + ctx_n) + rows_to_show.update(range(lo, hi + 1)) + return df_with_row[df_with_row["row"].isin(rows_to_show)].reset_index( + drop=True + ) + if r < 0: + return df_with_row + lo = max(0, r - ctx_n) + hi = min(len(df_with_row) - 1, r + ctx_n) + return df_with_row.iloc[lo : hi + 1].reset_index(drop=True) + + _summary = ( + f"| | A | B |\n" + f"|---|---|---|\n" + f"| Rows | {len(file_df_a):,} | {len(file_df_b):,} |\n" + f"| Columns | {len(file_df_a.columns)} | {len(file_df_b.columns)} |\n" + f"\n**Columns:** {_col_status}" ) + mo.vstack( [ - mo.md(f""" - | | A | B | - |---|---|---| - | Rows | {len(tbl_a):,} | {len(tbl_b):,} | - | Columns | {len(tbl_a.columns)} | {len(tbl_b.columns)} | - - **Columns:** {_col_status} - """), - mo.ui.tabs( - { - f"A — {table_selector.value}": mo.ui.table(tbl_a, selection=None), - f"B — {table_selector.value}": mo.ui.table(tbl_b, selection=None), - } + mo.md(_summary), + mo.vstack( + [ + mo.md(f"**A \u2014 {file_selector.value}**"), + mo.ui.table( + _maybe_focus(_with_row_num(file_df_a)), + selection=None, + ), + ] + ), + mo.vstack( + [ + mo.md(f"**B \u2014 {file_selector.value}**"), + mo.ui.table( + _maybe_focus(_with_row_num(file_df_b)), + selection=None, + ), + ] ), ] ) @@ -251,12 +394,58 @@ def _(table_selector, tbl_a, tbl_b): @app.cell(hide_code=True) -def _(): - _df = mo.sql( - """ - select * from - """ +def _(file_df_a, file_df_b): + mo.stop( + file_df_a is None or file_df_b is None, + mo.md("_Select a file above to load and compare._"), ) + + _shared_cols = [c for c in file_df_a.columns if c in file_df_b.columns] + _a = file_df_a[_shared_cols].reset_index(drop=True) + _b = file_df_b[_shared_cols].reset_index(drop=True) + _min_len = min(len(_a), len(_b)) + _a_cmp = _a.iloc[:_min_len] + _b_cmp = _b.iloc[:_min_len] + + _records = [] + for col in _shared_cols: + _col_a = _a_cmp[col] + _col_b = _b_cmp[col] + _changed = ~((_col_a == _col_b) | (_col_a.isna() & _col_b.isna())) + for idx in _changed[_changed].index: + _records.append( + { + "row": int(idx), + "column": col, + "value_a": _col_a.at[idx], + "value_b": _col_b.at[idx], + } + ) + + _row_count_note = ( + f"\n\n> \u26a0\ufe0f Row counts differ (A: {len(_a):,}, B: {len(_b):,}) \u2014 only first {_min_len:,} rows compared." + if len(_a) != len(_b) + else "" + ) + + if not _records and len(_a) == len(_b): + _diff_out = mo.md("_No differences found in shared columns._") + elif not _records: + _diff_out = mo.md( + f"_Shared columns match across first {_min_len:,} rows.{_row_count_note}_" + ) + else: + _n_diff_rows = len({r["row"] for r in _records}) + _n_diff_cols = len({r["column"] for r in _records}) + _diff_out = mo.vstack( + [ + mo.md( + f"**{_n_diff_rows:,} row(s)** with differences across **{_n_diff_cols:,}** column(s).{_row_count_note}" + ), + mo.ui.table(pd.DataFrame(_records), selection=None), + ] + ) + _diff_out return @@ -330,6 +519,34 @@ def _load_shapefile(prefix: str, filename: str) -> gpd.GeoDataFrame: return geo_gdf_a, geo_gdf_b +@app.cell(hide_code=True) +def _(geo_gdf_a, geo_gdf_b): + _geom_col = geo_gdf_a.geometry.name + _attr_cols = [c for c in geo_gdf_a.columns if c != _geom_col] + _attrs_equal = ( + geo_gdf_a[_attr_cols] + .reset_index(drop=True) + .equals(geo_gdf_b[_attr_cols].reset_index(drop=True)) + ) + _geom_equal = geo_gdf_a.geometry.reset_index(drop=True).equals( + geo_gdf_b.geometry.reset_index(drop=True) + ) + _equal = _attrs_equal and _geom_equal + mo.callout( + mo.md( + "**GeoDataFrame equal:** ✅ Yes — features and geometries match." + if _equal + else "**GeoDataFrame equal:** ❌ No — " + + ("" if _attrs_equal else "attributes differ") + + (" and " if not _attrs_equal and not _geom_equal else "") + + ("" if _geom_equal else "geometries differ") + + "." + ), + kind="success" if _equal else "danger", + ) + return + + @app.cell(hide_code=True) def _(geo_gdf_a, geo_selector, path_a_input): _geom_col = geo_gdf_a.geometry.name From 65ca5f810c798aeb3d4053fd3ba7f31588a56cf9 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 8 May 2026 11:45:11 -0400 Subject: [PATCH 02/12] start facdb dbt project --- products/facilities/dbt_project.yml | 21 +++++++++++++++++++++ products/facilities/packages.yml | 5 +++++ products/facilities/profiles.yml | 11 +++++++++++ 3 files changed, 37 insertions(+) create mode 100644 products/facilities/dbt_project.yml create mode 100644 products/facilities/packages.yml create mode 100644 products/facilities/profiles.yml diff --git a/products/facilities/dbt_project.yml b/products/facilities/dbt_project.yml new file mode 100644 index 0000000000..ee3a20e87c --- /dev/null +++ b/products/facilities/dbt_project.yml @@ -0,0 +1,21 @@ +name: facilities + +profile: dcp-de-postgres + +model-paths: [ "models" ] + +tests: + +store_failures: true + schema: "_tests" + +models: + facilities: + staging: + +materialized: view + intermediate: + +materialized: view + product: + +materialized: table + +flags: + fail-fast: true diff --git a/products/facilities/packages.yml b/products/facilities/packages.yml new file mode 100644 index 0000000000..fa915ec5e4 --- /dev/null +++ b/products/facilities/packages.yml @@ -0,0 +1,5 @@ +packages: +- package: dbt-labs/dbt_utils + version: 1.3.2 +- package: metaplane/dbt_expectations + version: 0.10.9 diff --git a/products/facilities/profiles.yml b/products/facilities/profiles.yml new file mode 100644 index 0000000000..6712525864 --- /dev/null +++ b/products/facilities/profiles.yml @@ -0,0 +1,11 @@ +dcp-de-postgres: + target: dev + outputs: + dev: + type: postgres + host: "{{ env_var('BUILD_ENGINE_HOST') }}" + user: "{{ env_var('BUILD_ENGINE_USER') }}" + password: "{{ env_var('BUILD_ENGINE_PASSWORD') }}" + port: "{{ env_var('BUILD_ENGINE_PORT') | as_number }}" + dbname: "{{ env_var('BUILD_ENGINE_DB') }}" + schema: "{{ env_var('BUILD_ENGINE_SCHEMA') }}" From d1aba0de8a18237a548aed983952fd74721fc24c Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 8 May 2026 11:57:19 -0400 Subject: [PATCH 03/12] drop unused cli cases --- products/facilities/facdb.sh | 2 -- 1 file changed, 2 deletions(-) diff --git a/products/facilities/facdb.sh b/products/facilities/facdb.sh index 433953fae1..957a53cc78 100755 --- a/products/facilities/facdb.sh +++ b/products/facilities/facdb.sh @@ -8,8 +8,6 @@ max_bg_procs 5 case $1 in - init) init ;; upload) python3 -m dcpy.connectors.edm.publishing upload -p db-facilities -a public-read ;; export) ./facdb/bash/export.sh ;; - *) facdb_execute $@ ;; esac From 189a1a39a945c7a4300a5b21634ac61fd2638d5f Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 8 May 2026 12:00:26 -0400 Subject: [PATCH 04/12] simplify facdb upload --- .github/workflows/facilities_build.yml | 2 +- products/facilities/facdb.sh | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/facilities_build.yml b/.github/workflows/facilities_build.yml index bbfb9652a4..a53d15e07a 100644 --- a/.github/workflows/facilities_build.yml +++ b/.github/workflows/facilities_build.yml @@ -86,4 +86,4 @@ jobs: run: ./facdb.sh export - name: Upload Artifacts - run: ./facdb.sh upload + run: python3 -m dcpy.connectors.edm.publishing upload --product db-facilities --acl public-read diff --git a/products/facilities/facdb.sh b/products/facilities/facdb.sh index 957a53cc78..c40677778b 100755 --- a/products/facilities/facdb.sh +++ b/products/facilities/facdb.sh @@ -8,6 +8,5 @@ max_bg_procs 5 case $1 in - upload) python3 -m dcpy.connectors.edm.publishing upload -p db-facilities -a public-read ;; export) ./facdb/bash/export.sh ;; esac From 6d5eea1060654177e1e9755c2ed9413235a30164 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 8 May 2026 12:09:45 -0400 Subject: [PATCH 05/12] move facdb export from bash to python CLI --- .github/workflows/facilities_build.yml | 2 +- products/facilities/facdb.sh | 12 --- products/facilities/facdb/cli.py | 108 +++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 13 deletions(-) delete mode 100755 products/facilities/facdb.sh diff --git a/.github/workflows/facilities_build.yml b/.github/workflows/facilities_build.yml index a53d15e07a..0955b8fecf 100644 --- a/.github/workflows/facilities_build.yml +++ b/.github/workflows/facilities_build.yml @@ -83,7 +83,7 @@ jobs: run: python -m facdb.cli reformat_facdb - name: Export facdb - run: ./facdb.sh export + run: python -m facdb.cli export - name: Upload Artifacts run: python3 -m dcpy.connectors.edm.publishing upload --product db-facilities --acl public-read diff --git a/products/facilities/facdb.sh b/products/facilities/facdb.sh deleted file mode 100755 index c40677778b..0000000000 --- a/products/facilities/facdb.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -FILE_DIR=$(dirname "$(readlink -f "$0")") - -source $FILE_DIR/../../bash/utils.sh -set_error_traps -max_bg_procs 5 - - -case $1 in - export) ./facdb/bash/export.sh ;; -esac diff --git a/products/facilities/facdb/cli.py b/products/facilities/facdb/cli.py index ff746571f8..cfda944b66 100644 --- a/products/facilities/facdb/cli.py +++ b/products/facilities/facdb/cli.py @@ -1,4 +1,9 @@ +import concurrent.futures import importlib +import shutil +import subprocess +import zipfile +from pathlib import Path import typer @@ -28,6 +33,84 @@ def _autocomplete_dataset_name(incomplete: str) -> list: return completion +_CSV_EXPORTS = [ + ("facdb_export_csv", "facilities"), + ("qc_operator", "qc_operator"), + ("qc_oversight", "qc_oversight"), + ("qc_classification", "qc_classification"), + ("qc_captype", "qc_captype"), + ("qc_mapped", "qc_mapped"), + ("qc_diff", "qc_diff"), + ("qc_recordcounts", "qc_recordcounts"), + ("qc_subgrpbins", "qc_subgrpbins"), +] + + +def _export_csv( + pg: postgres.PostgresClient, table: str, filename: str, output_dir: Path +) -> None: + pg.export_to_csv(table, output_dir / f"{filename}.csv") + + +def _export_shp(pg: postgres.PostgresClient, output_dir: Path) -> None: + shp_dir = output_dir / "facilities_shp" + shp_dir.mkdir(exist_ok=True) + subprocess.check_call( + [ + "ogr2ogr", + "-progress", + "-f", + "ESRI Shapefile", + str(shp_dir / "facilities.shp"), + f"PG:{BUILD_ENGINE}", + f"{pg.schema}.facdb_export", + "-nlt", + "POINT", + "-t_srs", + "EPSG:2263", + ] + ) + with zipfile.ZipFile( + output_dir / "facilities.shp.zip", "w", zipfile.ZIP_DEFLATED, compresslevel=9 + ) as zf: + for f in shp_dir.iterdir(): + zf.write(f, f.name) + shutil.rmtree(shp_dir) + + +def _export_fgdb(pg: postgres.PostgresClient, output_dir: Path) -> None: + gdb_outer = output_dir / "facilities_fgdb" + gdb_outer.mkdir(exist_ok=True) + gdb_path = gdb_outer / "facilities.gdb" + subprocess.check_call( + [ + "ogr2ogr", + "-progress", + "-f", + "OpenFileGDB", + str(gdb_path), + f"PG:{BUILD_ENGINE}", + f"{pg.schema}.facdb_export", + "-mapFieldType", + "Integer64=Real", + "-lco", + "GEOMETRY_NAME=Shape", + "-nln", + "facdb", + "-nlt", + "POINT", + "-t_srs", + "EPSG:2263", + ] + ) + with zipfile.ZipFile( + output_dir / "facilities.gdb.zip", "w", zipfile.ZIP_DEFLATED + ) as zf: + for f in gdb_path.rglob("*"): + zf.write(f, f.relative_to(gdb_outer)) + shutil.rmtree(gdb_outer) + + @app.command("init") def _cli_init(): """ @@ -120,5 +203,30 @@ def _cli_reformat_facdb(): ) +@app.command("export") +def _cli_export(): + """Export facdb tables and geospatial files to the output directory.""" + output_dir = Path("output") + output_dir.mkdir(exist_ok=True) + + shutil.copy("facdb/metadata.yml", output_dir / "metadata.yml") + shutil.copy("build_metadata.json", output_dir / "build_metadata.json") + shutil.copy("source_data_versions.csv", output_dir / "source_data_versions.csv") + + pg = postgres.PostgresClient() + + with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: + futures = [ + executor.submit(_export_csv, pg, table, filename, output_dir) + for table, filename in _CSV_EXPORTS + ] + futures.append(executor.submit(_export_shp, pg, output_dir)) + futures.append(executor.submit(_export_fgdb, pg, output_dir)) + for future in concurrent.futures.as_completed(futures): + future.result() + + typer.echo(typer.style("SUCCESS: export complete", fg=typer.colors.GREEN)) + + if __name__ == "__main__": app() From 98eea903878876c5301f1a136c4668604269ad29 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Fri, 8 May 2026 12:18:00 -0400 Subject: [PATCH 06/12] test FacDB models --- .github/workflows/facilities_build.yml | 10 ++++++++++ .../facilities/models/product/_product.yml | 20 +++++++++++++++++++ .../models/product/facdb_export.sql | 1 + .../tests/generic/test_many_to_one.sql | 12 +++++++++++ 4 files changed, 43 insertions(+) create mode 100644 products/facilities/models/product/_product.yml create mode 100644 products/facilities/models/product/facdb_export.sql create mode 100644 products/facilities/tests/generic/test_many_to_one.sql diff --git a/.github/workflows/facilities_build.yml b/.github/workflows/facilities_build.yml index 0955b8fecf..3e3f2d4538 100644 --- a/.github/workflows/facilities_build.yml +++ b/.github/workflows/facilities_build.yml @@ -48,6 +48,10 @@ jobs: AWS_SECRET_ACCESS_KEY: "op://Data Engineering/DO_keys/AWS_SECRET_ACCESS_KEY" AWS_ACCESS_KEY_ID: "op://Data Engineering/DO_keys/AWS_ACCESS_KEY_ID" BUILD_ENGINE_SERVER: "op://Data Engineering/EDM_DATA/server_url" + BUILD_ENGINE_HOST: "op://Data Engineering/EDM_DATA/server" + BUILD_ENGINE_USER: "op://Data Engineering/EDM_DATA/username" + BUILD_ENGINE_PASSWORD: "op://Data Engineering/EDM_DATA/password" + BUILD_ENGINE_PORT: "op://Data Engineering/EDM_DATA/port" - name: Run container setup working-directory: ./ @@ -82,6 +86,12 @@ jobs: - name: Modify facdb table run: python -m facdb.cli reformat_facdb + - name: Test build tables + run: | + dbt deps + dbt debug + dbt test + - name: Export facdb run: python -m facdb.cli export diff --git a/products/facilities/models/product/_product.yml b/products/facilities/models/product/_product.yml new file mode 100644 index 0000000000..b2b4cfaaf3 --- /dev/null +++ b/products/facilities/models/product/_product.yml @@ -0,0 +1,20 @@ +models: + - name: facdb_export + columns: + - name: '"UID"' + tests: [ not_null, unique ] + - name: '"DATASOURCE"' + tests: [ not_null ] + - name: '"FACNAME"' + tests: + - not_null: + config: + severity: error + error_if: ">10" + - name: '"FACTYPE"' + tests: + - not_null + - many_to_one: + parent_column: '"FACSUBGRP"' + - name: '"FACSUBGRP"' + tests: [ not_null ] diff --git a/products/facilities/models/product/facdb_export.sql b/products/facilities/models/product/facdb_export.sql new file mode 100644 index 0000000000..452633ee61 --- /dev/null +++ b/products/facilities/models/product/facdb_export.sql @@ -0,0 +1 @@ +-- Placeholder SQL file for facdb_export model. This file is required for dbt to recognize the model, but the actual SQL logic is implemented in the facdb_export.py script. diff --git a/products/facilities/tests/generic/test_many_to_one.sql b/products/facilities/tests/generic/test_many_to_one.sql new file mode 100644 index 0000000000..0489ae00f7 --- /dev/null +++ b/products/facilities/tests/generic/test_many_to_one.sql @@ -0,0 +1,12 @@ +{% test many_to_one(model, column_name, parent_column) %} +-- Asserts that each value of `column_name` (the "many" side) is associated with +-- exactly one value of `parent_column` (the "one" side). +-- Fails if any `column_name` value maps to more than one `parent_column` value, +-- which would indicate a many-to-many relationship. +select + {{ column_name }}, + count(distinct {{ parent_column }}) as parent_count +from {{ model }} +group by {{ column_name }} +having count(distinct {{ parent_column }}) > 1 +{% endtest %} From 9c2886a5f734eccb1f00918724280000148812be Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Mon, 11 May 2026 12:49:29 -0400 Subject: [PATCH 07/12] ignore sqfluff rule we'd rather be able to be explicit in case statements --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 0bb1747fa4..ab2a8545b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,6 +9,7 @@ exclude_rules = [ "references.qualification", "structure.unused_cte", "structure.column_order", + "structure.else_null", "structure.subquery", "references.keywords", "references.consistent", From b2c07e450942f684466fc799c261d6fc2564cf53 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Sun, 10 May 2026 21:53:43 -0400 Subject: [PATCH 08/12] map factype values for doe_universalprek --- products/facilities/facdb/sql/pipelines/doe_universalprek.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/products/facilities/facdb/sql/pipelines/doe_universalprek.sql b/products/facilities/facdb/sql/pipelines/doe_universalprek.sql index 244e2932c9..bb1c87791f 100644 --- a/products/facilities/facdb/sql/pipelines/doe_universalprek.sql +++ b/products/facilities/facdb/sql/pipelines/doe_universalprek.sql @@ -15,8 +15,9 @@ SELECT (CASE WHEN site_type = 'DOE' OR site_type = 'Public School' THEN 'DOE Universal Pre-K' WHEN site_type = 'CHARTER' OR site_type = 'Charter' THEN 'DOE Universal Pre-K - Charter' - WHEN site_type = 'NYCEEC' OR site_type = 'CBO' THEN 'Early Education Program' + WHEN site_type = 'NYCEEC' OR site_type = 'CBO' OR site_type = 'LYFE' THEN 'Early Education Program' WHEN site_type = 'PKC' THEN 'Pre-K Center' + ELSE NULL END) AS factype, 'DOE Universal Pre-Kindergarten' AS facsubgrp, (CASE From 4f8bcc7a883666aa4096994656d65c77f51f14e5 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Sun, 10 May 2026 22:06:36 -0400 Subject: [PATCH 09/12] map factype values for foodbankny_foodbanks --- .../facilities/facdb/sql/pipelines/foodbankny_foodbanks.sql | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/products/facilities/facdb/sql/pipelines/foodbankny_foodbanks.sql b/products/facilities/facdb/sql/pipelines/foodbankny_foodbanks.sql index 7d40671518..f97a9386f3 100644 --- a/products/facilities/facdb/sql/pipelines/foodbankny_foodbanks.sql +++ b/products/facilities/facdb/sql/pipelines/foodbankny_foodbanks.sql @@ -14,8 +14,9 @@ SELECT NULL AS bin, NULL AS bbl, (CASE - WHEN program_type ~* 'pantry' THEN 'Food Pantry' - WHEN program_type ~* 'Soup Kitchen' THEN 'Soup Kitchen' + WHEN program_type ~* 'pantry' OR program_type = 'MP' THEN 'Food Pantry' + WHEN program_type ~* 'Soup Kitchen' OR program_type IN ('SOUP KITCH', 'SKM') THEN 'Soup Kitchen' + ELSE NULL END) AS factype, 'Soup Kitchens and Food Pantries' AS facsubgrp, agency AS opname, From 1ba51ffd46c955f64945361fac461c4f83f4bc48 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Sun, 10 May 2026 22:40:09 -0400 Subject: [PATCH 10/12] map factype values for dycd_service_sites --- .../sql/pipelines/dycd_service_sites.sql | 41 +++++++++++++++++-- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/products/facilities/facdb/sql/pipelines/dycd_service_sites.sql b/products/facilities/facdb/sql/pipelines/dycd_service_sites.sql index 41561e5889..dd1c093ac2 100644 --- a/products/facilities/facdb/sql/pipelines/dycd_service_sites.sql +++ b/products/facilities/facdb/sql/pipelines/dycd_service_sites.sql @@ -13,7 +13,28 @@ SELECT NULL AS borocode, bin, bbl, - service_category AS factype, + (CASE + WHEN service_category IS NOT NULL THEN service_category + WHEN program_type ~* 'Transitional Independent Living \(TIL\)' + THEN 'Transitional Independent Living' + WHEN program_type ~* 'Transitional Independent Living \(HYA\)' + THEN 'Transitional Independent Living' + WHEN program_type ~* 'Adult Literacy Pilot Project' + THEN 'Adult Literacy' + WHEN program_type ~* 'Services for Immigrants' + THEN 'Immigrant Services' + WHEN program_type ~* 'Immigrant Workers' + THEN 'Immigrant Workers' + WHEN program_type ~* 'Crisis Shelters' + THEN 'Crisis Shelters' + WHEN program_type ~* 'Victims of Domestic Violence and Trafficking' + THEN 'VICTIM SERVICES, DOMESTIC VIOLENCE' + WHEN program_type ~* 'Legal Services For Immigrant Youth' + THEN 'Legal Services for Immigrant Youth' + WHEN program_type ~* 'COMPASS Horizon' + THEN 'COMPASS' + ELSE NULL + END) AS factype, (CASE WHEN program_type ~* 'Beacon' @@ -22,9 +43,23 @@ SELECT OR program_type ~* 'Teen Action Program' OR program_type ~* 'After-School Programs' THEN 'After-School Programs' - WHEN service_category ~* 'Immigrant Support Services' + WHEN program_type ~* 'COMPASS Horizon' + THEN 'After-School Programs' + WHEN + program_type ~* 'Services for Immigrants' + OR program_type ~* 'Immigrant Workers' + OR program_type ~* 'Legal Services For Immigrant Youth' THEN 'Immigrant Services' - ELSE 'Youth Centers, Literacy Programs, and Job Training Services' + WHEN program_type ~* 'Victims of Domestic Violence and Trafficking' + THEN 'LEGAL AND INTERVENTION SERVICES' + WHEN service_category IS NOT NULL + THEN 'Youth Centers, Literacy Programs, and Job Training Services' + WHEN + program_type ~* 'Transitional Independent Living' + OR program_type ~* 'Adult Literacy Pilot Project' + OR program_type ~* 'Crisis Shelters' + THEN 'Youth Centers, Literacy Programs, and Job Training Services' + ELSE NULL END) AS facsubgrp, provider AS opname, NULL AS opabbrev, From ed522c8371c59aeb8afbb3276de119de16f966b1 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Mon, 11 May 2026 13:06:35 -0400 Subject: [PATCH 11/12] more explicit factype and facsubgrp mappings --- .../sql/pipelines/dycd_service_sites.sql | 36 ++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/products/facilities/facdb/sql/pipelines/dycd_service_sites.sql b/products/facilities/facdb/sql/pipelines/dycd_service_sites.sql index dd1c093ac2..e494505c18 100644 --- a/products/facilities/facdb/sql/pipelines/dycd_service_sites.sql +++ b/products/facilities/facdb/sql/pipelines/dycd_service_sites.sql @@ -14,13 +14,33 @@ SELECT bin, bbl, (CASE + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'COMPASS Middle School' + THEN 'COMPASS' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'COMPASS Elementary' + THEN 'COMPASS ELEMENTARY' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'COMPASS Explore' + THEN 'COMPASS EXPLORE' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'COMPASS High School' + THEN 'COMPASS HIGH' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'COMPASS SONYC Pilot' + THEN 'COMPASS' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'Beacon' + THEN 'BEACON' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'Cornerstone' + THEN 'CORNERSTONE' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'High School' + THEN 'HIGH SCHOOL AFTERSCHOOL PROGRAMS' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'Learn and Earn' + THEN 'LEARN AND EARN' + WHEN service_category ILIKE '%AFTERSCHOOL%' AND program_type ~* 'Adolescent Literacy' + THEN 'ADOLESCENT LITERACY' WHEN service_category IS NOT NULL THEN service_category WHEN program_type ~* 'Transitional Independent Living \(TIL\)' THEN 'Transitional Independent Living' WHEN program_type ~* 'Transitional Independent Living \(HYA\)' THEN 'Transitional Independent Living' WHEN program_type ~* 'Adult Literacy Pilot Project' - THEN 'Adult Literacy' + THEN 'Adult Literacy Pilot Program' WHEN program_type ~* 'Services for Immigrants' THEN 'Immigrant Services' WHEN program_type ~* 'Immigrant Workers' @@ -36,6 +56,20 @@ SELECT ELSE NULL END) AS factype, (CASE + WHEN + service_category ILIKE '%AFTERSCHOOL%' AND ( + program_type ~* 'COMPASS' + OR program_type ~* 'Beacon' + OR program_type ~* 'Cornerstone' + OR program_type ~* 'High School' + ) + THEN 'After-School Programs' + WHEN + service_category ILIKE '%AFTERSCHOOL%' AND ( + program_type ~* 'Learn and Earn' + OR program_type ~* 'Adolescent Literacy' + ) + THEN 'Youth Centers, Literacy Programs, and Job Training Services' WHEN program_type ~* 'Beacon' OR program_type ~* 'High-School Aged Youth' From 3f20e579f41e540476a12b021b4df79c3a1a1b13 Mon Sep 17 00:00:00 2001 From: Damon McCullough Date: Mon, 11 May 2026 14:45:05 -0400 Subject: [PATCH 12/12] more explicit factype and facsubgrp mappings --- products/facilities/facdb/sql/pipelines/dcp_colp.sql | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/products/facilities/facdb/sql/pipelines/dcp_colp.sql b/products/facilities/facdb/sql/pipelines/dcp_colp.sql index 9d7f88d2f7..489ef9bb57 100644 --- a/products/facilities/facdb/sql/pipelines/dcp_colp.sql +++ b/products/facilities/facdb/sql/pipelines/dcp_colp.sql @@ -36,6 +36,10 @@ WITH _dcp_colp_tmp AS ( WHEN parcelname ~* 'PRECINCT' AND usecode = '0500' THEN 'Police Station' + WHEN agency LIKE '%OCME%' AND usetype LIKE '%TESTING LABORATORY%' THEN 'Medical Testing Laboratory' + WHEN agency LIKE '%DEP%' AND usetype LIKE '%TESTING LABORATORY%' THEN 'Environmental Testing Laboratory' + WHEN agency LIKE '%CUNY%' AND usetype LIKE '%RESIDENTIAL%' THEN 'University Residential Structure' + WHEN agency LIKE '%HHC%' AND usetype LIKE '%RESIDENTIAL%' THEN 'Hospital Residential Structure' ELSE initcap(replace(usetype, 'OTHER ', '')) END ) AS factype, @@ -202,6 +206,9 @@ WITH _dcp_colp_tmp AS ( WHEN usetype LIKE '%EMERGENCY MEDICAL%' THEN 'Other Emergency Services' WHEN usetype LIKE '%FIREHOUSE%' THEN 'Fire Services' WHEN usetype LIKE '%POLICE STATION%' THEN 'Police Services' + WHEN + usetype LIKE '%PUBLIC SAFETY%' AND usecode = '0500' AND parcelname ~* 'PRECINCT' + THEN 'Police Services' WHEN usetype LIKE '%PUBLIC SAFETY%' THEN 'Other Public Safety' WHEN agency LIKE '%OCME%' THEN 'Forensics' -- Education, Children, Youth WHEN usetype LIKE '%UNIVERSITY%' THEN 'Colleges or Universities'