From 082891b3efd451ecb7e0f620a9afb21fc2a35e65 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Sun, 27 Jul 2025 23:12:42 -0400
Subject: [PATCH 01/24] Document the purpose of the repo

---
 README.md | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 16aafa0..7ed8e0d 100644
--- a/README.md
+++ b/README.md
@@ -1,2 +1,11 @@
-# vrdc_vtin_to_npi
-Calculate VTINs for a VRDC PUF
+# VRDC Python Projects
+
+This is a meta project folder that will have VRDC projects to create new data files inside the VRDC environment. 
+
+## Purpose and Approach
+
+Some of these data files will become public resources on (probably) data.cms.gov. Others will have private information that will require that the data remain in the CMS VRDC environment. 
+
+The projects should generally be implemented using [plainerflow](https://github.com/DSACMS/ndh_plainerflow) so that they will be portable between SQL based systems.
+
+It is possible that some of these systems will be need to be executed in SAS ProcSQL or FedSQL, or snowflake notebooks but the default environment will be databricks notebooks. 

From a39a5bce1a5225a4bd7b296ba90e968f25ffe89c Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Wed, 10 Sep 2025 23:16:51 -0400
Subject: [PATCH 02/24] Update README.md

---
 README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 7ed8e0d..7c5af05 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
-# VRDC Python Projects
+# VRDC/IRD Projects
 
-This is a meta project folder that will have VRDC projects to create new data files inside the VRDC environment. 
+This is a meta project folder that will have VRDC/IDR projects to create new data files inside the VRDC/IDR environment. 
 
 ## Purpose and Approach
 
@@ -8,4 +8,4 @@ Some of these data files will become public resources on (probably) data.cms.gov
 
 The projects should generally be implemented using [plainerflow](https://github.com/DSACMS/ndh_plainerflow) so that they will be portable between SQL based systems.
 
-It is possible that some of these systems will be need to be executed in SAS ProcSQL or FedSQL, or snowflake notebooks but the default environment will be databricks notebooks. 
+It is possible that some of these systems will be need to be executed in SAS ProcSQL or FedSQL, or snowflake notebooks but the default environment will be databricks notebooks for VRDC and snowflake workbooks for IDR. 

From ce65211fb179d550908d30c5e0ad2c05f63aa344 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Wed, 10 Sep 2025 23:21:47 -0400
Subject: [PATCH 03/24] intial

---
 idr/pecos_practice_location.sql | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)
 create mode 100644 idr/pecos_practice_location.sql

diff --git a/idr/pecos_practice_location.sql b/idr/pecos_practice_location.sql
new file mode 100644
index 0000000..c0d6a84
--- /dev/null
+++ b/idr/pecos_practice_location.sql
@@ -0,0 +1,23 @@
+-- This pulls out recent the latest PECOS addresses, by NPI, where the update darte was after 2022
+-- YEAR(IDR_TRANS_OBSLT_TS) = 9999 filters out all not-current records. 
+SELECT 
+    PRVDR_NPI_NUM,
+    ADR_TYPE_DESC,
+    LINE_1_ADR,
+    LINE_2_ADR,
+    GEO_USPS_STATE_CD,
+    ZIP5_CD,
+    ZIP4_CD,
+    CNTRY_CD,
+    PHNE_NUM,
+    PHNE_EXTNSN_NUM,
+    FAX_NUM,
+    IDR_UPDT_TS,
+    YEAR(IDR_UPDT_TS) AS IDR_UPDT_TS_YEAR,
+    IDR_TRANS_OBSLT_TS,
+    YEAR(IDR_TRANS_OBSLT_TS) AS IDR_TRANS_OBSLT_TS_YEAR,
+    IDR_TRANS_EFCTV_TS,
+    YEAR(IDR_TRANS_EFCTV_TS) AS IDR_TRANS_EFCTV_TS_YEAR
+    
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
+WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
\ No newline at end of file

From bc4106d42528c6e789a6dfe429d2cb75d86e7f29 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 00:30:52 -0400
Subject: [PATCH 04/24] move to python

---
 idr/pecos_practice_location.sql      | 23 --------------
 idr/pecos_recent_practice_address.py | 46 ++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 23 deletions(-)
 delete mode 100644 idr/pecos_practice_location.sql
 create mode 100644 idr/pecos_recent_practice_address.py

diff --git a/idr/pecos_practice_location.sql b/idr/pecos_practice_location.sql
deleted file mode 100644
index c0d6a84..0000000
--- a/idr/pecos_practice_location.sql
+++ /dev/null
@@ -1,23 +0,0 @@
--- This pulls out recent the latest PECOS addresses, by NPI, where the update darte was after 2022
--- YEAR(IDR_TRANS_OBSLT_TS) = 9999 filters out all not-current records. 
-SELECT 
-    PRVDR_NPI_NUM,
-    ADR_TYPE_DESC,
-    LINE_1_ADR,
-    LINE_2_ADR,
-    GEO_USPS_STATE_CD,
-    ZIP5_CD,
-    ZIP4_CD,
-    CNTRY_CD,
-    PHNE_NUM,
-    PHNE_EXTNSN_NUM,
-    FAX_NUM,
-    IDR_UPDT_TS,
-    YEAR(IDR_UPDT_TS) AS IDR_UPDT_TS_YEAR,
-    IDR_TRANS_OBSLT_TS,
-    YEAR(IDR_TRANS_OBSLT_TS) AS IDR_TRANS_OBSLT_TS_YEAR,
-    IDR_TRANS_EFCTV_TS,
-    YEAR(IDR_TRANS_EFCTV_TS) AS IDR_TRANS_EFCTV_TS_YEAR
-    
-FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
-WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
\ No newline at end of file
diff --git a/idr/pecos_recent_practice_address.py b/idr/pecos_recent_practice_address.py
new file mode 100644
index 0000000..b033976
--- /dev/null
+++ b/idr/pecos_recent_practice_address.py
@@ -0,0 +1,46 @@
+# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
+
+# Import python packages
+import streamlit as st
+import pandas as pd
+from datetime import datetime
+
+# We can also use Snowpark for our analyses!
+from snowflake.snowpark.context import get_active_session
+session = get_active_session()
+
+address_sql = f"""
+SELECT 
+    PRVDR_NPI_NUM,
+    ADR_TYPE_DESC,
+    LINE_1_ADR,
+    LINE_2_ADR,
+    GEO_USPS_STATE_CD,
+    ZIP5_CD,
+    ZIP4_CD,
+    CNTRY_CD,
+    PHNE_NUM,
+    FAX_NUM,
+    YEAR(IDR_UPDT_TS) AS IDR_UPDT_TS_YEAR,
+    YEAR(IDR_TRANS_OBSLT_TS) AS IDR_TRANS_OBSLT_TS_YEAR,
+    YEAR(IDR_TRANS_EFCTV_TS) AS IDR_TRANS_EFCTV_TS_YEAR
+    
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
+WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
+
+"""
+
+df = session.sql(address_sql).to_pandas()
+
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+address_file_name = f"pecos_recent_practice_address.{ts}.csv"
+
+session.file.put_stream(
+    df.to_csv(address_file_name, index = False),
+    f"@~/{address_file_name}",
+    auto_compress=False
+)
+
+# To download use: 
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
\ No newline at end of file

From b0f08cb2977cc6a25a1e0a1f52d1bfdb6ac3142a Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 01:24:51 -0400
Subject: [PATCH 05/24] this version generates too much data local to the
 notebook.

---
 idr/pecos_unexpired_state_license.py | 48 ++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 idr/pecos_unexpired_state_license.py

diff --git a/idr/pecos_unexpired_state_license.py b/idr/pecos_unexpired_state_license.py
new file mode 100644
index 0000000..bd3d49a
--- /dev/null
+++ b/idr/pecos_unexpired_state_license.py
@@ -0,0 +1,48 @@
+# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
+
+# Import python packages
+import streamlit as st
+import pandas as pd
+from datetime import datetime
+
+# We can also use Snowpark for our analyses!
+from snowflake.snowpark.context import get_active_session
+session = get_active_session()
+
+now = datetime.now()
+last_year = now.year - 1 
+
+
+address_sql = f"""
+SELECT
+    PRVDR_NPI_NUM,
+    PRVDR_ENRLMT_LCNS_STATE_CD,
+    PRVDR_ENRLMT_LCNS_NUM, 
+    PRVDR_ENRLMT_FORM_CD,
+    PRVDR_ENRLMT_SPCLTY_CD,
+    PRVDR_ENRLMT_SPCLTY_DESC
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_LCNS_CRNT AS current_license
+JOIN  IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_NPI_CRNT enrollment_to_npi ON
+  enrollment_to_npi.prvdr_enrlmt_id =
+  current_license.prvdr_enrlmt_id
+JOIN IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_SPCLTY_CRNT AS enrollment_speciality ON 
+    enrollment_speciality.prvdr_enrlmt_id =
+    enrollment_to_npi.prvdr_enrlmt_id
+WHERE YEAR(PRVDR_ENRLMT_LCNS_EXPRTN_DT) > {last_year}
+
+"""
+
+df = session.sql(address_sql).to_pandas()
+
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+address_file_name = f"pecos_recent_practice_address.{ts}.csv"
+
+session.file.put_stream(
+    df.to_csv(address_file_name, index = False),
+    f"@~/{address_file_name}",
+    auto_compress=False
+)
+
+# To download use: 
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
\ No newline at end of file

From b8a0a9c210861dcf1b2c77d7eefb45eef7bdd5ac Mon Sep 17 00:00:00 2001
From: ftrotter-gov <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 01:29:17 -0400
Subject: [PATCH 06/24] use COPY INTO

---
 idr/pecos_unexpired_state_license.py | 31 +++++++++++++++++-----------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/idr/pecos_unexpired_state_license.py b/idr/pecos_unexpired_state_license.py
index bd3d49a..545ff47 100644
--- a/idr/pecos_unexpired_state_license.py
+++ b/idr/pecos_unexpired_state_license.py
@@ -9,11 +9,18 @@
 from snowflake.snowpark.context import get_active_session
 session = get_active_session()
 
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+license_file_name = f"@~/pecos_recent_practice_address.{ts}.csv"
+
+
 now = datetime.now()
 last_year = now.year - 1 
 
 
-address_sql = f"""
+license_sql = f"""
+COPY INTO {license_file_name}
+FROM (
 SELECT
     PRVDR_NPI_NUM,
     PRVDR_ENRLMT_LCNS_STATE_CD,
@@ -29,20 +36,20 @@
     enrollment_speciality.prvdr_enrlmt_id =
     enrollment_to_npi.prvdr_enrlmt_id
 WHERE YEAR(PRVDR_ENRLMT_LCNS_EXPRTN_DT) > {last_year}
-
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+SINGLE = TRUE
+OVERWRITE = TRUE;
 """
 
-df = session.sql(address_sql).to_pandas()
-
-ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+session.sql(license_sql).collect()
 
-address_file_name = f"pecos_recent_practice_address.{ts}.csv"
-
-session.file.put_stream(
-    df.to_csv(address_file_name, index = False),
-    f"@~/{address_file_name}",
-    auto_compress=False
-)
 
 # To download use: 
 # snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
\ No newline at end of file

From b74afb7596f1f6a28686c1872031c6c92d5e2764 Mon Sep 17 00:00:00 2001
From: ftrotter-gov <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 01:40:33 -0400
Subject: [PATCH 07/24] merge csv files

---
 misc_scripts/snowflake_csv_merge.py | 97 +++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 misc_scripts/snowflake_csv_merge.py

diff --git a/misc_scripts/snowflake_csv_merge.py b/misc_scripts/snowflake_csv_merge.py
new file mode 100644
index 0000000..bf8674e
--- /dev/null
+++ b/misc_scripts/snowflake_csv_merge.py
@@ -0,0 +1,97 @@
+#!/usr/bin/env python3
+"""
+Written by ChatGPT in response to: 
+
+I am going to have multiple different file patterns that I am downloading in this manner. 
+Please write my a python script that will merge everything that has the same "root" 
+from snowflakes download in the same directory. 
+
+Cline used to add command line arguments afterwards. 
+"""
+import os
+import glob
+import re
+import pandas as pd
+import argparse
+from collections import defaultdict
+
+def group_files(files):
+    """
+    Group files by their root before the Snowflake part suffix (_0_0_0, etc.).
+    E.g. 'foo_0_0_0.csv' and 'foo_0_0_1.csv' -> root 'foo'
+    """
+    groups = defaultdict(list)
+    for f in files:
+        base = os.path.basename(f)
+        # strip compression extension first
+        if base.endswith(".gz"):
+            base = base[:-3]
+        if base.endswith(".csv"):
+            base = base[:-4]
+
+        # remove the trailing Snowflake suffix if present
+        root = re.sub(r'(_\d+_\d+_\d+)$', '', base)
+        groups[root].append(f)
+    return groups
+
+def merge_group(root, files, outdir="."):
+    # Sort files so order is consistent
+    files = sorted(files)
+
+    outname = os.path.join(outdir, f"{root}.merged.csv")
+    print(f"Merging {len(files)} files into {outname}")
+
+    # Read first file with header
+    df = pd.read_csv(files[0], compression="infer")
+    # Append others without header
+    for f in files[1:]:
+        df2 = pd.read_csv(f, header=None, compression="infer")
+        df = pd.concat([df, df2], ignore_index=True)
+
+    df.to_csv(outname, index=False)
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Merge CSV files with the same root name from Snowflake downloads"
+    )
+    parser.add_argument(
+        "directory", 
+        help="Directory location containing CSV files to merge"
+    )
+    parser.add_argument(
+        "--output-dir",
+        help="Output directory for merged files (defaults to input directory)",
+        default=None
+    )
+    
+    args = parser.parse_args()
+    
+    # Validate directory exists
+    if not os.path.isdir(args.directory):
+        print(f"Error: Directory '{args.directory}' does not exist")
+        return 1
+    
+    # Set output directory
+    output_dir = args.output_dir if args.output_dir else args.directory
+    
+    # Look for all csv/csv.gz files in specified directory
+    csv_pattern = os.path.join(args.directory, "*.csv")
+    gz_pattern = os.path.join(args.directory, "*.csv.gz")
+    files = glob.glob(csv_pattern) + glob.glob(gz_pattern)
+    
+    if not files:
+        print(f"No CSV files found in directory: {args.directory}")
+        return 1
+    
+    print(f"Found {len(files)} CSV files in {args.directory}")
+    groups = group_files(files)
+    
+    print(f"Identified {len(groups)} file groups to merge")
+    for root, flist in groups.items():
+        merge_group(root, flist, outdir=output_dir)
+    
+    print("Merge process completed")
+    return 0
+
+if __name__ == "__main__":
+    main()

From f2343e9faf13ae27b04ae9529b34d81c64ddcdf6 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 01:41:40 -0400
Subject: [PATCH 08/24] multiple csv in output

---
 idr/pecos_unexpired_state_license.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/idr/pecos_unexpired_state_license.py b/idr/pecos_unexpired_state_license.py
index 545ff47..73256c5 100644
--- a/idr/pecos_unexpired_state_license.py
+++ b/idr/pecos_unexpired_state_license.py
@@ -11,7 +11,7 @@
 
 ts = datetime.now().strftime("%Y_%m_%d_%H%M")
 
-license_file_name = f"@~/pecos_recent_practice_address.{ts}.csv"
+license_file_name = f"@~/pecos_unexpired_license.{ts}.csv"
 
 
 now = datetime.now()
@@ -44,7 +44,6 @@
   COMPRESSION = NONE
 )
 HEADER = TRUE
-SINGLE = TRUE
 OVERWRITE = TRUE;
 """
 

From 0464679af6d7c60ca6817d463326b197aa1cbead Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 01:44:36 -0400
Subject: [PATCH 09/24] no csv files

---
 .gitignore | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.gitignore b/.gitignore
index b7faf40..fa5266e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -205,3 +205,7 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+
+idr_data/**/*.csv
+vrdc_data/**/*.csv

From fe83d1f361be2abada09c97426a81467d0a8a2be Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 01:45:26 -0400
Subject: [PATCH 10/24] readme

---
 idr_data/ReadMe.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 idr_data/ReadMe.md

diff --git a/idr_data/ReadMe.md b/idr_data/ReadMe.md
new file mode 100644
index 0000000..9fca76e
--- /dev/null
+++ b/idr_data/ReadMe.md
@@ -0,0 +1 @@
+This data holds the actual downloaded data, and uses .gitignore to ensure that csv files are not in github

From e64c9843eddb71ed107781b95b44cd4130019dbc Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 01:45:50 -0400
Subject: [PATCH 11/24] readme

---
 vrdc_data/ReadMe.md | 1 +
 1 file changed, 1 insertion(+)
 create mode 100644 vrdc_data/ReadMe.md

diff --git a/vrdc_data/ReadMe.md b/vrdc_data/ReadMe.md
new file mode 100644
index 0000000..9fca76e
--- /dev/null
+++ b/vrdc_data/ReadMe.md
@@ -0,0 +1 @@
+This data holds the actual downloaded data, and uses .gitignore to ensure that csv files are not in github

From f049d37a79da98c522c6bddc47a15d8546c51bab Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 01:48:28 -0400
Subject: [PATCH 12/24] initial

---
 idr_data/download_all_snowflake_csv.sh | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100755 idr_data/download_all_snowflake_csv.sh

diff --git a/idr_data/download_all_snowflake_csv.sh b/idr_data/download_all_snowflake_csv.sh
new file mode 100755
index 0000000..ea9f8b3
--- /dev/null
+++ b/idr_data/download_all_snowflake_csv.sh
@@ -0,0 +1,2 @@
+#!/bin/sh
+snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"

From 24c1ee3bccc0abf747018a27a0c6779c2b657cbe Mon Sep 17 00:00:00 2001
From: ftrotter-gov <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 02:04:08 -0400
Subject: [PATCH 13/24] check headers

---
 misc_scripts/snowflake_csv_merge.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/misc_scripts/snowflake_csv_merge.py b/misc_scripts/snowflake_csv_merge.py
index bf8674e..5a41628 100644
--- a/misc_scripts/snowflake_csv_merge.py
+++ b/misc_scripts/snowflake_csv_merge.py
@@ -41,14 +41,22 @@ def merge_group(root, files, outdir="."):
     outname = os.path.join(outdir, f"{root}.merged.csv")
     print(f"Merging {len(files)} files into {outname}")
 
-    # Read first file with header
-    df = pd.read_csv(files[0], compression="infer")
-    # Append others without header
+    # Read first file with header to establish column structure
+    df = pd.read_csv(files[0], compression="infer", dtype=str)
+    expected_headers = list(df.columns)
+    
+    # Append others, validating headers match
     for f in files[1:]:
-        df2 = pd.read_csv(f, header=None, compression="infer")
+        df2 = pd.read_csv(f, compression="infer", dtype=str)
+        
+        # Validate headers match exactly
+        if list(df2.columns) != expected_headers:
+            raise ValueError(f"Header mismatch in file {f}. Expected: {expected_headers}, Got: {list(df2.columns)}")
+        
         df = pd.concat([df, df2], ignore_index=True)
 
     df.to_csv(outname, index=False)
+    print(f"Successfully merged {len(files)} files with {len(df)} total rows (excluding header)")
 
 def main():
     parser = argparse.ArgumentParser(

From e0bf58fc860ada9d119565ae66b48f38662d136f Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 02:05:23 -0400
Subject: [PATCH 14/24] automerge the snowflake downloads

---
 idr_data/download_all_snowflake_csv.sh           | 2 --
 idr_data/download_and_merge_all_snowflake_csv.sh | 5 +++++
 2 files changed, 5 insertions(+), 2 deletions(-)
 delete mode 100755 idr_data/download_all_snowflake_csv.sh
 create mode 100755 idr_data/download_and_merge_all_snowflake_csv.sh

diff --git a/idr_data/download_all_snowflake_csv.sh b/idr_data/download_all_snowflake_csv.sh
deleted file mode 100755
index ea9f8b3..0000000
--- a/idr_data/download_all_snowflake_csv.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/sh
-snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
diff --git a/idr_data/download_and_merge_all_snowflake_csv.sh b/idr_data/download_and_merge_all_snowflake_csv.sh
new file mode 100755
index 0000000..95826a6
--- /dev/null
+++ b/idr_data/download_and_merge_all_snowflake_csv.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+cd ./unmerged_csv_files/
+#snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+cd ..
+python3 ../misc_scripts/snowflake_csv_merge.py ./unmerged_csv_files/ --output-dir .

From 15e712cbba513edbd7ac7cc95c4718fbcb3906f1 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 02:05:36 -0400
Subject: [PATCH 15/24] we need pandas so...

---
 source_me_to_get_venv.sh | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100755 source_me_to_get_venv.sh

diff --git a/source_me_to_get_venv.sh b/source_me_to_get_venv.sh
new file mode 100755
index 0000000..6659f64
--- /dev/null
+++ b/source_me_to_get_venv.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+# This file exists to remind fred what the command is because he is a noob
+source ./venv/bin/activate

From ae0662d738be81c4d3b71c391ae743d844f584a0 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 02:06:26 -0400
Subject: [PATCH 16/24] also for pandas

---
 requirements.txt | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 requirements.txt

diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..cf8c8a9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,6 @@
+numpy==2.3.3
+pandas==2.3.2
+python-dateutil==2.9.0.post0
+pytz==2025.2
+six==1.17.0
+tzdata==2025.2

From 162b0517cd19f8f939e3b2a0ba7fdb616270f74a Mon Sep 17 00:00:00 2001
From: ftrotter-gov <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 02:09:44 -0400
Subject: [PATCH 17/24] use COPY INTO

---
 idr/pecos_recent_practice_address.py | 35 +++++++++++++++-------------
 idr/pecos_unexpired_state_license.py |  4 ++--
 2 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/idr/pecos_recent_practice_address.py b/idr/pecos_recent_practice_address.py
index b033976..bb46138 100644
--- a/idr/pecos_recent_practice_address.py
+++ b/idr/pecos_recent_practice_address.py
@@ -1,15 +1,21 @@
 # Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
 
 # Import python packages
-import streamlit as st
+import streamlit as st  # type: ignore
 import pandas as pd
 from datetime import datetime
 
 # We can also use Snowpark for our analyses!
-from snowflake.snowpark.context import get_active_session
+from snowflake.snowpark.context import get_active_session # type: ignore
 session = get_active_session()
 
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+address_file_name = f"@~/pecos_recent_practice_address.{ts}.csv"
+
 address_sql = f"""
+COPY INTO {address_file_name}
+FROM (
 SELECT 
     PRVDR_NPI_NUM,
     ADR_TYPE_DESC,
@@ -24,23 +30,20 @@
     YEAR(IDR_UPDT_TS) AS IDR_UPDT_TS_YEAR,
     YEAR(IDR_TRANS_OBSLT_TS) AS IDR_TRANS_OBSLT_TS_YEAR,
     YEAR(IDR_TRANS_EFCTV_TS) AS IDR_TRANS_EFCTV_TS_YEAR
-    
 FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
 WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
-
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
 """
 
-df = session.sql(address_sql).to_pandas()
-
-ts = datetime.now().strftime("%Y_%m_%d_%H%M")
-
-address_file_name = f"pecos_recent_practice_address.{ts}.csv"
-
-session.file.put_stream(
-    df.to_csv(address_file_name, index = False),
-    f"@~/{address_file_name}",
-    auto_compress=False
-)
+session.sql(address_sql).collect()
 
 # To download use: 
-# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
\ No newline at end of file
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
diff --git a/idr/pecos_unexpired_state_license.py b/idr/pecos_unexpired_state_license.py
index 73256c5..5a0f69e 100644
--- a/idr/pecos_unexpired_state_license.py
+++ b/idr/pecos_unexpired_state_license.py
@@ -1,12 +1,12 @@
 # Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
 
 # Import python packages
-import streamlit as st
+import streamlit as st # type: ignore
 import pandas as pd
 from datetime import datetime
 
 # We can also use Snowpark for our analyses!
-from snowflake.snowpark.context import get_active_session
+from snowflake.snowpark.context import get_active_session # type: ignore
 session = get_active_session()
 
 ts = datetime.now().strftime("%Y_%m_%d_%H%M")

From 4fc9fb2d3c23bbaa651ad5568c2ecc6e1356c523 Mon Sep 17 00:00:00 2001
From: ftrotter-gov <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 02:10:53 -0400
Subject: [PATCH 18/24] usage notes

---
 idr/pecos_recent_practice_address.py | 1 +
 idr/pecos_unexpired_state_license.py | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/idr/pecos_recent_practice_address.py b/idr/pecos_recent_practice_address.py
index bb46138..3eb2d67 100644
--- a/idr/pecos_recent_practice_address.py
+++ b/idr/pecos_recent_practice_address.py
@@ -47,3 +47,4 @@
 
 # To download use: 
 # snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
\ No newline at end of file
diff --git a/idr/pecos_unexpired_state_license.py b/idr/pecos_unexpired_state_license.py
index 5a0f69e..94cb4ae 100644
--- a/idr/pecos_unexpired_state_license.py
+++ b/idr/pecos_unexpired_state_license.py
@@ -51,4 +51,5 @@
 
 
 # To download use: 
-# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
\ No newline at end of file
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
\ No newline at end of file

From b27c6b22421dd07c0006f7ed0310e9842d932fab Mon Sep 17 00:00:00 2001
From: ftrotter-gov <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 02:22:04 -0400
Subject: [PATCH 19/24] npi query

---
 idr/pecos_recent_practice_address.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/idr/pecos_recent_practice_address.py b/idr/pecos_recent_practice_address.py
index 3eb2d67..aa06785 100644
--- a/idr/pecos_recent_practice_address.py
+++ b/idr/pecos_recent_practice_address.py
@@ -47,4 +47,14 @@
 
 # To download use: 
 # snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
-# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
\ No newline at end of file
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
+
+"""
+possible NPI query
+SELECT my_string,
+       CASE 
+         WHEN my_string ~ '^[1][0-9]{9}$' THEN 'Looks like an NPI'
+         ELSE 'Not an NPI'
+       END AS npi_flag
+FROM my_table;
+"""
\ No newline at end of file

From eebeb58ec14aae40ee4fee120e3c417a4efc16c4 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 03:09:45 -0400
Subject: [PATCH 20/24] initial

---
 idr/medicaid_provider_credentials.py | 50 ++++++++++++++++++++++++++++
 idr/pecos_recent_practice_address.py |  9 -----
 2 files changed, 50 insertions(+), 9 deletions(-)
 create mode 100644 idr/medicaid_provider_credentials.py

diff --git a/idr/medicaid_provider_credentials.py b/idr/medicaid_provider_credentials.py
new file mode 100644
index 0000000..5795ae3
--- /dev/null
+++ b/idr/medicaid_provider_credentials.py
@@ -0,0 +1,50 @@
+# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
+
+# Import python packages
+import streamlit as st  # type: ignore
+import pandas as pd
+from datetime import datetime
+
+# We can also use Snowpark for our analyses!
+from snowflake.snowpark.context import get_active_session # type: ignore
+session = get_active_session()
+
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+medicaid_credentials_file_name = f"@~/medicaid_credentials.{ts}.csv"
+
+medicaid_credentials_sql = f"""
+COPY INTO {medicaid_credentials_file_name}
+FROM (
+SELECT 
+ PRVDR_STATE_MDCD_ID,
+ PRVDR_LCNS_ISSG_ENT_ID,
+ PRVDR_LCNS_OR_ACRDTN_NUM,
+ license_list.PRVDR_MDCD_LCNS_TYPE_CD,
+ PRVDR_MDCD_LCNS_TYPE_CD_DESC,
+ PRVDR_LCNS_OR_ACRDTN_NUM
+ 
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCD_PRD.V2_MDCD_PRVDR_LCNS_CRNT AS license_list 
+JOIN IDRC_PRD.CMS_VDM_VIEW_MDCD_PRD.V2_MDCD_PRVDR_LCNS_TYPE_CD AS license_type ON 
+    license_type.PRVDR_MDCD_LCNS_TYPE_CD =
+    license_list.PRVDR_MDCD_LCNS_TYPE_CD
+WHERE  REGEXP_LIKE(PRVDR_STATE_MDCD_ID,'^[1][0-9]{9}$') 
+AND license_list.PRVDR_MDCD_LCNS_TYPE_CD != '2' 
+AND license_list.PRVDR_MDCD_LCNS_TYPE_CD != '~'
+AND PRVDR_LCNS_ISSG_ENT_ID != 'NPI'
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
+"""
+
+session.sql(medicaid_credentials_sql).collect()
+
+# To download use: 
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
diff --git a/idr/pecos_recent_practice_address.py b/idr/pecos_recent_practice_address.py
index aa06785..402972a 100644
--- a/idr/pecos_recent_practice_address.py
+++ b/idr/pecos_recent_practice_address.py
@@ -49,12 +49,3 @@
 # snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
 # Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
 
-"""
-possible NPI query
-SELECT my_string,
-       CASE 
-         WHEN my_string ~ '^[1][0-9]{9}$' THEN 'Looks like an NPI'
-         ELSE 'Not an NPI'
-       END AS npi_flag
-FROM my_table;
-"""
\ No newline at end of file

From 043282bde1346a11c1a189d0be11f26d5cf45d19 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Thu, 11 Sep 2025 09:23:21 -0400
Subject: [PATCH 21/24] not as useful as a vtin, but will work for linking

---
 idr/nppes_tinhash.py                          | 53 +++++++++++++++++++
 .../download_and_merge_all_snowflake_csv.sh   |  8 ++-
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 idr/nppes_tinhash.py

diff --git a/idr/nppes_tinhash.py b/idr/nppes_tinhash.py
new file mode 100644
index 0000000..6457266
--- /dev/null
+++ b/idr/nppes_tinhash.py
@@ -0,0 +1,53 @@
+"""
+
+This creates a map of the salted + hashed ONPI relationships. 
+
+This program relies on the 'salt' variable being set in a previous notebook. 
+not setting this in the scope of this program ensures that we do not accidentally 
+record the salt into git. Not quite as bad as a password.. but still bad. 
+
+The salt should be changed every run. 
+
+"""
+
+# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
+
+# Import python packages
+import streamlit as st  # type: ignore
+import pandas as pd
+from datetime import datetime
+
+# We can also use Snowpark for our analyses!
+from snowflake.snowpark.context import get_active_session # type: ignore
+session = get_active_session()
+
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+nppes_onpi_tin_file_name = f"@~/nppes_onpi_tin.{ts}.csv"
+
+nppes_onpi_tin_sql = f"""
+COPY INTO {nppes_onpi_tin_file_name}
+FROM (
+SELECT PRVDR_NPI_NUM, 
+PRVDR_ENT_TYPE_CD,
+PRVDR_ORG_SUBRDNT_CD,
+ORG_NAME,
+MD5('{salt}' || PRVDR_EIN_NUM) AS tin_salted_hash
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_DMGRPHCS_CRNT
+WHERE PRVDR_EIN_NUM IS NOT NULL
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
+"""
+
+session.sql(nppes_onpi_tin_sql).collect()
+
+# To download use: 
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
diff --git a/idr_data/download_and_merge_all_snowflake_csv.sh b/idr_data/download_and_merge_all_snowflake_csv.sh
index 95826a6..cd73f75 100755
--- a/idr_data/download_and_merge_all_snowflake_csv.sh
+++ b/idr_data/download_and_merge_all_snowflake_csv.sh
@@ -1,5 +1,11 @@
 #!/bin/sh
+# Delete the previous run
+rm unmerged_csv_files/*.csv
+# move to the download directory to being the download
 cd ./unmerged_csv_files/
-#snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# download using snowsql. You must have cms_idr configured for snowflake
+snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# go back the main directory
 cd ..
+# merge the new csv file here. 
 python3 ../misc_scripts/snowflake_csv_merge.py ./unmerged_csv_files/ --output-dir .

From cedcab763be243d6d720d811fbe05ee419108f27 Mon Sep 17 00:00:00 2001
From: ftrotter-gov <frederick.trotter@cms.hhs.gov>
Date: Fri, 12 Sep 2025 20:21:54 -0400
Subject: [PATCH 22/24] lets use SHA512 instead

---
 idr/nppes_tinhash.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/idr/nppes_tinhash.py b/idr/nppes_tinhash.py
index 6457266..f1095ca 100644
--- a/idr/nppes_tinhash.py
+++ b/idr/nppes_tinhash.py
@@ -32,7 +32,7 @@
 PRVDR_ENT_TYPE_CD,
 PRVDR_ORG_SUBRDNT_CD,
 ORG_NAME,
-MD5('{salt}' || PRVDR_EIN_NUM) AS tin_salted_hash
+encode(digest('{salt}' || PRVDR_EIN_NUM, 'sha512'), 'hex') AS tin_salted_hash_sha512,
 FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_DMGRPHCS_CRNT
 WHERE PRVDR_EIN_NUM IS NOT NULL
 )

From 1b95a5c5be8de3e1cccb9b80b8d8271ece716446 Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Sat, 13 Sep 2025 01:25:46 -0400
Subject: [PATCH 23/24] tweaks and city

---
 idr/nppes_tinhash.py                 | 2 +-
 idr/pecos_recent_practice_address.py | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/idr/nppes_tinhash.py b/idr/nppes_tinhash.py
index f1095ca..91e5029 100644
--- a/idr/nppes_tinhash.py
+++ b/idr/nppes_tinhash.py
@@ -32,7 +32,7 @@
 PRVDR_ENT_TYPE_CD,
 PRVDR_ORG_SUBRDNT_CD,
 ORG_NAME,
-encode(digest('{salt}' || PRVDR_EIN_NUM, 'sha512'), 'hex') AS tin_salted_hash_sha512,
+SHA2('{salt}' || PRVDR_EIN_NUM, 512) AS tin_salted_hash_sha512,
 FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_DMGRPHCS_CRNT
 WHERE PRVDR_EIN_NUM IS NOT NULL
 )
diff --git a/idr/pecos_recent_practice_address.py b/idr/pecos_recent_practice_address.py
index 402972a..14add26 100644
--- a/idr/pecos_recent_practice_address.py
+++ b/idr/pecos_recent_practice_address.py
@@ -22,6 +22,7 @@
     LINE_1_ADR,
     LINE_2_ADR,
     GEO_USPS_STATE_CD,
+    ADR_CITY_NAME,
     ZIP5_CD,
     ZIP4_CD,
     CNTRY_CD,

From f30f5e882f0e582f9ee89e357cf3e18ec09ea0dd Mon Sep 17 00:00:00 2001
From: Fred Trotter <frederick.trotter@cms.hhs.gov>
Date: Wed, 17 Sep 2025 09:29:30 -0400
Subject: [PATCH 24/24] summary

---
 idr/pecos_recent_practice_address.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/idr/pecos_recent_practice_address.py b/idr/pecos_recent_practice_address.py
index 14add26..2c745e8 100644
--- a/idr/pecos_recent_practice_address.py
+++ b/idr/pecos_recent_practice_address.py
@@ -46,6 +46,33 @@
 
 session.sql(address_sql).collect()
 
+
+summary_address_file_name = f"@~/pecos_address_summary.{ts}.csv"
+
+summary_address_sql = f"""
+COPY INTO {summary_address_file_name}
+FROM (
+SELECT 
+    PRVDR_NPI_NUM,
+    GEO_USPS_STATE_CD,
+    ADR_CITY_NAME,
+    ZIP5_CD,
+    ZIP4_CD
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
+WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
+"""
+
+session.sql(summary_address_sql).collect()
+
 # To download use: 
 # snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
 # Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files.