DSACMS · ftrotter-gov · Jul 28, 2025 · Sep 11, 2025 · Sep 11, 2025 · Sep 11, 2025
diff --git a/.gitignore b/.gitignore
@@ -205,3 +205,7 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+
+idr_data/**/*.csv
+vrdc_data/**/*.csv
diff --git a/README.md b/README.md
@@ -1,2 +1,11 @@
-# vrdc_vtin_to_npi
-Calculate VTINs for a VRDC PUF
+# VRDC/IRD Projects
+
+This is a meta project folder that will have VRDC/IDR projects to create new data files inside the VRDC/IDR environment. 
+
+## Purpose and Approach
+
+Some of these data files will become public resources on (probably) data.cms.gov. Others will have private information that will require that the data remain in the CMS VRDC environment. 
+
+The projects should generally be implemented using [plainerflow](https://github.com/DSACMS/ndh_plainerflow) so that they will be portable between SQL based systems.
+
+It is possible that some of these systems will be need to be executed in SAS ProcSQL or FedSQL, or snowflake notebooks but the default environment will be databricks notebooks for VRDC and snowflake workbooks for IDR. 
diff --git a/idr/medicaid_provider_credentials.py b/idr/medicaid_provider_credentials.py
@@ -0,0 +1,50 @@
+# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
+
+# Import python packages
+import streamlit as st  # type: ignore
+import pandas as pd
+from datetime import datetime
+
+# We can also use Snowpark for our analyses!
+from snowflake.snowpark.context import get_active_session # type: ignore
+session = get_active_session()
+
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+medicaid_credentials_file_name = f"@~/medicaid_credentials.{ts}.csv"
+
+medicaid_credentials_sql = f"""
+COPY INTO {medicaid_credentials_file_name}
+FROM (
+SELECT 
+ PRVDR_STATE_MDCD_ID,
+ PRVDR_LCNS_ISSG_ENT_ID,
+ PRVDR_LCNS_OR_ACRDTN_NUM,
+ license_list.PRVDR_MDCD_LCNS_TYPE_CD,
+ PRVDR_MDCD_LCNS_TYPE_CD_DESC,
+ PRVDR_LCNS_OR_ACRDTN_NUM
+
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCD_PRD.V2_MDCD_PRVDR_LCNS_CRNT AS license_list 
+JOIN IDRC_PRD.CMS_VDM_VIEW_MDCD_PRD.V2_MDCD_PRVDR_LCNS_TYPE_CD AS license_type ON 
+    license_type.PRVDR_MDCD_LCNS_TYPE_CD =
+    license_list.PRVDR_MDCD_LCNS_TYPE_CD
+WHERE  REGEXP_LIKE(PRVDR_STATE_MDCD_ID,'^[1][0-9]{9}$') 
+AND license_list.PRVDR_MDCD_LCNS_TYPE_CD != '2' 
+AND license_list.PRVDR_MDCD_LCNS_TYPE_CD != '~'
+AND PRVDR_LCNS_ISSG_ENT_ID != 'NPI'
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
+"""
+
+session.sql(medicaid_credentials_sql).collect()
+
+# To download use: 
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
diff --git a/idr/nppes_tinhash.py b/idr/nppes_tinhash.py
@@ -0,0 +1,53 @@
+"""
+
+This creates a map of the salted + hashed ONPI relationships. 
+
+This program relies on the 'salt' variable being set in a previous notebook. 
+not setting this in the scope of this program ensures that we do not accidentally 
+record the salt into git. Not quite as bad as a password.. but still bad. 
+
+The salt should be changed every run. 
+
+"""
+
+# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
+
+# Import python packages
+import streamlit as st  # type: ignore
+import pandas as pd
+from datetime import datetime
+
+# We can also use Snowpark for our analyses!
+from snowflake.snowpark.context import get_active_session # type: ignore
+session = get_active_session()
+
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+nppes_onpi_tin_file_name = f"@~/nppes_onpi_tin.{ts}.csv"
+
+nppes_onpi_tin_sql = f"""
+COPY INTO {nppes_onpi_tin_file_name}
+FROM (
+SELECT PRVDR_NPI_NUM, 
+PRVDR_ENT_TYPE_CD,
+PRVDR_ORG_SUBRDNT_CD,
+ORG_NAME,
+SHA2('{salt}' || PRVDR_EIN_NUM, 512) AS tin_salted_hash_sha512,
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_DMGRPHCS_CRNT
+WHERE PRVDR_EIN_NUM IS NOT NULL
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
+"""
+
+session.sql(nppes_onpi_tin_sql).collect()
+
+# To download use: 
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
diff --git a/idr/pecos_recent_practice_address.py b/idr/pecos_recent_practice_address.py
@@ -0,0 +1,79 @@
+# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
+
+# Import python packages
+import streamlit as st  # type: ignore
+import pandas as pd
+from datetime import datetime
+
+# We can also use Snowpark for our analyses!
+from snowflake.snowpark.context import get_active_session # type: ignore
+session = get_active_session()
+
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+address_file_name = f"@~/pecos_recent_practice_address.{ts}.csv"
+
+address_sql = f"""
+COPY INTO {address_file_name}
+FROM (
+SELECT 
+    PRVDR_NPI_NUM,
+    ADR_TYPE_DESC,
+    LINE_1_ADR,
+    LINE_2_ADR,
+    GEO_USPS_STATE_CD,
+    ADR_CITY_NAME,
+    ZIP5_CD,
+    ZIP4_CD,
+    CNTRY_CD,
+    PHNE_NUM,
+    FAX_NUM,
+    YEAR(IDR_UPDT_TS) AS IDR_UPDT_TS_YEAR,
+    YEAR(IDR_TRANS_OBSLT_TS) AS IDR_TRANS_OBSLT_TS_YEAR,
+    YEAR(IDR_TRANS_EFCTV_TS) AS IDR_TRANS_EFCTV_TS_YEAR
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
+WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
+"""
+
+session.sql(address_sql).collect()
+
+
+summary_address_file_name = f"@~/pecos_address_summary.{ts}.csv"
+
+summary_address_sql = f"""
+COPY INTO {summary_address_file_name}
+FROM (
+SELECT 
+    PRVDR_NPI_NUM,
+    GEO_USPS_STATE_CD,
+    ADR_CITY_NAME,
+    ZIP5_CD,
+    ZIP4_CD
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
+WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
+"""
+
+session.sql(summary_address_sql).collect()
+
+# To download use: 
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
+
diff --git a/idr/pecos_unexpired_state_license.py b/idr/pecos_unexpired_state_license.py
@@ -0,0 +1,55 @@
+# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected
+
+# Import python packages
+import streamlit as st # type: ignore
+import pandas as pd
+from datetime import datetime
+
+# We can also use Snowpark for our analyses!
+from snowflake.snowpark.context import get_active_session # type: ignore
+session = get_active_session()
+
+ts = datetime.now().strftime("%Y_%m_%d_%H%M")
+
+license_file_name = f"@~/pecos_unexpired_license.{ts}.csv"
+
+
+now = datetime.now()
+last_year = now.year - 1 
+
+
+license_sql = f"""
+COPY INTO {license_file_name}
+FROM (
+SELECT
+    PRVDR_NPI_NUM,
+    PRVDR_ENRLMT_LCNS_STATE_CD,
+    PRVDR_ENRLMT_LCNS_NUM, 
+    PRVDR_ENRLMT_FORM_CD,
+    PRVDR_ENRLMT_SPCLTY_CD,
+    PRVDR_ENRLMT_SPCLTY_DESC
+FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_LCNS_CRNT AS current_license
+JOIN  IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_NPI_CRNT enrollment_to_npi ON
+  enrollment_to_npi.prvdr_enrlmt_id =
+  current_license.prvdr_enrlmt_id
+JOIN IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_SPCLTY_CRNT AS enrollment_speciality ON 
+    enrollment_speciality.prvdr_enrlmt_id =
+    enrollment_to_npi.prvdr_enrlmt_id
+WHERE YEAR(PRVDR_ENRLMT_LCNS_EXPRTN_DT) > {last_year}
+)
+FILE_FORMAT = (
+  TYPE = CSV
+  FIELD_DELIMITER = ','
+  FIELD_OPTIONALLY_ENCLOSED_BY = '"'
+  COMPRESSION = NONE
+)
+HEADER = TRUE
+OVERWRITE = TRUE;
+"""
+
+session.sql(license_sql).collect()
+
+
+# To download use: 
+# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files. 
diff --git a/idr_data/ReadMe.md b/idr_data/ReadMe.md
@@ -0,0 +1 @@
+This data holds the actual downloaded data, and uses .gitignore to ensure that csv files are not in github
diff --git a/idr_data/download_and_merge_all_snowflake_csv.sh b/idr_data/download_and_merge_all_snowflake_csv.sh
@@ -0,0 +1,11 @@
+#!/bin/sh
+# Delete the previous run
+rm unmerged_csv_files/*.csv
+# move to the download directory to being the download
+cd ./unmerged_csv_files/
+# download using snowsql. You must have cms_idr configured for snowflake
+snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
+# go back the main directory
+cd ..
+# merge the new csv file here. 
+python3 ../misc_scripts/snowflake_csv_merge.py ./unmerged_csv_files/ --output-dir .
diff --git a/misc_scripts/snowflake_csv_merge.py b/misc_scripts/snowflake_csv_merge.py
@@ -0,0 +1,105 @@
+#!/usr/bin/env python3
+"""
+Written by ChatGPT in response to: 
+
+I am going to have multiple different file patterns that I am downloading in this manner. 
+Please write my a python script that will merge everything that has the same "root" 
+from snowflakes download in the same directory. 
+
+Cline used to add command line arguments afterwards. 
+"""
+import os
+import glob
+import re
+import pandas as pd
+import argparse
+from collections import defaultdict
+
+def group_files(files):
+    """
+    Group files by their root before the Snowflake part suffix (_0_0_0, etc.).
+    E.g. 'foo_0_0_0.csv' and 'foo_0_0_1.csv' -> root 'foo'
+    """
+    groups = defaultdict(list)
+    for f in files:
+        base = os.path.basename(f)
+        # strip compression extension first
+        if base.endswith(".gz"):
+            base = base[:-3]
+        if base.endswith(".csv"):
+            base = base[:-4]
+
+        # remove the trailing Snowflake suffix if present
+        root = re.sub(r'(_\d+_\d+_\d+)$', '', base)
+        groups[root].append(f)
+    return groups
+
+def merge_group(root, files, outdir="."):
+    # Sort files so order is consistent
+    files = sorted(files)
+
+    outname = os.path.join(outdir, f"{root}.merged.csv")
+    print(f"Merging {len(files)} files into {outname}")
+
+    # Read first file with header to establish column structure
+    df = pd.read_csv(files[0], compression="infer", dtype=str)
+    expected_headers = list(df.columns)
+
+    # Append others, validating headers match
+    for f in files[1:]:
+        df2 = pd.read_csv(f, compression="infer", dtype=str)
+
+        # Validate headers match exactly
+        if list(df2.columns) != expected_headers:
+            raise ValueError(f"Header mismatch in file {f}. Expected: {expected_headers}, Got: {list(df2.columns)}")
+
+        df = pd.concat([df, df2], ignore_index=True)
+
+    df.to_csv(outname, index=False)
+    print(f"Successfully merged {len(files)} files with {len(df)} total rows (excluding header)")
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Merge CSV files with the same root name from Snowflake downloads"
+    )
+    parser.add_argument(
+        "directory", 
+        help="Directory location containing CSV files to merge"
+    )
+    parser.add_argument(
+        "--output-dir",
+        help="Output directory for merged files (defaults to input directory)",
+        default=None
+    )
+
+    args = parser.parse_args()
+
+    # Validate directory exists
+    if not os.path.isdir(args.directory):
+        print(f"Error: Directory '{args.directory}' does not exist")
+        return 1
+
+    # Set output directory
+    output_dir = args.output_dir if args.output_dir else args.directory
+
+    # Look for all csv/csv.gz files in specified directory
+    csv_pattern = os.path.join(args.directory, "*.csv")
+    gz_pattern = os.path.join(args.directory, "*.csv.gz")
+    files = glob.glob(csv_pattern) + glob.glob(gz_pattern)
+
+    if not files:
+        print(f"No CSV files found in directory: {args.directory}")
+        return 1
+
+    print(f"Found {len(files)} CSV files in {args.directory}")
+    groups = group_files(files)
+
+    print(f"Identified {len(groups)} file groups to merge")
+    for root, flist in groups.items():
+        merge_group(root, flist, outdir=output_dir)
+
+    print("Merge process completed")
+    return 0
+
+if __name__ == "__main__":
+    main()
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+numpy==2.3.3
+pandas==2.3.2
+python-dateutil==2.9.0.post0
+pytz==2025.2
+six==1.17.0
+tzdata==2025.2
diff --git a/source_me_to_get_venv.sh b/source_me_to_get_venv.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+# This file exists to remind fred what the command is because he is a noob
+source ./venv/bin/activate
diff --git a/vrdc_data/ReadMe.md b/vrdc_data/ReadMe.md
@@ -0,0 +1 @@
+This data holds the actual downloaded data, and uses .gitignore to ensure that csv files are not in github
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This data holds the actual downloaded data, and uses .gitignore to ensure that csv files are not in github