Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
082891b
Document the purpose of the repo
ftrotter-gov Jul 28, 2025
a39a5bc
Update README.md
ftrotter-gov Sep 11, 2025
fda1623
Merge pull request #4 from DSACMS/main
ftrotter-gov Sep 11, 2025
ce65211
intial
ftrotter-gov Sep 11, 2025
bc4106d
move to python
ftrotter-gov Sep 11, 2025
b0f08cb
this version generates too much data local to the notebook.
ftrotter-gov Sep 11, 2025
b8a0a9c
use COPY INTO
ftrotter-gov Sep 11, 2025
b74afb7
merge csv files
ftrotter-gov Sep 11, 2025
f2343e9
multiple csv in output
ftrotter-gov Sep 11, 2025
5882254
Merge branch 'ft_idr_reports' of https://github.com/DSACMS/npd_vrdc_p…
ftrotter-gov Sep 11, 2025
0464679
no csv files
ftrotter-gov Sep 11, 2025
fe83d1f
readme
ftrotter-gov Sep 11, 2025
e64c984
readme
ftrotter-gov Sep 11, 2025
f049d37
initial
ftrotter-gov Sep 11, 2025
24c1ee3
check headers
ftrotter-gov Sep 11, 2025
b2c2eb7
Merge branch 'ft_idr_reports' of https://github.com/DSACMS/npd_vrdc_p…
ftrotter-gov Sep 11, 2025
e0bf58f
automerge the snowflake downloads
ftrotter-gov Sep 11, 2025
15e712c
we need pandas so...
ftrotter-gov Sep 11, 2025
ae0662d
also for pandas
ftrotter-gov Sep 11, 2025
b68be34
Merge branch 'ft_idr_reports' of https://github.com/DSACMS/npd_vrdc_p…
ftrotter-gov Sep 11, 2025
162b051
use COPY INTO
ftrotter-gov Sep 11, 2025
4fc9fb2
usage notes
ftrotter-gov Sep 11, 2025
b27c6b2
npi query
ftrotter-gov Sep 11, 2025
eebeb58
initial
ftrotter-gov Sep 11, 2025
043282b
not as useful as a vtin, but will work for linking
ftrotter-gov Sep 11, 2025
cedcab7
lets use SHA512 instead
ftrotter-gov Sep 13, 2025
1b95a5c
tweaks and city
ftrotter-gov Sep 13, 2025
f30f5e8
summary
ftrotter-gov Sep 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,7 @@ cython_debug/
marimo/_static/
marimo/_lsp/
__marimo__/


idr_data/**/*.csv
vrdc_data/**/*.csv
13 changes: 11 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,11 @@
# vrdc_vtin_to_npi
Calculate VTINs for a VRDC PUF
# VRDC/IRD Projects

This is a meta project folder that will have VRDC/IDR projects to create new data files inside the VRDC/IDR environment.

## Purpose and Approach

Some of these data files will become public resources on (probably) data.cms.gov. Others will have private information that will require that the data remain in the CMS VRDC environment.

The projects should generally be implemented using [plainerflow](https://github.com/DSACMS/ndh_plainerflow) so that they will be portable between SQL based systems.

It is possible that some of these systems will be need to be executed in SAS ProcSQL or FedSQL, or snowflake notebooks but the default environment will be databricks notebooks for VRDC and snowflake workbooks for IDR.
50 changes: 50 additions & 0 deletions idr/medicaid_provider_credentials.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected

# Import python packages
import streamlit as st # type: ignore
import pandas as pd
from datetime import datetime

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session # type: ignore
session = get_active_session()

ts = datetime.now().strftime("%Y_%m_%d_%H%M")

medicaid_credentials_file_name = f"@~/medicaid_credentials.{ts}.csv"

medicaid_credentials_sql = f"""
COPY INTO {medicaid_credentials_file_name}
FROM (
SELECT
PRVDR_STATE_MDCD_ID,
PRVDR_LCNS_ISSG_ENT_ID,
PRVDR_LCNS_OR_ACRDTN_NUM,
license_list.PRVDR_MDCD_LCNS_TYPE_CD,
PRVDR_MDCD_LCNS_TYPE_CD_DESC,
PRVDR_LCNS_OR_ACRDTN_NUM

FROM IDRC_PRD.CMS_VDM_VIEW_MDCD_PRD.V2_MDCD_PRVDR_LCNS_CRNT AS license_list
JOIN IDRC_PRD.CMS_VDM_VIEW_MDCD_PRD.V2_MDCD_PRVDR_LCNS_TYPE_CD AS license_type ON
license_type.PRVDR_MDCD_LCNS_TYPE_CD =
license_list.PRVDR_MDCD_LCNS_TYPE_CD
WHERE REGEXP_LIKE(PRVDR_STATE_MDCD_ID,'^[1][0-9]{9}$')
AND license_list.PRVDR_MDCD_LCNS_TYPE_CD != '2'
AND license_list.PRVDR_MDCD_LCNS_TYPE_CD != '~'
AND PRVDR_LCNS_ISSG_ENT_ID != 'NPI'
)
FILE_FORMAT = (
TYPE = CSV
FIELD_DELIMITER = ','
FIELD_OPTIONALLY_ENCLOSED_BY = '"'
COMPRESSION = NONE
)
HEADER = TRUE
OVERWRITE = TRUE;
"""

session.sql(medicaid_credentials_sql).collect()

# To download use:
# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files.
53 changes: 53 additions & 0 deletions idr/nppes_tinhash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""

This creates a map of the salted + hashed ONPI relationships.

This program relies on the 'salt' variable being set in a previous notebook.
not setting this in the scope of this program ensures that we do not accidentally
record the salt into git. Not quite as bad as a password.. but still bad.

The salt should be changed every run.

"""

# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected

# Import python packages
import streamlit as st # type: ignore
import pandas as pd
from datetime import datetime

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session # type: ignore
session = get_active_session()

ts = datetime.now().strftime("%Y_%m_%d_%H%M")

nppes_onpi_tin_file_name = f"@~/nppes_onpi_tin.{ts}.csv"

nppes_onpi_tin_sql = f"""
COPY INTO {nppes_onpi_tin_file_name}
FROM (
SELECT PRVDR_NPI_NUM,
PRVDR_ENT_TYPE_CD,
PRVDR_ORG_SUBRDNT_CD,
ORG_NAME,
SHA2('{salt}' || PRVDR_EIN_NUM, 512) AS tin_salted_hash_sha512,
FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_DMGRPHCS_CRNT
WHERE PRVDR_EIN_NUM IS NOT NULL
)
FILE_FORMAT = (
TYPE = CSV
FIELD_DELIMITER = ','
FIELD_OPTIONALLY_ENCLOSED_BY = '"'
COMPRESSION = NONE
)
HEADER = TRUE
OVERWRITE = TRUE;
"""

session.sql(nppes_onpi_tin_sql).collect()

# To download use:
# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files.
79 changes: 79 additions & 0 deletions idr/pecos_recent_practice_address.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected

# Import python packages
import streamlit as st # type: ignore
import pandas as pd
from datetime import datetime

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session # type: ignore
session = get_active_session()

ts = datetime.now().strftime("%Y_%m_%d_%H%M")

address_file_name = f"@~/pecos_recent_practice_address.{ts}.csv"

address_sql = f"""
COPY INTO {address_file_name}
FROM (
SELECT
PRVDR_NPI_NUM,
ADR_TYPE_DESC,
LINE_1_ADR,
LINE_2_ADR,
GEO_USPS_STATE_CD,
ADR_CITY_NAME,
ZIP5_CD,
ZIP4_CD,
CNTRY_CD,
PHNE_NUM,
FAX_NUM,
YEAR(IDR_UPDT_TS) AS IDR_UPDT_TS_YEAR,
YEAR(IDR_TRANS_OBSLT_TS) AS IDR_TRANS_OBSLT_TS_YEAR,
YEAR(IDR_TRANS_EFCTV_TS) AS IDR_TRANS_EFCTV_TS_YEAR
FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
)
FILE_FORMAT = (
TYPE = CSV
FIELD_DELIMITER = ','
FIELD_OPTIONALLY_ENCLOSED_BY = '"'
COMPRESSION = NONE
)
HEADER = TRUE
OVERWRITE = TRUE;
"""

session.sql(address_sql).collect()


summary_address_file_name = f"@~/pecos_address_summary.{ts}.csv"

summary_address_sql = f"""
COPY INTO {summary_address_file_name}
FROM (
SELECT
PRVDR_NPI_NUM,
GEO_USPS_STATE_CD,
ADR_CITY_NAME,
ZIP5_CD,
ZIP4_CD
FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENMRT_ADR_HSTRY
WHERE YEAR(IDR_TRANS_OBSLT_TS) = 9999 AND YEAR(IDR_TRANS_EFCTV_TS) > 2022 AND ADR_TYPE_DESC = 'PRACTICE'
)
FILE_FORMAT = (
TYPE = CSV
FIELD_DELIMITER = ','
FIELD_OPTIONALLY_ENCLOSED_BY = '"'
COMPRESSION = NONE
)
HEADER = TRUE
OVERWRITE = TRUE;
"""

session.sql(summary_address_sql).collect()

# To download use:
# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files.

55 changes: 55 additions & 0 deletions idr/pecos_unexpired_state_license.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Note: you must have an appropriate role chosen and the IDRC_PRD_COMM_WH warehouse selected

# Import python packages
import streamlit as st # type: ignore
import pandas as pd
from datetime import datetime

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session # type: ignore
session = get_active_session()

ts = datetime.now().strftime("%Y_%m_%d_%H%M")

license_file_name = f"@~/pecos_unexpired_license.{ts}.csv"


now = datetime.now()
last_year = now.year - 1


license_sql = f"""
COPY INTO {license_file_name}
FROM (
SELECT
PRVDR_NPI_NUM,
PRVDR_ENRLMT_LCNS_STATE_CD,
PRVDR_ENRLMT_LCNS_NUM,
PRVDR_ENRLMT_FORM_CD,
PRVDR_ENRLMT_SPCLTY_CD,
PRVDR_ENRLMT_SPCLTY_DESC
FROM IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_LCNS_CRNT AS current_license
JOIN IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_NPI_CRNT enrollment_to_npi ON
enrollment_to_npi.prvdr_enrlmt_id =
current_license.prvdr_enrlmt_id
JOIN IDRC_PRD.CMS_VDM_VIEW_MDCR_PRD.V2_PRVDR_ENRLMT_SPCLTY_CRNT AS enrollment_speciality ON
enrollment_speciality.prvdr_enrlmt_id =
enrollment_to_npi.prvdr_enrlmt_id
WHERE YEAR(PRVDR_ENRLMT_LCNS_EXPRTN_DT) > {last_year}
)
FILE_FORMAT = (
TYPE = CSV
FIELD_DELIMITER = ','
FIELD_OPTIONALLY_ENCLOSED_BY = '"'
COMPRESSION = NONE
)
HEADER = TRUE
OVERWRITE = TRUE;
"""

session.sql(license_sql).collect()


# To download use:
# snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
# Or look in ../idr_data/ for idr_data/download_and_merge_all_snowflake_csv.sh which downloads the data from idr and then re-merges the csv files.
1 change: 1 addition & 0 deletions idr_data/ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This data holds the actual downloaded data, and uses .gitignore to ensure that csv files are not in github
11 changes: 11 additions & 0 deletions idr_data/download_and_merge_all_snowflake_csv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/sh
# Delete the previous run
rm unmerged_csv_files/*.csv
# move to the download directory to being the download
cd ./unmerged_csv_files/
# download using snowsql. You must have cms_idr configured for snowflake
snowsql -c cms_idr -q "GET @~/ file://. PATTERN='.*.csv';"
# go back the main directory
cd ..
# merge the new csv file here.
python3 ../misc_scripts/snowflake_csv_merge.py ./unmerged_csv_files/ --output-dir .
105 changes: 105 additions & 0 deletions misc_scripts/snowflake_csv_merge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
#!/usr/bin/env python3
"""
Written by ChatGPT in response to:

I am going to have multiple different file patterns that I am downloading in this manner.
Please write my a python script that will merge everything that has the same "root"
from snowflakes download in the same directory.

Cline used to add command line arguments afterwards.
"""
import os
import glob
import re
import pandas as pd
import argparse
from collections import defaultdict

def group_files(files):
"""
Group files by their root before the Snowflake part suffix (_0_0_0, etc.).
E.g. 'foo_0_0_0.csv' and 'foo_0_0_1.csv' -> root 'foo'
"""
groups = defaultdict(list)
for f in files:
base = os.path.basename(f)
# strip compression extension first
if base.endswith(".gz"):
base = base[:-3]
if base.endswith(".csv"):
base = base[:-4]

# remove the trailing Snowflake suffix if present
root = re.sub(r'(_\d+_\d+_\d+)$', '', base)
groups[root].append(f)
return groups

def merge_group(root, files, outdir="."):
# Sort files so order is consistent
files = sorted(files)

outname = os.path.join(outdir, f"{root}.merged.csv")
print(f"Merging {len(files)} files into {outname}")

# Read first file with header to establish column structure
df = pd.read_csv(files[0], compression="infer", dtype=str)
expected_headers = list(df.columns)

# Append others, validating headers match
for f in files[1:]:
df2 = pd.read_csv(f, compression="infer", dtype=str)

# Validate headers match exactly
if list(df2.columns) != expected_headers:
raise ValueError(f"Header mismatch in file {f}. Expected: {expected_headers}, Got: {list(df2.columns)}")

df = pd.concat([df, df2], ignore_index=True)

df.to_csv(outname, index=False)
print(f"Successfully merged {len(files)} files with {len(df)} total rows (excluding header)")

def main():
parser = argparse.ArgumentParser(
description="Merge CSV files with the same root name from Snowflake downloads"
)
parser.add_argument(
"directory",
help="Directory location containing CSV files to merge"
)
parser.add_argument(
"--output-dir",
help="Output directory for merged files (defaults to input directory)",
default=None
)

args = parser.parse_args()

# Validate directory exists
if not os.path.isdir(args.directory):
print(f"Error: Directory '{args.directory}' does not exist")
return 1

# Set output directory
output_dir = args.output_dir if args.output_dir else args.directory

# Look for all csv/csv.gz files in specified directory
csv_pattern = os.path.join(args.directory, "*.csv")
gz_pattern = os.path.join(args.directory, "*.csv.gz")
files = glob.glob(csv_pattern) + glob.glob(gz_pattern)

if not files:
print(f"No CSV files found in directory: {args.directory}")
return 1

print(f"Found {len(files)} CSV files in {args.directory}")
groups = group_files(files)

print(f"Identified {len(groups)} file groups to merge")
for root, flist in groups.items():
merge_group(root, flist, outdir=output_dir)

print("Merge process completed")
return 0

if __name__ == "__main__":
main()
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
numpy==2.3.3
pandas==2.3.2
python-dateutil==2.9.0.post0
pytz==2025.2
six==1.17.0
tzdata==2025.2
3 changes: 3 additions & 0 deletions source_me_to_get_venv.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/bash
# This file exists to remind fred what the command is because he is a noob
source ./venv/bin/activate
1 change: 1 addition & 0 deletions vrdc_data/ReadMe.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
This data holds the actual downloaded data, and uses .gitignore to ensure that csv files are not in github