diff --git a/scripts/us_epa/ejscreen/ejscreen.py b/scripts/us_epa/ejscreen/ejscreen.py index ea9f15ed3b..37da64beee 100644 --- a/scripts/us_epa/ejscreen/ejscreen.py +++ b/scripts/us_epa/ejscreen/ejscreen.py @@ -2,15 +2,22 @@ Generates cleaned CSV for the EPA EJSCREEN data and TMCF. Usage: python3 ejscreen.py ''' - import io import zipfile import requests import pandas as pd +from absl import logging + +logging.set_verbosity(logging.INFO) +logger = logging -YEARS = ['2015', '2016', '2017', '2018', '2019', '2020'] +YEARS = [ + '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023', + '2024' +] NORM_CSV_COLUMNS = ['ID', 'DSLPM', 'CANCER', 'RESP', 'OZONE', 'PM25'] +NORM_CSV_COLUMNS1 = ['ID', 'DSLPM', 'OZONE', 'PM25'] # 2015 has different csv column names CSV_COLUMNS_BY_YEAR = { @@ -19,7 +26,11 @@ '2017': NORM_CSV_COLUMNS, '2018': NORM_CSV_COLUMNS, '2019': NORM_CSV_COLUMNS, - '2020': NORM_CSV_COLUMNS + '2020': NORM_CSV_COLUMNS, + '2021': NORM_CSV_COLUMNS, + '2022': NORM_CSV_COLUMNS, + '2023': NORM_CSV_COLUMNS, + '2024': NORM_CSV_COLUMNS1 } ZIP_FILENAMES = { @@ -28,7 +39,11 @@ '2017': None, '2018': 'EJSCREEN_2018_USPR_csv', '2019': 'EJSCREEN_2019_USPR.csv', - '2020': 'EJSCREEN_2020_USPR.csv' + '2020': 'EJSCREEN_2020_USPR.csv', + '2021': 'EJSCREEN_2021_USPR.csv', + '2022': 'EJSCREEN_2022_with_AS_CNMI_GU_VI.csv', + '2023': 'EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv', + '2024': 'EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv' } FILENAMES = { @@ -37,7 +52,11 @@ '2017': 'EJSCREEN_2017_USPR_Public', '2018': 'EJSCREEN_Full_USPR_2018', '2019': 'EJSCREEN_2019_USPR', - '2020': 'EJSCREEN_2020_USPR' + '2020': 'EJSCREEN_2020_USPR', + '2021': 'EJSCREEN_2021_USPR', + '2022': 'EJSCREEN_2022_Full_with_AS_CNMI_GU_VI', + '2023': 'EJSCREEN_2023_BG_with_AS_CNMI_GU_VI', + '2024': 'EJScreen_2024_Tract_with_AS_CNMI_GU_VI' } TEMPLATE_MCF = ''' @@ -57,6 +76,7 @@ observationAbout: C:ejscreen_airpollutants->FIPS observationPeriod: dcs:P1Y value: C:ejscreen_airpollutants->CANCER +unit: dcs:PerMillionPerson Node: E:ejscreen_airpollutants->E2 typeOf: dcs:StatVarObservation @@ -92,10 +112,8 @@ def write_csv(data, outfilename): full_df = pd.DataFrame() for curr_year, one_year_df in data.items(): - one_year_df['year'] = curr_year # add year column - full_df = pd.concat( - [full_df, one_year_df], - ignore_index=True) # concatenate year onto larger dataframe + one_year_df['year'] = curr_year + full_df = pd.concat([full_df, one_year_df], ignore_index=True) # sort by FIPS and make into dcid full_df = full_df.rename(columns={'ID': 'FIPS'}) @@ -115,25 +133,60 @@ def write_tmcf(outfilename): if __name__ == '__main__': dfs = {} for year in YEARS: - print(year) + logger.info(year) columns = CSV_COLUMNS_BY_YEAR[year] - # request file zip_filename = ZIP_FILENAMES[year] + if zip_filename is not None: - response = requests.get( - f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip') - with zipfile.ZipFile(io.BytesIO(response.content())) as zfile: - with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: - dfs[year] = pd.read_csv(newfile, usecols=columns) - # some years are not zipped + if year == '2024': + + url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip' + elif year == '2023': + + url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip' + else: + url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip' + + logger.info(f"Requesting file: {url}") + response = requests.get(url, verify=False) + + if response.status_code == 200: + with zipfile.ZipFile(io.BytesIO(response.content)) as zfile: + with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile: + dfs[year] = pd.read_csv(newfile, + engine='python', + encoding='latin1', + usecols=columns) + else: + logger.error( + f"Failed to download file for {year}. HTTP Status Code: {response.status_code}" + ) + + else: + # If the file is not zipped, download the CSV directly + url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv' + logger.info(f"Requesting CSV file: {url}") + response = requests.get(url, verify=False) + + # Check if the response is successful (status code 200) + if response.status_code == 200: + dfs[year] = pd.read_csv(io.StringIO(response.text), + sep=',', + usecols=columns) + else: + logger.error( + f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}" + ) + + # Rename weird column names to match other years + if year == '2024': + # Use NORM_CSV_COLUMNS1 for 2024 + cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1)) else: - response = requests.get( - f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv') - dfs[year] = pd.read_csv(response, usecols=columns) - # rename weird column names to match other years - if columns != NORM_CSV_COLUMNS: + # Use NORM_CSV_COLUMNS for other years cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS)) - dfs[year] = dfs[year].rename(columns=cols_renamed) + + dfs[year] = dfs[year].rename(columns=cols_renamed) write_csv(dfs, 'ejscreen_airpollutants.csv') write_tmcf('ejscreen.tmcf')