Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Import EPA_EJSCREEN changes added for 2024 data processing #1128

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
99 changes: 76 additions & 23 deletions scripts/us_epa/ejscreen/ejscreen.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,22 @@
Generates cleaned CSV for the EPA EJSCREEN data and TMCF.
Usage: python3 ejscreen.py
'''

import io
import zipfile
import requests
import pandas as pd
from absl import logging

logging.set_verbosity(logging.INFO)
logger = logging

YEARS = ['2015', '2016', '2017', '2018', '2019', '2020']
YEARS = [
'2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022', '2023',
'2024'
]

NORM_CSV_COLUMNS = ['ID', 'DSLPM', 'CANCER', 'RESP', 'OZONE', 'PM25']
NORM_CSV_COLUMNS1 = ['ID', 'DSLPM', 'OZONE', 'PM25']

# 2015 has different csv column names
CSV_COLUMNS_BY_YEAR = {
Expand All @@ -19,7 +26,11 @@
'2017': NORM_CSV_COLUMNS,
'2018': NORM_CSV_COLUMNS,
'2019': NORM_CSV_COLUMNS,
'2020': NORM_CSV_COLUMNS
'2020': NORM_CSV_COLUMNS,
'2021': NORM_CSV_COLUMNS,
'2022': NORM_CSV_COLUMNS,
'2023': NORM_CSV_COLUMNS,
'2024': NORM_CSV_COLUMNS1
}

ZIP_FILENAMES = {
Expand All @@ -28,7 +39,11 @@
'2017': None,
'2018': 'EJSCREEN_2018_USPR_csv',
'2019': 'EJSCREEN_2019_USPR.csv',
'2020': 'EJSCREEN_2020_USPR.csv'
'2020': 'EJSCREEN_2020_USPR.csv',
'2021': 'EJSCREEN_2021_USPR.csv',
'2022': 'EJSCREEN_2022_with_AS_CNMI_GU_VI.csv',
'2023': 'EJSCREEN_2023_BG_with_AS_CNMI_GU_VI.csv',
'2024': 'EJScreen_2024_Tract_with_AS_CNMI_GU_VI.csv'
}

FILENAMES = {
Expand All @@ -37,7 +52,11 @@
'2017': 'EJSCREEN_2017_USPR_Public',
'2018': 'EJSCREEN_Full_USPR_2018',
'2019': 'EJSCREEN_2019_USPR',
'2020': 'EJSCREEN_2020_USPR'
'2020': 'EJSCREEN_2020_USPR',
'2021': 'EJSCREEN_2021_USPR',
'2022': 'EJSCREEN_2022_Full_with_AS_CNMI_GU_VI',
'2023': 'EJSCREEN_2023_BG_with_AS_CNMI_GU_VI',
'2024': 'EJScreen_2024_Tract_with_AS_CNMI_GU_VI'
}

TEMPLATE_MCF = '''
Expand All @@ -57,6 +76,7 @@
observationAbout: C:ejscreen_airpollutants->FIPS
observationPeriod: dcs:P1Y
value: C:ejscreen_airpollutants->CANCER
unit: dcs:PerMillionPerson

Node: E:ejscreen_airpollutants->E2
typeOf: dcs:StatVarObservation
Expand Down Expand Up @@ -92,10 +112,8 @@
def write_csv(data, outfilename):
full_df = pd.DataFrame()
for curr_year, one_year_df in data.items():
one_year_df['year'] = curr_year # add year column
full_df = pd.concat(
[full_df, one_year_df],
ignore_index=True) # concatenate year onto larger dataframe
one_year_df['year'] = curr_year
full_df = pd.concat([full_df, one_year_df], ignore_index=True)

# sort by FIPS and make into dcid
full_df = full_df.rename(columns={'ID': 'FIPS'})
Expand All @@ -115,25 +133,60 @@ def write_tmcf(outfilename):
if __name__ == '__main__':
dfs = {}
for year in YEARS:
print(year)
logger.info(year)
columns = CSV_COLUMNS_BY_YEAR[year]
# request file
zip_filename = ZIP_FILENAMES[year]

if zip_filename is not None:
response = requests.get(
f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip')
with zipfile.ZipFile(io.BytesIO(response.content())) as zfile:
with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
dfs[year] = pd.read_csv(newfile, usecols=columns)
# some years are not zipped
if year == '2024':

url = f'https://gaftp.epa.gov/EJSCREEN/2024/2.32_August_UseMe/{zip_filename}.zip'
elif year == '2023':

url = f'https://gaftp.epa.gov/EJSCREEN/2023/2.22_September_UseMe/{zip_filename}.zip'
else:
url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{zip_filename}.zip'

logger.info(f"Requesting file: {url}")
response = requests.get(url, verify=False)

if response.status_code == 200:
with zipfile.ZipFile(io.BytesIO(response.content)) as zfile:
with zfile.open(f'{FILENAMES[year]}.csv', 'r') as newfile:
dfs[year] = pd.read_csv(newfile,
engine='python',
encoding='latin1',
usecols=columns)
else:
logger.error(
f"Failed to download file for {year}. HTTP Status Code: {response.status_code}"
)

else:
# If the file is not zipped, download the CSV directly
url = f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv'
logger.info(f"Requesting CSV file: {url}")
response = requests.get(url, verify=False)

# Check if the response is successful (status code 200)
if response.status_code == 200:
dfs[year] = pd.read_csv(io.StringIO(response.text),
sep=',',
usecols=columns)
else:
logger.error(
f"Failed to download CSV for {year}. HTTP Status Code: {response.status_code}"
)

# Rename weird column names to match other years
if year == '2024':
# Use NORM_CSV_COLUMNS1 for 2024
cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS1))
else:
response = requests.get(
f'https://gaftp.epa.gov/EJSCREEN/{year}/{FILENAMES[year]}.csv')
dfs[year] = pd.read_csv(response, usecols=columns)
# rename weird column names to match other years
if columns != NORM_CSV_COLUMNS:
# Use NORM_CSV_COLUMNS for other years
cols_renamed = dict(zip(columns, NORM_CSV_COLUMNS))
dfs[year] = dfs[year].rename(columns=cols_renamed)

dfs[year] = dfs[year].rename(columns=cols_renamed)

write_csv(dfs, 'ejscreen_airpollutants.csv')
write_tmcf('ejscreen.tmcf')
Loading