Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 18f3c44

Browse files
committedFeb 6, 2025·
fix: add docs, move util functions into useful places.
1 parent d526ff5 commit 18f3c44

File tree

5 files changed

+61
-50
lines changed

5 files changed

+61
-50
lines changed
 

‎.github/workflows/run-archiver.yml

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ on:
66
inputs:
77
datasets:
88
description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
9-
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
9+
default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
1010
required: true
1111
type: string
1212
create_github_issue:
@@ -26,7 +26,7 @@ jobs:
2626
strategy:
2727
matrix:
2828
# Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
29-
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
29+
dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
3030
fail-fast: false
3131
runs-on: ubuntu-latest
3232
permissions:

‎src/pudl_archiver/archivers/classes.py

+8
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from pathlib import Path
1515

1616
import aiohttp
17+
import bs4
1718
import pandas as pd
1819

1920
from pudl_archiver.archivers import validate
@@ -129,6 +130,13 @@ def __init__(
129130
self.logger = logging.getLogger(f"catalystcoop.{__name__}")
130131
self.logger.info(f"Archiving {self.name}")
131132

133+
async def __get_soup(self, url: str) -> bs4.BeautifulSoup:
134+
"""Get a BeautifulSoup instance for a URL using our existing session."""
135+
response = await retry_async(self.session.get, args=[url])
136+
# TODO 2025-02-03: for some reason, lxml fails to grab the closing div
137+
# tag for tab content - so we use html.parser, which is slower.
138+
return bs4.BeautifulSoup(await response.text(), "html.parser")
139+
132140
@abstractmethod
133141
def get_resources(self) -> ArchiveAwaitable:
134142
"""Abstract method that each data source must implement to download all resources.

‎src/pudl_archiver/archivers/eia/eiarecs.py

+37-46
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,18 @@
11
"""Archive EIA Residential Energy Consumption Survey (RECS)."""
22

3-
import logging
43
import re
5-
from collections import defaultdict
64
from dataclasses import dataclass
75
from io import BytesIO
86
from pathlib import Path
97
from urllib.parse import urljoin, urlparse
108

11-
import bs4
12-
139
from pudl_archiver.archivers.classes import (
1410
AbstractDatasetArchiver,
1511
ArchiveAwaitable,
1612
ResourceInfo,
1713
)
1814
from pudl_archiver.frictionless import ZipLayout
19-
from pudl_archiver.utils import retry_async
20-
21-
logger = logging.getLogger(f"catalystcoop.{__name__}")
15+
from pudl_archiver.utils import is_html_file
2216

2317
BASE_URL = "https://www.eia.gov/consumption/residential/data/"
2418

@@ -38,13 +32,6 @@ class EiaRECSArchiver(AbstractDatasetArchiver):
3832
name = "eiarecs"
3933
base_url = "https://www.eia.gov/consumption/residential/data/2020/"
4034

41-
async def __get_soup(self, url: str) -> bs4.BeautifulSoup:
42-
"""Get a BeautifulSoup instance for a URL using our existing session."""
43-
response = await retry_async(self.session.get, args=[url])
44-
# TODO 2025-02-03: for some reason, lxml fails to grab the closing div
45-
# tag for tab content - so we use html.parser, which is slower.
46-
return bs4.BeautifulSoup(await response.text(), "html.parser")
47-
4835
async def get_resources(self) -> ArchiveAwaitable:
4936
"""Download EIA-RECS resources.
5037
@@ -86,25 +73,15 @@ async def __get_year_resources(self, url: str, year: int) -> ResourceInfo:
8673

8774
tab_infos = await self.__select_tabs(url)
8875

89-
# most tabs for most years can be handled the same way
90-
tab_handlers = {
91-
"housing-characteristics": defaultdict(lambda: self.__get_tab_links),
92-
"consumption-expenditures": defaultdict(lambda: self.__get_tab_links),
93-
"microdata": defaultdict(lambda: self.__get_tab_html_and_links),
94-
"methodology": defaultdict(lambda: self.__get_tab_html_and_links),
95-
"state-data": defaultdict(lambda: self.__get_tab_links),
96-
}
97-
98-
# Add the exceptions - skip the 2009 and 2015 methodology sections for now
99-
tab_handlers["methodology"][2015] = self.__skip
100-
tab_handlers["methodology"][2009] = self.__skip
76+
tab_handlers_overrides = {"methodology": {2009: self.__skip, 2015: self.__skip}}
10177

102-
zip_path = self.download_directory / f"eia-recs-{year}.zip"
78+
zip_path = self.download_directory / f"eiarecs-{year}.zip"
10379
paths_within_archive = []
10480
for tab in tab_infos:
105-
paths_within_archive += await tab_handlers[tab.name][tab.year](
106-
tab_info=tab, zip_path=zip_path
81+
tab_handler = tab_handlers_overrides.get(tab.name, {}).get(
82+
tab.year, self.__get_tab_html_and_links
10783
)
84+
paths_within_archive += await tab_handler(tab_info=tab, zip_path=zip_path)
10885

10986
self.logger.info(f"Looking for original forms for {year}")
11087
original_forms_within_archive = await self.__get_original_forms(year, zip_path)
@@ -137,27 +114,36 @@ async def __add_links_to_archive(
137114
data_paths_in_archive = []
138115
for link, output_filename in url_paths.items():
139116
download_path = self.download_directory / output_filename
140-
logger.debug(f"Fetching {link} to {download_path}")
117+
self.logger.debug(f"Fetching {link} to {download_path}")
141118
await self.download_file(link, download_path, timeout=120)
142119
with download_path.open("rb") as f:
143120
# TODO 2025-02-04: check html-ness against the suffix... if we
144121
# have a php/html/cfm/etc. we probably actually *do* want the
145122
# html file.
146-
if self.__is_html_file(f):
147-
logger.info(f"{link} was HTML file - skipping.")
123+
if is_html_file(f):
124+
self.logger.info(f"{link} was HTML file - skipping.")
148125
continue
149126
self.add_to_archive(
150127
zip_path=zip_path,
151128
filename=output_filename,
152129
blob=f,
153130
)
154-
logger.debug(f"Added {link} to {zip_path} as {output_filename}")
131+
self.logger.debug(f"Added {link} to {zip_path} as {output_filename}")
155132
data_paths_in_archive.append(output_filename)
156133
download_path.unlink()
157134
return data_paths_in_archive
158135

159136
async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
160-
"""Get the data files for a single tab."""
137+
"""Get the data files for a single tab.
138+
139+
First, gets a list of all of the <a> tags within the tab contents which have an href attribute.
140+
141+
These tag objects have the HTML attrs accessible as if they were dictionaries - href, src, etc.
142+
143+
They also have some Python attributes of their own that you can read: text, contents, children, etc.
144+
145+
See https://beautiful-soup-4.readthedocs.io/en/latest/#tag for details.
146+
"""
161147
soup = await self.__get_soup(tab_info.url)
162148
links_in_tab = soup.select("div.tab-contentbox a[href]")
163149
log_scope = f"{tab_info.year}:{tab_info.name}"
@@ -177,7 +163,7 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
177163
urljoin(tab_info.url, link["href"]) for link in links_filtered
178164
]
179165
links_with_filenames = {
180-
link: f"eia-recs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}"
166+
link: f"eiarecs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}"
181167
for link in resolved_links
182168
}
183169

@@ -194,11 +180,23 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
194180
async def __get_tab_html_and_links(
195181
self, tab_info: TabInfo, zip_path: Path
196182
) -> list[str]:
197-
"""Get the data files in the tab, *and* get the tab content itself."""
183+
"""Get the data files in the tab, *and* get the tab content itself.
184+
185+
First, get all the links within the tab that aren't HTML files and
186+
aren't mailtos.
187+
188+
Then, gets the entire HTML contents of div.tab-contentbox, which
189+
contains the tab contents.
190+
191+
Then, makes a new HTML document with an html and a body tag, and shoves
192+
the old tab contents in there.
193+
194+
This makes a new HTML file that can be opened by one's browser and
195+
includes the tab's contents - but any links/images will not work.
196+
"""
198197
log_scope = f"{tab_info.year}:{tab_info.name}"
199198
self.logger.info(f"{log_scope}: Getting links in tab")
200199
links = await self.__get_tab_links(tab_info=tab_info, zip_path=zip_path)
201-
self.logger.info(f"{log_scope}: Got {len(links)} links")
202200

203201
soup = await self.__get_soup(tab_info.url)
204202
tab_content = soup.select_one("div.tab-contentbox")
@@ -210,7 +208,7 @@ async def __get_tab_html_and_links(
210208
# TODO 2025-02-03: consider using some sort of html-to-pdf converter here.
211209
# use html-sanitizer or something before feeding it into pdf.
212210

213-
filename = f"eia-recs-{tab_info.year}-{tab_info.name}-tab-contents.html"
211+
filename = f"eiarecs-{tab_info.year}-{tab_info.name}-tab-contents.html"
214212
self.add_to_archive(
215213
zip_path=zip_path,
216214
filename=filename,
@@ -235,7 +233,7 @@ async def __get_original_forms(self, year: int, zip_path: Path) -> list[str]:
235233
resolved_links = [urljoin(forms_url, link["href"]) for link in links_filtered]
236234

237235
links_with_filenames = {
238-
link: f"eia-recs-{year}-form-{self.__get_filename_from_link(link)}"
236+
link: f"eiarecs-{year}-form-{self.__get_filename_from_link(link)}"
239237
for link in resolved_links
240238
}
241239

@@ -248,13 +246,6 @@ def __get_filename_from_link(self, url: str) -> str:
248246
stem = re.sub(r"\W+", "-", filepath.stem)
249247
return f"{stem}{filepath.suffix}".lower()
250248

251-
def __is_html_file(self, fileobj: BytesIO) -> bool:
252-
"""Check the first 30 bytes of a file to see if there's an HTML header hiding in there."""
253-
fileobj.seek(0)
254-
header = fileobj.read(30).lower().strip()
255-
fileobj.seek(0)
256-
return b"<!doctype html" in header
257-
258249
async def __select_tabs(self, url: str) -> set[TabInfo]:
259250
"""Get the clickable tab links from the EIA RECS page layout."""
260251

‎src/pudl_archiver/archivers/validate.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from pydantic import BaseModel
1313

1414
from pudl_archiver.frictionless import DataPackage, Resource, ZipLayout
15-
from pudl_archiver.utils import Url
15+
from pudl_archiver.utils import Url, is_html_file
1616

1717
logger = logging.getLogger(f"catalystcoop.{__name__}")
1818

@@ -277,7 +277,7 @@ def _process_resource_diffs(
277277
return [*changed_resources, *created_resources, *deleted_resources]
278278

279279

280-
def _validate_file_type(path: Path, buffer: BytesIO) -> bool:
280+
def _validate_file_type(path: Path, buffer: BytesIO) -> bool: # noqa:C901
281281
"""Check that file appears valid based on extension."""
282282
extension = path.suffix
283283

@@ -310,6 +310,9 @@ def _validate_file_type(path: Path, buffer: BytesIO) -> bool:
310310
# magic bytes for old-school xls file
311311
return header.hex() == "d0cf11e0a1b11ae1"
312312

313+
if extension == ".html":
314+
return is_html_file(buffer)
315+
313316
if extension == ".txt":
314317
return _validate_text(buffer)
315318

‎src/pudl_archiver/utils.py

+9
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import zipfile
77
from collections.abc import Awaitable, Callable
88
from hashlib import md5
9+
from io import BytesIO
910
from pathlib import Path
1011

1112
import aiohttp
@@ -145,3 +146,11 @@ def compute_md5(file_path: UPath) -> str:
145146
hash_md5.update(chunk)
146147

147148
return hash_md5.hexdigest()
149+
150+
151+
def is_html_file(self, fileobj: BytesIO) -> bool:
152+
"""Check the first 30 bytes of a file to see if there's an HTML header hiding in there."""
153+
fileobj.seek(0)
154+
header = fileobj.read(30).lower().strip()
155+
fileobj.seek(0)
156+
return b"<!doctype html" in header

0 commit comments

Comments
 (0)
Please sign in to comment.