fix: add docs, move util functions into useful places.

jdangerx · jdangerx · commit 18f3c440d956 · 2025-02-06T14:50:32.000-05:00
diff --git a/.github/workflows/run-archiver.yml b/.github/workflows/run-archiver.yml
@@ -6,7 +6,7 @@ on:
     inputs:
       datasets:
         description: 'Comma-separated list of datasets to archive (e.g., "ferc2","ferc6").'
-        default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
+        default: '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"'
         required: true
         type: string
       create_github_issue:
@@ -26,7 +26,7 @@ jobs:
     strategy:
       matrix:
         # Note that we can't pass global env variables to the matrix, so we manually reproduce the list of datasets here.
-        dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
+        dataset: ${{ fromJSON(format('[{0}]', inputs.datasets || '"doeiraec","doelead","eia176","eia191","eia757a","eia860","eia860m","eia861","eia923","eia930","eiaaeo","eiacbecs","eiamecs","eianems","eiarecs","eiawater","eia_bulk_elec","epacamd_eia","epacems","epaegrid","epamats","epapcap","ferc1","ferc2","ferc6","ferc60","ferc714","gridpathratoolkit","mshamines","nrelatb","phmsagas","usgsuspvdb","usgsuswtdb","vcerare"' )) }}
       fail-fast: false
     runs-on: ubuntu-latest
     permissions:
diff --git a/src/pudl_archiver/archivers/classes.py b/src/pudl_archiver/archivers/classes.py
@@ -14,6 +14,7 @@
 from pathlib import Path
 
 import aiohttp
+import bs4
 import pandas as pd
 
 from pudl_archiver.archivers import validate
@@ -129,6 +130,13 @@ def __init__(
         self.logger = logging.getLogger(f"catalystcoop.{__name__}")
         self.logger.info(f"Archiving {self.name}")
 
+    async def __get_soup(self, url: str) -> bs4.BeautifulSoup:
+        """Get a BeautifulSoup instance for a URL using our existing session."""
+        response = await retry_async(self.session.get, args=[url])
+        # TODO 2025-02-03: for some reason, lxml fails to grab the closing div
+        # tag for tab content - so we use html.parser, which is slower.
+        return bs4.BeautifulSoup(await response.text(), "html.parser")
+
     @abstractmethod
     def get_resources(self) -> ArchiveAwaitable:
         """Abstract method that each data source must implement to download all resources.
diff --git a/src/pudl_archiver/archivers/eia/eiarecs.py b/src/pudl_archiver/archivers/eia/eiarecs.py
@@ -1,24 +1,18 @@
 """Archive EIA Residential Energy Consumption Survey (RECS)."""
 
-import logging
 import re
-from collections import defaultdict
 from dataclasses import dataclass
 from io import BytesIO
 from pathlib import Path
 from urllib.parse import urljoin, urlparse
 
-import bs4
-
 from pudl_archiver.archivers.classes import (
     AbstractDatasetArchiver,
     ArchiveAwaitable,
     ResourceInfo,
 )
 from pudl_archiver.frictionless import ZipLayout
-from pudl_archiver.utils import retry_async
-
-logger = logging.getLogger(f"catalystcoop.{__name__}")
+from pudl_archiver.utils import is_html_file
 
 BASE_URL = "https://www.eia.gov/consumption/residential/data/"
 
@@ -38,13 +32,6 @@ class EiaRECSArchiver(AbstractDatasetArchiver):
     name = "eiarecs"
     base_url = "https://www.eia.gov/consumption/residential/data/2020/"
 
-    async def __get_soup(self, url: str) -> bs4.BeautifulSoup:
-        """Get a BeautifulSoup instance for a URL using our existing session."""
-        response = await retry_async(self.session.get, args=[url])
-        # TODO 2025-02-03: for some reason, lxml fails to grab the closing div
-        # tag for tab content - so we use html.parser, which is slower.
-        return bs4.BeautifulSoup(await response.text(), "html.parser")
-
     async def get_resources(self) -> ArchiveAwaitable:
         """Download EIA-RECS resources.
 
@@ -86,25 +73,15 @@ async def __get_year_resources(self, url: str, year: int) -> ResourceInfo:
 
         tab_infos = await self.__select_tabs(url)
 
-        # most tabs for most years can be handled the same way
-        tab_handlers = {
-            "housing-characteristics": defaultdict(lambda: self.__get_tab_links),
-            "consumption-expenditures": defaultdict(lambda: self.__get_tab_links),
-            "microdata": defaultdict(lambda: self.__get_tab_html_and_links),
-            "methodology": defaultdict(lambda: self.__get_tab_html_and_links),
-            "state-data": defaultdict(lambda: self.__get_tab_links),
-        }
-
-        # Add the exceptions - skip the 2009 and 2015 methodology sections for now
-        tab_handlers["methodology"][2015] = self.__skip
-        tab_handlers["methodology"][2009] = self.__skip
+        tab_handlers_overrides = {"methodology": {2009: self.__skip, 2015: self.__skip}}
 
-        zip_path = self.download_directory / f"eia-recs-{year}.zip"
+        zip_path = self.download_directory / f"eiarecs-{year}.zip"
         paths_within_archive = []
         for tab in tab_infos:
-            paths_within_archive += await tab_handlers[tab.name][tab.year](
-                tab_info=tab, zip_path=zip_path
+            tab_handler = tab_handlers_overrides.get(tab.name, {}).get(
+                tab.year, self.__get_tab_html_and_links
             )
+            paths_within_archive += await tab_handler(tab_info=tab, zip_path=zip_path)
 
         self.logger.info(f"Looking for original forms for {year}")
         original_forms_within_archive = await self.__get_original_forms(year, zip_path)
@@ -137,27 +114,36 @@ async def __add_links_to_archive(
         data_paths_in_archive = []
         for link, output_filename in url_paths.items():
             download_path = self.download_directory / output_filename
-            logger.debug(f"Fetching {link} to {download_path}")
+            self.logger.debug(f"Fetching {link} to {download_path}")
             await self.download_file(link, download_path, timeout=120)
             with download_path.open("rb") as f:
                 # TODO 2025-02-04: check html-ness against the suffix... if we
                 # have a php/html/cfm/etc. we probably actually *do* want the
                 # html file.
-                if self.__is_html_file(f):
-                    logger.info(f"{link} was HTML file - skipping.")
+                if is_html_file(f):
+                    self.logger.info(f"{link} was HTML file - skipping.")
                     continue
                 self.add_to_archive(
                     zip_path=zip_path,
                     filename=output_filename,
                     blob=f,
                 )
-                logger.debug(f"Added {link} to {zip_path} as {output_filename}")
+                self.logger.debug(f"Added {link} to {zip_path} as {output_filename}")
                 data_paths_in_archive.append(output_filename)
             download_path.unlink()
         return data_paths_in_archive
 
     async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
-        """Get the data files for a single tab."""
+        """Get the data files for a single tab.
+
+        First, gets a list of all of the <a> tags within the tab contents which have an href attribute.
+
+        These tag objects have the HTML attrs accessible as if they were dictionaries - href, src, etc.
+
+        They also have some Python attributes of their own that you can read: text, contents, children, etc.
+
+        See https://beautiful-soup-4.readthedocs.io/en/latest/#tag for details.
+        """
         soup = await self.__get_soup(tab_info.url)
         links_in_tab = soup.select("div.tab-contentbox a[href]")
         log_scope = f"{tab_info.year}:{tab_info.name}"
@@ -177,7 +163,7 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
             urljoin(tab_info.url, link["href"]) for link in links_filtered
         ]
         links_with_filenames = {
-            link: f"eia-recs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}"
+            link: f"eiarecs-{tab_info.year}-{tab_info.name}-{self.__get_filename_from_link(link)}"
             for link in resolved_links
         }
 
@@ -194,11 +180,23 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
     async def __get_tab_html_and_links(
         self, tab_info: TabInfo, zip_path: Path
     ) -> list[str]:
-        """Get the data files in the tab, *and* get the tab content itself."""
+        """Get the data files in the tab, *and* get the tab content itself.
+
+        First, get all the links within the tab that aren't HTML files and
+        aren't mailtos.
+
+        Then, gets the entire HTML contents of div.tab-contentbox, which
+        contains the tab contents.
+
+        Then, makes a new HTML document with an html and a body tag, and shoves
+        the old tab contents in there.
+
+        This makes a new HTML file that can be opened by one's browser and
+        includes the tab's contents - but any links/images will not work.
+        """
         log_scope = f"{tab_info.year}:{tab_info.name}"
         self.logger.info(f"{log_scope}: Getting links in tab")
         links = await self.__get_tab_links(tab_info=tab_info, zip_path=zip_path)
-        self.logger.info(f"{log_scope}: Got {len(links)} links")
 
         soup = await self.__get_soup(tab_info.url)
         tab_content = soup.select_one("div.tab-contentbox")
@@ -210,7 +208,7 @@ async def __get_tab_html_and_links(
         # TODO 2025-02-03: consider using some sort of html-to-pdf converter here.
         # use html-sanitizer or something before feeding it into pdf.
 
-        filename = f"eia-recs-{tab_info.year}-{tab_info.name}-tab-contents.html"
+        filename = f"eiarecs-{tab_info.year}-{tab_info.name}-tab-contents.html"
         self.add_to_archive(
             zip_path=zip_path,
             filename=filename,
@@ -235,7 +233,7 @@ async def __get_original_forms(self, year: int, zip_path: Path) -> list[str]:
         resolved_links = [urljoin(forms_url, link["href"]) for link in links_filtered]
 
         links_with_filenames = {
-            link: f"eia-recs-{year}-form-{self.__get_filename_from_link(link)}"
+            link: f"eiarecs-{year}-form-{self.__get_filename_from_link(link)}"
             for link in resolved_links
         }
 
@@ -248,13 +246,6 @@ def __get_filename_from_link(self, url: str) -> str:
         stem = re.sub(r"\W+", "-", filepath.stem)
         return f"{stem}{filepath.suffix}".lower()
 
-    def __is_html_file(self, fileobj: BytesIO) -> bool:
-        """Check the first 30 bytes of a file to see if there's an HTML header hiding in there."""
-        fileobj.seek(0)
-        header = fileobj.read(30).lower().strip()
-        fileobj.seek(0)
-        return b"<!doctype html" in header
-
     async def __select_tabs(self, url: str) -> set[TabInfo]:
         """Get the clickable tab links from the EIA RECS page layout."""
 
diff --git a/src/pudl_archiver/archivers/validate.py b/src/pudl_archiver/archivers/validate.py
@@ -12,7 +12,7 @@
 from pydantic import BaseModel
 
 from pudl_archiver.frictionless import DataPackage, Resource, ZipLayout
-from pudl_archiver.utils import Url
+from pudl_archiver.utils import Url, is_html_file
 
 logger = logging.getLogger(f"catalystcoop.{__name__}")
 
@@ -277,7 +277,7 @@ def _process_resource_diffs(
     return [*changed_resources, *created_resources, *deleted_resources]
 
 
-def _validate_file_type(path: Path, buffer: BytesIO) -> bool:
+def _validate_file_type(path: Path, buffer: BytesIO) -> bool:  # noqa:C901
     """Check that file appears valid based on extension."""
     extension = path.suffix
 
@@ -310,6 +310,9 @@ def _validate_file_type(path: Path, buffer: BytesIO) -> bool:
         # magic bytes for old-school xls file
         return header.hex() == "d0cf11e0a1b11ae1"
 
+    if extension == ".html":
+        return is_html_file(buffer)
+
     if extension == ".txt":
         return _validate_text(buffer)
 
diff --git a/src/pudl_archiver/utils.py b/src/pudl_archiver/utils.py
@@ -6,6 +6,7 @@
 import zipfile
 from collections.abc import Awaitable, Callable
 from hashlib import md5
+from io import BytesIO
 from pathlib import Path
 
 import aiohttp
@@ -145,3 +146,11 @@ def compute_md5(file_path: UPath) -> str:
             hash_md5.update(chunk)
 
     return hash_md5.hexdigest()
+
+
+def is_html_file(self, fileobj: BytesIO) -> bool:
+    """Check the first 30 bytes of a file to see if there's an HTML header hiding in there."""
+    fileobj.seek(0)
+    header = fileobj.read(30).lower().strip()
+    fileobj.seek(0)
+    return b"<!doctype html" in header