1
1
"""Archive EIA Residential Energy Consumption Survey (RECS)."""
2
2
3
- import logging
4
3
import re
5
- from collections import defaultdict
6
4
from dataclasses import dataclass
7
5
from io import BytesIO
8
6
from pathlib import Path
9
7
from urllib .parse import urljoin , urlparse
10
8
11
- import bs4
12
-
13
9
from pudl_archiver .archivers .classes import (
14
10
AbstractDatasetArchiver ,
15
11
ArchiveAwaitable ,
16
12
ResourceInfo ,
17
13
)
18
14
from pudl_archiver .frictionless import ZipLayout
19
- from pudl_archiver .utils import retry_async
20
-
21
- logger = logging .getLogger (f"catalystcoop.{ __name__ } " )
15
+ from pudl_archiver .utils import is_html_file
22
16
23
17
BASE_URL = "https://www.eia.gov/consumption/residential/data/"
24
18
@@ -38,13 +32,6 @@ class EiaRECSArchiver(AbstractDatasetArchiver):
38
32
name = "eiarecs"
39
33
base_url = "https://www.eia.gov/consumption/residential/data/2020/"
40
34
41
- async def __get_soup (self , url : str ) -> bs4 .BeautifulSoup :
42
- """Get a BeautifulSoup instance for a URL using our existing session."""
43
- response = await retry_async (self .session .get , args = [url ])
44
- # TODO 2025-02-03: for some reason, lxml fails to grab the closing div
45
- # tag for tab content - so we use html.parser, which is slower.
46
- return bs4 .BeautifulSoup (await response .text (), "html.parser" )
47
-
48
35
async def get_resources (self ) -> ArchiveAwaitable :
49
36
"""Download EIA-RECS resources.
50
37
@@ -86,25 +73,15 @@ async def __get_year_resources(self, url: str, year: int) -> ResourceInfo:
86
73
87
74
tab_infos = await self .__select_tabs (url )
88
75
89
- # most tabs for most years can be handled the same way
90
- tab_handlers = {
91
- "housing-characteristics" : defaultdict (lambda : self .__get_tab_links ),
92
- "consumption-expenditures" : defaultdict (lambda : self .__get_tab_links ),
93
- "microdata" : defaultdict (lambda : self .__get_tab_html_and_links ),
94
- "methodology" : defaultdict (lambda : self .__get_tab_html_and_links ),
95
- "state-data" : defaultdict (lambda : self .__get_tab_links ),
96
- }
97
-
98
- # Add the exceptions - skip the 2009 and 2015 methodology sections for now
99
- tab_handlers ["methodology" ][2015 ] = self .__skip
100
- tab_handlers ["methodology" ][2009 ] = self .__skip
76
+ tab_handlers_overrides = {"methodology" : {2009 : self .__skip , 2015 : self .__skip }}
101
77
102
- zip_path = self .download_directory / f"eia-recs -{ year } .zip"
78
+ zip_path = self .download_directory / f"eiarecs -{ year } .zip"
103
79
paths_within_archive = []
104
80
for tab in tab_infos :
105
- paths_within_archive += await tab_handlers [ tab .name ][ tab . year ] (
106
- tab_info = tab , zip_path = zip_path
81
+ tab_handler = tab_handlers_overrides . get ( tab .name , {}). get (
82
+ tab . year , self . __get_tab_html_and_links
107
83
)
84
+ paths_within_archive += await tab_handler (tab_info = tab , zip_path = zip_path )
108
85
109
86
self .logger .info (f"Looking for original forms for { year } " )
110
87
original_forms_within_archive = await self .__get_original_forms (year , zip_path )
@@ -137,27 +114,36 @@ async def __add_links_to_archive(
137
114
data_paths_in_archive = []
138
115
for link , output_filename in url_paths .items ():
139
116
download_path = self .download_directory / output_filename
140
- logger .debug (f"Fetching { link } to { download_path } " )
117
+ self . logger .debug (f"Fetching { link } to { download_path } " )
141
118
await self .download_file (link , download_path , timeout = 120 )
142
119
with download_path .open ("rb" ) as f :
143
120
# TODO 2025-02-04: check html-ness against the suffix... if we
144
121
# have a php/html/cfm/etc. we probably actually *do* want the
145
122
# html file.
146
- if self . __is_html_file (f ):
147
- logger .info (f"{ link } was HTML file - skipping." )
123
+ if is_html_file (f ):
124
+ self . logger .info (f"{ link } was HTML file - skipping." )
148
125
continue
149
126
self .add_to_archive (
150
127
zip_path = zip_path ,
151
128
filename = output_filename ,
152
129
blob = f ,
153
130
)
154
- logger .debug (f"Added { link } to { zip_path } as { output_filename } " )
131
+ self . logger .debug (f"Added { link } to { zip_path } as { output_filename } " )
155
132
data_paths_in_archive .append (output_filename )
156
133
download_path .unlink ()
157
134
return data_paths_in_archive
158
135
159
136
async def __get_tab_links (self , tab_info : TabInfo , zip_path : Path ) -> list [str ]:
160
- """Get the data files for a single tab."""
137
+ """Get the data files for a single tab.
138
+
139
+ First, gets a list of all of the <a> tags within the tab contents which have an href attribute.
140
+
141
+ These tag objects have the HTML attrs accessible as if they were dictionaries - href, src, etc.
142
+
143
+ They also have some Python attributes of their own that you can read: text, contents, children, etc.
144
+
145
+ See https://beautiful-soup-4.readthedocs.io/en/latest/#tag for details.
146
+ """
161
147
soup = await self .__get_soup (tab_info .url )
162
148
links_in_tab = soup .select ("div.tab-contentbox a[href]" )
163
149
log_scope = f"{ tab_info .year } :{ tab_info .name } "
@@ -177,7 +163,7 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
177
163
urljoin (tab_info .url , link ["href" ]) for link in links_filtered
178
164
]
179
165
links_with_filenames = {
180
- link : f"eia-recs -{ tab_info .year } -{ tab_info .name } -{ self .__get_filename_from_link (link )} "
166
+ link : f"eiarecs -{ tab_info .year } -{ tab_info .name } -{ self .__get_filename_from_link (link )} "
181
167
for link in resolved_links
182
168
}
183
169
@@ -194,11 +180,23 @@ async def __get_tab_links(self, tab_info: TabInfo, zip_path: Path) -> list[str]:
194
180
async def __get_tab_html_and_links (
195
181
self , tab_info : TabInfo , zip_path : Path
196
182
) -> list [str ]:
197
- """Get the data files in the tab, *and* get the tab content itself."""
183
+ """Get the data files in the tab, *and* get the tab content itself.
184
+
185
+ First, get all the links within the tab that aren't HTML files and
186
+ aren't mailtos.
187
+
188
+ Then, gets the entire HTML contents of div.tab-contentbox, which
189
+ contains the tab contents.
190
+
191
+ Then, makes a new HTML document with an html and a body tag, and shoves
192
+ the old tab contents in there.
193
+
194
+ This makes a new HTML file that can be opened by one's browser and
195
+ includes the tab's contents - but any links/images will not work.
196
+ """
198
197
log_scope = f"{ tab_info .year } :{ tab_info .name } "
199
198
self .logger .info (f"{ log_scope } : Getting links in tab" )
200
199
links = await self .__get_tab_links (tab_info = tab_info , zip_path = zip_path )
201
- self .logger .info (f"{ log_scope } : Got { len (links )} links" )
202
200
203
201
soup = await self .__get_soup (tab_info .url )
204
202
tab_content = soup .select_one ("div.tab-contentbox" )
@@ -210,7 +208,7 @@ async def __get_tab_html_and_links(
210
208
# TODO 2025-02-03: consider using some sort of html-to-pdf converter here.
211
209
# use html-sanitizer or something before feeding it into pdf.
212
210
213
- filename = f"eia-recs -{ tab_info .year } -{ tab_info .name } -tab-contents.html"
211
+ filename = f"eiarecs -{ tab_info .year } -{ tab_info .name } -tab-contents.html"
214
212
self .add_to_archive (
215
213
zip_path = zip_path ,
216
214
filename = filename ,
@@ -235,7 +233,7 @@ async def __get_original_forms(self, year: int, zip_path: Path) -> list[str]:
235
233
resolved_links = [urljoin (forms_url , link ["href" ]) for link in links_filtered ]
236
234
237
235
links_with_filenames = {
238
- link : f"eia-recs -{ year } -form-{ self .__get_filename_from_link (link )} "
236
+ link : f"eiarecs -{ year } -form-{ self .__get_filename_from_link (link )} "
239
237
for link in resolved_links
240
238
}
241
239
@@ -248,13 +246,6 @@ def __get_filename_from_link(self, url: str) -> str:
248
246
stem = re .sub (r"\W+" , "-" , filepath .stem )
249
247
return f"{ stem } { filepath .suffix } " .lower ()
250
248
251
- def __is_html_file (self , fileobj : BytesIO ) -> bool :
252
- """Check the first 30 bytes of a file to see if there's an HTML header hiding in there."""
253
- fileobj .seek (0 )
254
- header = fileobj .read (30 ).lower ().strip ()
255
- fileobj .seek (0 )
256
- return b"<!doctype html" in header
257
-
258
249
async def __select_tabs (self , url : str ) -> set [TabInfo ]:
259
250
"""Get the clickable tab links from the EIA RECS page layout."""
260
251
0 commit comments