Skip to content

Commit a5ca655

Browse files
authored
Merge pull request #53 from NREL/pp/ignore_https_errors
Add option to ignore https errors
2 parents a323b47 + 69af7d9 commit a5ca655

File tree

3 files changed

+15
-3
lines changed

3 files changed

+15
-3
lines changed

elm/web/html_pw.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,9 @@ async def _load_html( # pragma: no cover
6767
logger.trace("launching chromium; browser_semaphore=%r",
6868
browser_semaphore)
6969
browser = await p.chromium.launch(**pw_launch_kwargs)
70-
async with pw_page(browser, intercept_routes=True) as page:
70+
page_kwargs = {"browser": browser, "intercept_routes": True,
71+
"ignore_https_errors": True} # no sensitive inputs
72+
async with pw_page(**page_kwargs) as page:
7173
logger.trace("Navigating to: %r", url)
7274
await page.goto(url)
7375
logger.trace("Waiting for load with timeout: %r", timeout)

elm/web/search/base.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,8 @@ async def _search(self, query, num_results=10):
122122
logger.debug("Searching %s: %r", self._SE_NAME, query)
123123
num_results = min(num_results, self.MAX_RESULTS_CONSIDERED_PER_PAGE)
124124

125-
page_kwargs = {"browser": self._browser, "stealth_config": self._SC}
125+
page_kwargs = {"browser": self._browser, "stealth_config": self._SC,
126+
"ignore_https_errors": True} # no sensitive inputs
126127
async with pw_page(**page_kwargs) as page:
127128
await _navigate_to_search_engine(page, se_url=self._SE_URL,
128129
timeout=self.PAGE_LOAD_TIMEOUT)

elm/web/utilities.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,8 @@ def write_url_doc_to_file(doc, file_content, out_dir, make_name_unique=False):
170170

171171

172172
@asynccontextmanager
173-
async def pw_page(browser, intercept_routes=False, stealth_config=None):
173+
async def pw_page(browser, intercept_routes=False, stealth_config=None,
174+
ignore_https_errors=False):
174175
"""Create new page from playwright browser context
175176
176177
Parameters
@@ -184,6 +185,13 @@ async def pw_page(browser, intercept_routes=False, stealth_config=None):
184185
Optional playwright stealth configuration object.
185186
By default, ``None``, which uses all the default stealth
186187
options.
188+
ignore_https_errors : bool, default=False
189+
Option to ignore https errors (i.e. SSL cert errors). This is
190+
not generally safe to do - you are susceptible to MITM attacks.
191+
However, if you are doing a simple scrape without providing
192+
any sensitive information (which you probably shouldn't be doing
193+
programmatically anyways), then it's probably ok to ignore these
194+
errors. By default, ``False``.
187195
188196
Yields
189197
------
@@ -202,6 +210,7 @@ async def pw_page(browser, intercept_routes=False, stealth_config=None):
202210
extra_http_headers=DEFAULT_HEADERS,
203211
user_agent=ua,
204212
viewport={"width": randint(800, 1400), "height": randint(800, 1400)},
213+
ignore_https_errors=ignore_https_errors,
205214
)
206215

207216
try:

0 commit comments

Comments
 (0)