Skip to content

Commit ffbf6f0

Browse files
ichard26sbidoul
andauthored
perf: Avoid unnecessary URL processing while parsing links (#13132)
There are three optimizations in this commit, in descending order of impact: - If the file URL in the "project detail" response is already absolute, then avoid calling urljoin() as it's expensive (mostly because it calls urlparse() on both of its URL arguments) and does nothing. While it'd be more correct to check whether the file URL has a scheme, we'd need to parse the URL which is what we're trying to avoid in the first place. Anyway, by simply checking if the URL starts with http[s]://, we can avoid slow urljoin() calls for PyPI responses. - Replacing urllib.parse.urlparse() with urllib.parse.urlsplit() in _ensure_quoted_url(). The URL parsing functions are equivalent for our needs[^1]. However, urlsplit() is faster, and we achieve better cache utilization of its internal cache if we call it directly[^2]. - Calculating the Link.path property in advance as it's very hot. [^1]: we don't care about URL parameters AFAIK (which are different than the query component!) [^2]: urlparse() calls urlsplit() internally, but it passes the authority parameter (unlike any of our calls) so it bypasses the cache. Co-authored-by: Stéphane Bidoul <[email protected]>
1 parent bc553db commit ffbf6f0

File tree

2 files changed

+21
-6
lines changed

2 files changed

+21
-6
lines changed

news/13132.feature.rst

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Optimize package collection by avoiding unnecessary URL parsing and other processing.

src/pip/_internal/models/link.py

+20-6
Original file line numberDiff line numberDiff line change
@@ -170,12 +170,23 @@ def _ensure_quoted_url(url: str) -> str:
170170
and without double-quoting other characters.
171171
"""
172172
# Split the URL into parts according to the general structure
173-
# `scheme://netloc/path;parameters?query#fragment`.
174-
result = urllib.parse.urlparse(url)
173+
# `scheme://netloc/path?query#fragment`.
174+
result = urllib.parse.urlsplit(url)
175175
# If the netloc is empty, then the URL refers to a local filesystem path.
176176
is_local_path = not result.netloc
177177
path = _clean_url_path(result.path, is_local_path=is_local_path)
178-
return urllib.parse.urlunparse(result._replace(path=path))
178+
return urllib.parse.urlunsplit(result._replace(path=path))
179+
180+
181+
def _absolute_link_url(base_url: str, url: str) -> str:
182+
"""
183+
A faster implementation of urllib.parse.urljoin with a shortcut
184+
for absolute http/https URLs.
185+
"""
186+
if url.startswith(("https://", "http://")):
187+
return url
188+
else:
189+
return urllib.parse.urljoin(base_url, url)
179190

180191

181192
@functools.total_ordering
@@ -185,6 +196,7 @@ class Link:
185196
__slots__ = [
186197
"_parsed_url",
187198
"_url",
199+
"_path",
188200
"_hashes",
189201
"comes_from",
190202
"requires_python",
@@ -241,6 +253,8 @@ def __init__(
241253
# Store the url as a private attribute to prevent accidentally
242254
# trying to set a new value.
243255
self._url = url
256+
# The .path property is hot, so calculate its value ahead of time.
257+
self._path = urllib.parse.unquote(self._parsed_url.path)
244258

245259
link_hash = LinkHash.find_hash_url_fragment(url)
246260
hashes_from_link = {} if link_hash is None else link_hash.as_dict()
@@ -270,7 +284,7 @@ def from_json(
270284
if file_url is None:
271285
return None
272286

273-
url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url))
287+
url = _ensure_quoted_url(_absolute_link_url(page_url, file_url))
274288
pyrequire = file_data.get("requires-python")
275289
yanked_reason = file_data.get("yanked")
276290
hashes = file_data.get("hashes", {})
@@ -322,7 +336,7 @@ def from_element(
322336
if not href:
323337
return None
324338

325-
url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href))
339+
url = _ensure_quoted_url(_absolute_link_url(base_url, href))
326340
pyrequire = anchor_attribs.get("data-requires-python")
327341
yanked_reason = anchor_attribs.get("data-yanked")
328342

@@ -421,7 +435,7 @@ def netloc(self) -> str:
421435

422436
@property
423437
def path(self) -> str:
424-
return urllib.parse.unquote(self._parsed_url.path)
438+
return self._path
425439

426440
def splitext(self) -> Tuple[str, str]:
427441
return splitext(posixpath.basename(self.path.rstrip("/")))

0 commit comments

Comments
 (0)