diff --git a/news/13132.feature.rst b/news/13132.feature.rst new file mode 100644 index 00000000000..d8cd57e7c56 --- /dev/null +++ b/news/13132.feature.rst @@ -0,0 +1 @@ +Optimize package collection by avoiding unnecessary URL parsing and other processing. diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2f41f2f6a09..c1c5511137e 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -170,12 +170,23 @@ def _ensure_quoted_url(url: str) -> str: and without double-quoting other characters. """ # Split the URL into parts according to the general structure - # `scheme://netloc/path;parameters?query#fragment`. - result = urllib.parse.urlparse(url) + # `scheme://netloc/path?query#fragment`. + result = urllib.parse.urlsplit(url) # If the netloc is empty, then the URL refers to a local filesystem path. is_local_path = not result.netloc path = _clean_url_path(result.path, is_local_path=is_local_path) - return urllib.parse.urlunparse(result._replace(path=path)) + return urllib.parse.urlunsplit(result._replace(path=path)) + + +def _absolute_link_url(base_url: str, file_url: str) -> str: + """ + A faster implementation of urlib.parse.urljoin with a shortcut + for absolute http/https URLs. + """ + if file_url.startswith(("https://", "http://")): + return file_url + else: + return urllib.parse.urljoin(base_url, file_url) @functools.total_ordering @@ -185,6 +196,7 @@ class Link: __slots__ = [ "_parsed_url", "_url", + "_path", "_hashes", "comes_from", "requires_python", @@ -241,6 +253,8 @@ def __init__( # Store the url as a private attribute to prevent accidentally # trying to set a new value. self._url = url + # The .path property is hot, so calculate its value ahead of time. + self._path = urllib.parse.unquote(self._parsed_url.path) link_hash = LinkHash.find_hash_url_fragment(url) hashes_from_link = {} if link_hash is None else link_hash.as_dict() @@ -270,7 +284,7 @@ def from_json( if file_url is None: return None - url = _ensure_quoted_url(urllib.parse.urljoin(page_url, file_url)) + url = _ensure_quoted_url(_absolute_link_url(page_url, file_url)) pyrequire = file_data.get("requires-python") yanked_reason = file_data.get("yanked") hashes = file_data.get("hashes", {}) @@ -322,7 +336,7 @@ def from_element( if not href: return None - url = _ensure_quoted_url(urllib.parse.urljoin(base_url, href)) + url = _ensure_quoted_url(_absolute_link_url(base_url, href)) pyrequire = anchor_attribs.get("data-requires-python") yanked_reason = anchor_attribs.get("data-yanked") @@ -421,7 +435,7 @@ def netloc(self) -> str: @property def path(self) -> str: - return urllib.parse.unquote(self._parsed_url.path) + return self._path def splitext(self) -> Tuple[str, str]: return splitext(posixpath.basename(self.path.rstrip("/")))