Skip to content

Commit c06bfce

Browse files
FazeelUsmanijayaddisonAA-Turner
authored
linkcheck: Allow case-insensitive URL comparisons (#14046)
Co-authored-by: James Addison <[email protected]> Co-authored-by: Adam Turner <[email protected]>
1 parent 67762d8 commit c06bfce

File tree

6 files changed

+143
-2
lines changed

6 files changed

+143
-2
lines changed

CHANGES.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,11 @@ Features added
7171
* #14023: Add the new :confval:`mathjax_config_path` option
7272
to load MathJax configuration from a file.
7373
Patch by Randolf Scholz and Adam Turner.
74+
* #14046: linkcheck: Add the :confval:`linkcheck_case_insensitive_urls` option
75+
to allow case-insensitive URL comparison for specific URL patterns.
76+
This is useful for links to websites that normalise URL casing (e.g. GitHub)
77+
or case-insensitive servers.
78+
Patch by Fazeel Usmani and James Addison.
7479

7580
Bugs fixed
7681
----------

doc/usage/configuration.rst

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3813,6 +3813,42 @@ and the number of workers to use.
38133813

38143814
.. versionadded:: 7.3
38153815

3816+
.. confval:: linkcheck_case_insensitive_urls
3817+
:type: :code-py:`Set[str] | Sequence[str]`
3818+
:default: :code-py:`()`
3819+
3820+
A collection of regular expressions that match URLs for which the *linkcheck*
3821+
builder should perform case-insensitive comparisons. This is useful for
3822+
links to websites that are case-insensitive or normalise URL casing.
3823+
3824+
By default, *linkcheck* requires the destination URL to match the
3825+
documented URL case-sensitively.
3826+
For example, a link to ``http://example.org/PATH`` that redirects to
3827+
``http://example.org/path`` will be reported as ``redirected``.
3828+
3829+
If the URL matches a pattern contained in
3830+
:confval:`!linkcheck_case_insensitive_urls`,
3831+
it would instead be reported as ``working``.
3832+
3833+
For example, to treat all GitHub URLs as case-insensitive:
3834+
3835+
.. code-block:: python
3836+
3837+
linkcheck_case_insensitive_urls = [
3838+
r'https://github\.com/.*',
3839+
]
3840+
3841+
Or, to treat all URLs as case-insensitive:
3842+
3843+
.. code-block:: python
3844+
3845+
linkcheck_case_insensitive_urls = ['.*']
3846+
3847+
.. note:: URI fragments (HTML anchors) are not affected by this option.
3848+
They are always checked with case-sensitive comparisons.
3849+
3850+
.. versionadded:: 8.3
3851+
38163852
.. confval:: linkcheck_rate_limit_timeout
38173853
:type: :code-py:`int`
38183854
:default: :code-py:`300`

sphinx/builders/linkcheck.py

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@
3434
from sphinx.util.nodes import get_node_line
3535

3636
if TYPE_CHECKING:
37-
from collections.abc import Callable, Iterator
37+
from collections.abc import Callable, Iterator, Sequence
3838
from typing import Any, Literal, TypeAlias
3939

4040
from requests import Response
@@ -385,6 +385,9 @@ def __init__(
385385
self.documents_exclude: list[re.Pattern[str]] = list(
386386
map(re.compile, config.linkcheck_exclude_documents)
387387
)
388+
self.ignore_case: Sequence[re.Pattern[str]] = tuple(
389+
map(re.compile, config.linkcheck_case_insensitive_urls)
390+
)
388391
self.auth = [
389392
(re.compile(pattern), auth_info)
390393
for pattern, auth_info in config.linkcheck_auth
@@ -629,8 +632,15 @@ def _check_uri(self, uri: str, hyperlink: Hyperlink) -> _URIProperties:
629632
netloc = urlsplit(req_url).netloc
630633
self.rate_limits.pop(netloc, None)
631634

635+
# Check if URL should be normalised case-insensitively
636+
ignore_case = any(pat.match(req_url) for pat in self.ignore_case)
637+
normalised_req_url = self._normalise_url(req_url, ignore_case=ignore_case)
638+
normalised_response_url = self._normalise_url(
639+
response_url, ignore_case=ignore_case
640+
)
641+
632642
if (
633-
(response_url.rstrip('/') == req_url.rstrip('/'))
643+
normalised_response_url == normalised_req_url
634644
or _allowed_redirect(req_url, response_url, self.allowed_redirects)
635645
): # fmt: skip
636646
return _Status.WORKING, '', 0
@@ -676,6 +686,17 @@ def limit_rate(self, response_url: str, retry_after: str | None) -> float | None
676686
self.rate_limits[netloc] = RateLimit(delay, next_check)
677687
return next_check
678688

689+
@staticmethod
690+
def _normalise_url(url: str, *, ignore_case: bool) -> str:
691+
normalised_url = url.rstrip('/')
692+
if not ignore_case:
693+
return normalised_url
694+
# URI fragments are case-sensitive
695+
url_part, sep, fragment = normalised_url.partition('#')
696+
if sep:
697+
return f'{url_part.casefold()}#{fragment}'
698+
return url_part.casefold()
699+
679700

680701
def _get_request_headers(
681702
uri: str,
@@ -816,6 +837,12 @@ def setup(app: Sphinx) -> ExtensionMetadata:
816837
app.add_config_value(
817838
'linkcheck_report_timeouts_as_broken', False, '', types=frozenset({bool})
818839
)
840+
app.add_config_value(
841+
'linkcheck_case_insensitive_urls',
842+
(),
843+
'',
844+
types=frozenset({frozenset, list, set, tuple}),
845+
)
819846

820847
app.add_event('linkcheck-process-uri')
821848

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# Empty config for linkcheck case sensitivity tests
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
`path1 <http://localhost:7777/path1>`_
2+
3+
`path2 <http://localhost:7777/path2>`_
4+
5+
`PATH3 <http://localhost:7777/PATH3>`_

tests/test_builders/test_build_linkcheck.py

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1439,3 +1439,70 @@ def test_linkcheck_exclude_documents(app: SphinxTestApp) -> None:
14391439
'uri': 'https://www.sphinx-doc.org/this-is-another-broken-link',
14401440
'info': 'br0ken_link matched br[0-9]ken_link from linkcheck_exclude_documents',
14411441
} in content
1442+
1443+
1444+
class CapitalisePathHandler(BaseHTTPRequestHandler):
1445+
"""Test server that uppercases URL paths via redirects."""
1446+
1447+
protocol_version = 'HTTP/1.1'
1448+
1449+
def do_GET(self):
1450+
if self.path.islower():
1451+
# Redirect lowercase paths to uppercase versions
1452+
self.send_response(301, 'Moved Permanently')
1453+
self.send_header('Location', self.path.upper())
1454+
self.send_header('Content-Length', '0')
1455+
self.end_headers()
1456+
else:
1457+
# Serve uppercase paths
1458+
content = b'ok\n\n'
1459+
self.send_response(200, 'OK')
1460+
self.send_header('Content-Length', str(len(content)))
1461+
self.end_headers()
1462+
self.wfile.write(content)
1463+
1464+
1465+
@pytest.mark.sphinx(
1466+
'linkcheck',
1467+
testroot='linkcheck-case-check',
1468+
freshenv=True,
1469+
)
1470+
@pytest.mark.parametrize(
1471+
('case_insensitive_pattern', 'expected_path1', 'expected_path2', 'expected_path3'),
1472+
[
1473+
([], 'redirected', 'redirected', 'working'), # default: case-sensitive
1474+
(
1475+
[r'http://localhost:\d+/.*'],
1476+
'working',
1477+
'working',
1478+
'working',
1479+
), # all URLs case-insensitive
1480+
(
1481+
[r'http://localhost:\d+/path1'],
1482+
'working',
1483+
'redirected',
1484+
'working',
1485+
), # only path1 case-insensitive
1486+
],
1487+
)
1488+
def test_linkcheck_case_sensitivity(
1489+
app: SphinxTestApp,
1490+
case_insensitive_pattern: list[str],
1491+
expected_path1: str,
1492+
expected_path2: str,
1493+
expected_path3: str,
1494+
) -> None:
1495+
"""Test case-sensitive and case-insensitive URL checking."""
1496+
app.config.linkcheck_case_insensitive_urls = case_insensitive_pattern
1497+
1498+
with serve_application(app, CapitalisePathHandler) as address:
1499+
app.build()
1500+
1501+
content = (app.outdir / 'output.json').read_text(encoding='utf8')
1502+
rows = [json.loads(x) for x in content.splitlines()]
1503+
rowsby = {row['uri']: row for row in rows}
1504+
1505+
# Verify expected status for each path
1506+
assert rowsby[f'http://{address}/path1']['status'] == expected_path1
1507+
assert rowsby[f'http://{address}/path2']['status'] == expected_path2
1508+
assert rowsby[f'http://{address}/PATH3']['status'] == expected_path3

0 commit comments

Comments
 (0)