diff --git a/multiaddr/codecs/http_path.py b/multiaddr/codecs/http_path.py new file mode 100644 index 0000000..3abbbf9 --- /dev/null +++ b/multiaddr/codecs/http_path.py @@ -0,0 +1,60 @@ +import re +from typing import Any +from urllib.parse import quote, unquote + +from ..codecs import CodecBase +from ..exceptions import BinaryParseError, StringParseError + +IS_PATH = False +SIZE = -1 # LengthPrefixedVarSize + + +class Codec(CodecBase): + SIZE = SIZE + IS_PATH = IS_PATH + + def to_bytes(self, proto: Any, string: str) -> bytes: + """ + Convert an HTTP path string to bytes + Unescape URL-encoded characters, validated non-empty, then encode + as UTF-8 + """ + + # Reject invalid percent-escapes like "%zz" or "%f" (but allow standalone %) + # Look for % followed by exactly 1 hex digit OR % followed by non-hex characters OR % at end + invalid_escape = ( + re.search(r"%[0-9A-Fa-f](?![0-9A-Fa-f])", string) + or re.search(r"%[^0-9A-Fa-f]", string) + or re.search(r"%$", string) + ) + if invalid_escape: + raise StringParseError("Invalid percent-escape in path", string) + + # Now safely unquote + try: + unescaped = unquote(string) + except Exception: + raise StringParseError("Invalid HTTP path string", string) + + if not unescaped: + raise StringParseError("empty http path is not allowed", string) + + return unescaped.encode("utf-8") + + def to_string(self, proto: Any, buf: bytes) -> str: + """ + Convert bytes to an HTTP path string + Decode as UTF-8 and URL-encode (matches Go implementation) + """ + if len(buf) == 0: + raise BinaryParseError("Empty http path is not allowed", buf, "http-path") + + return quote(buf.decode("utf-8"), safe="") + + def validate(self, b: bytes) -> None: + """ + Validate an HTTP path buffer. + Just check non-empty. + """ + if len(b) == 0: + raise ValueError("Empty http path is not allowed") diff --git a/multiaddr/multiaddr.py b/multiaddr/multiaddr.py index aa43551..7453c4f 100644 --- a/multiaddr/multiaddr.py +++ b/multiaddr/multiaddr.py @@ -354,12 +354,12 @@ def _from_string(self, addr: str) -> None: continue # Special handling for unix paths - if part == "unix": + if part in ("unix",): try: # Get the next part as the path value - unix_path_value = next(parts) - if not unix_path_value: - raise exceptions.StringParseError("empty unix path", addr) + protocol_path_value = next(parts) + if not protocol_path_value: + raise exceptions.StringParseError("empty protocol path", addr) # Join any remaining parts as part of the path remaining_parts = [] @@ -373,16 +373,16 @@ def _from_string(self, addr: str) -> None: break if remaining_parts: - unix_path_value = unix_path_value + "/" + "/".join(remaining_parts) + protocol_path_value = protocol_path_value + "/" + "/".join(remaining_parts) - proto = protocol_with_name("unix") + proto = protocol_with_name(part) codec = codec_by_name(proto.codec) if not codec: raise exceptions.StringParseError(f"unknown codec: {proto.codec}", addr) try: self._bytes += varint.encode(proto.code) - buf = codec.to_bytes(proto, unix_path_value) + buf = codec.to_bytes(proto, protocol_path_value) # Add length prefix for variable-sized or zero-sized codecs if codec.SIZE <= 0: self._bytes += varint.encode(len(buf)) diff --git a/multiaddr/protocols.py b/multiaddr/protocols.py index f32b74d..a356625 100644 --- a/multiaddr/protocols.py +++ b/multiaddr/protocols.py @@ -158,6 +158,7 @@ def __repr__(self) -> str: Protocol(P_QUIC1, "quic-v1", None), Protocol(P_HTTP, "http", None), Protocol(P_HTTPS, "https", None), + Protocol(P_HTTP_PATH, "http-path", "http_path"), Protocol(P_TLS, "tls", None), Protocol(P_WS, "ws", None), Protocol(P_WSS, "wss", None), diff --git a/multiaddr/transforms.py b/multiaddr/transforms.py index 563eecc..b366f59 100644 --- a/multiaddr/transforms.py +++ b/multiaddr/transforms.py @@ -27,9 +27,9 @@ def string_to_bytes(string: str) -> bytes: logger.debug(f"[DEBUG string_to_bytes] Encoded protocol code: {encoded_code}") bs.append(encoded_code) - # Special case: protocols with codec=None are flag protocols + # Special case: protocols with codec=None or SIZE=0 are flag protocols # (no value, no length prefix, no buffer) - if codec is None: + if codec is None or getattr(codec, "SIZE", None) == 0: logger.debug( f"[DEBUG string_to_bytes] Protocol {proto.name} has no data, " "skipping value encoding" @@ -93,6 +93,7 @@ def bytes_to_string(buf: bytes) -> str: value = codec.to_string(proto, bs.read(size)) logger.debug(f"[DEBUG] bytes_to_string: proto={proto.name}, value='{value}'") if codec.IS_PATH and value.startswith("/"): + # For path protocols, the codec already handles URL encoding strings.append("/" + proto.name + value) # type: ignore[arg-type] else: strings.append("/" + proto.name + "/" + value) # type: ignore[arg-type] diff --git a/newsfragments/94.feature.rst b/newsfragments/94.feature.rst new file mode 100644 index 0000000..8af4263 --- /dev/null +++ b/newsfragments/94.feature.rst @@ -0,0 +1 @@ +Added the http-path protocol in reference with go-multiaddr. diff --git a/tests/test_multiaddr.py b/tests/test_multiaddr.py index 369c08c..b22a2af 100644 --- a/tests/test_multiaddr.py +++ b/tests/test_multiaddr.py @@ -10,6 +10,7 @@ from multiaddr.multiaddr import Multiaddr from multiaddr.protocols import ( P_DNS, + P_HTTP_PATH, P_IP4, P_IP6, P_P2P, @@ -825,3 +826,158 @@ def test_memory_protocol_properties(): assert proto.code == 777 assert proto.name == "memory" assert proto.codec == "memory" + + +def test_http_path_multiaddr_roundtrip(): + """Test basic http-path in multiaddr string roundtrip""" + test_cases = [ + "/http-path/foo", + "/http-path/foo%2Fbar", # URL-encoded forward slashes + "/http-path/api%2Fv1%2Fusers", # URL-encoded forward slashes + ] + + for addr_str in test_cases: + m = Multiaddr(addr_str) + assert str(m) == addr_str + # Verify protocol value extraction + path_value = m.value_for_protocol(P_HTTP_PATH) + expected_path = addr_str.replace("/http-path/", "") + assert path_value == expected_path + + +def test_http_path_url_encoding(): + """Test special characters and URL encoding behavior""" + test_cases = [ + ("/foo%20bar", "/foo%20bar"), # Already URL-encoded input + ( + "/path%2Fwith%2Fspecial%21%40%23", + "/path%2Fwith%2Fspecial%21%40%23", + ), # Already URL-encoded input + ( + "/%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF", + "/%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF", + ), # Already URL-encoded input + ("/tmp%2Fbar", "/tmp%2Fbar"), # Already URL-encoded input + ] + + for input_path, expected_encoded in test_cases: + addr_str = f"/http-path{input_path}" + m = Multiaddr(addr_str) + # The string representation should show URL-encoded path + assert str(m) == f"/http-path{expected_encoded}" + + +def test_http_path_in_complex_multiaddr(): + """Test http-path as part of larger multiaddr chains""" + test_cases = [ + ("/ip4/127.0.0.1/tcp/443/tls/http/http-path/api%2Fv1", "api%2Fv1"), + ("/ip4/127.0.0.1/tcp/80/http/http-path/static%2Fcss", "static%2Fcss"), + ("/dns/example.com/tcp/443/tls/http/http-path/docs", "docs"), + ] + + for addr_str, expected_path in test_cases: + m = Multiaddr(addr_str) + assert str(m) == addr_str + + # Extract the http-path value + path_value = m.value_for_protocol(P_HTTP_PATH) + assert path_value == expected_path + + +def test_http_path_error_cases(): + """Test error handling for invalid http-path values""" + + # Empty path should raise error + with pytest.raises(StringParseError): + Multiaddr("/http-path/") + + # Missing path value should raise error + with pytest.raises(StringParseError): + Multiaddr("/http-path") + + # Invalid URL encoding should raise error + with pytest.raises(StringParseError): + Multiaddr("/http-path/invalid%zz") + + +def test_http_path_value_extraction(): + """Test extracting http-path values from multiaddr""" + test_cases = [ + ("/http-path/foo", "foo"), + ("/http-path/foo%2Fbar", "foo%2Fbar"), + ("/http-path/api%2Fv1%2Fusers", "api%2Fv1%2Fusers"), + ("/ip4/127.0.0.1/tcp/80/http/http-path/docs", "docs"), + ] + + for addr_str, expected_path in test_cases: + m = Multiaddr(addr_str) + path_value = m.value_for_protocol(P_HTTP_PATH) + assert path_value == expected_path + + +def test_http_path_edge_cases(): + """Test edge cases and special character handling""" + + # Test with various special characters (URL-encoded input) + special_paths = [ + "path%20with%20spaces", + "path%2Fwith%2Fmultiple%2Fslashes", + "path%2Fwith%2Funicode%2F%E6%B5%8B%E8%AF%95", + "path%2Fwith%2Fsymbols%21%40%23%24%25%5E%26%2A%28%29", + ] + + for path in special_paths: + addr_str = f"/http-path/{path}" + m = Multiaddr(addr_str) + # Should handle encoding properly + assert m.value_for_protocol(P_HTTP_PATH) == path + + +def test_http_path_only_reads_http_path_part(): + """Test that http-path only reads its own part, not subsequent protocols""" + # This test verifies that when we have /http-path/tmp%2Fbar/p2p-circuit, + # the ValueForProtocol only returns the http-path part (tmp%2Fbar) + # and doesn't include the /p2p-circuit part + addr_str = "/http-path/tmp%2Fbar/p2p-circuit" + m = Multiaddr(addr_str) + + # Should only return the http-path part, not the p2p-circuit part + http_path_value = m.value_for_protocol(P_HTTP_PATH) + assert http_path_value == "tmp%2Fbar" + + # The full string should still include both parts + assert str(m) == addr_str + + +def test_http_path_malformed_percent_escape(): + """Test that malformed percent-escapes are properly rejected""" + # This tests the specific case from Go: /http-path/thisIsMissingAfullByte%f + # The %f is an incomplete percent-escape and should be rejected + bad_addr = "/http-path/thisIsMissingAfullByte%f" + + with pytest.raises(StringParseError, match="Invalid percent-escape"): + Multiaddr(bad_addr) + + +def test_http_path_raw_value_access(): + """Test accessing raw unescaped values from http-path components""" + # This test demonstrates how to get the raw unescaped value + # similar to Go's SplitLast and RawValue functionality + addr_str = "/http-path/tmp%2Fbar" + m = Multiaddr(addr_str) + + # Get the URL-encoded value (what ValueForProtocol returns) + encoded_value = m.value_for_protocol(P_HTTP_PATH) + assert encoded_value == "tmp%2Fbar" + + # Get the raw unescaped value by accessing the component directly + # This is similar to Go's component.RawValue() + from urllib.parse import unquote + + raw_value = unquote(encoded_value) + assert raw_value == "tmp/bar" + + # Verify the roundtrip + from urllib.parse import quote + + assert quote(raw_value, safe="") == encoded_value diff --git a/tests/test_protocols.py b/tests/test_protocols.py index f7b8bed..4ac2496 100644 --- a/tests/test_protocols.py +++ b/tests/test_protocols.py @@ -2,7 +2,7 @@ import varint from multiaddr import Multiaddr, exceptions, protocols -from multiaddr.codecs import memory +from multiaddr.codecs import http_path, memory from multiaddr.exceptions import BinaryParseError @@ -269,3 +269,52 @@ def test_memory_integration_invalid_values(): # Too large (overflow > uint64) with pytest.raises(ValueError): Multiaddr(f"/memory/{2**64}") + + +def test_http_path_bytes_string_roundtrip(): + codec = http_path.Codec() + + # some valid HTTP path strings (URL-encoded input as expected by multiaddr system) + from urllib.parse import quote + + for s in ["/foo", "/foo/bar", "/a b", "/こんにちは", "/path/with/special!@#"]: + encoded_s = quote(s, safe="") # Use same encoding as codec + b = codec.to_bytes(None, encoded_s) + assert isinstance(b, bytes) + out = codec.to_string(None, b) + # Should return the same URL-encoded string + assert out == encoded_s + + +def test_http_path_empty_string_raises(): + codec = http_path.Codec() + with pytest.raises(ValueError): + codec.to_bytes(None, "") + + +def test_http_path_empty_bytes_raises(): + codec = http_path.Codec() + with pytest.raises(BinaryParseError): + codec.to_string(None, b"") + + +def test_http_path_special_characters(): + codec = http_path.Codec() + path = "/foo bar/あいうえお" + from urllib.parse import quote + + encoded_path = quote(path, safe="") # Use same encoding as codec + b = codec.to_bytes(None, encoded_path) + + assert codec.to_string(None, b) == encoded_path + + +def test_http_path_validate_function(): + codec = http_path.Codec() + + # valid path + codec.validate(b"/valid/path") # should not raise + + # empty path + with pytest.raises(ValueError): + codec.validate(b"")