multiformats · acul71 · Sep 22, 2025 · Sep 15, 2025 · Sep 15, 2025 · Sep 19, 2025
diff --git a/multiaddr/codecs/http_path.py b/multiaddr/codecs/http_path.py
@@ -0,0 +1,60 @@
+import re
+from typing import Any
+from urllib.parse import quote, unquote
+
+from ..codecs import CodecBase
+from ..exceptions import BinaryParseError, StringParseError
+
+IS_PATH = False
+SIZE = -1  # LengthPrefixedVarSize
+
+
+class Codec(CodecBase):
+    SIZE = SIZE
+    IS_PATH = IS_PATH
+
+    def to_bytes(self, proto: Any, string: str) -> bytes:
+        """
+        Convert an HTTP path string to bytes
+        Unescape URL-encoded characters, validated non-empty, then encode
+        as UTF-8
+        """
+
+        # Reject invalid percent-escapes like "%zz" or "%f" (but allow standalone %)
+        # Look for % followed by exactly 1 hex digit OR % followed by non-hex characters OR % at end
+        invalid_escape = (
+            re.search(r"%[0-9A-Fa-f](?![0-9A-Fa-f])", string)
+            or re.search(r"%[^0-9A-Fa-f]", string)
+            or re.search(r"%$", string)
+        )
+        if invalid_escape:
+            raise StringParseError("Invalid percent-escape in path", string)
+
+        # Now safely unquote
+        try:
+            unescaped = unquote(string)
+        except Exception:
+            raise StringParseError("Invalid HTTP path string", string)
+
+        if not unescaped:
+            raise StringParseError("empty http path is not allowed", string)
+
+        return unescaped.encode("utf-8")
+
+    def to_string(self, proto: Any, buf: bytes) -> str:
+        """
+        Convert bytes to an HTTP path string
+        Decode as UTF-8 and URL-encode (matches Go implementation)
+        """
+        if len(buf) == 0:
+            raise BinaryParseError("Empty http path is not allowed", buf, "http-path")
+
+        return quote(buf.decode("utf-8"), safe="")
+
+    def validate(self, b: bytes) -> None:
+        """
+        Validate an HTTP path buffer.
+        Just check non-empty.
+        """
+        if len(b) == 0:
+            raise ValueError("Empty http path is not allowed")
diff --git a/multiaddr/multiaddr.py b/multiaddr/multiaddr.py
@@ -354,12 +354,12 @@ def _from_string(self, addr: str) -> None:
                 continue
 
             # Special handling for unix paths
-            if part == "unix":
+            if part in ("unix",):
                 try:
                     # Get the next part as the path value
-                    unix_path_value = next(parts)
-                    if not unix_path_value:
-                        raise exceptions.StringParseError("empty unix path", addr)
+                    protocol_path_value = next(parts)
+                    if not protocol_path_value:
+                        raise exceptions.StringParseError("empty protocol path", addr)
 
                     # Join any remaining parts as part of the path
                     remaining_parts = []
@@ -373,16 +373,16 @@ def _from_string(self, addr: str) -> None:
                             break
 
                     if remaining_parts:
-                        unix_path_value = unix_path_value + "/" + "/".join(remaining_parts)
+                        protocol_path_value = protocol_path_value + "/" + "/".join(remaining_parts)
 
-                    proto = protocol_with_name("unix")
+                    proto = protocol_with_name(part)
                     codec = codec_by_name(proto.codec)
                     if not codec:
                         raise exceptions.StringParseError(f"unknown codec: {proto.codec}", addr)
 
                     try:
                         self._bytes += varint.encode(proto.code)
-                        buf = codec.to_bytes(proto, unix_path_value)
+                        buf = codec.to_bytes(proto, protocol_path_value)
                         # Add length prefix for variable-sized or zero-sized codecs
                         if codec.SIZE <= 0:
                             self._bytes += varint.encode(len(buf))

diff --git a/multiaddr/protocols.py b/multiaddr/protocols.py
@@ -158,6 +158,7 @@ def __repr__(self) -> str:
     Protocol(P_QUIC1, "quic-v1", None),
     Protocol(P_HTTP, "http", None),
     Protocol(P_HTTPS, "https", None),
+    Protocol(P_HTTP_PATH, "http-path", "http_path"),
     Protocol(P_TLS, "tls", None),
     Protocol(P_WS, "ws", None),
     Protocol(P_WSS, "wss", None),

diff --git a/multiaddr/transforms.py b/multiaddr/transforms.py
@@ -27,9 +27,9 @@ def string_to_bytes(string: str) -> bytes:
         logger.debug(f"[DEBUG string_to_bytes] Encoded protocol code: {encoded_code}")
         bs.append(encoded_code)
 
-        # Special case: protocols with codec=None are flag protocols
+        # Special case: protocols with codec=None or SIZE=0 are flag protocols
         # (no value, no length prefix, no buffer)
-        if codec is None:
+        if codec is None or getattr(codec, "SIZE", None) == 0:
             logger.debug(
                 f"[DEBUG string_to_bytes] Protocol {proto.name} has no data, "
                 "skipping value encoding"
@@ -93,6 +93,7 @@ def bytes_to_string(buf: bytes) -> str:
                     value = codec.to_string(proto, bs.read(size))
                 logger.debug(f"[DEBUG] bytes_to_string: proto={proto.name}, value='{value}'")
                 if codec.IS_PATH and value.startswith("/"):
+                    # For path protocols, the codec already handles URL encoding
                     strings.append("/" + proto.name + value)  # type: ignore[arg-type]
                 else:
                     strings.append("/" + proto.name + "/" + value)  # type: ignore[arg-type]

diff --git a/newsfragments/94.feature.rst b/newsfragments/94.feature.rst
@@ -0,0 +1 @@
+Added the http-path protocol in reference with go-multiaddr.
diff --git a/tests/test_multiaddr.py b/tests/test_multiaddr.py
@@ -10,6 +10,7 @@
 from multiaddr.multiaddr import Multiaddr
 from multiaddr.protocols import (
     P_DNS,
+    P_HTTP_PATH,
     P_IP4,
     P_IP6,
     P_P2P,
@@ -825,3 +826,158 @@ def test_memory_protocol_properties():
     assert proto.code == 777
     assert proto.name == "memory"
     assert proto.codec == "memory"
+
+
+def test_http_path_multiaddr_roundtrip():
+    """Test basic http-path in multiaddr string roundtrip"""
+    test_cases = [
+        "/http-path/foo",
+        "/http-path/foo%2Fbar",  # URL-encoded forward slashes
+        "/http-path/api%2Fv1%2Fusers",  # URL-encoded forward slashes
+    ]
+
+    for addr_str in test_cases:
+        m = Multiaddr(addr_str)
+        assert str(m) == addr_str
+        # Verify protocol value extraction
+        path_value = m.value_for_protocol(P_HTTP_PATH)
+        expected_path = addr_str.replace("/http-path/", "")
+        assert path_value == expected_path
+
+
+def test_http_path_url_encoding():
+    """Test special characters and URL encoding behavior"""
+    test_cases = [
+        ("/foo%20bar", "/foo%20bar"),  # Already URL-encoded input
+        (
+            "/path%2Fwith%2Fspecial%21%40%23",
+            "/path%2Fwith%2Fspecial%21%40%23",
+        ),  # Already URL-encoded input
+        (
+            "/%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF",
+            "/%E3%81%93%E3%82%93%E3%81%AB%E3%81%A1%E3%81%AF",
+        ),  # Already URL-encoded input
+        ("/tmp%2Fbar", "/tmp%2Fbar"),  # Already URL-encoded input
+    ]
+
+    for input_path, expected_encoded in test_cases:
+        addr_str = f"/http-path{input_path}"
+        m = Multiaddr(addr_str)
+        # The string representation should show URL-encoded path
+        assert str(m) == f"/http-path{expected_encoded}"
+
+
+def test_http_path_in_complex_multiaddr():
+    """Test http-path as part of larger multiaddr chains"""
+    test_cases = [
+        ("/ip4/127.0.0.1/tcp/443/tls/http/http-path/api%2Fv1", "api%2Fv1"),
+        ("/ip4/127.0.0.1/tcp/80/http/http-path/static%2Fcss", "static%2Fcss"),
+        ("/dns/example.com/tcp/443/tls/http/http-path/docs", "docs"),
+    ]
+
+    for addr_str, expected_path in test_cases:
+        m = Multiaddr(addr_str)
+        assert str(m) == addr_str
+
+        # Extract the http-path value
+        path_value = m.value_for_protocol(P_HTTP_PATH)
+        assert path_value == expected_path
+
+
+def test_http_path_error_cases():
+    """Test error handling for invalid http-path values"""
+
+    # Empty path should raise error
+    with pytest.raises(StringParseError):
+        Multiaddr("/http-path/")
+
+    # Missing path value should raise error
+    with pytest.raises(StringParseError):
+        Multiaddr("/http-path")
+
+    # Invalid URL encoding should raise error
+    with pytest.raises(StringParseError):
+        Multiaddr("/http-path/invalid%zz")
+
+
+def test_http_path_value_extraction():
+    """Test extracting http-path values from multiaddr"""
+    test_cases = [
+        ("/http-path/foo", "foo"),
+        ("/http-path/foo%2Fbar", "foo%2Fbar"),
+        ("/http-path/api%2Fv1%2Fusers", "api%2Fv1%2Fusers"),
+        ("/ip4/127.0.0.1/tcp/80/http/http-path/docs", "docs"),
+    ]
+
+    for addr_str, expected_path in test_cases:
+        m = Multiaddr(addr_str)
+        path_value = m.value_for_protocol(P_HTTP_PATH)
+        assert path_value == expected_path
+
+
+def test_http_path_edge_cases():
+    """Test edge cases and special character handling"""
+
+    # Test with various special characters (URL-encoded input)
+    special_paths = [
+        "path%20with%20spaces",
+        "path%2Fwith%2Fmultiple%2Fslashes",
+        "path%2Fwith%2Funicode%2F%E6%B5%8B%E8%AF%95",
+        "path%2Fwith%2Fsymbols%21%40%23%24%25%5E%26%2A%28%29",
+    ]
+
+    for path in special_paths:
+        addr_str = f"/http-path/{path}"
+        m = Multiaddr(addr_str)
+        # Should handle encoding properly
+        assert m.value_for_protocol(P_HTTP_PATH) == path
+
+
+def test_http_path_only_reads_http_path_part():
+    """Test that http-path only reads its own part, not subsequent protocols"""
+    # This test verifies that when we have /http-path/tmp%2Fbar/p2p-circuit,
+    # the ValueForProtocol only returns the http-path part (tmp%2Fbar)
+    # and doesn't include the /p2p-circuit part
+    addr_str = "/http-path/tmp%2Fbar/p2p-circuit"
+    m = Multiaddr(addr_str)
+
+    # Should only return the http-path part, not the p2p-circuit part
+    http_path_value = m.value_for_protocol(P_HTTP_PATH)
+    assert http_path_value == "tmp%2Fbar"
+
+    # The full string should still include both parts
+    assert str(m) == addr_str
+
+
+def test_http_path_malformed_percent_escape():
+    """Test that malformed percent-escapes are properly rejected"""
+    # This tests the specific case from Go: /http-path/thisIsMissingAfullByte%f
+    # The %f is an incomplete percent-escape and should be rejected
+    bad_addr = "/http-path/thisIsMissingAfullByte%f"
+
+    with pytest.raises(StringParseError, match="Invalid percent-escape"):
+        Multiaddr(bad_addr)
+
+
+def test_http_path_raw_value_access():
+    """Test accessing raw unescaped values from http-path components"""
+    # This test demonstrates how to get the raw unescaped value
+    # similar to Go's SplitLast and RawValue functionality
+    addr_str = "/http-path/tmp%2Fbar"
+    m = Multiaddr(addr_str)
+
+    # Get the URL-encoded value (what ValueForProtocol returns)
+    encoded_value = m.value_for_protocol(P_HTTP_PATH)
+    assert encoded_value == "tmp%2Fbar"
+
+    # Get the raw unescaped value by accessing the component directly
+    # This is similar to Go's component.RawValue()
+    from urllib.parse import unquote
+
+    raw_value = unquote(encoded_value)
+    assert raw_value == "tmp/bar"
+
+    # Verify the roundtrip
+    from urllib.parse import quote
+
+    assert quote(raw_value, safe="") == encoded_value
diff --git a/tests/test_protocols.py b/tests/test_protocols.py
@@ -2,7 +2,7 @@
 import varint
 
 from multiaddr import Multiaddr, exceptions, protocols
-from multiaddr.codecs import memory
+from multiaddr.codecs import http_path, memory
 from multiaddr.exceptions import BinaryParseError
 
 
@@ -269,3 +269,52 @@ def test_memory_integration_invalid_values():
     # Too large (overflow > uint64)
     with pytest.raises(ValueError):
         Multiaddr(f"/memory/{2**64}")
+
+
+def test_http_path_bytes_string_roundtrip():
+    codec = http_path.Codec()
+
+    # some valid HTTP path strings (URL-encoded input as expected by multiaddr system)
+    from urllib.parse import quote
+
+    for s in ["/foo", "/foo/bar", "/a b", "/こんにちは", "/path/with/special!@#"]:
+        encoded_s = quote(s, safe="")  # Use same encoding as codec
+        b = codec.to_bytes(None, encoded_s)
+        assert isinstance(b, bytes)
+        out = codec.to_string(None, b)
+        # Should return the same URL-encoded string
+        assert out == encoded_s
+
+
+def test_http_path_empty_string_raises():
+    codec = http_path.Codec()
+    with pytest.raises(ValueError):
+        codec.to_bytes(None, "")
+
+
+def test_http_path_empty_bytes_raises():
+    codec = http_path.Codec()
+    with pytest.raises(BinaryParseError):
+        codec.to_string(None, b"")
+
+
+def test_http_path_special_characters():
+    codec = http_path.Codec()
+    path = "/foo bar/あいうえお"
+    from urllib.parse import quote
+
+    encoded_path = quote(path, safe="")  # Use same encoding as codec
+    b = codec.to_bytes(None, encoded_path)
+
+    assert codec.to_string(None, b) == encoded_path
+
+
+def test_http_path_validate_function():
+    codec = http_path.Codec()
+
+    # valid path
+    codec.validate(b"/valid/path")  # should not raise
+
+    # empty path
+    with pytest.raises(ValueError):
+        codec.validate(b"")
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		Added the http-path protocol in reference with go-multiaddr.