From f7aa293014aa3204440667c450df5b5f43fc073c Mon Sep 17 00:00:00 2001 From: Rodrigo Nogueira Date: Sun, 12 Apr 2026 18:32:07 -0300 Subject: [PATCH 1/4] Fix host validation: zone ID characters and NFKC percent bypass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finding 1: IPv6 zone IDs were not validated even when validate_host=True. Any character — including CR, LF, and null bytes — could be embedded in url.host via URL.build(host='::1%'). This creates an asymmetry: regular hostnames are correctly rejected for control characters but zone IDs were passed through verbatim. Fix: add _ZONE_ID_RE regex (RFC 6874 unreserved + sub-delims) and validate the zone portion of IPv6 addresses in _encode_host() when validate_host=True. Finding 2: _check_netloc() normalizes the netloc via NFKC and checks for URL-reserved characters but '%' was missing from the checked set. U+FF05 (FULLWIDTH PERCENT SIGN) and U+FE6A (SMALL PERCENT SIGN) both normalize to '%' under NFKC and were accepted, ultimately producing a literal '%' in url.host via the stdlib IDNA fallback in _idna_encode(). Fix: add '%' to the character set checked in _check_netloc(). --- CHANGES/1655.bugfix.1.rst | 4 ++++ CHANGES/1655.bugfix.2.rst | 5 +++++ tests/test_url.py | 15 +++++++++++++++ tests/test_url_build.py | 15 +++++++++++++++ yarl/_parse.py | 2 +- yarl/_url.py | 8 ++++++++ 6 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 CHANGES/1655.bugfix.1.rst create mode 100644 CHANGES/1655.bugfix.2.rst diff --git a/CHANGES/1655.bugfix.1.rst b/CHANGES/1655.bugfix.1.rst new file mode 100644 index 000000000..2e82a62b1 --- /dev/null +++ b/CHANGES/1655.bugfix.1.rst @@ -0,0 +1,4 @@ +Fixed :meth:`~yarl.URL.build` failing to validate characters in the zone ID +portion of IPv6 addresses when ``validate_host=True``, allowing control +characters such as CR and LF to pass through into ``url.host`` +-- by :user:`rodrigobnogueira`. diff --git a/CHANGES/1655.bugfix.2.rst b/CHANGES/1655.bugfix.2.rst new file mode 100644 index 000000000..fd7005f8b --- /dev/null +++ b/CHANGES/1655.bugfix.2.rst @@ -0,0 +1,5 @@ +Fixed ``_check_netloc()`` missing ``%`` from its NFKC normalization character +check, which allowed Unicode characters U+FF05 (FULLWIDTH PERCENT SIGN) and +U+FE6A (SMALL PERCENT SIGN) to produce a literal ``%`` in ``url.host`` via +the standard library IDNA fallback +-- by :user:`rodrigobnogueira`. diff --git a/tests/test_url.py b/tests/test_url.py index 37871fedb..8f7dd2b16 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -12,6 +12,8 @@ _VERTICAL_COLON = "\ufe13" # normalizes to ":" _FULL_WITH_NUMBER_SIGN = "\uff03" # normalizes to "#" _ACCOUNT_OF = "\u2100" # normalizes to "a/c" +_FULLWIDTH_PERCENT = "\uff05" # normalizes to "%" +_SMALL_PERCENT = "\ufe6a" # normalizes to "%" def test_inheritance() -> None: @@ -2465,3 +2467,16 @@ def test_url_with_invalid_unicode(disallowed_unicode: str) -> None: ValueError, match="contains invalid characters under NFKC normalization" ): URL(f"http://example.{disallowed_unicode}.com/frag") + + +@pytest.mark.parametrize( + "percent_char", + [_FULLWIDTH_PERCENT, _SMALL_PERCENT], + ids=["fullwidth-percent-U+FF05", "small-percent-U+FE6A"], +) +def test_url_with_fullwidth_percent_rejected(percent_char: str) -> None: + """NFKC normalization of fullwidth/small percent signs must be caught.""" + with pytest.raises( + ValueError, match="contains invalid characters under NFKC normalization" + ): + URL(f"http://evil.com{percent_char}2e.internal/") diff --git a/tests/test_url_build.py b/tests/test_url_build.py index d80fe08c6..8eab020f8 100644 --- a/tests/test_url_build.py +++ b/tests/test_url_build.py @@ -35,6 +35,21 @@ def test_url_ipv4_in_ipv6() -> None: assert str(u) == "http://[2001:db8:122:344::c000:221]" +@pytest.mark.parametrize( + ("zone", "desc"), + ( + ("\r\nX-Injected: evil", "crlf-injection"), + ("\x00evil", "null-byte"), + ("zone with spaces", "spaces"), + ), + ids=("crlf-injection", "null-byte", "spaces"), +) +def test_url_build_ipv6_zone_id_invalid_chars(zone: str, desc: str) -> None: + """Zone IDs with control characters must be rejected by validate_host.""" + with pytest.raises(ValueError, match="Invalid characters in IPv6 zone ID"): + URL.build(scheme="http", host=f"::1%{zone}", path="/") + + def test_build_with_scheme() -> None: u = URL.build(scheme="blob", path="path") assert str(u) == "blob:path" diff --git a/yarl/_parse.py b/yarl/_parse.py index bb64165c7..777ffcca5 100644 --- a/yarl/_parse.py +++ b/yarl/_parse.py @@ -96,7 +96,7 @@ def _check_netloc(netloc: str) -> None: # Note that there are no unicode decompositions for the character '@' so # its currently impossible to have test coverage for this branch, however if the # one should be added in the future we want to make sure its still checked. - for c in "/?#@:": # pragma: no branch + for c in "/?#@:%": # pragma: no branch if c in normalized_netloc: raise ValueError( f"netloc '{netloc}' contains invalid " diff --git a/yarl/_url.py b/yarl/_url.py index 5a79fe1f8..096d8ea0e 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -89,6 +89,10 @@ re.VERBOSE, ) +# RFC 6874 ZoneID = 1*( unreserved / pct-encoded ) +# In practice, sub-delimiters are also used (e.g. eth0, Ethernet+1). +_ZONE_ID_RE = re.compile(r"^[A-Za-z0-9._~!$&'()*+,;=%-]+$") + _T = TypeVar("_T") if sys.version_info >= (3, 11): @@ -1574,6 +1578,10 @@ def _encode_host(host: str, validate_host: bool) -> str: except ValueError: pass else: + if sep and validate_host and not _ZONE_ID_RE.match(zone): + raise ValueError( + f"Invalid characters in IPv6 zone ID: {zone!r}" + ) # These checks should not happen in the # LRU to keep the cache size small host = ip.compressed From 979a271336814bfd8e07d157416b05bbeeb56a85 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 12 Apr 2026 21:37:42 +0000 Subject: [PATCH 2/4] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- yarl/_url.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/yarl/_url.py b/yarl/_url.py index 096d8ea0e..db76279aa 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -1579,9 +1579,7 @@ def _encode_host(host: str, validate_host: bool) -> str: pass else: if sep and validate_host and not _ZONE_ID_RE.match(zone): - raise ValueError( - f"Invalid characters in IPv6 zone ID: {zone!r}" - ) + raise ValueError(f"Invalid characters in IPv6 zone ID: {zone!r}") # These checks should not happen in the # LRU to keep the cache size small host = ip.compressed From ad025da51ab9b3dc07686e72913a43310c6f6734 Mon Sep 17 00:00:00 2001 From: Rodrigo Nogueira Date: Tue, 14 Apr 2026 23:43:10 -0300 Subject: [PATCH 3/4] Address reviewer comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove unused 'desc' parameter from zone ID test parametrize tuple - Update _ZONE_ID_RE comment: cite RFC 9844 (which obsoletes RFC 6874 for UI usage) and add a direct link to RFC 6874 §2 for the ZoneID ABNF grammar (unreserved / pct-encoded) --- tests/test_url_build.py | 10 +++++----- yarl/_url.py | 4 +++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/test_url_build.py b/tests/test_url_build.py index 8eab020f8..7bfef38cb 100644 --- a/tests/test_url_build.py +++ b/tests/test_url_build.py @@ -36,15 +36,15 @@ def test_url_ipv4_in_ipv6() -> None: @pytest.mark.parametrize( - ("zone", "desc"), + "zone", ( - ("\r\nX-Injected: evil", "crlf-injection"), - ("\x00evil", "null-byte"), - ("zone with spaces", "spaces"), + "\r\nX-Injected: evil", + "\x00evil", + "zone with spaces", ), ids=("crlf-injection", "null-byte", "spaces"), ) -def test_url_build_ipv6_zone_id_invalid_chars(zone: str, desc: str) -> None: +def test_url_build_ipv6_zone_id_invalid_chars(zone: str) -> None: """Zone IDs with control characters must be rejected by validate_host.""" with pytest.raises(ValueError, match="Invalid characters in IPv6 zone ID"): URL.build(scheme="http", host=f"::1%{zone}", path="/") diff --git a/yarl/_url.py b/yarl/_url.py index db76279aa..29979622c 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -89,7 +89,9 @@ re.VERBOSE, ) -# RFC 6874 ZoneID = 1*( unreserved / pct-encoded ) +# Zone IDs in URIs are defined by RFC 6874 (obsoleted by RFC 9844 for UI usage): +# ZoneID = 1*( unreserved / pct-encoded ) +# https://www.rfc-editor.org/rfc/rfc6874#section-2 # In practice, sub-delimiters are also used (e.g. eth0, Ethernet+1). _ZONE_ID_RE = re.compile(r"^[A-Za-z0-9._~!$&'()*+,;=%-]+$") From eb198502ea442b442496f143ed50c6917c2f15aa Mon Sep 17 00:00:00 2001 From: Rodrigo Nogueira Date: Sun, 19 Apr 2026 16:23:29 -0300 Subject: [PATCH 4/4] Relax zone ID validation: reject CTL characters per RFC 4007/9844 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The _ZONE_ID_RE allowlist was based on RFC 6874's ABNF grammar, which was overly restrictive. RFC 4007 §11.2 specifies that zone IDs are OS-defined text strings with no format restriction (interface names like 'eth0', 'Ethernet (LAN)', and numeric indices are all valid). RFC 9844 §6.3 recommends rejecting characters inappropriate for the environment. For yarl this means ASCII control characters (CTL). Changes: - Replace _ZONE_ID_RE with _ZONE_ID_UNSAFE_RE that rejects CTL chars - Accept empty-zone check (::1% is still invalid) - Update tests: remove 'spaces' from invalid cases, add valid cases - Update changelog to cite RFC 9844 §6.3 --- CHANGES/1655.bugfix.1.rst | 4 +++- tests/test_url.py | 36 ++++++++++++++---------------------- tests/test_url_build.py | 20 ++++++++++++++++++-- yarl/_url.py | 13 +++++++------ 4 files changed, 42 insertions(+), 31 deletions(-) diff --git a/CHANGES/1655.bugfix.1.rst b/CHANGES/1655.bugfix.1.rst index 2e82a62b1..a7e29c19e 100644 --- a/CHANGES/1655.bugfix.1.rst +++ b/CHANGES/1655.bugfix.1.rst @@ -1,4 +1,6 @@ Fixed :meth:`~yarl.URL.build` failing to validate characters in the zone ID portion of IPv6 addresses when ``validate_host=True``, allowing control -characters such as CR and LF to pass through into ``url.host`` +characters such as CR, LF, and NUL to pass through into ``url.host``. +Zone IDs now reject ASCII control characters per +`RFC 9844 §6.3 `_ -- by :user:`rodrigobnogueira`. diff --git a/tests/test_url.py b/tests/test_url.py index 8f7dd2b16..61cfbe7cb 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1808,8 +1808,8 @@ def test_to_idna() -> None: def test_from_ascii_login() -> None: - url = URL("http://" "%D0%B2%D0%B0%D1%81%D1%8F" "@host:1234/") - assert ("http://" "%D0%B2%D0%B0%D1%81%D1%8F" "@host:1234/") == str(url) + url = URL("http://%D0%B2%D0%B0%D1%81%D1%8F@host:1234/") + assert ("http://%D0%B2%D0%B0%D1%81%D1%8F@host:1234/") == str(url) def test_from_non_ascii_login() -> None: @@ -1843,16 +1843,16 @@ def test_from_non_ascii_login_and_password() -> None: def test_from_ascii_path() -> None: - url = URL("http://example.com/" "%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0") + url = URL("http://example.com/%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0") assert ( - "http://example.com/" "%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0" + "http://example.com/%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0" ) == str(url) def test_from_ascii_path_lower_case() -> None: - url = URL("http://example.com/" "%d0%bf%d1%83%d1%82%d1%8c/%d1%82%d1%83%d0%b4%d0%b0") + url = URL("http://example.com/%d0%bf%d1%83%d1%82%d1%8c/%d1%82%d1%83%d0%b4%d0%b0") assert ( - "http://example.com/" "%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0" + "http://example.com/%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0" ) == str(url) @@ -1873,23 +1873,17 @@ def test_bytes() -> None: def test_from_ascii_query_parts() -> None: url = URL( - "http://example.com/" - "?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC" - "=%D0%B7%D0%BD%D0%B0%D1%87" + "http://example.com/?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC=%D0%B7%D0%BD%D0%B0%D1%87" ) assert ( - "http://example.com/" - "?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC" - "=%D0%B7%D0%BD%D0%B0%D1%87" + "http://example.com/?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC=%D0%B7%D0%BD%D0%B0%D1%87" ) == str(url) def test_from_non_ascii_query_parts() -> None: url = URL("http://example.com/?парам=знач") assert ( - "http://example.com/" - "?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC" - "=%D0%B7%D0%BD%D0%B0%D1%87" + "http://example.com/?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC=%D0%B7%D0%BD%D0%B0%D1%87" ) == str(url) @@ -1899,16 +1893,16 @@ def test_from_non_ascii_query_parts2() -> None: def test_from_ascii_fragment() -> None: - url = URL("http://example.com/" "#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82") + url = URL("http://example.com/#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82") assert ( - "http://example.com/" "#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82" + "http://example.com/#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82" ) == str(url) def test_from_bytes_with_non_ascii_fragment() -> None: url = URL("http://example.com/#фрагмент") assert ( - "http://example.com/" "#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82" + "http://example.com/#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82" ) == str(url) @@ -1919,12 +1913,10 @@ def test_to_str() -> None: def test_to_str_long() -> None: url = URL( - "https://host-12345678901234567890123456789012345678901234567890" "-name:8888/" + "https://host-12345678901234567890123456789012345678901234567890-name:8888/" ) expected = ( - "https://host-" - "12345678901234567890123456789012345678901234567890" - "-name:8888/" + "https://host-12345678901234567890123456789012345678901234567890-name:8888/" ) assert expected == str(url) diff --git a/tests/test_url_build.py b/tests/test_url_build.py index 7bfef38cb..bf09d2b7b 100644 --- a/tests/test_url_build.py +++ b/tests/test_url_build.py @@ -40,9 +40,8 @@ def test_url_ipv4_in_ipv6() -> None: ( "\r\nX-Injected: evil", "\x00evil", - "zone with spaces", ), - ids=("crlf-injection", "null-byte", "spaces"), + ids=("crlf-injection", "null-byte"), ) def test_url_build_ipv6_zone_id_invalid_chars(zone: str) -> None: """Zone IDs with control characters must be rejected by validate_host.""" @@ -50,6 +49,23 @@ def test_url_build_ipv6_zone_id_invalid_chars(zone: str) -> None: URL.build(scheme="http", host=f"::1%{zone}", path="/") +@pytest.mark.parametrize( + "zone", + ( + "eth0", + "1", + "zone with spaces", + "Ethernet (LAN)", + "日本語", + ), + ids=("iface-name", "numeric", "spaces", "parens", "unicode"), +) +def test_url_build_ipv6_zone_id_valid(zone: str) -> None: + """Zone IDs accept any non-CTL text per RFC 4007 §11.2.""" + u = URL.build(scheme="http", host=f"::1%{zone}", path="/") + assert u.host == f"::1%{zone}" + + def test_build_with_scheme() -> None: u = URL.build(scheme="blob", path="path") assert str(u) == "blob:path" diff --git a/yarl/_url.py b/yarl/_url.py index 29979622c..36b850992 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -89,11 +89,12 @@ re.VERBOSE, ) -# Zone IDs in URIs are defined by RFC 6874 (obsoleted by RFC 9844 for UI usage): -# ZoneID = 1*( unreserved / pct-encoded ) -# https://www.rfc-editor.org/rfc/rfc6874#section-2 -# In practice, sub-delimiters are also used (e.g. eth0, Ethernet+1). -_ZONE_ID_RE = re.compile(r"^[A-Za-z0-9._~!$&'()*+,;=%-]+$") +# Zone IDs are OS-specific text strings with no format defined by the RFCs: +# https://datatracker.ietf.org/doc/html/rfc4007#section-11.2 +# RFC 9844 §6.3 recommends rejecting characters inappropriate for the +# environment; for yarl we reject ASCII control characters (CTL): +# https://datatracker.ietf.org/doc/html/rfc9844#section-6-3 +_ZONE_ID_UNSAFE_RE = re.compile(r"[\x00-\x1f\x7f]") _T = TypeVar("_T") @@ -1580,7 +1581,7 @@ def _encode_host(host: str, validate_host: bool) -> str: except ValueError: pass else: - if sep and validate_host and not _ZONE_ID_RE.match(zone): + if sep and validate_host and (not zone or _ZONE_ID_UNSAFE_RE.search(zone)): raise ValueError(f"Invalid characters in IPv6 zone ID: {zone!r}") # These checks should not happen in the # LRU to keep the cache size small