diff --git a/CHANGES/1655.bugfix.1.rst b/CHANGES/1655.bugfix.1.rst new file mode 100644 index 000000000..a7e29c19e --- /dev/null +++ b/CHANGES/1655.bugfix.1.rst @@ -0,0 +1,6 @@ +Fixed :meth:`~yarl.URL.build` failing to validate characters in the zone ID +portion of IPv6 addresses when ``validate_host=True``, allowing control +characters such as CR, LF, and NUL to pass through into ``url.host``. +Zone IDs now reject ASCII control characters per +`RFC 9844 §6.3 `_ +-- by :user:`rodrigobnogueira`. diff --git a/CHANGES/1655.bugfix.2.rst b/CHANGES/1655.bugfix.2.rst new file mode 100644 index 000000000..fd7005f8b --- /dev/null +++ b/CHANGES/1655.bugfix.2.rst @@ -0,0 +1,5 @@ +Fixed ``_check_netloc()`` missing ``%`` from its NFKC normalization character +check, which allowed Unicode characters U+FF05 (FULLWIDTH PERCENT SIGN) and +U+FE6A (SMALL PERCENT SIGN) to produce a literal ``%`` in ``url.host`` via +the standard library IDNA fallback +-- by :user:`rodrigobnogueira`. diff --git a/tests/test_url.py b/tests/test_url.py index 37871fedb..61cfbe7cb 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -12,6 +12,8 @@ _VERTICAL_COLON = "\ufe13" # normalizes to ":" _FULL_WITH_NUMBER_SIGN = "\uff03" # normalizes to "#" _ACCOUNT_OF = "\u2100" # normalizes to "a/c" +_FULLWIDTH_PERCENT = "\uff05" # normalizes to "%" +_SMALL_PERCENT = "\ufe6a" # normalizes to "%" def test_inheritance() -> None: @@ -1806,8 +1808,8 @@ def test_to_idna() -> None: def test_from_ascii_login() -> None: - url = URL("http://" "%D0%B2%D0%B0%D1%81%D1%8F" "@host:1234/") - assert ("http://" "%D0%B2%D0%B0%D1%81%D1%8F" "@host:1234/") == str(url) + url = URL("http://%D0%B2%D0%B0%D1%81%D1%8F@host:1234/") + assert ("http://%D0%B2%D0%B0%D1%81%D1%8F@host:1234/") == str(url) def test_from_non_ascii_login() -> None: @@ -1841,16 +1843,16 @@ def test_from_non_ascii_login_and_password() -> None: def test_from_ascii_path() -> None: - url = URL("http://example.com/" "%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0") + url = URL("http://example.com/%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0") assert ( - "http://example.com/" "%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0" + "http://example.com/%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0" ) == str(url) def test_from_ascii_path_lower_case() -> None: - url = URL("http://example.com/" "%d0%bf%d1%83%d1%82%d1%8c/%d1%82%d1%83%d0%b4%d0%b0") + url = URL("http://example.com/%d0%bf%d1%83%d1%82%d1%8c/%d1%82%d1%83%d0%b4%d0%b0") assert ( - "http://example.com/" "%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0" + "http://example.com/%D0%BF%D1%83%D1%82%D1%8C/%D1%82%D1%83%D0%B4%D0%B0" ) == str(url) @@ -1871,23 +1873,17 @@ def test_bytes() -> None: def test_from_ascii_query_parts() -> None: url = URL( - "http://example.com/" - "?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC" - "=%D0%B7%D0%BD%D0%B0%D1%87" + "http://example.com/?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC=%D0%B7%D0%BD%D0%B0%D1%87" ) assert ( - "http://example.com/" - "?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC" - "=%D0%B7%D0%BD%D0%B0%D1%87" + "http://example.com/?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC=%D0%B7%D0%BD%D0%B0%D1%87" ) == str(url) def test_from_non_ascii_query_parts() -> None: url = URL("http://example.com/?парам=знач") assert ( - "http://example.com/" - "?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC" - "=%D0%B7%D0%BD%D0%B0%D1%87" + "http://example.com/?%D0%BF%D0%B0%D1%80%D0%B0%D0%BC=%D0%B7%D0%BD%D0%B0%D1%87" ) == str(url) @@ -1897,16 +1893,16 @@ def test_from_non_ascii_query_parts2() -> None: def test_from_ascii_fragment() -> None: - url = URL("http://example.com/" "#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82") + url = URL("http://example.com/#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82") assert ( - "http://example.com/" "#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82" + "http://example.com/#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82" ) == str(url) def test_from_bytes_with_non_ascii_fragment() -> None: url = URL("http://example.com/#фрагмент") assert ( - "http://example.com/" "#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82" + "http://example.com/#%D1%84%D1%80%D0%B0%D0%B3%D0%BC%D0%B5%D0%BD%D1%82" ) == str(url) @@ -1917,12 +1913,10 @@ def test_to_str() -> None: def test_to_str_long() -> None: url = URL( - "https://host-12345678901234567890123456789012345678901234567890" "-name:8888/" + "https://host-12345678901234567890123456789012345678901234567890-name:8888/" ) expected = ( - "https://host-" - "12345678901234567890123456789012345678901234567890" - "-name:8888/" + "https://host-12345678901234567890123456789012345678901234567890-name:8888/" ) assert expected == str(url) @@ -2465,3 +2459,16 @@ def test_url_with_invalid_unicode(disallowed_unicode: str) -> None: ValueError, match="contains invalid characters under NFKC normalization" ): URL(f"http://example.{disallowed_unicode}.com/frag") + + +@pytest.mark.parametrize( + "percent_char", + [_FULLWIDTH_PERCENT, _SMALL_PERCENT], + ids=["fullwidth-percent-U+FF05", "small-percent-U+FE6A"], +) +def test_url_with_fullwidth_percent_rejected(percent_char: str) -> None: + """NFKC normalization of fullwidth/small percent signs must be caught.""" + with pytest.raises( + ValueError, match="contains invalid characters under NFKC normalization" + ): + URL(f"http://evil.com{percent_char}2e.internal/") diff --git a/tests/test_url_build.py b/tests/test_url_build.py index d80fe08c6..bf09d2b7b 100644 --- a/tests/test_url_build.py +++ b/tests/test_url_build.py @@ -35,6 +35,37 @@ def test_url_ipv4_in_ipv6() -> None: assert str(u) == "http://[2001:db8:122:344::c000:221]" +@pytest.mark.parametrize( + "zone", + ( + "\r\nX-Injected: evil", + "\x00evil", + ), + ids=("crlf-injection", "null-byte"), +) +def test_url_build_ipv6_zone_id_invalid_chars(zone: str) -> None: + """Zone IDs with control characters must be rejected by validate_host.""" + with pytest.raises(ValueError, match="Invalid characters in IPv6 zone ID"): + URL.build(scheme="http", host=f"::1%{zone}", path="/") + + +@pytest.mark.parametrize( + "zone", + ( + "eth0", + "1", + "zone with spaces", + "Ethernet (LAN)", + "日本語", + ), + ids=("iface-name", "numeric", "spaces", "parens", "unicode"), +) +def test_url_build_ipv6_zone_id_valid(zone: str) -> None: + """Zone IDs accept any non-CTL text per RFC 4007 §11.2.""" + u = URL.build(scheme="http", host=f"::1%{zone}", path="/") + assert u.host == f"::1%{zone}" + + def test_build_with_scheme() -> None: u = URL.build(scheme="blob", path="path") assert str(u) == "blob:path" diff --git a/yarl/_parse.py b/yarl/_parse.py index bb64165c7..777ffcca5 100644 --- a/yarl/_parse.py +++ b/yarl/_parse.py @@ -96,7 +96,7 @@ def _check_netloc(netloc: str) -> None: # Note that there are no unicode decompositions for the character '@' so # its currently impossible to have test coverage for this branch, however if the # one should be added in the future we want to make sure its still checked. - for c in "/?#@:": # pragma: no branch + for c in "/?#@:%": # pragma: no branch if c in normalized_netloc: raise ValueError( f"netloc '{netloc}' contains invalid " diff --git a/yarl/_url.py b/yarl/_url.py index 5a79fe1f8..36b850992 100644 --- a/yarl/_url.py +++ b/yarl/_url.py @@ -89,6 +89,13 @@ re.VERBOSE, ) +# Zone IDs are OS-specific text strings with no format defined by the RFCs: +# https://datatracker.ietf.org/doc/html/rfc4007#section-11.2 +# RFC 9844 §6.3 recommends rejecting characters inappropriate for the +# environment; for yarl we reject ASCII control characters (CTL): +# https://datatracker.ietf.org/doc/html/rfc9844#section-6-3 +_ZONE_ID_UNSAFE_RE = re.compile(r"[\x00-\x1f\x7f]") + _T = TypeVar("_T") if sys.version_info >= (3, 11): @@ -1574,6 +1581,8 @@ def _encode_host(host: str, validate_host: bool) -> str: except ValueError: pass else: + if sep and validate_host and (not zone or _ZONE_ID_UNSAFE_RE.search(zone)): + raise ValueError(f"Invalid characters in IPv6 zone ID: {zone!r}") # These checks should not happen in the # LRU to keep the cache size small host = ip.compressed