Coverage for src/httpx/_urlparse.py: 72%
191 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-05-23 21:03 +0100
« prev ^ index » next coverage.py v7.6.12, created at 2025-05-23 21:03 +0100
1"""
2An implementation of `urlparse` that provides URL validation and normalization
3as described by RFC3986.
5We rely on this implementation rather than the one in Python's stdlib, because:
7* It provides more complete URL validation.
8* It properly differentiates between an empty querystring and an absent querystring,
9 to distinguish URLs with a trailing '?'.
10* It handles scheme, hostname, port, and path normalization.
11* It supports IDNA hostnames, normalizing them to their encoded form.
12* The API supports passing individual components, as well as the complete URL string.
14Previously we relied on the excellent `rfc3986` package to handle URL parsing and
15validation, but this module provides a simpler alternative, with less indirection
16required.
17"""
19import ipaddress
20import re
21import typing
24class InvalidURL(ValueError):
25 pass
28MAX_URL_LENGTH = 65536
30# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3
31UNRESERVED_CHARACTERS = (
32 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"
33)
34SUB_DELIMS = "!$&'()*+,;="
36PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}")
38# https://url.spec.whatwg.org/#percent-encoded-bytes
40# The fragment percent-encode set is the C0 control percent-encode set
41# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`).
42FRAG_SAFE = "".join(
43 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)]
44)
46# The query percent-encode set is the C0 control percent-encode set
47# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>).
48QUERY_SAFE = "".join(
49 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)]
50)
52# The path percent-encode set is the query percent-encode set
53# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}).
54PATH_SAFE = "".join(
55 [
56 chr(i)
57 for i in range(0x20, 0x7F)
58 if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D)
59 ]
60)
62# The userinfo percent-encode set is the path percent-encode set
63# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@),
64# U+005B ([) to U+005E (^), inclusive, and U+007C (|).
65USERNAME_SAFE = "".join(
66 [
67 chr(i)
68 for i in range(0x20, 0x7F)
69 if i
70 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
71 + (0x3F, 0x60, 0x7B, 0x7D)
72 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
73 ]
74)
75PASSWORD_SAFE = "".join(
76 [
77 chr(i)
78 for i in range(0x20, 0x7F)
79 if i
80 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
81 + (0x3F, 0x60, 0x7B, 0x7D)
82 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
83 ]
84)
85# Note... The terminology 'userinfo' percent-encode set in the WHATWG document
86# is used for the username and password quoting. For the joint userinfo component
87# we remove U+003A (:) from the safe set.
88USERINFO_SAFE = "".join(
89 [
90 chr(i)
91 for i in range(0x20, 0x7F)
92 if i
93 not in (0x20, 0x22, 0x23, 0x3C, 0x3E)
94 + (0x3F, 0x60, 0x7B, 0x7D)
95 + (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C)
96 ]
97)
100# {scheme}: (optional)
101# //{authority} (optional)
102# {path}
103# ?{query} (optional)
104# #{fragment} (optional)
105URL_REGEX = re.compile(
106 (
107 r"(?:(?P<scheme>{scheme}):)?"
108 r"(?://(?P<authority>{authority}))?"
109 r"(?P<path>{path})"
110 r"(?:\?(?P<query>{query}))?"
111 r"(?:#(?P<fragment>{fragment}))?"
112 ).format(
113 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?",
114 authority="[^/?#]*",
115 path="[^?#]*",
116 query="[^#]*",
117 fragment=".*",
118 )
119)
121# {userinfo}@ (optional)
122# {host}
123# :{port} (optional)
124AUTHORITY_REGEX = re.compile(
125 (
126 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?"
127 ).format(
128 userinfo=".*", # Any character sequence.
129 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@',
130 # or an IPv6 address enclosed within square brackets.
131 port=".*", # Any character sequence.
132 )
133)
136# If we call urlparse with an individual component, then we need to regex
137# validate that component individually.
138# Note that we're duplicating the same strings as above. Shock! Horror!!
139COMPONENT_REGEX = {
140 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"),
141 "authority": re.compile("[^/?#]*"),
142 "path": re.compile("[^?#]*"),
143 "query": re.compile("[^#]*"),
144 "fragment": re.compile(".*"),
145 "userinfo": re.compile("[^@]*"),
146 "host": re.compile("(\\[.*\\]|[^:]*)"),
147 "port": re.compile(".*"),
148}
151# We use these simple regexs as a first pass before handing off to
152# the stdlib 'ipaddress' module for IP address validation.
153IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$")
154IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$")
157class ParseResult(typing.NamedTuple):
158 scheme: str
159 userinfo: str
160 host: str
161 port: int | None
162 path: str
163 query: str | None
164 fragment: str | None
166 @property
167 def authority(self) -> str:
168 return "".join(
169 [
170 f"{self.userinfo}@" if self.userinfo else "",
171 f"[{self.host}]" if ":" in self.host else self.host,
172 f":{self.port}" if self.port is not None else "",
173 ]
174 )
176 @property
177 def netloc(self) -> str:
178 return "".join(
179 [
180 f"[{self.host}]" if ":" in self.host else self.host,
181 f":{self.port}" if self.port is not None else "",
182 ]
183 )
185 def copy_with(self, **kwargs: str | None) -> "ParseResult":
186 if not kwargs:
187 return self
189 defaults = {
190 "scheme": self.scheme,
191 "authority": self.authority,
192 "path": self.path,
193 "query": self.query,
194 "fragment": self.fragment,
195 }
196 defaults.update(kwargs)
197 return urlparse("", **defaults)
199 def __str__(self) -> str:
200 authority = self.authority
201 return "".join(
202 [
203 f"{self.scheme}:" if self.scheme else "",
204 f"//{authority}" if authority else "",
205 self.path,
206 f"?{self.query}" if self.query is not None else "",
207 f"#{self.fragment}" if self.fragment is not None else "",
208 ]
209 )
212def urlparse(url: str = "", **kwargs: str | None) -> ParseResult:
213 # Initial basic checks on allowable URLs.
214 # ---------------------------------------
216 # Hard limit the maximum allowable URL length.
217 if len(url) > MAX_URL_LENGTH:
218 raise InvalidURL("URL too long")
220 # If a URL includes any ASCII control characters including \t, \r, \n,
221 # then treat it as invalid.
222 if any(char.isascii() and not char.isprintable() for char in url):
223 char = next(char for char in url if char.isascii() and not char.isprintable())
224 idx = url.find(char)
225 error = (
226 f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}."
227 )
228 raise InvalidURL(error)
230 # Some keyword arguments require special handling.
231 # ------------------------------------------------
233 # Coerce "port" to a string, if it is provided as an integer.
234 if "port" in kwargs:
235 port = kwargs["port"]
236 kwargs["port"] = str(port) if isinstance(port, int) else port
238 # Replace "netloc" with "host and "port".
239 if "netloc" in kwargs:
240 netloc = kwargs.pop("netloc") or ""
241 kwargs["host"], _, kwargs["port"] = netloc.partition(":")
243 # Replace "username" and/or "password" with "userinfo".
244 if "username" in kwargs or "password" in kwargs:
245 username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE)
246 password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE)
247 kwargs["userinfo"] = f"{username}:{password}" if password else username
249 # Replace "raw_path" with "path" and "query".
250 if "raw_path" in kwargs:
251 raw_path = kwargs.pop("raw_path") or ""
252 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?")
253 if not seperator:
254 kwargs["query"] = None
256 # Ensure that IPv6 "host" addresses are always escaped with "[...]".
257 if "host" in kwargs:
258 host = kwargs.get("host") or ""
259 if ":" in host and not (host.startswith("[") and host.endswith("]")):
260 kwargs["host"] = f"[{host}]"
262 # If any keyword arguments are provided, ensure they are valid.
263 # -------------------------------------------------------------
265 for key, value in kwargs.items():
266 if value is not None:
267 if len(value) > MAX_URL_LENGTH:
268 raise InvalidURL(f"URL component '{key}' too long")
270 # If a component includes any ASCII control characters including \t, \r, \n,
271 # then treat it as invalid.
272 if any(char.isascii() and not char.isprintable() for char in value):
273 char = next(
274 char for char in value if char.isascii() and not char.isprintable()
275 )
276 idx = value.find(char)
277 error = (
278 f"Invalid non-printable ASCII character in URL {key} component, "
279 f"{char!r} at position {idx}."
280 )
281 raise InvalidURL(error)
283 # Ensure that keyword arguments match as a valid regex.
284 if not COMPONENT_REGEX[key].fullmatch(value):
285 raise InvalidURL(f"Invalid URL component '{key}'")
287 # The URL_REGEX will always match, but may have empty components.
288 url_match = URL_REGEX.match(url)
289 assert url_match is not None
290 url_dict = url_match.groupdict()
292 # * 'scheme', 'authority', and 'path' may be empty strings.
293 # * 'query' may be 'None', indicating no trailing "?" portion.
294 # Any string including the empty string, indicates a trailing "?".
295 # * 'fragment' may be 'None', indicating no trailing "#" portion.
296 # Any string including the empty string, indicates a trailing "#".
297 scheme = kwargs.get("scheme", url_dict["scheme"]) or ""
298 authority = kwargs.get("authority", url_dict["authority"]) or ""
299 path = kwargs.get("path", url_dict["path"]) or ""
300 query = kwargs.get("query", url_dict["query"])
301 frag = kwargs.get("fragment", url_dict["fragment"])
303 # The AUTHORITY_REGEX will always match, but may have empty components.
304 authority_match = AUTHORITY_REGEX.match(authority)
305 assert authority_match is not None
306 authority_dict = authority_match.groupdict()
308 # * 'userinfo' and 'host' may be empty strings.
309 # * 'port' may be 'None'.
310 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or ""
311 host = kwargs.get("host", authority_dict["host"]) or ""
312 port = kwargs.get("port", authority_dict["port"])
314 # Normalize and validate each component.
315 # We end up with a parsed representation of the URL,
316 # with components that are plain ASCII bytestrings.
317 parsed_scheme: str = scheme.lower()
318 parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE)
319 parsed_host: str = encode_host(host)
320 parsed_port: int | None = normalize_port(port, scheme)
322 has_scheme = parsed_scheme != ""
323 has_authority = (
324 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None
325 )
326 validate_path(path, has_scheme=has_scheme, has_authority=has_authority)
327 if has_scheme or has_authority:
328 path = normalize_path(path)
330 parsed_path: str = quote(path, safe=PATH_SAFE)
331 parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE)
332 parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE)
334 # The parsed ASCII bytestrings are our canonical form.
335 # All properties of the URL are derived from these.
336 return ParseResult(
337 parsed_scheme,
338 parsed_userinfo,
339 parsed_host,
340 parsed_port,
341 parsed_path,
342 parsed_query,
343 parsed_frag,
344 )
347def encode_host(host: str) -> str:
348 if not host:
349 return ""
351 elif IPv4_STYLE_HOSTNAME.match(host):
352 # Validate IPv4 hostnames like #.#.#.#
353 #
354 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
355 #
356 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet
357 try:
358 ipaddress.IPv4Address(host)
359 except ipaddress.AddressValueError:
360 raise InvalidURL(f"Invalid IPv4 address: {host!r}")
361 return host
363 elif IPv6_STYLE_HOSTNAME.match(host):
364 # Validate IPv6 hostnames like [...]
365 #
366 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
367 #
368 # "A host identified by an Internet Protocol literal address, version 6
369 # [RFC3513] or later, is distinguished by enclosing the IP literal
370 # within square brackets ("[" and "]"). This is the only place where
371 # square bracket characters are allowed in the URI syntax."
372 try:
373 ipaddress.IPv6Address(host[1:-1])
374 except ipaddress.AddressValueError:
375 raise InvalidURL(f"Invalid IPv6 address: {host!r}")
376 return host[1:-1]
378 elif not host.isascii():
379 try:
380 import idna # type: ignore
381 except ImportError:
382 raise InvalidURL(
383 f"Cannot handle URL with IDNA hostname: {host!r}. "
384 f"Package 'idna' is not installed."
385 )
387 # IDNA hostnames
388 try:
389 return idna.encode(host.lower()).decode("ascii")
390 except idna.IDNAError:
391 raise InvalidURL(f"Invalid IDNA hostname: {host!r}")
393 # Regular ASCII hostnames
394 #
395 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2
396 #
397 # reg-name = *( unreserved / pct-encoded / sub-delims )
398 WHATWG_SAFE = '"`{}%|\\'
399 return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE)
402def normalize_port(port: str | int | None, scheme: str) -> int | None:
403 # From https://tools.ietf.org/html/rfc3986#section-3.2.3
404 #
405 # "A scheme may define a default port. For example, the "http" scheme
406 # defines a default port of "80", corresponding to its reserved TCP
407 # port number. The type of port designated by the port number (e.g.,
408 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and
409 # normalizers should omit the port component and its ":" delimiter if
410 # port is empty or if its value would be the same as that of the
411 # scheme's default."
412 if port is None or port == "":
413 return None
415 try:
416 port_as_int = int(port)
417 except ValueError:
418 raise InvalidURL(f"Invalid port: {port!r}")
420 # See https://url.spec.whatwg.org/#url-miscellaneous
421 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get(
422 scheme
423 )
424 if port_as_int == default_port:
425 return None
426 return port_as_int
429def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None:
430 """
431 Path validation rules that depend on if the URL contains
432 a scheme or authority component.
434 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3
435 """
436 if has_authority:
437 # If a URI contains an authority component, then the path component
438 # must either be empty or begin with a slash ("/") character."
439 if path and not path.startswith("/"):
440 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'")
442 if not has_scheme and not has_authority:
443 # If a URI does not contain an authority component, then the path cannot begin
444 # with two slash characters ("//").
445 if path.startswith("//"):
446 raise InvalidURL("Relative URLs cannot have a path starting with '//'")
448 # In addition, a URI reference (Section 4.1) may be a relative-path reference,
449 # in which case the first path segment cannot contain a colon (":") character.
450 if path.startswith(":"):
451 raise InvalidURL("Relative URLs cannot have a path starting with ':'")
454def normalize_path(path: str) -> str:
455 """
456 Drop "." and ".." segments from a URL path.
458 For example:
460 normalize_path("/path/./to/somewhere/..") == "/path/to"
461 """
462 # Fast return when no '.' characters in the path.
463 if "." not in path:
464 return path
466 components = path.split("/")
468 # Fast return when no '.' or '..' components in the path.
469 if "." not in components and ".." not in components:
470 return path
472 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4
473 output: list[str] = []
474 for component in components:
475 if component == ".":
476 pass
477 elif component == "..":
478 if output and output != [""]:
479 output.pop()
480 else:
481 output.append(component)
482 return "/".join(output)
485def PERCENT(string: str) -> str:
486 return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")])
489def percent_encoded(string: str, safe: str) -> str:
490 """
491 Use percent-encoding to quote a string.
492 """
493 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe
495 # Fast path for strings that don't need escaping.
496 if not string.rstrip(NON_ESCAPED_CHARS):
497 return string
499 return "".join(
500 [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string]
501 )
504def quote(string: str, safe: str) -> str:
505 """
506 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences.
508 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1
510 * `string`: The string to be percent-escaped.
511 * `safe`: A string containing characters that may be treated as safe, and do not
512 need to be escaped. Unreserved characters are always treated as safe.
513 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3
514 """
515 parts = []
516 current_position = 0
517 for match in re.finditer(PERCENT_ENCODED_REGEX, string):
518 start_position, end_position = match.start(), match.end()
519 matched_text = match.group(0)
520 # Add any text up to the '%xx' escape sequence.
521 if start_position != current_position:
522 leading_text = string[current_position:start_position]
523 parts.append(percent_encoded(leading_text, safe=safe))
525 # Add the '%xx' escape sequence.
526 parts.append(matched_text)
527 current_position = end_position
529 # Add any text after the final '%xx' escape sequence.
530 if current_position != len(string):
531 trailing_text = string[current_position:]
532 parts.append(percent_encoded(trailing_text, safe=safe))
534 return "".join(parts)