Coverage for src/httpx/_urlparse.py: 72%

191 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-05-23 21:03 +0100

1""" 

2An implementation of `urlparse` that provides URL validation and normalization 

3as described by RFC3986. 

4 

5We rely on this implementation rather than the one in Python's stdlib, because: 

6 

7* It provides more complete URL validation. 

8* It properly differentiates between an empty querystring and an absent querystring, 

9 to distinguish URLs with a trailing '?'. 

10* It handles scheme, hostname, port, and path normalization. 

11* It supports IDNA hostnames, normalizing them to their encoded form. 

12* The API supports passing individual components, as well as the complete URL string. 

13 

14Previously we relied on the excellent `rfc3986` package to handle URL parsing and 

15validation, but this module provides a simpler alternative, with less indirection 

16required. 

17""" 

18 

19import ipaddress 

20import re 

21import typing 

22 

23 

24class InvalidURL(ValueError): 

25 pass 

26 

27 

28MAX_URL_LENGTH = 65536 

29 

30# https://datatracker.ietf.org/doc/html/rfc3986.html#section-2.3 

31UNRESERVED_CHARACTERS = ( 

32 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~" 

33) 

34SUB_DELIMS = "!$&'()*+,;=" 

35 

36PERCENT_ENCODED_REGEX = re.compile("%[A-Fa-f0-9]{2}") 

37 

38# https://url.spec.whatwg.org/#percent-encoded-bytes 

39 

40# The fragment percent-encode set is the C0 control percent-encode set 

41# and U+0020 SPACE, U+0022 ("), U+003C (<), U+003E (>), and U+0060 (`). 

42FRAG_SAFE = "".join( 

43 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x3C, 0x3E, 0x60)] 

44) 

45 

46# The query percent-encode set is the C0 control percent-encode set 

47# and U+0020 SPACE, U+0022 ("), U+0023 (#), U+003C (<), and U+003E (>). 

48QUERY_SAFE = "".join( 

49 [chr(i) for i in range(0x20, 0x7F) if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E)] 

50) 

51 

52# The path percent-encode set is the query percent-encode set 

53# and U+003F (?), U+0060 (`), U+007B ({), and U+007D (}). 

54PATH_SAFE = "".join( 

55 [ 

56 chr(i) 

57 for i in range(0x20, 0x7F) 

58 if i not in (0x20, 0x22, 0x23, 0x3C, 0x3E) + (0x3F, 0x60, 0x7B, 0x7D) 

59 ] 

60) 

61 

62# The userinfo percent-encode set is the path percent-encode set 

63# and U+002F (/), U+003A (:), U+003B (;), U+003D (=), U+0040 (@), 

64# U+005B ([) to U+005E (^), inclusive, and U+007C (|). 

65USERNAME_SAFE = "".join( 

66 [ 

67 chr(i) 

68 for i in range(0x20, 0x7F) 

69 if i 

70 not in (0x20, 0x22, 0x23, 0x3C, 0x3E) 

71 + (0x3F, 0x60, 0x7B, 0x7D) 

72 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) 

73 ] 

74) 

75PASSWORD_SAFE = "".join( 

76 [ 

77 chr(i) 

78 for i in range(0x20, 0x7F) 

79 if i 

80 not in (0x20, 0x22, 0x23, 0x3C, 0x3E) 

81 + (0x3F, 0x60, 0x7B, 0x7D) 

82 + (0x2F, 0x3A, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) 

83 ] 

84) 

85# Note... The terminology 'userinfo' percent-encode set in the WHATWG document 

86# is used for the username and password quoting. For the joint userinfo component 

87# we remove U+003A (:) from the safe set. 

88USERINFO_SAFE = "".join( 

89 [ 

90 chr(i) 

91 for i in range(0x20, 0x7F) 

92 if i 

93 not in (0x20, 0x22, 0x23, 0x3C, 0x3E) 

94 + (0x3F, 0x60, 0x7B, 0x7D) 

95 + (0x2F, 0x3B, 0x3D, 0x40, 0x5B, 0x5C, 0x5D, 0x5E, 0x7C) 

96 ] 

97) 

98 

99 

100# {scheme}: (optional) 

101# //{authority} (optional) 

102# {path} 

103# ?{query} (optional) 

104# #{fragment} (optional) 

105URL_REGEX = re.compile( 

106 ( 

107 r"(?:(?P<scheme>{scheme}):)?" 

108 r"(?://(?P<authority>{authority}))?" 

109 r"(?P<path>{path})" 

110 r"(?:\?(?P<query>{query}))?" 

111 r"(?:#(?P<fragment>{fragment}))?" 

112 ).format( 

113 scheme="([a-zA-Z][a-zA-Z0-9+.-]*)?", 

114 authority="[^/?#]*", 

115 path="[^?#]*", 

116 query="[^#]*", 

117 fragment=".*", 

118 ) 

119) 

120 

121# {userinfo}@ (optional) 

122# {host} 

123# :{port} (optional) 

124AUTHORITY_REGEX = re.compile( 

125 ( 

126 r"(?:(?P<userinfo>{userinfo})@)?" r"(?P<host>{host})" r":?(?P<port>{port})?" 

127 ).format( 

128 userinfo=".*", # Any character sequence. 

129 host="(\\[.*\\]|[^:@]*)", # Either any character sequence excluding ':' or '@', 

130 # or an IPv6 address enclosed within square brackets. 

131 port=".*", # Any character sequence. 

132 ) 

133) 

134 

135 

136# If we call urlparse with an individual component, then we need to regex 

137# validate that component individually. 

138# Note that we're duplicating the same strings as above. Shock! Horror!! 

139COMPONENT_REGEX = { 

140 "scheme": re.compile("([a-zA-Z][a-zA-Z0-9+.-]*)?"), 

141 "authority": re.compile("[^/?#]*"), 

142 "path": re.compile("[^?#]*"), 

143 "query": re.compile("[^#]*"), 

144 "fragment": re.compile(".*"), 

145 "userinfo": re.compile("[^@]*"), 

146 "host": re.compile("(\\[.*\\]|[^:]*)"), 

147 "port": re.compile(".*"), 

148} 

149 

150 

151# We use these simple regexs as a first pass before handing off to 

152# the stdlib 'ipaddress' module for IP address validation. 

153IPv4_STYLE_HOSTNAME = re.compile(r"^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$") 

154IPv6_STYLE_HOSTNAME = re.compile(r"^\[.*\]$") 

155 

156 

157class ParseResult(typing.NamedTuple): 

158 scheme: str 

159 userinfo: str 

160 host: str 

161 port: int | None 

162 path: str 

163 query: str | None 

164 fragment: str | None 

165 

166 @property 

167 def authority(self) -> str: 

168 return "".join( 

169 [ 

170 f"{self.userinfo}@" if self.userinfo else "", 

171 f"[{self.host}]" if ":" in self.host else self.host, 

172 f":{self.port}" if self.port is not None else "", 

173 ] 

174 ) 

175 

176 @property 

177 def netloc(self) -> str: 

178 return "".join( 

179 [ 

180 f"[{self.host}]" if ":" in self.host else self.host, 

181 f":{self.port}" if self.port is not None else "", 

182 ] 

183 ) 

184 

185 def copy_with(self, **kwargs: str | None) -> "ParseResult": 

186 if not kwargs: 

187 return self 

188 

189 defaults = { 

190 "scheme": self.scheme, 

191 "authority": self.authority, 

192 "path": self.path, 

193 "query": self.query, 

194 "fragment": self.fragment, 

195 } 

196 defaults.update(kwargs) 

197 return urlparse("", **defaults) 

198 

199 def __str__(self) -> str: 

200 authority = self.authority 

201 return "".join( 

202 [ 

203 f"{self.scheme}:" if self.scheme else "", 

204 f"//{authority}" if authority else "", 

205 self.path, 

206 f"?{self.query}" if self.query is not None else "", 

207 f"#{self.fragment}" if self.fragment is not None else "", 

208 ] 

209 ) 

210 

211 

212def urlparse(url: str = "", **kwargs: str | None) -> ParseResult: 

213 # Initial basic checks on allowable URLs. 

214 # --------------------------------------- 

215 

216 # Hard limit the maximum allowable URL length. 

217 if len(url) > MAX_URL_LENGTH: 

218 raise InvalidURL("URL too long") 

219 

220 # If a URL includes any ASCII control characters including \t, \r, \n, 

221 # then treat it as invalid. 

222 if any(char.isascii() and not char.isprintable() for char in url): 

223 char = next(char for char in url if char.isascii() and not char.isprintable()) 

224 idx = url.find(char) 

225 error = ( 

226 f"Invalid non-printable ASCII character in URL, {char!r} at position {idx}." 

227 ) 

228 raise InvalidURL(error) 

229 

230 # Some keyword arguments require special handling. 

231 # ------------------------------------------------ 

232 

233 # Coerce "port" to a string, if it is provided as an integer. 

234 if "port" in kwargs: 

235 port = kwargs["port"] 

236 kwargs["port"] = str(port) if isinstance(port, int) else port 

237 

238 # Replace "netloc" with "host and "port". 

239 if "netloc" in kwargs: 

240 netloc = kwargs.pop("netloc") or "" 

241 kwargs["host"], _, kwargs["port"] = netloc.partition(":") 

242 

243 # Replace "username" and/or "password" with "userinfo". 

244 if "username" in kwargs or "password" in kwargs: 

245 username = quote(kwargs.pop("username", "") or "", safe=USERNAME_SAFE) 

246 password = quote(kwargs.pop("password", "") or "", safe=PASSWORD_SAFE) 

247 kwargs["userinfo"] = f"{username}:{password}" if password else username 

248 

249 # Replace "raw_path" with "path" and "query". 

250 if "raw_path" in kwargs: 

251 raw_path = kwargs.pop("raw_path") or "" 

252 kwargs["path"], seperator, kwargs["query"] = raw_path.partition("?") 

253 if not seperator: 

254 kwargs["query"] = None 

255 

256 # Ensure that IPv6 "host" addresses are always escaped with "[...]". 

257 if "host" in kwargs: 

258 host = kwargs.get("host") or "" 

259 if ":" in host and not (host.startswith("[") and host.endswith("]")): 

260 kwargs["host"] = f"[{host}]" 

261 

262 # If any keyword arguments are provided, ensure they are valid. 

263 # ------------------------------------------------------------- 

264 

265 for key, value in kwargs.items(): 

266 if value is not None: 

267 if len(value) > MAX_URL_LENGTH: 

268 raise InvalidURL(f"URL component '{key}' too long") 

269 

270 # If a component includes any ASCII control characters including \t, \r, \n, 

271 # then treat it as invalid. 

272 if any(char.isascii() and not char.isprintable() for char in value): 

273 char = next( 

274 char for char in value if char.isascii() and not char.isprintable() 

275 ) 

276 idx = value.find(char) 

277 error = ( 

278 f"Invalid non-printable ASCII character in URL {key} component, " 

279 f"{char!r} at position {idx}." 

280 ) 

281 raise InvalidURL(error) 

282 

283 # Ensure that keyword arguments match as a valid regex. 

284 if not COMPONENT_REGEX[key].fullmatch(value): 

285 raise InvalidURL(f"Invalid URL component '{key}'") 

286 

287 # The URL_REGEX will always match, but may have empty components. 

288 url_match = URL_REGEX.match(url) 

289 assert url_match is not None 

290 url_dict = url_match.groupdict() 

291 

292 # * 'scheme', 'authority', and 'path' may be empty strings. 

293 # * 'query' may be 'None', indicating no trailing "?" portion. 

294 # Any string including the empty string, indicates a trailing "?". 

295 # * 'fragment' may be 'None', indicating no trailing "#" portion. 

296 # Any string including the empty string, indicates a trailing "#". 

297 scheme = kwargs.get("scheme", url_dict["scheme"]) or "" 

298 authority = kwargs.get("authority", url_dict["authority"]) or "" 

299 path = kwargs.get("path", url_dict["path"]) or "" 

300 query = kwargs.get("query", url_dict["query"]) 

301 frag = kwargs.get("fragment", url_dict["fragment"]) 

302 

303 # The AUTHORITY_REGEX will always match, but may have empty components. 

304 authority_match = AUTHORITY_REGEX.match(authority) 

305 assert authority_match is not None 

306 authority_dict = authority_match.groupdict() 

307 

308 # * 'userinfo' and 'host' may be empty strings. 

309 # * 'port' may be 'None'. 

310 userinfo = kwargs.get("userinfo", authority_dict["userinfo"]) or "" 

311 host = kwargs.get("host", authority_dict["host"]) or "" 

312 port = kwargs.get("port", authority_dict["port"]) 

313 

314 # Normalize and validate each component. 

315 # We end up with a parsed representation of the URL, 

316 # with components that are plain ASCII bytestrings. 

317 parsed_scheme: str = scheme.lower() 

318 parsed_userinfo: str = quote(userinfo, safe=USERINFO_SAFE) 

319 parsed_host: str = encode_host(host) 

320 parsed_port: int | None = normalize_port(port, scheme) 

321 

322 has_scheme = parsed_scheme != "" 

323 has_authority = ( 

324 parsed_userinfo != "" or parsed_host != "" or parsed_port is not None 

325 ) 

326 validate_path(path, has_scheme=has_scheme, has_authority=has_authority) 

327 if has_scheme or has_authority: 

328 path = normalize_path(path) 

329 

330 parsed_path: str = quote(path, safe=PATH_SAFE) 

331 parsed_query: str | None = None if query is None else quote(query, safe=QUERY_SAFE) 

332 parsed_frag: str | None = None if frag is None else quote(frag, safe=FRAG_SAFE) 

333 

334 # The parsed ASCII bytestrings are our canonical form. 

335 # All properties of the URL are derived from these. 

336 return ParseResult( 

337 parsed_scheme, 

338 parsed_userinfo, 

339 parsed_host, 

340 parsed_port, 

341 parsed_path, 

342 parsed_query, 

343 parsed_frag, 

344 ) 

345 

346 

347def encode_host(host: str) -> str: 

348 if not host: 

349 return "" 

350 

351 elif IPv4_STYLE_HOSTNAME.match(host): 

352 # Validate IPv4 hostnames like #.#.#.# 

353 # 

354 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

355 # 

356 # IPv4address = dec-octet "." dec-octet "." dec-octet "." dec-octet 

357 try: 

358 ipaddress.IPv4Address(host) 

359 except ipaddress.AddressValueError: 

360 raise InvalidURL(f"Invalid IPv4 address: {host!r}") 

361 return host 

362 

363 elif IPv6_STYLE_HOSTNAME.match(host): 

364 # Validate IPv6 hostnames like [...] 

365 # 

366 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

367 # 

368 # "A host identified by an Internet Protocol literal address, version 6 

369 # [RFC3513] or later, is distinguished by enclosing the IP literal 

370 # within square brackets ("[" and "]"). This is the only place where 

371 # square bracket characters are allowed in the URI syntax." 

372 try: 

373 ipaddress.IPv6Address(host[1:-1]) 

374 except ipaddress.AddressValueError: 

375 raise InvalidURL(f"Invalid IPv6 address: {host!r}") 

376 return host[1:-1] 

377 

378 elif not host.isascii(): 

379 try: 

380 import idna # type: ignore 

381 except ImportError: 

382 raise InvalidURL( 

383 f"Cannot handle URL with IDNA hostname: {host!r}. " 

384 f"Package 'idna' is not installed." 

385 ) 

386 

387 # IDNA hostnames 

388 try: 

389 return idna.encode(host.lower()).decode("ascii") 

390 except idna.IDNAError: 

391 raise InvalidURL(f"Invalid IDNA hostname: {host!r}") 

392 

393 # Regular ASCII hostnames 

394 # 

395 # From https://datatracker.ietf.org/doc/html/rfc3986/#section-3.2.2 

396 # 

397 # reg-name = *( unreserved / pct-encoded / sub-delims ) 

398 WHATWG_SAFE = '"`{}%|\\' 

399 return quote(host.lower(), safe=SUB_DELIMS + WHATWG_SAFE) 

400 

401 

402def normalize_port(port: str | int | None, scheme: str) -> int | None: 

403 # From https://tools.ietf.org/html/rfc3986#section-3.2.3 

404 # 

405 # "A scheme may define a default port. For example, the "http" scheme 

406 # defines a default port of "80", corresponding to its reserved TCP 

407 # port number. The type of port designated by the port number (e.g., 

408 # TCP, UDP, SCTP) is defined by the URI scheme. URI producers and 

409 # normalizers should omit the port component and its ":" delimiter if 

410 # port is empty or if its value would be the same as that of the 

411 # scheme's default." 

412 if port is None or port == "": 

413 return None 

414 

415 try: 

416 port_as_int = int(port) 

417 except ValueError: 

418 raise InvalidURL(f"Invalid port: {port!r}") 

419 

420 # See https://url.spec.whatwg.org/#url-miscellaneous 

421 default_port = {"ftp": 21, "http": 80, "https": 443, "ws": 80, "wss": 443}.get( 

422 scheme 

423 ) 

424 if port_as_int == default_port: 

425 return None 

426 return port_as_int 

427 

428 

429def validate_path(path: str, has_scheme: bool, has_authority: bool) -> None: 

430 """ 

431 Path validation rules that depend on if the URL contains 

432 a scheme or authority component. 

433 

434 See https://datatracker.ietf.org/doc/html/rfc3986.html#section-3.3 

435 """ 

436 if has_authority: 

437 # If a URI contains an authority component, then the path component 

438 # must either be empty or begin with a slash ("/") character." 

439 if path and not path.startswith("/"): 

440 raise InvalidURL("For absolute URLs, path must be empty or begin with '/'") 

441 

442 if not has_scheme and not has_authority: 

443 # If a URI does not contain an authority component, then the path cannot begin 

444 # with two slash characters ("//"). 

445 if path.startswith("//"): 

446 raise InvalidURL("Relative URLs cannot have a path starting with '//'") 

447 

448 # In addition, a URI reference (Section 4.1) may be a relative-path reference, 

449 # in which case the first path segment cannot contain a colon (":") character. 

450 if path.startswith(":"): 

451 raise InvalidURL("Relative URLs cannot have a path starting with ':'") 

452 

453 

454def normalize_path(path: str) -> str: 

455 """ 

456 Drop "." and ".." segments from a URL path. 

457 

458 For example: 

459 

460 normalize_path("/path/./to/somewhere/..") == "/path/to" 

461 """ 

462 # Fast return when no '.' characters in the path. 

463 if "." not in path: 

464 return path 

465 

466 components = path.split("/") 

467 

468 # Fast return when no '.' or '..' components in the path. 

469 if "." not in components and ".." not in components: 

470 return path 

471 

472 # https://datatracker.ietf.org/doc/html/rfc3986#section-5.2.4 

473 output: list[str] = [] 

474 for component in components: 

475 if component == ".": 

476 pass 

477 elif component == "..": 

478 if output and output != [""]: 

479 output.pop() 

480 else: 

481 output.append(component) 

482 return "/".join(output) 

483 

484 

485def PERCENT(string: str) -> str: 

486 return "".join([f"%{byte:02X}" for byte in string.encode("utf-8")]) 

487 

488 

489def percent_encoded(string: str, safe: str) -> str: 

490 """ 

491 Use percent-encoding to quote a string. 

492 """ 

493 NON_ESCAPED_CHARS = UNRESERVED_CHARACTERS + safe 

494 

495 # Fast path for strings that don't need escaping. 

496 if not string.rstrip(NON_ESCAPED_CHARS): 

497 return string 

498 

499 return "".join( 

500 [char if char in NON_ESCAPED_CHARS else PERCENT(char) for char in string] 

501 ) 

502 

503 

504def quote(string: str, safe: str) -> str: 

505 """ 

506 Use percent-encoding to quote a string, omitting existing '%xx' escape sequences. 

507 

508 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.1 

509 

510 * `string`: The string to be percent-escaped. 

511 * `safe`: A string containing characters that may be treated as safe, and do not 

512 need to be escaped. Unreserved characters are always treated as safe. 

513 See: https://www.rfc-editor.org/rfc/rfc3986#section-2.3 

514 """ 

515 parts = [] 

516 current_position = 0 

517 for match in re.finditer(PERCENT_ENCODED_REGEX, string): 

518 start_position, end_position = match.start(), match.end() 

519 matched_text = match.group(0) 

520 # Add any text up to the '%xx' escape sequence. 

521 if start_position != current_position: 

522 leading_text = string[current_position:start_position] 

523 parts.append(percent_encoded(leading_text, safe=safe)) 

524 

525 # Add the '%xx' escape sequence. 

526 parts.append(matched_text) 

527 current_position = end_position 

528 

529 # Add any text after the final '%xx' escape sequence. 

530 if current_position != len(string): 

531 trailing_text = string[current_position:] 

532 parts.append(percent_encoded(trailing_text, safe=safe)) 

533 

534 return "".join(parts)