|
2 | 2 | Functions for handling encoding of web pages
|
3 | 3 | """
|
4 | 4 | import re, codecs, encodings
|
5 |
| -from sys import version_info |
6 | 5 | from typing import Callable, Match, Optional, Tuple, Union, cast
|
7 | 6 | from w3lib._types import AnyUnicodeError, StrOrBytes
|
8 |
| -from w3lib.util import to_native_str |
| 7 | +import w3lib.util |
9 | 8 |
|
10 | 9 | _HEADER_ENCODING_RE = re.compile(r"charset=([\w-]+)", re.I)
|
11 | 10 |
|
@@ -46,6 +45,7 @@ def http_content_type_encoding(content_type: Optional[str]) -> Optional[str]:
|
46 | 45 | _XML_ENCODING_RE = _TEMPLATE % ("encoding", r"(?P<xmlcharset>[\w-]+)")
|
47 | 46 |
|
48 | 47 | # check for meta tags, or xml decl. and stop search if a body tag is encountered
|
| 48 | +# pylint: disable=consider-using-f-string |
49 | 49 | _BODY_ENCODING_PATTERN = (
|
50 | 50 | r"<\s*(?:meta%s(?:(?:\s+%s|\s+%s){2}|\s+%s)|\?xml\s[^>]+%s|body)"
|
51 | 51 | % (_SKIP_ATTRS, _HTTPEQUIV_RE, _CONTENT_RE, _CONTENT2_RE, _XML_ENCODING_RE)
|
@@ -93,7 +93,7 @@ def html_body_declared_encoding(html_body_str: StrOrBytes) -> Optional[str]:
|
93 | 93 | or match.group("xmlcharset")
|
94 | 94 | )
|
95 | 95 | if encoding:
|
96 |
| - return resolve_encoding(to_native_str(encoding)) |
| 96 | + return resolve_encoding(w3lib.util.to_unicode(encoding)) |
97 | 97 |
|
98 | 98 | return None
|
99 | 99 |
|
@@ -163,7 +163,7 @@ def resolve_encoding(encoding_alias: str) -> Optional[str]:
|
163 | 163 | (codecs.BOM_UTF16_LE, "utf-16-le"),
|
164 | 164 | (codecs.BOM_UTF8, "utf-8"),
|
165 | 165 | ]
|
166 |
| -_FIRST_CHARS = set(c[0] for (c, _) in _BOM_TABLE) |
| 166 | +_FIRST_CHARS = {c[0] for (c, _) in _BOM_TABLE} |
167 | 167 |
|
168 | 168 |
|
169 | 169 | def read_bom(data: bytes) -> Union[Tuple[None, None], Tuple[str, bytes]]:
|
@@ -208,9 +208,7 @@ def to_unicode(data_str: bytes, encoding: str) -> str:
|
208 | 208 | Characters that cannot be converted will be converted to ``\\ufffd`` (the
|
209 | 209 | unicode replacement character).
|
210 | 210 | """
|
211 |
| - return data_str.decode( |
212 |
| - encoding, "replace" if version_info[0:2] >= (3, 3) else "w3lib_replace" |
213 |
| - ) |
| 211 | + return data_str.decode(encoding, "replace") |
214 | 212 |
|
215 | 213 |
|
216 | 214 | def html_to_unicode(
|
|
0 commit comments