From 2c508a11dd8eca32f8dbc83a2bca663054ddea30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Sat, 20 Mar 2021 12:50:05 +0100 Subject: [PATCH 1/8] Remove Python 2.7 and 3.5 support, add Python 3.9 support --- .github/workflows/build.yml | 6 +++--- .github/workflows/publish.yml | 4 ++-- .github/workflows/tests.yml | 2 +- README.rst | 2 +- docs/index.rst | 2 +- setup.py | 4 +--- 6 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 986099db..37b9a64b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -18,13 +18,13 @@ jobs: - python-version: 3.7 env: TOXENV: docs - - python-version: 3.8 + - python-version: 3.9 env: TOXENV: flake8 - - python-version: 3.8 + - python-version: 3.9 env: TOXENV: pylint - - python-version: 3.8 + - python-version: 3.9 env: TOXENV: security diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 9390e788..26b1c58a 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -12,10 +12,10 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.8 + - name: Set up Python 3.9 uses: actions/setup-python@v2 with: - python-version: 3.8 + python-version: 3.9 - name: Check Tag id: check-release-tag diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c47af5f8..643c2655 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -14,7 +14,7 @@ jobs: runs-on: ubuntu-20.04 strategy: matrix: - python-version: [2.7, 3.5, 3.6, 3.7, 3.8, pypy3] + python-version: [3.6, 3.7, 3.8, 3.9, pypy3] steps: - uses: actions/checkout@v2 diff --git a/README.rst b/README.rst index e9cbd231..d097492b 100644 --- a/README.rst +++ b/README.rst @@ -27,7 +27,7 @@ This is a Python library of web-related functions, such as: Requirements ============ -Python 2.7 or Python 3.5+ +Python 3.6+ Install ======= diff --git a/docs/index.rst b/docs/index.rst index fdbda607..bd14188b 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -28,7 +28,7 @@ Modules Requirements ============ -Python 2.7 or Python 3.3+ +Python 3.6+ Install ======= diff --git a/setup.py b/setup.py index ea0ca0a9..75de11d3 100644 --- a/setup.py +++ b/setup.py @@ -18,13 +18,11 @@ 'License :: OSI Approved :: BSD License', 'Operating System :: OS Independent', 'Programming Language :: Python', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', 'Programming Language :: Python :: Implementation :: CPython', 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Internet :: WWW/HTTP', From c16d7bac3af3148b7018c67ef7922a5da6b3e640 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Sat, 20 Mar 2021 13:14:30 +0100 Subject: [PATCH 2/8] Remove six --- setup.py | 1 - stdeb.cfg | 2 - tests/test_encoding.py | 27 ++++--- tests/test_html.py | 91 +++++++++++---------- tests/test_url.py | 24 ++++-- w3lib/form.py | 7 +- w3lib/html.py | 26 +++--- w3lib/url.py | 179 ++++++++++++++++++++--------------------- w3lib/util.py | 18 ++--- 9 files changed, 193 insertions(+), 182 deletions(-) delete mode 100644 stdeb.cfg diff --git a/setup.py b/setup.py index 75de11d3..6f24f5eb 100644 --- a/setup.py +++ b/setup.py @@ -27,5 +27,4 @@ 'Programming Language :: Python :: Implementation :: PyPy', 'Topic :: Internet :: WWW/HTTP', ], - install_requires=['six >= 1.4.1'], ) diff --git a/stdeb.cfg b/stdeb.cfg deleted file mode 100644 index 5a7e8a2d..00000000 --- a/stdeb.cfg +++ /dev/null @@ -1,2 +0,0 @@ -[w3lib] -Depends: python-six (>= 1.4.1) diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 649c189a..3d3795cc 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -1,7 +1,14 @@ -import unittest, codecs -import six -from w3lib.encoding import (html_body_declared_encoding, read_bom, to_unicode, - http_content_type_encoding, resolve_encoding, html_to_unicode) +import codecs +import unittest + +from w3lib.encoding import ( + html_body_declared_encoding, + http_content_type_encoding, + html_to_unicode, + read_bom, + resolve_encoding, + to_unicode, +) class RequestEncodingTests(unittest.TestCase): utf8_fragments = [ @@ -107,18 +114,18 @@ def test_unicode_body(self): original_string = unicode_string.encode('cp1251') encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string) # check body_as_unicode - self.assertTrue(isinstance(body_unicode, six.text_type)) + self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(body_unicode, unicode_string) def _assert_encoding(self, content_type, body, expected_encoding, expected_unicode): - assert not isinstance(body, six.text_type) + assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body) - self.assertTrue(isinstance(body_unicode, six.text_type)) + self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) - if isinstance(expected_unicode, six.string_types): + if isinstance(expected_unicode, str): self.assertEqual(body_unicode, expected_unicode) else: self.assertTrue( @@ -177,9 +184,9 @@ def test_replace_wrong_encoding(self): def _assert_encoding_detected(self, content_type, expected_encoding, body, **kwargs): - assert not isinstance(body, six.text_type) + assert not isinstance(body, str) encoding, body_unicode = html_to_unicode(ct(content_type), body, **kwargs) - self.assertTrue(isinstance(body_unicode, six.text_type)) + self.assertTrue(isinstance(body_unicode, str)) self.assertEqual(norm_encoding(encoding), norm_encoding(expected_encoding)) def test_BOM(self): diff --git a/tests/test_html.py b/tests/test_html.py index a3c31d87..89a651e4 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,18 +1,25 @@ -# -*- coding: utf-8 -*- import unittest -import six -from w3lib.html import (replace_entities, replace_tags, remove_comments, - remove_tags_with_content, replace_escape_chars, remove_tags, unquote_markup, - get_base_url, get_meta_refresh) + +from w3lib.html import ( + get_base_url, + get_meta_refresh, + remove_comments, + remove_tags, + remove_tags_with_content, + replace_entities, + replace_escape_chars, + replace_tags, + unquote_markup, +) class RemoveEntitiesTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return uncode - assert isinstance(replace_entities(b'no entities'), six.text_type) - assert isinstance(replace_entities(b'Price: £100!'), six.text_type) - assert isinstance(replace_entities(u'no entities'), six.text_type) - assert isinstance(replace_entities(u'Price: £100!'), six.text_type) + assert isinstance(replace_entities(b'no entities'), str) + assert isinstance(replace_entities(b'Price: £100!'), str) + assert isinstance(replace_entities(u'no entities'), str) + assert isinstance(replace_entities(u'Price: £100!'), str) def test_regular(self): # regular conversions @@ -71,8 +78,8 @@ def test_encoding(self): class ReplaceTagsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return uncode - assert isinstance(replace_tags(b'no entities'), six.text_type) - assert isinstance(replace_tags('no entities'), six.text_type) + assert isinstance(replace_tags(b'no entities'), str) + assert isinstance(replace_tags('no entities'), str) def test_replace_tags(self): self.assertEqual(replace_tags(u'This text contains some tag'), @@ -88,10 +95,10 @@ def test_replace_tags_multiline(self): class RemoveCommentsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_comments(b'without comments'), six.text_type) - assert isinstance(remove_comments(b''), six.text_type) - assert isinstance(remove_comments(u'without comments'), six.text_type) - assert isinstance(remove_comments(u''), six.text_type) + assert isinstance(remove_comments(b'without comments'), str) + assert isinstance(remove_comments(b''), str) + assert isinstance(remove_comments(u'without comments'), str) + assert isinstance(remove_comments(u''), str) def test_no_comments(self): # text without comments @@ -112,16 +119,16 @@ def test_remove_comments(self): class RemoveTagsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_tags(b'no tags'), six.text_type) - assert isinstance(remove_tags(b'no tags', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags(b'

one tag

'), six.text_type) - assert isinstance(remove_tags(b'

one tag

', which_ones=('p')), six.text_type) - assert isinstance(remove_tags(b'link', which_ones=('b',)), six.text_type) - assert isinstance(remove_tags(u'no tags'), six.text_type) - assert isinstance(remove_tags(u'no tags', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags(u'

one tag

'), six.text_type) - assert isinstance(remove_tags(u'

one tag

', which_ones=('p')), six.text_type) - assert isinstance(remove_tags(u'link', which_ones=('b',)), six.text_type) + assert isinstance(remove_tags(b'no tags'), str) + assert isinstance(remove_tags(b'no tags', which_ones=('p',)), str) + assert isinstance(remove_tags(b'

one tag

'), str) + assert isinstance(remove_tags(b'

one tag

', which_ones=('p')), str) + assert isinstance(remove_tags(b'link', which_ones=('b',)), str) + assert isinstance(remove_tags(u'no tags'), str) + assert isinstance(remove_tags(u'no tags', which_ones=('p',)), str) + assert isinstance(remove_tags(u'

one tag

'), str) + assert isinstance(remove_tags(u'

one tag

', which_ones=('p')), str) + assert isinstance(remove_tags(u'link', which_ones=('b',)), str) def test_remove_tags_without_tags(self): # text without tags @@ -160,14 +167,14 @@ def test_uppercase_tags(self): class RemoveTagsWithContentTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(remove_tags_with_content(b'no tags'), six.text_type) - assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags_with_content(b'

one tag

', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags_with_content(b'link', which_ones=('b',)), six.text_type) - assert isinstance(remove_tags_with_content(u'no tags'), six.text_type) - assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags_with_content(u'

one tag

', which_ones=('p',)), six.text_type) - assert isinstance(remove_tags_with_content(u'link', which_ones=('b',)), six.text_type) + assert isinstance(remove_tags_with_content(b'no tags'), str) + assert isinstance(remove_tags_with_content(b'no tags', which_ones=('p',)), str) + assert isinstance(remove_tags_with_content(b'

one tag

', which_ones=('p',)), str) + assert isinstance(remove_tags_with_content(b'link', which_ones=('b',)), str) + assert isinstance(remove_tags_with_content(u'no tags'), str) + assert isinstance(remove_tags_with_content(u'no tags', which_ones=('p',)), str) + assert isinstance(remove_tags_with_content(u'

one tag

', which_ones=('p',)), str) + assert isinstance(remove_tags_with_content(u'link', which_ones=('b',)), str) def test_without_tags(self): # text without tags @@ -194,13 +201,13 @@ def test_tags_with_shared_prefix(self): class ReplaceEscapeCharsTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(replace_escape_chars(b'no ec'), six.text_type) - assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), six.text_type) - assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), six.text_type) - assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), six.text_type) - assert isinstance(replace_escape_chars(u'no ec'), six.text_type) - assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), six.text_type) - assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), six.text_type) + assert isinstance(replace_escape_chars(b'no ec'), str) + assert isinstance(replace_escape_chars(b'no ec', replace_by='str'), str) + assert isinstance(replace_escape_chars(b'no ec', replace_by=u'str'), str) + assert isinstance(replace_escape_chars(b'no ec', which_ones=('\n', '\t',)), str) + assert isinstance(replace_escape_chars(u'no ec'), str) + assert isinstance(replace_escape_chars(u'no ec', replace_by=u'str'), str) + assert isinstance(replace_escape_chars(u'no ec', which_ones=('\n', '\t',)), str) def test_without_escape_chars(self): # text without escape chars @@ -226,8 +233,8 @@ class UnquoteMarkupTest(unittest.TestCase): def test_returns_unicode(self): # make sure it always return unicode - assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), six.text_type) - assert isinstance(unquote_markup(self.sample_txt2), six.text_type) + assert isinstance(unquote_markup(self.sample_txt1.encode('latin-1')), str) + assert isinstance(unquote_markup(self.sample_txt2), str) def test_unquote_markup(self): self.assertEqual(unquote_markup(self.sample_txt1), u"""hi, this is sample text with entities: & \xa9 diff --git a/tests/test_url.py b/tests/test_url.py index 07695500..9b854232 100644 --- a/tests/test_url.py +++ b/tests/test_url.py @@ -1,15 +1,25 @@ -# -*- coding: utf-8 -*- -from __future__ import absolute_import import os import unittest +from urllib.parse import urlparse import pytest -from six.moves.urllib.parse import urlparse -from w3lib.url import (is_url, safe_url_string, safe_download_url, - url_query_parameter, add_or_replace_parameter, url_query_cleaner, - file_uri_to_path, parse_data_uri, path_to_file_uri, any_to_uri, - urljoin_rfc, canonicalize_url, parse_url, add_or_replace_parameters) +from w3lib.url import ( + add_or_replace_parameter, + add_or_replace_parameters, + any_to_uri, + canonicalize_url, + file_uri_to_path, + is_url, + parse_data_uri, + parse_url, + path_to_file_uri, + safe_download_url, + safe_url_string, + url_query_parameter, + url_query_cleaner, + urljoin_rfc, +) class UrlTests(unittest.TestCase): diff --git a/w3lib/form.py b/w3lib/form.py index 6a5eb403..9181b057 100644 --- a/w3lib/form.py +++ b/w3lib/form.py @@ -1,9 +1,6 @@ import warnings -import six -if six.PY2: - from cStringIO import StringIO as BytesIO -else: - from io import BytesIO +from io import BytesIO + from w3lib.util import unicode_to_str diff --git a/w3lib/html.py b/w3lib/html.py index 87d8d214..e87d96a1 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -5,16 +5,16 @@ import warnings import re -import six -from six import moves +from html.entities import name2codepoint +from urllib.parse import urljoin from w3lib.util import to_bytes, to_unicode from w3lib.url import safe_url_string _ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL) -_baseurl_re = re.compile(six.u(r']*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I) -_meta_refresh_re = re.compile(six.u(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE) +_baseurl_re = re.compile(r']*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']', re.I) +_meta_refresh_re = re.compile(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)', re.DOTALL | re.IGNORECASE) _cdata_re = re.compile(r'((?P.*?)(?P\]\]>))', re.DOTALL) HTML5_WHITESPACE = ' \t\n\r\x0c' @@ -77,8 +77,10 @@ def convert_entity(m): if entity_name.lower() in keep: return m.group(0) else: - number = (moves.html_entities.name2codepoint.get(entity_name) or - moves.html_entities.name2codepoint.get(entity_name.lower())) + number = ( + name2codepoint.get(entity_name) + or name2codepoint.get(entity_name.lower()) + ) if number is not None: # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped @@ -86,9 +88,9 @@ def convert_entity(m): # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML try: if 0x80 <= number <= 0x9f: - return six.int2byte(number).decode('cp1252') + return bytes((number,)).decode('cp1252') else: - return six.unichr(number) + return chr(number) except ValueError: pass @@ -265,7 +267,7 @@ def _get_fragments(txt, pattern): text = to_unicode(text, encoding) ret_text = u'' for fragment in _get_fragments(text, _cdata_re): - if isinstance(fragment, six.string_types): + if isinstance(fragment, str): # it's not a CDATA (so we try to remove its entities) ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) else: @@ -284,7 +286,7 @@ def get_base_url(text, baseurl='', encoding='utf-8'): text = to_unicode(text, encoding) m = _baseurl_re.search(text) if m: - return moves.urllib.parse.urljoin( + return urljoin( safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding) ) @@ -301,8 +303,6 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', """ - if six.PY2: - baseurl = to_bytes(baseurl, encoding) try: text = to_unicode(text, encoding) except UnicodeDecodeError: @@ -314,7 +314,7 @@ def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) - url = moves.urllib.parse.urljoin(baseurl, url) + url = urljoin(baseurl, url) return interval, url else: return None, None diff --git a/w3lib/url.py b/w3lib/url.py index bf12745d..e98da51d 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -5,17 +5,28 @@ import base64 import codecs import os -import re import posixpath -import warnings +import re import string +import warnings from collections import namedtuple -import six -from six.moves.urllib.parse import (urljoin, urlsplit, urlunsplit, - urldefrag, urlencode, urlparse, - quote, parse_qs, parse_qsl, - ParseResult, unquote, urlunparse) -from six.moves.urllib.request import pathname2url, url2pathname +from urllib.parse import ( + _coerce_args, + parse_qs, + parse_qsl, + ParseResult, + quote, + unquote, + unquote_to_bytes, + urldefrag, + urlencode, + urljoin, + urlparse, + urlsplit, + urlunparse, + urlunsplit, +) +from urllib.request import pathname2url, url2pathname from w3lib.util import to_bytes, to_native_str, to_unicode @@ -185,7 +196,7 @@ def url_query_cleaner(url, parameterlist=(), sep='&', kvsep='=', remove=False, u """ - if isinstance(parameterlist, (six.text_type, bytes)): + if isinstance(parameterlist, (str, bytes)): parameterlist = [parameterlist] url, fragment = urldefrag(url) base, _, query = url.partition('?') @@ -347,10 +358,7 @@ def parse_data_uri(uri): # delimiters, but it makes parsing easier and should not affect # well-formed URIs, as the delimiters used in this URI scheme are not # allowed, percent-encoded or not, in tokens. - if six.PY2: - uri = unquote(uri) - else: - uri = unquote_to_bytes(uri) + uri = unquote_to_bytes(uri) media_type = "text/plain" media_type_params = {} @@ -470,33 +478,32 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, # 1. decode query-string as UTF-8 (or keep raw bytes), # sort values, # and percent-encode them back - if six.PY2: - keyvals = parse_qsl(query, keep_blank_values) - else: - # Python3's urllib.parse.parse_qsl does not work as wanted - # for percent-encoded characters that do not match passed encoding, - # they get lost. - # - # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] - # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), - # instead of \xa3 that you get with Python2's parse_qsl) - # - # what we want here is to keep raw bytes, and percent encode them - # so as to preserve whatever encoding what originally used. - # - # See https://tools.ietf.org/html/rfc3987#section-6.4: - # - # For example, it is possible to have a URI reference of - # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the - # document name is encoded in iso-8859-1 based on server settings, but - # where the fragment identifier is encoded in UTF-8 according to - # [XPointer]. The IRI corresponding to the above URI would be (in XML - # notation) - # "http://www.example.org/r%E9sum%E9.xml#résumé". - # Similar considerations apply to query parts. The functionality of - # IRIs (namely, to be able to include non-ASCII characters) can only be - # used if the query part is encoded in UTF-8. - keyvals = parse_qsl_to_bytes(query, keep_blank_values) + + # Python's urllib.parse.parse_qsl does not work as wanted + # for percent-encoded characters that do not match passed encoding, + # they get lost. + # + # e.g., 'q=b%a3' becomes [('q', 'b\ufffd')] + # (ie. with 'REPLACEMENT CHARACTER' (U+FFFD), + # instead of \xa3 that you get with Python2's parse_qsl) + # + # what we want here is to keep raw bytes, and percent encode them + # so as to preserve whatever encoding what originally used. + # + # See https://tools.ietf.org/html/rfc3987#section-6.4: + # + # For example, it is possible to have a URI reference of + # "http://www.example.org/r%E9sum%E9.xml#r%C3%A9sum%C3%A9", where the + # document name is encoded in iso-8859-1 based on server settings, but + # where the fragment identifier is encoded in UTF-8 according to + # [XPointer]. The IRI corresponding to the above URI would be (in XML + # notation) + # "http://www.example.org/r%E9sum%E9.xml#résumé". + # Similar considerations apply to query parts. The functionality of + # IRIs (namely, to be able to include non-ASCII characters) can only be + # used if the query part is encoded in UTF-8. + keyvals = parse_qsl_to_bytes(query, keep_blank_values) + keyvals.sort() query = urlencode(keyvals) @@ -520,17 +527,12 @@ def _unquotepath(path): for reserved in ('2f', '2F', '3f', '3F'): path = path.replace('%' + reserved, '%25' + reserved.upper()) - if six.PY2: - # in Python 2, '%a3' becomes '\xa3', which is what we want - return unquote(path) - else: - # in Python 3, - # standard lib's unquote() does not work for non-UTF-8 - # percent-escaped characters, they get lost. - # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) - # - # unquote_to_bytes() returns raw bytes instead - return unquote_to_bytes(path) + # standard lib's unquote() does not work for non-UTF-8 + # percent-escaped characters, they get lost. + # e.g., '%a3' becomes 'REPLACEMENT CHARACTER' (U+FFFD) + # + # unquote_to_bytes() returns raw bytes instead + return unquote_to_bytes(path) def parse_url(url, encoding=None): @@ -542,51 +544,48 @@ def parse_url(url, encoding=None): return urlparse(to_unicode(url, encoding)) -if not six.PY2: - from urllib.parse import _coerce_args, unquote_to_bytes +def parse_qsl_to_bytes(qs, keep_blank_values=False): + """Parse a query given as a string argument. - def parse_qsl_to_bytes(qs, keep_blank_values=False): - """Parse a query given as a string argument. + Data are returned as a list of name, value pairs as bytes. - Data are returned as a list of name, value pairs as bytes. + Arguments: - Arguments: + qs: percent-encoded query string to be parsed - qs: percent-encoded query string to be parsed + keep_blank_values: flag indicating whether blank values in + percent-encoded queries should be treated as blank strings. A + true value indicates that blanks should be retained as blank + strings. The default false value indicates that blank values + are to be ignored and treated as if they were not included. - keep_blank_values: flag indicating whether blank values in - percent-encoded queries should be treated as blank strings. A - true value indicates that blanks should be retained as blank - strings. The default false value indicates that blank values - are to be ignored and treated as if they were not included. - - """ - # This code is the same as Python3's parse_qsl() - # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) - # except for the unquote(s, encoding, errors) calls replaced - # with unquote_to_bytes(s) - qs, _coerce_result = _coerce_args(qs) - pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] - r = [] - for name_value in pairs: - if not name_value: + """ + # This code is the same as Python3's parse_qsl() + # (at https://hg.python.org/cpython/rev/c38ac7ab8d9a) + # except for the unquote(s, encoding, errors) calls replaced + # with unquote_to_bytes(s) + qs, _coerce_result = _coerce_args(qs) + pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] + r = [] + for name_value in pairs: + if not name_value: + continue + nv = name_value.split('=', 1) + if len(nv) != 2: + # Handle case of a control-name with no equal sign + if keep_blank_values: + nv.append('') + else: continue - nv = name_value.split('=', 1) - if len(nv) != 2: - # Handle case of a control-name with no equal sign - if keep_blank_values: - nv.append('') - else: - continue - if len(nv[1]) or keep_blank_values: - name = nv[0].replace('+', ' ') - name = unquote_to_bytes(name) - name = _coerce_result(name) - value = nv[1].replace('+', ' ') - value = unquote_to_bytes(value) - value = _coerce_result(value) - r.append((name, value)) - return r + if len(nv[1]) or keep_blank_values: + name = nv[0].replace('+', ' ') + name = unquote_to_bytes(name) + name = _coerce_result(name) + value = nv[1].replace('+', ' ') + value = unquote_to_bytes(value) + value = _coerce_result(value) + r.append((name, value)) + return r def urljoin_rfc(base, ref, encoding='utf-8'): diff --git a/w3lib/util.py b/w3lib/util.py index d8513eef..02deeeea 100644 --- a/w3lib/util.py +++ b/w3lib/util.py @@ -1,5 +1,3 @@ -import six - def str_to_unicode(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' @@ -10,16 +8,16 @@ def str_to_unicode(text, encoding=None, errors='strict'): def unicode_to_str(text, encoding=None, errors='strict'): if encoding is None: encoding = 'utf-8' - if isinstance(text, six.text_type): + if isinstance(text, str): return text.encode(encoding, errors) return text def to_unicode(text, encoding=None, errors='strict'): """Return the unicode representation of a bytes object `text`. If `text` is already an unicode object, return it as-is.""" - if isinstance(text, six.text_type): + if isinstance(text, str): return text - if not isinstance(text, (bytes, six.text_type)): + if not isinstance(text, (bytes, str)): raise TypeError('to_unicode must receive a bytes, str or unicode ' 'object, got %s' % type(text).__name__) if encoding is None: @@ -31,7 +29,7 @@ def to_bytes(text, encoding=None, errors='strict'): is already a bytes object, return it as-is.""" if isinstance(text, bytes): return text - if not isinstance(text, six.string_types): + if not isinstance(text, str): raise TypeError('to_bytes must receive a unicode, str or bytes ' 'object, got %s' % type(text).__name__) if encoding is None: @@ -39,9 +37,5 @@ def to_bytes(text, encoding=None, errors='strict'): return text.encode(encoding, errors) def to_native_str(text, encoding=None, errors='strict'): - """ Return str representation of `text` - (bytes in Python 2.x and unicode in Python 3.x). """ - if six.PY2: - return to_bytes(text, encoding, errors) - else: - return to_unicode(text, encoding, errors) + """ Return str representation of `text` """ + return to_unicode(text, encoding, errors) From fd1f8d0d2ac45d6cabf8e56257a64b771e8f543c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Sat, 20 Mar 2021 13:20:00 +0100 Subject: [PATCH 3/8] Fix style issues --- pytest.ini | 2 ++ w3lib/html.py | 2 +- w3lib/url.py | 1 - 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pytest.ini b/pytest.ini index 4f23e3f8..30701606 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,6 +1,8 @@ [pytest] doctest_optionflags = ALLOW_UNICODE ALLOW_BYTES flake8-ignore = + W503 + docs/conf.py E121 E122 E265 E401 E501 tests/test_encoding.py E128 E221 E241 E302 E401 E501 E731 tests/test_form.py E265 E501 diff --git a/w3lib/html.py b/w3lib/html.py index e87d96a1..cbb1a9b0 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -8,7 +8,7 @@ from html.entities import name2codepoint from urllib.parse import urljoin -from w3lib.util import to_bytes, to_unicode +from w3lib.util import to_unicode from w3lib.url import safe_url_string _ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) diff --git a/w3lib/url.py b/w3lib/url.py index e98da51d..d27dbd52 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -16,7 +16,6 @@ parse_qsl, ParseResult, quote, - unquote, unquote_to_bytes, urldefrag, urlencode, From ea2a4ceeaaf3cc0c08408127c4099f8977b72da8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Mar 2021 11:30:21 +0100 Subject: [PATCH 4/8] Remove unused tests/py3-ignores.txt --- tests/py3-ignores.txt | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 tests/py3-ignores.txt diff --git a/tests/py3-ignores.txt b/tests/py3-ignores.txt deleted file mode 100644 index 09f34ec9..00000000 --- a/tests/py3-ignores.txt +++ /dev/null @@ -1,5 +0,0 @@ -w3lib/encoding.py -w3lib/form.py -w3lib/html.py -w3lib/http.py -w3lib/url.py From 9922b177c7e701f72be058b0786ae43b63c3e687 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Mar 2021 11:30:55 +0100 Subject: [PATCH 5/8] =?UTF-8?q?Provide=20a=20URL=20that=20indicates=20the?= =?UTF-8?q?=20reasoning=20behind=20ignoring=20Flake8=E2=80=99s=20W503?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pytest.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pytest.ini b/pytest.ini index 30701606..94b29688 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,7 +1,7 @@ [pytest] doctest_optionflags = ALLOW_UNICODE ALLOW_BYTES flake8-ignore = - W503 + W503 # https://www.flake8rules.com/rules/W503.html docs/conf.py E121 E122 E265 E401 E501 tests/test_encoding.py E128 E221 E241 E302 E401 E501 E731 From 0e049871d0a20c2a20964f3ae17a97e47162ade8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Mar 2021 11:42:14 +0100 Subject: [PATCH 6/8] Remove the u prefix from literal strings --- docs/conf.py | 16 ++-- tests/test_encoding.py | 84 ++++++++--------- tests/test_form.py | 20 ++--- tests/test_html.py | 200 ++++++++++++++++++++--------------------- tests/test_http.py | 4 +- tests/test_url.py | 156 ++++++++++++++++---------------- w3lib/encoding.py | 4 +- w3lib/form.py | 4 +- w3lib/html.py | 34 +++---- w3lib/url.py | 6 +- 10 files changed, 264 insertions(+), 264 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index b786f2b9..d79efcf4 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -47,8 +47,8 @@ master_doc = 'index' # General information about the project. -project = u'w3lib' -copyright = u'2014, w3lib developers' +project = 'w3lib' +copyright = '2014, w3lib developers' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -190,8 +190,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'w3lib.tex', u'w3lib Documentation', - u'w3lib developers', 'manual'), + ('index', 'w3lib.tex', 'w3lib Documentation', + 'w3lib developers', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -220,8 +220,8 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'w3lib', u'w3lib Documentation', - [u'w3lib developers'], 1) + ('index', 'w3lib', 'w3lib Documentation', + ['w3lib developers'], 1) ] # If true, show URL addresses after external links. @@ -234,8 +234,8 @@ # (source start file, target name, title, author, # dir menu entry, description, category) texinfo_documents = [ - ('index', 'w3lib', u'w3lib Documentation', - u'w3lib developers', 'w3lib', 'One line description of project.', + ('index', 'w3lib', 'w3lib Documentation', + 'w3lib developers', 'w3lib', 'One line description of project.', 'Miscellaneous'), ] diff --git a/tests/test_encoding.py b/tests/test_encoding.py index 3d3795cc..9faced10 100644 --- a/tests/test_encoding.py +++ b/tests/test_encoding.py @@ -29,7 +29,7 @@ class RequestEncodingTests(unittest.TestCase): def test_bom(self): # cjk water character in unicode - water_unicode = u'\u6C34' + water_unicode = '\u6C34' # BOM + water character encoded utf16be = b'\xfe\xff\x6c\x34' utf16le = b'\xff\xfe\x34\x6c' @@ -69,19 +69,19 @@ def test_html_body_declared_encoding(self): def test_html_body_declared_encoding_unicode(self): # html_body_declared_encoding should work when unicode body is passed - self.assertEqual(None, html_body_declared_encoding(u"something else")) + self.assertEqual(None, html_body_declared_encoding("something else")) for fragment in self.utf8_fragments: encoding = html_body_declared_encoding(fragment.decode('utf8')) self.assertEqual(encoding, 'utf-8', fragment) - self.assertEqual(None, html_body_declared_encoding(u""" + self.assertEqual(None, html_body_declared_encoding(""" this isn't searched """)) self.assertEqual(None, html_body_declared_encoding( - u"""""")) + """""")) class CodecsEncodingTestCase(unittest.TestCase): @@ -95,10 +95,10 @@ def test_resolve_encoding(self): class UnicodeDecodingTestCase(unittest.TestCase): def test_utf8(self): - self.assertEqual(to_unicode(b'\xc2\xa3', 'utf-8'), u'\xa3') + self.assertEqual(to_unicode(b'\xc2\xa3', 'utf-8'), '\xa3') def test_invalid_utf8(self): - self.assertEqual(to_unicode(b'\xc2\xc2\xa3', 'utf-8'), u'\ufffd\xa3') + self.assertEqual(to_unicode(b'\xc2\xc2\xa3', 'utf-8'), '\ufffd\xa3') def ct(charset): @@ -110,7 +110,7 @@ def norm_encoding(enc): class HtmlConversionTests(unittest.TestCase): def test_unicode_body(self): - unicode_string = u'\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442' + unicode_string = '\u043a\u0438\u0440\u0438\u043b\u043b\u0438\u0447\u0435\u0441\u043a\u0438\u0439 \u0442\u0435\u043a\u0441\u0442' original_string = unicode_string.encode('cp1251') encoding, body_unicode = html_to_unicode(ct('cp1251'), original_string) # check body_as_unicode @@ -137,23 +137,23 @@ def test_content_type_and_conversion(self): """Test content type header is interpreted and text converted as expected """ - self._assert_encoding('utf-8', b"\xc2\xa3", 'utf-8', u"\xa3") + self._assert_encoding('utf-8', b"\xc2\xa3", 'utf-8', "\xa3") # something like this in the scrapy tests - but that's invalid? - # self._assert_encoding('', "\xa3", 'utf-8', u"\xa3") + # self._assert_encoding('', "\xa3", 'utf-8', "\xa3") # iso-8859-1 is overridden to cp1252 - self._assert_encoding('iso-8859-1', b"\xa3", 'cp1252', u"\xa3") - self._assert_encoding('', b"\xc2\xa3", 'utf-8', u"\xa3") - self._assert_encoding('none', b"\xc2\xa3", 'utf-8', u"\xa3") - self._assert_encoding('gb2312', b"\xa8D", 'gb18030', u"\u2015") - self._assert_encoding('gbk', b"\xa8D", 'gb18030', u"\u2015") - self._assert_encoding('big5', b"\xf9\xda", 'big5hkscs', u"\u6052") + self._assert_encoding('iso-8859-1', b"\xa3", 'cp1252', "\xa3") + self._assert_encoding('', b"\xc2\xa3", 'utf-8', "\xa3") + self._assert_encoding('none', b"\xc2\xa3", 'utf-8', "\xa3") + self._assert_encoding('gb2312', b"\xa8D", 'gb18030', "\u2015") + self._assert_encoding('gbk', b"\xa8D", 'gb18030', "\u2015") + self._assert_encoding('big5', b"\xf9\xda", 'big5hkscs', "\u6052") def test_invalid_utf8_encoded_body_with_valid_utf8_BOM(self): # unlike scrapy, the BOM is stripped self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xabWORD2", - 'utf-8', u'WORD\ufffdWORD2') + 'utf-8', 'WORD\ufffdWORD2') self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xabWORD2", - 'utf-8', u'WORD\ufffdWORD2') + 'utf-8', 'WORD\ufffdWORD2') def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self): # Python implementations handle unexpected end of UTF8 data @@ -163,24 +163,24 @@ def test_utf8_unexpected_end_of_data_with_valid_utf8_BOM(self): # unlike scrapy, the BOM is stripped self._assert_encoding('utf-8', b"\xef\xbb\xbfWORD\xe3\xab", - 'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd']) + 'utf-8', ['WORD\ufffd\ufffd', 'WORD\ufffd']) self._assert_encoding(None, b"\xef\xbb\xbfWORD\xe3\xab", - 'utf-8', [u'WORD\ufffd\ufffd', u'WORD\ufffd']) + 'utf-8', ['WORD\ufffd\ufffd', 'WORD\ufffd']) def test_replace_wrong_encoding(self): """Test invalid chars are replaced properly""" encoding, body_unicode = html_to_unicode(ct('utf-8'), b'PREFIX\xe3\xabSUFFIX') # XXX: Policy for replacing invalid chars may suffer minor variations - # but it should always contain the unicode replacement char (u'\ufffd') - assert u'\ufffd' in body_unicode, repr(body_unicode) - assert u'PREFIX' in body_unicode, repr(body_unicode) - assert u'SUFFIX' in body_unicode, repr(body_unicode) + # but it should always contain the unicode replacement char ('\ufffd') + assert '\ufffd' in body_unicode, repr(body_unicode) + assert 'PREFIX' in body_unicode, repr(body_unicode) + assert 'SUFFIX' in body_unicode, repr(body_unicode) # Do not destroy html tags due to encoding bugs encoding, body_unicode = html_to_unicode(ct('utf-8'), b'\xf0value') - assert u'value' in body_unicode, repr(body_unicode) + assert 'value' in body_unicode, repr(body_unicode) def _assert_encoding_detected(self, content_type, expected_encoding, body, **kwargs): @@ -193,39 +193,39 @@ def test_BOM(self): # utf-16 cases already tested, as is the BOM detection function # http header takes precedence, irrespective of BOM - bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be') - expected = u'\ufffd\ufffd\x00h\x00i' + bom_be_str = codecs.BOM_UTF16_BE + "hi".encode('utf-16-be') + expected = '\ufffd\ufffd\x00h\x00i' self._assert_encoding('utf-8', bom_be_str, 'utf-8', expected) # BOM is stripped when it agrees with the encoding, or used to # determine encoding bom_utf8_str = codecs.BOM_UTF8 + b'hi' - self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', u"hi") - self._assert_encoding(None, bom_utf8_str, 'utf-8', u"hi") + self._assert_encoding('utf-8', bom_utf8_str, 'utf-8', "hi") + self._assert_encoding(None, bom_utf8_str, 'utf-8', "hi") def test_utf16_32(self): # tools.ietf.org/html/rfc2781 section 4.3 # USE BOM and strip it - bom_be_str = codecs.BOM_UTF16_BE + u"hi".encode('utf-16-be') - self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', u"hi") - self._assert_encoding(None, bom_be_str, 'utf-16-be', u"hi") + bom_be_str = codecs.BOM_UTF16_BE + "hi".encode('utf-16-be') + self._assert_encoding('utf-16', bom_be_str, 'utf-16-be', "hi") + self._assert_encoding(None, bom_be_str, 'utf-16-be', "hi") - bom_le_str = codecs.BOM_UTF16_LE + u"hi".encode('utf-16-le') - self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', u"hi") - self._assert_encoding(None, bom_le_str, 'utf-16-le', u"hi") + bom_le_str = codecs.BOM_UTF16_LE + "hi".encode('utf-16-le') + self._assert_encoding('utf-16', bom_le_str, 'utf-16-le', "hi") + self._assert_encoding(None, bom_le_str, 'utf-16-le', "hi") - bom_be_str = codecs.BOM_UTF32_BE + u"hi".encode('utf-32-be') - self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', u"hi") - self._assert_encoding(None, bom_be_str, 'utf-32-be', u"hi") + bom_be_str = codecs.BOM_UTF32_BE + "hi".encode('utf-32-be') + self._assert_encoding('utf-32', bom_be_str, 'utf-32-be', "hi") + self._assert_encoding(None, bom_be_str, 'utf-32-be', "hi") - bom_le_str = codecs.BOM_UTF32_LE + u"hi".encode('utf-32-le') - self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', u"hi") - self._assert_encoding(None, bom_le_str, 'utf-32-le', u"hi") + bom_le_str = codecs.BOM_UTF32_LE + "hi".encode('utf-32-le') + self._assert_encoding('utf-32', bom_le_str, 'utf-32-le', "hi") + self._assert_encoding(None, bom_le_str, 'utf-32-le', "hi") # if there is no BOM, big endian should be chosen - self._assert_encoding('utf-16', u"hi".encode('utf-16-be'), 'utf-16-be', u"hi") - self._assert_encoding('utf-32', u"hi".encode('utf-32-be'), 'utf-32-be', u"hi") + self._assert_encoding('utf-16', "hi".encode('utf-16-be'), 'utf-16-be', "hi") + self._assert_encoding('utf-32', "hi".encode('utf-32-be'), 'utf-32-be', "hi") def test_python_crash(self): import random diff --git a/tests/test_form.py b/tests/test_form.py index 280d8795..4a6d3052 100644 --- a/tests/test_form.py +++ b/tests/test_form.py @@ -23,20 +23,20 @@ def test_encode_multipart(self): def test_encode_multipart_unicode(self): data = OrderedDict([ - (u'ключ1', u'значение1'.encode('utf8')), - (u'ключ2', u'значение2'), + ('ключ1', 'значение1'.encode('utf8')), + ('ключ2', 'значение2'), ]) with warnings.catch_warnings(record=True): body, boundary = encode_multipart(data) expected_body = ( - u'\r\n--{boundary}' - u'\r\nContent-Disposition: form-data; name="ключ1"\r\n' - u'\r\nзначение1' - u'\r\n--{boundary}' - u'\r\nContent-Disposition: form-data; name="ключ2"\r\n' - u'\r\nзначение2' - u'\r\n--{boundary}--' - u'\r\n'.format(boundary=boundary).encode('utf8') + '\r\n--{boundary}' + '\r\nContent-Disposition: form-data; name="ключ1"\r\n' + '\r\nзначение1' + '\r\n--{boundary}' + '\r\nContent-Disposition: form-data; name="ключ2"\r\n' + '\r\nзначение2' + '\r\n--{boundary}--' + '\r\n'.format(boundary=boundary).encode('utf8') ) self.assertEqual(body, expected_body) diff --git a/tests/test_html.py b/tests/test_html.py index 89a651e4..d6a6c22b 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -18,37 +18,37 @@ def test_returns_unicode(self): # make sure it always return uncode assert isinstance(replace_entities(b'no entities'), str) assert isinstance(replace_entities(b'Price: £100!'), str) - assert isinstance(replace_entities(u'no entities'), str) - assert isinstance(replace_entities(u'Price: £100!'), str) + assert isinstance(replace_entities('no entities'), str) + assert isinstance(replace_entities('Price: £100!'), str) def test_regular(self): # regular conversions - self.assertEqual(replace_entities(u'As low as £100!'), - u'As low as \xa3100!') + self.assertEqual(replace_entities('As low as £100!'), + 'As low as \xa3100!') self.assertEqual(replace_entities(b'As low as £100!'), - u'As low as \xa3100!') + 'As low as \xa3100!') self.assertEqual(replace_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'), - u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant') + 'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant') def test_keep_entities(self): # keep some entities self.assertEqual(replace_entities(b'Low < High & Medium £ six', keep=['lt', 'amp']), - u'Low < High & Medium \xa3 six') - self.assertEqual(replace_entities(u'Low < High & Medium £ six', keep=[u'lt', u'amp']), - u'Low < High & Medium \xa3 six') + 'Low < High & Medium \xa3 six') + self.assertEqual(replace_entities('Low < High & Medium £ six', keep=['lt', 'amp']), + 'Low < High & Medium \xa3 six') def test_illegal_entities(self): self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=False), - u'a < b &illegal; c � six') + 'a < b &illegal; c � six') self.assertEqual(replace_entities('a < b &illegal; c � six', remove_illegal=True), - u'a < b c six') - self.assertEqual(replace_entities('x≤y'), u'x\u2264y') - self.assertEqual(replace_entities('xy'), u'xy') - self.assertEqual(replace_entities('xy', remove_illegal=False), u'xy') + 'a < b c six') + self.assertEqual(replace_entities('x≤y'), 'x\u2264y') + self.assertEqual(replace_entities('xy'), 'xy') + self.assertEqual(replace_entities('xy', remove_illegal=False), 'xy') def test_browser_hack(self): # check browser hack for numeric character references in the 80-9F range - self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y') + self.assertEqual(replace_entities('x™y', encoding='cp1252'), 'x\u2122y') self.assertEqual(replace_entities('x™y', encoding='cp1252'), u'x\u2122y') def test_missing_semicolon(self): @@ -60,19 +60,19 @@ def test_missing_semicolon(self): ('Ah', 'Ah',), ('A!', 'A!',), ('Ax', 'Ax',), - ('³!', u'\u00B3!',), - ('Á!', u'\u00C1!',), - ('☃!', u'\u2603!',), - ('™', u'\u2122',), - ('™', u'\u2122',), + ('³!', '\u00B3!',), + ('Á!', '\u00C1!',), + ('☃!', '\u2603!',), + ('™', '\u2122',), + ('™', '\u2122',), ): self.assertEqual(replace_entities(entity, encoding='cp1252'), result) - self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), u'x%sy' % result) + self.assertEqual(replace_entities('x%sy' % entity, encoding='cp1252'), 'x%sy' % result) def test_encoding(self): self.assertEqual(replace_entities(b'x\x99™™y', encoding='cp1252'), \ - u'x\u2122\u2122\u2122y') + 'x\u2122\u2122\u2122y') class ReplaceTagsTest(unittest.TestCase): @@ -82,14 +82,14 @@ def test_returns_unicode(self): assert isinstance(replace_tags('no entities'), str) def test_replace_tags(self): - self.assertEqual(replace_tags(u'This text contains some tag'), - u'This text contains some tag') + self.assertEqual(replace_tags('This text contains some tag'), + 'This text contains some tag') self.assertEqual(replace_tags(b'This text is very important', ' '), - u'This text is very im port ant') + 'This text is very im port ant') def test_replace_tags_multiline(self): self.assertEqual(replace_tags(b'Click here'), - u'Click here') + 'Click here') class RemoveCommentsTest(unittest.TestCase): @@ -97,23 +97,23 @@ def test_returns_unicode(self): # make sure it always return unicode assert isinstance(remove_comments(b'without comments'), str) assert isinstance(remove_comments(b''), str) - assert isinstance(remove_comments(u'without comments'), str) - assert isinstance(remove_comments(u''), str) + assert isinstance(remove_comments('without comments'), str) + assert isinstance(remove_comments(''), str) def test_no_comments(self): # text without comments - self.assertEqual(remove_comments(u'text without comments'), u'text without comments') + self.assertEqual(remove_comments('text without comments'), 'text without comments') def test_remove_comments(self): # text with comments - self.assertEqual(remove_comments(u''), u'') - self.assertEqual(remove_comments(u'Hello'), u'Hello') - self.assertEqual(remove_comments(u'Hello'), u'Hello') + self.assertEqual(remove_comments(''), '') + self.assertEqual(remove_comments('Hello'), 'Hello') + self.assertEqual(remove_comments('Hello'), 'Hello') - self.assertEqual(remove_comments(b"test whatever"), u'test whatever') - self.assertEqual(remove_comments(b"test whatever"), u'test whatever') + self.assertEqual(remove_comments(b"test whatever"), 'test whatever') + self.assertEqual(remove_comments(b"test whatever"), 'test whatever') - self.assertEqual(remove_comments(b"test |$)', re.DOTALL) +_REMOVECOMMENTS_RE = re.compile('|$)', re.DOTALL) def remove_comments(text, encoding=None): """ Remove HTML Comments. >>> import w3lib.html >>> w3lib.html.remove_comments(b"test whatever") - u'test whatever' + 'test whatever' >>> """ text = to_unicode(text, encoding) - return _REMOVECOMMENTS_RE.sub(u'', text) + return _REMOVECOMMENTS_RE.sub('', text) def remove_tags(text, which_ones=(), keep=(), encoding=None): """ Remove HTML Tags only. @@ -158,19 +158,19 @@ def remove_tags(text, which_ones=(), keep=(), encoding=None): >>> import w3lib.html >>> doc = '

This is a link: example

' >>> w3lib.html.remove_tags(doc) - u'This is a link: example' + 'This is a link: example' >>> Keep only some tags: >>> w3lib.html.remove_tags(doc, keep=('div',)) - u'
This is a link: example
' + '
This is a link: example
' >>> Remove only specific tags: >>> w3lib.html.remove_tags(doc, which_ones=('a','b')) - u'

This is a link: example

' + '

This is a link: example

' >>> You can't remove some and keep some: @@ -197,7 +197,7 @@ def will_remove(tag): def remove_tag(m): tag = m.group(1) - return u'' if will_remove(tag) else m.group(0) + return '' if will_remove(tag) else m.group(0) regex = '/]+).*?>' retags = re.compile(regex, re.DOTALL | re.IGNORECASE) @@ -213,7 +213,7 @@ def remove_tags_with_content(text, which_ones=(), encoding=None): >>> import w3lib.html >>> doc = '

This is a link: example

' >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',)) - u'' + '' >>> """ @@ -222,11 +222,11 @@ def remove_tags_with_content(text, which_ones=(), encoding=None): if which_ones: tags = '|'.join([r'<%s\b.*?|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) - text = retags.sub(u'', text) + text = retags.sub('', text) return text -def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ +def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by='', \ encoding=None): """Remove escape characters. @@ -265,7 +265,7 @@ def _get_fragments(txt, pattern): yield txt[offset:] text = to_unicode(text, encoding) - ret_text = u'' + ret_text = '' for fragment in _get_fragments(text, _cdata_re): if isinstance(fragment, str): # it's not a CDATA (so we try to remove its entities) diff --git a/w3lib/url.py b/w3lib/url.py index d27dbd52..e0624228 100644 --- a/w3lib/url.py +++ b/w3lib/url.py @@ -456,7 +456,7 @@ def canonicalize_url(url, keep_blank_values=True, keep_fragments=False, 'http://www.example.com/do?a=50&b=2&b=5&c=3' >>> >>> # UTF-8 conversion + percent-encoding of non-ASCII characters - >>> w3lib.url.canonicalize_url(u'http://www.example.com/r\u00e9sum\u00e9') + >>> w3lib.url.canonicalize_url('http://www.example.com/r\u00e9sum\u00e9') 'http://www.example.com/r%C3%A9sum%C3%A9' >>> @@ -602,12 +602,12 @@ def urljoin_rfc(base, ref, encoding='utf-8'): Always returns a str. >>> import w3lib.url - >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', u'/otherpath/index2.html') + >>> w3lib.url.urljoin_rfc('http://www.example.com/path/index.html', '/otherpath/index2.html') 'http://www.example.com/otherpath/index2.html' >>> >>> # Note: the following does not work in Python 3 - >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', u'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP + >>> w3lib.url.urljoin_rfc(b'http://www.example.com/path/index.html', 'fran\u00e7ais/d\u00e9part.htm') # doctest: +SKIP 'http://www.example.com/path/fran\xc3\xa7ais/d\xc3\xa9part.htm' >>> From d5750880b6da8abb33d17d723b05b9a60c150ef1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Mar 2021 11:43:24 +0100 Subject: [PATCH 7/8] Remove file encoding comments --- docs/conf.py | 2 -- tests/test_form.py | 1 - tests/test_http.py | 2 -- w3lib/encoding.py | 1 - w3lib/html.py | 1 - 5 files changed, 7 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index d79efcf4..eb57263a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- -# # w3lib documentation build configuration file, created by # sphinx-quickstart on Sun Jan 26 22:19:38 2014. # diff --git a/tests/test_form.py b/tests/test_form.py index 4a6d3052..93ddab4e 100644 --- a/tests/test_form.py +++ b/tests/test_form.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- from __future__ import absolute_import import warnings import unittest diff --git a/tests/test_http.py b/tests/test_http.py index 8b934c96..67dddd3e 100644 --- a/tests/test_http.py +++ b/tests/test_http.py @@ -1,5 +1,3 @@ -# -*- coding: utf-8 -*- - import unittest from collections import OrderedDict from w3lib.http import (basic_auth_header, diff --git a/w3lib/encoding.py b/w3lib/encoding.py index d96bd160..4407e789 100644 --- a/w3lib/encoding.py +++ b/w3lib/encoding.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ Functions for handling encoding of web pages """ diff --git a/w3lib/html.py b/w3lib/html.py index bc92de39..9f7dbf0d 100644 --- a/w3lib/html.py +++ b/w3lib/html.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- """ Functions for dealing with markup text """ From 0579cade9c8e5725457fb4ceef3c3153f3824307 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A1n=20Chaves?= Date: Wed, 24 Mar 2021 11:44:04 +0100 Subject: [PATCH 8/8] Remove from __future__ import absolute_import --- tests/test_form.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_form.py b/tests/test_form.py index 93ddab4e..ac0696b5 100644 --- a/tests/test_form.py +++ b/tests/test_form.py @@ -1,4 +1,3 @@ -from __future__ import absolute_import import warnings import unittest from collections import OrderedDict