Replace tokenization with tokenize_rt (#77)

pquentin · web-flow · commit 1b6fd0cc4417 · 2022-07-22T14:56:48.000+02:00
* Replace tokenization with tokenize_rt

The standard library module `tokenize` does not round trip, so we had to
implement our own tokenization on top of it. However, tokenize_rt does
it better, so let's adopt it instead.

* Drop version pin

* Fix pip upgrade on Windows
diff --git a/ci.sh b/ci.sh
@@ -4,7 +4,7 @@ set -ex
 
 BLACK_VERSION=22.6.0
 
-pip install -U pip setuptools wheel
+python -m pip install -U pip setuptools wheel
 
 python setup.py sdist --formats=zip
 pip install dist/*.zip
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
     include_package_data=True,
     packages=find_packages("src"),
     package_dir={"": "src"},
-    install_requires=[],
+    install_requires=["tokenize_rt"],
     keywords=["async"],
     python_requires=">=3.7",
     classifiers=[
diff --git a/src/unasync/__init__.py b/src/unasync/__init__.py
@@ -6,6 +6,7 @@
 import sys
 import tokenize as std_tokenize
 
+import tokenize_rt
 from setuptools.command import build_py as orig
 
 from ._version import __version__  # NOQA
@@ -65,35 +66,41 @@ def _match(self, filepath):
     def _unasync_file(self, filepath):
         with open(filepath, "rb") as f:
             encoding, _ = std_tokenize.detect_encoding(f.readline)
-            f.seek(0)
-            tokens = _tokenize(f)
+
+        with open(filepath, "rt", encoding=encoding) as f:
+            tokens = tokenize_rt.src_to_tokens(f.read())
             tokens = self._unasync_tokens(tokens)
-            result = _untokenize(tokens)
+            result = tokenize_rt.tokens_to_src(tokens)
             outfilepath = filepath.replace(self.fromdir, self.todir)
             os.makedirs(os.path.dirname(outfilepath), exist_ok=True)
             with open(outfilepath, "wb") as f:
                 f.write(result.encode(encoding))
 
     def _unasync_tokens(self, tokens):
-        # TODO __await__, ...?
-        used_space = None
-        for space, toknum, tokval in tokens:
-            if tokval in ["async", "await"]:
-                # When removing async or await, we want to use the whitespace that
-                # was before async/await before the next token so that
-                # `print(await stuff)` becomes `print(stuff)` and not
-                # `print( stuff)`
-                used_space = space
+        skip_next = False
+        for i, token in enumerate(tokens):
+            if skip_next:
+                skip_next = False
+                continue
+
+            if token.src in ["async", "await"]:
+                # When removing async or await, we want to skip the following whitespace
+                # so that `print(await stuff)` becomes `print(stuff)` and not `print( stuff)`
+                skip_next = True
             else:
-                if toknum == std_tokenize.NAME:
-                    tokval = self._unasync_name(tokval)
-                elif toknum == std_tokenize.STRING:
-                    left_quote, name, right_quote = tokval[0], tokval[1:-1], tokval[-1]
-                    tokval = left_quote + self._unasync_name(name) + right_quote
-                if used_space is None:
-                    used_space = space
-                yield (used_space, tokval)
-                used_space = None
+                if token.name == "NAME":
+                    token = token._replace(src=self._unasync_name(token.src))
+                elif token.name == "STRING":
+                    left_quote, name, right_quote = (
+                        token.src[0],
+                        token.src[1:-1],
+                        token.src[-1],
+                    )
+                    token = token._replace(
+                        src=left_quote + self._unasync_name(name) + right_quote
+                    )
+
+                yield token
 
     def _unasync_name(self, name):
         if name in self.token_replacements:
@@ -122,31 +129,6 @@ def unasync_files(fpath_list, rules):
 Token = collections.namedtuple("Token", ["type", "string", "start", "end", "line"])
 
 
-def _tokenize(f):
-    last_end = (1, 0)
-    for tok in std_tokenize.tokenize(f.readline):
-        if tok.type == std_tokenize.ENCODING:
-            continue
-
-        if last_end[0] < tok.start[0]:
-            yield ("", std_tokenize.STRING, " \\\n")
-            last_end = (tok.start[0], 0)
-
-        space = ""
-        if tok.start > last_end:
-            assert tok.start[0] == last_end[0]
-            space = " " * (tok.start[1] - last_end[1])
-        yield (space, tok.type, tok.string)
-
-        last_end = tok.end
-        if tok.type in [std_tokenize.NEWLINE, std_tokenize.NL]:
-            last_end = (tok.end[0] + 1, 0)
-
-
-def _untokenize(tokens):
-    return "".join(space + tokval for space, tokval in tokens)
-
-
 _DEFAULT_RULE = Rule(fromdir="/_async/", todir="/_sync/")