prechelt · shaunhegarty · May 11, 2022 · May 11, 2022 · May 11, 2022 · May 12, 2022
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1 @@
+pyproject.toml
diff --git a/.gitattributes b/.gitattributes
@@ -1 +1,3 @@
 *.rtf eol=crlf
+* linguist-vendored
+*.py linguist-vendored=false
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,6 @@
 *.py[co]
 *.egg-info
 tests/currentoutput/
+.devcontainer
+pyproject.toml
+poetry.lock
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,11 @@
+FROM fkrull/multi-python
+
+WORKDIR /app
+
+RUN apt update && apt install pdftohtml -y
+
+COPY tox.ini .
+
+RUN tox -v; exit 0
+
+COPY . .
diff --git a/README.md b/README.md
@@ -95,6 +95,8 @@ existing reference output files in `tests/rtf-as-html` and `tests/rtf-as-html`.
 The empty or missing output files indicate where functionality is missing,
 which nicely indicates possible places to jump in if you want to help.
 
+To run tests quietly with docker and tox `docker run --rm $(docker build -q .) tox`. Tests run against python 2.7 and python 3.6 at the moment. 
+
 
 Dependencies
 ============

diff --git a/pyth/plugins/latex/writer.py b/pyth/plugins/latex/writer.py
@@ -6,7 +6,7 @@
 """
 from __future__ import absolute_import
 
-from six import StringIO
+import six
 import docutils.core
 
 from pyth import document
@@ -15,7 +15,6 @@
 
 
 class LatexWriter(PythWriter):
-
     @classmethod
     def write(klass, document, target=None, stylesheet=""):
         """
@@ -37,7 +36,7 @@ def __init__(self, doc, target=None, stylesheet=""):
         """
         self.document = doc
         self.stylesheet = stylesheet
-        self.target = target if target is not None else StringIO()
+        self.target = target if target is not None else six.BytesIO()
 
     @property
     def full_stylesheet(self):
@@ -57,19 +56,20 @@ def full_stylesheet(self):
         }
         """ % (self.document.properties.get("title"),
                self.document.properties.get("author"),
-               self.document.properties.get("subject"))
+                self.document.properties.get("subject"),
+            )
         return latex_fragment + self.stylesheet
 
     def go(self):
         rst = RSTWriter.write(self.document).getvalue()
-        settings = dict(input_encoding="UTF-8",
-                        output_encoding="UTF-8",
-                        stylesheet="stylesheet.tex")
-        latex = docutils.core.publish_string(rst,
-                                             writer_name="latex",
-                                             settings_overrides=settings)
+        settings = dict(
+            input_encoding="UTF-8", output_encoding="UTF-8", stylesheet="stylesheet.tex"
+        )
+        latex = docutils.core.publish_string(
+            rst, writer_name="latex", settings_overrides=settings
+        )
         # We don't want to keep an \input command in the latex file
-        latex = latex.replace(r"\input{stylesheet.tex}",
-                              self.full_stylesheet)
+        # assert False, '{}, {}'.format(type(rb"\input{stylesheet.tex}"), type(six.ensure_binary(self.full_stylesheet)))
+        latex = latex.replace(six.ensure_binary(r"\input{stylesheet.tex}"), six.ensure_binary(self.full_stylesheet))
         self.target.write(latex)
         return self.target
diff --git a/pyth/plugins/pdf/writer.py b/pyth/plugins/pdf/writer.py
@@ -3,7 +3,7 @@
 """
 from __future__ import absolute_import
 
-from six import StringIO
+import six
 import cgi # For escape()
 
 from pyth import document
@@ -34,7 +34,7 @@ def write(klass, document, target=None, paragraphStyle=None):
         story = writer.go()
 
         if target is None:
-            target = StringIO()
+            target = six.BytesIO()
 
         doc = SimpleDocTemplate(target)
         doc.build(story)

diff --git a/pyth/plugins/rst/writer.py b/pyth/plugins/rst/writer.py
@@ -1,9 +1,9 @@
 """
 Render documents as reStructuredText.
 """
-from __future__ import absolute_import
+from __future__ import absolute_import, unicode_literals
 import six
-from six import StringIO
+from six import BytesIO
 
 from pyth import document
 from pyth.format import PythWriter
@@ -15,7 +15,7 @@ class RSTWriter(PythWriter):
     @classmethod
     def write(klass, document, target=None):
         if target is None:
-            target = StringIO()
+            target = BytesIO()
 
         writer = RSTWriter(document, target)
         return writer.go()
@@ -28,10 +28,10 @@ def __init__(self, doc, target):
                                   document.Paragraph: self.paragraph}
 
     def go(self):
-        for (i, paragraph) in enumerate(self.document.content):
+        for _, paragraph in enumerate(self.document.content):
             handler = self.paragraphDispatch[paragraph.__class__]
             handler(paragraph)
-            self.target.write("\n")
+            self.target.write(b"\n")
 
         # Heh heh, remove final paragraph spacing
         self.target.seek(-2, 1)
@@ -43,35 +43,35 @@ def text(self, text):
         """
         process a pyth text and return the formatted string
         """
-        ret = u"".join(text.content)
+        ret = "".join(text.content)
         if 'url' in text.properties:
-            return u"`%s`_" % ret
+            return "`%s`_" % ret
         if 'bold' in text.properties:
-            return u"**%s**" % ret
+            return "**%s**" % ret
         if 'italic' in text.properties:
-            return u"*%s*" % ret
+            return "*%s*" % ret
         if 'sub' in text.properties:
-            return six.u(r"\ :sub:`%s`\ " % ret)
+            return r"\ :sub:`%s`\ " % ret
         if 'super' in text.properties:
-            return six.u(r"\ :sup:`%s`\ " % ret)
+            return r"\ :sup:`%s`\ " % ret
         return ret
 
-    def paragraph(self, paragraph, prefix=""):
+    def paragraph(self, paragraph, prefix=b""):
         """
         process a pyth paragraph into the target
         """
         content = []
         for text in paragraph.content:
             content.append(self.text(text))
-        content = u"".join(content).encode("utf-8")
+        content = "".join(content).encode("utf-8")
 
-        for line in content.split("\n"):
-            self.target.write("  " * self.indent)
+        for line in content.split(b"\n"):
+            self.target.write(b"  " * self.indent)
             self.target.write(prefix)
             self.target.write(line)
-            self.target.write("\n")
+            self.target.write(b"\n")
             if prefix:
-                prefix = "  "
+                prefix = b"  "
 
         # handle the links
         if any('url' in text.properties for text in paragraph.content):

diff --git a/pyth/plugins/xhtml/reader.py b/pyth/plugins/xhtml/reader.py
@@ -3,7 +3,7 @@
 """
 from __future__ import absolute_import
 
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, NavigableString
 import six
 
 from pyth import document
@@ -26,9 +26,8 @@ def __init__(self, source, css_source=None, encoding="utf-8", link_callback=None
 
     def go(self):
         soup = BeautifulSoup(self.source,
-                             convertEntities=BeautifulSoup.BeautifulSoup.HTML_ENTITIES,
-                             fromEncoding=self.encoding,
-                             smartQuotesTo=None)
+                             features="xml",
+                             from_encoding=self.encoding)
         # Make sure the document content doesn't use multi-lines
         soup = self.format(soup)
         doc = document.Document()
@@ -58,12 +57,12 @@ def format(self, soup):
                 text = six.text_type(node)
                 lines = [x.strip() for x in text.splitlines()]
                 text = ' '.join(lines)
-                node.replaceWith(BeautifulSoup.BeautifulSoup(text))
-        soup = BeautifulSoup.BeautifulSoup(six.text_type(soup))
+                node.replaceWith(BeautifulSoup(text, features="xml"))
+        soup = BeautifulSoup(six.text_type(soup), features="xml")
         # replace all <br/> tag by newline character
         for node in soup.findAll('br'):
             node.replaceWith("\n")
-        soup = BeautifulSoup.BeautifulSoup(six.text_type(soup))
+        soup = BeautifulSoup(six.text_type(soup), features="xml")
         return soup
 
     def is_bold(self, node):
@@ -143,7 +142,7 @@ def process_into(self, node, obj):
         Process a BeautifulSoup node and fill its elements into a pyth
         base object.
         """
-        if isinstance(node, BeautifulSoup.NavigableString):
+        if isinstance(node, NavigableString):
             text = self.process_text(node)
             if text:
                 obj.append(text)

diff --git a/pyth/plugins/xhtml/writer.py b/pyth/plugins/xhtml/writer.py
@@ -1,6 +1,8 @@
 """
 Render documents as XHTML fragments
 """
+import os
+
 from pyth import document
 from pyth.format import PythWriter
 
@@ -17,11 +19,11 @@
 class XHTMLWriter(PythWriter):
 
     @classmethod
-    def write(klass, document, target=None, cssClasses=True, pretty=False):
+    def write(klass, document, target=None, cssClasses=True, pretty=False, newline=os.linesep):
         if target is None:
             target = six.BytesIO()
 
-        writer = XHTMLWriter(document, target, cssClasses, pretty)
+        writer = XHTMLWriter(document, target, cssClasses, pretty, newline)
         final = writer.go()
         final.seek(0)
 
@@ -37,12 +39,12 @@ def write(klass, document, target=None, cssClasses=True, pretty=False):
 
         return final
 
-
-    def __init__(self, doc, target, cssClasses=True, pretty=False):
+    def __init__(self, doc, target, cssClasses=True, pretty=False, newline=os.linesep):
         self.document = doc
         self.target = target
         self.cssClasses = cssClasses
         self.pretty = pretty
+        self.newline = newline
         self.paragraphDispatch = {
             document.List: self._list,
             document.Paragraph: self._paragraph
@@ -154,7 +156,7 @@ def render(self, target):
 
         if self.tag is not None:
             target.write(('</%s>' % self.tag).encode("utf-8"))
-        
+
 
     def attrString(self):
         return " ".join(

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 from setuptools import setup, find_packages
 
 setup(name="pyth3",
-      version="0.7",
+      version="0.7.1",
       packages = find_packages(),
       zip_safe = False,
 

diff --git a/tests/test_readrtf15.py b/tests/test_readrtf15.py
@@ -6,6 +6,8 @@
 """
 from __future__ import absolute_import
 from __future__ import print_function
+from __future__ import unicode_literals
+
 import glob
 import os
 import os.path
@@ -15,6 +17,10 @@
 from pyth.plugins.xhtml.writer import XHTMLWriter, write_html_file
 from pyth.plugins.plaintext.writer import PlaintextWriter
 
+
+TEST_LINE_SEP = '\r\n'  # Reference Outputs use CRLF
+
+
 class TestRtfHTML(unittest.TestCase):
     pass  # will be filled dynamically now:
 
@@ -45,7 +51,7 @@ def testmethod(self):  # the test method to be added
             write_html_file(outputfilename, the_testoutput, print_msg=False)
         elif writer == 'txt':
             with open(outputfilename, "wt") as f:
-                PlaintextWriter.write(document, f)
+                PlaintextWriter.write(document, f, newline=TEST_LINE_SEP)
 
         #--- compute test output:
         with open(outputfilename, "rb") as input:

diff --git a/tests/test_readxhtml.py b/tests/test_readxhtml.py
@@ -18,23 +18,23 @@ def test_basic(self):
         """
         xhtml = "<div></div>"
         doc = XHTMLReader.read(xhtml)
-        self.assert_(isinstance(doc, pyth.document.Document))
-        self.assert_(not doc.content)
+        self.assertTrue(isinstance(doc, pyth.document.Document))
+        self.assertTrue(not doc.content)
 
     def test_paragraphs(self):
         """
         Try to read a simple xhtml document containing tree paragraphs
         """
         xhtml = "<div><p>p0</p><p>p1</p><p>p2</p></div>"
         doc = XHTMLReader.read(xhtml)
-        self.assert_(len(doc.content) == 3)
+        self.assertTrue(len(doc.content) == 3)
         for i, p in enumerate(doc.content):
-            self.assert_(isinstance(p, pyth.document.Paragraph))
-            self.assert_(len(p.content) == 1)
-            self.assert_(isinstance(p.content[0], pyth.document.Text))
+            self.assertTrue(isinstance(p, pyth.document.Paragraph))
+            self.assertTrue(len(p.content) == 1)
+            self.assertTrue(isinstance(p.content[0], pyth.document.Text))
             text = p.content[0]
-            self.assert_(len(text.content) == 1)
-            self.assert_(text.content[0] == 'p%d' % i)
+            self.assertTrue(len(text.content) == 1)
+            self.assertTrue(text.content[0] == 'p%d' % i)
 
     def test_bold(self):
         """