From 6d2e69ce056c24d60d397881d56f018cad2f2475 Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Tue, 3 Aug 2021 12:14:17 -0400 Subject: [PATCH] Do not modify contents of tag (#86) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Do not modify contents of tag This changes the behaviour around normalizing the field of .glif files. Previously the contents of this element would be broken into lines and reindented, but this was a potentially destructive operation. With this patch, any content of a element is passed through unchanged. For more discussion, see: https://github.com/unified-font-object/ufoNormalizer/issues/85 * Address code review comments - make sure we're escaping XML correctly, and test that - remove dedent_tabs function - add a test for correctly encoding 'é' --- src/ufonormalizer/__init__.py | 86 +--------------------------------- tests/data/glif/format2.glif | 4 +- tests/test_ufonormalizer.py | 88 +++++++++++------------------------ 3 files changed, 30 insertions(+), 148 deletions(-) diff --git a/src/ufonormalizer/__init__.py b/src/ufonormalizer/__init__.py index 1f16a99..87dcb94 100644 --- a/src/ufonormalizer/__init__.py +++ b/src/ufonormalizer/__init__.py @@ -4,11 +4,9 @@ import binascii import time import os -import re import shutil from xml.etree import cElementTree as ET import plistlib -import textwrap import datetime import glob from collections import OrderedDict @@ -838,9 +836,7 @@ def _normalizeGlifNote(element, writer): return if not value.strip(): return - writer.beginElement("note") - writer.text(value) - writer.endElement("note") + writer.simpleElement("note", value=xmlEscapeText(value)) def _normalizeGlifOutlineFormat1(element, writer): @@ -1261,29 +1257,6 @@ def data(self, text): line = "" % text self.raw(line) - def text(self, text): - text = text.strip("\n") - text = dedent_tabs(text) - text = text.strip() - text = xmlEscapeText(text) - paragraphs = [] - for paragraph in text.splitlines(): - if not paragraph: - paragraphs.append("") - else: - paragraph = textwrap.wrap( - paragraph.rstrip(), - width=xmlTextMaxLineLength, - expand_tabs=False, - replace_whitespace=False, - drop_whitespace=False, - break_long_words=False, - break_on_hyphens=False - ) - paragraphs.extend(paragraph) - for line in paragraphs: - self.raw(line) - def simpleElement(self, tag, attrs=None, value=None): if attrs: attrs = self.attributesToString(attrs) @@ -1450,63 +1423,6 @@ def xmlConvertInt(value): return str(value) -# --------------- -# Text Operations -# --------------- - -WHITESPACE_ONLY_RE = re.compile(r'^[\s\t]+$', re.MULTILINE) -LEADING_WHITESPACE_RE = re.compile(r'(^(?:\s{4}|\t)*)(?:[^\t\n])', re.MULTILINE) - - -def dedent_tabs(text): - """ - Based on `textwrap.dedent`, but modified to only work on tabs and 4-space indents - - Remove any common leading tabs from every line in `text`. - This can be used to make triple-quoted strings line up with the left - edge of the display, while still presenting them in the source code - in indented form. - - Entirely blank lines are normalized to a newline character. - """ - # Look for the longest leading string of spaces and tabs common to - # all lines. - margin = None - text = WHITESPACE_ONLY_RE.sub('', text) - indents = LEADING_WHITESPACE_RE.findall(text) - for indent in indents: - if margin is None: - margin = indent - - # Current line more deeply indented than previous winner: - # no change (previous winner is still on top). - elif indent.startswith(margin): - pass - - # Current line consistent with and no deeper than previous winner: - # it's the new winner. - elif margin.startswith(indent): - margin = indent - - # Find the largest common whitespace between current line and previous - # winner. - else: - for i, (x, y) in enumerate(zip(margin, indent)): - if x != y: - margin = margin[:i] - break - - # sanity check (testing/debugging only) - if 0 and margin: - for line in text.split("\n"): - assert not line or line.startswith(margin), \ - "line = %r, margin = %r" % (line, margin) - - if margin: - text = re.sub(r'(?m)^' + margin, '', text) - return text - - # --------------- # Path Operations # --------------- diff --git a/tests/data/glif/format2.glif b/tests/data/glif/format2.glif index 83c5f6d..19cba54 100644 --- a/tests/data/glif/format2.glif +++ b/tests/data/glif/format2.glif @@ -35,7 +35,5 @@ 1,0,0,0.5 - - arbitrary text about the glyph - + arbitrary text about the glyph diff --git a/tests/test_ufonormalizer.py b/tests/test_ufonormalizer.py index 4589f80..f242409 100644 --- a/tests/test_ufonormalizer.py +++ b/tests/test_ufonormalizer.py @@ -107,9 +107,7 @@ 1,0,0,0.5 - - arbitrary text about the glyph - + arbitrary text about the glyph ''' @@ -767,25 +765,35 @@ def test_normalizeGLIF_lib_undefined(self): self.assertEqual(writer.getText(), '') def test_normalizeGLIF_note_defined(self): + """ Serialization of notes is non-fancy: we take the note text and + use it, unchanged, as the body of the element. In previous + version of ufonormalizer we would break the user text into lines. See + https://github.com/unified-font-object/ufoNormalizer/issues/85 for some + background. + """ + element = ET.fromstring("Blah") writer = XMLWriter(declaration=None) _normalizeGlifNote(element, writer) - self.assertEqual(writer.getText(), "\n\tBlah\n") + self.assertEqual(writer.getText(), "Blah") - element = ET.fromstring(" Blah \t\n\t ") + # encode accent correctly + element = ET.fromstring( + tobytes("Don't forget to check the béziers!!", + encoding="utf8")) writer = XMLWriter(declaration=None) _normalizeGlifNote(element, writer) - self.assertEqual(writer.getText(), "\n\tBlah\n") + self.assertEqual( + writer.getText(), + "Don't forget to check the b\xe9ziers!!") - element = ET.fromstring( - tobytes("Don't forget to check the béziers!!", - encoding="utf8")) + # trailing whitespace is preserved + element = ET.fromstring(" Blah \t\n\t ") writer = XMLWriter(declaration=None) _normalizeGlifNote(element, writer) - self.assertEqual( - writer.getText(), - "\n\tDon't forget to check the b\xe9ziers!!\n") + self.assertEqual(writer.getText(), " Blah \t\n\t ") + # multiline strings are preserved element = ET.fromstring( tobytes("A quick brown fox jumps over the lazy dog.\n" "Příliš žluťoučký kůň úpěl ďábelské ódy.", @@ -794,64 +802,24 @@ def test_normalizeGLIF_note_defined(self): _normalizeGlifNote(element, writer) self.assertEqual( writer.getText(), - "\n\tA quick brown fox jumps over the lazy dog.\n\t" + "A quick brown fox jumps over the lazy dog.\n" "P\u0159\xedli\u0161 \u017elu\u0165ou\u010dk\xfd k\u016f\u0148 " - "\xfap\u011bl \u010f\xe1belsk\xe9 \xf3dy.\n") - - element = ET.fromstring( - " Line1 \t\n\n Line3\t ") - writer = XMLWriter(declaration=None) - _normalizeGlifNote(element, writer) - self.assertEqual( - writer.getText(), - "\n\tLine1\n\t\n\t Line3\n") - - # Normalizer should not indent Line2 and Line3 more than already indented - element = ET.fromstring( - "\n\tLine1\n\tLine2\n\tLine3\n") - writer = XMLWriter(declaration=None) - _normalizeGlifNote(element, writer) - self.assertEqual( - writer.getText(), - "\n\tLine1\n\tLine2\n\tLine3\n") - - # Normalizer should keep the extra tab in line 2 - element = ET.fromstring( - "\n\tLine1\n\t\tLine2\n\tLine3\n") - writer = XMLWriter(declaration=None) - _normalizeGlifNote(element, writer) - self.assertEqual( - writer.getText(), - "\n\tLine1\n\t\tLine2\n\tLine3\n") + "\xfap\u011bl \u010f\xe1belsk\xe9 \xf3dy.") - # Normalizer should keep the extra spaces on line 2 + # Everything is always preserved element = ET.fromstring( - "\n\tLine1\n\t Line2\n\tLine3\n") + "\n\tLine1\n\t\tLine2\n\t Line3\n") writer = XMLWriter(declaration=None) _normalizeGlifNote(element, writer) self.assertEqual( writer.getText(), - "\n\tLine1\n\t Line2\n\tLine3\n") + "\n\tLine1\n\t\tLine2\n\t Line3\n") - # Normalizer should remove the extra tab all lines have in common, - # but leave the additional tab on line 2 - element = ET.fromstring( - "\n\t\tLine1\n\t\t\tLine2\n\t\tLine3\n") + # correctly escape xml + element = ET.fromstring("escape<br />me!") writer = XMLWriter(declaration=None) _normalizeGlifNote(element, writer) - self.assertEqual( - writer.getText(), - "\n\tLine1\n\t\tLine2\n\tLine3\n") - - # Normalizer should remove the extra 4-space all lines have in common, - # but leave the additional 4-space on line 2 - element = ET.fromstring( - "\n Line1\n Line2\n Line3\n") - writer = XMLWriter(declaration=None) - _normalizeGlifNote(element, writer) - self.assertEqual( - writer.getText(), - "\n\tLine1\n\t Line2\n\tLine3\n") + self.assertEqual(writer.getText(), "escape<br />me!") def test_normalizeGLIF_note_undefined(self): element = ET.fromstring("")