From 21f1262bcb68e3395ce1cff9fb9284291b3dd066 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Tue, 19 May 2020 18:18:29 +0000
Subject: [PATCH 1/4] Avoid creating single-use-throw-away lists for string
 join

There is no need to create a list of it only to discard it after a
single use with join (which gladly accepts an iterator/generator
instead).

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 638ee5f..150b8e4 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -562,7 +562,7 @@ def findReferencedElements(node, ids=None):
         # one stretch of text, please! (we could use node.normalize(), but
         # this actually modifies the node, and we don't want to keep
         # whitespace around if there's any)
-        stylesheet = "".join([child.nodeValue for child in node.childNodes])
+        stylesheet = "".join(child.nodeValue for child in node.childNodes)
         if stylesheet != '':
             cssRules = parseCssString(stylesheet)
             for rule in cssRules:
@@ -853,7 +853,7 @@ def renameID(idFrom, idTo, identifiedElements, referringNodes):
                     # there's a CDATASection node surrounded by whitespace
                     # nodes
                     # (node.normalize() will NOT work here, it only acts on Text nodes)
-                    oldValue = "".join([child.nodeValue for child in node.childNodes])
+                    oldValue = "".join(child.nodeValue for child in node.childNodes)
                     # not going to reparse the whole thing
                     newValue = oldValue.replace('url(#' + idFrom + ')', 'url(#' + idTo + ')')
                     newValue = newValue.replace("url(#'" + idFrom + "')", 'url(#' + idTo + ')')
@@ -1617,7 +1617,7 @@ def _getStyle(node):
 
 def _setStyle(node, styleMap):
     u"""Sets the style attribute of a node to the dictionary ``styleMap``."""
-    fixedStyle = ';'.join([prop + ':' + styleMap[prop] for prop in styleMap])
+    fixedStyle = ';'.join(prop + ':' + styleMap[prop] for prop in styleMap)
     if fixedStyle != '':
         node.setAttribute('style', fixedStyle)
     elif node.getAttribute('style'):
@@ -2837,18 +2837,18 @@ def serializePath(pathObj, options):
     """
     # elliptical arc commands must have comma/wsp separating the coordinates
     # this fixes an issue outlined in Fix https://bugs.launchpad.net/scour/+bug/412754
-    return ''.join([cmd + scourCoordinates(data, options,
-                                           control_points=controlPoints(cmd, data),
-                                           flags=flags(cmd, data))
-                    for cmd, data in pathObj])
+    return ''.join(cmd + scourCoordinates(data, options,
+                                          control_points=controlPoints(cmd, data),
+                                          flags=flags(cmd, data))
+                   for cmd, data in pathObj)
 
 
 def serializeTransform(transformObj):
     """
        Reserializes the transform data with some cleanups.
     """
-    return ' '.join([command + '(' + ' '.join([scourUnitlessLength(number) for number in numbers]) + ')'
-                     for command, numbers in transformObj])
+    return ' '.join(command + '(' + ' '.join(scourUnitlessLength(number) for number in numbers) + ')'
+                    for command, numbers in transformObj)
 
 
 def scourCoordinates(data, options, force_whitespace=False, control_points=[], flags=[]):
@@ -3408,7 +3408,7 @@ def makeWellFormed(str, quote=''):
     xml_ents = {'<': '&lt;', '>': '&gt;', '&': '&amp;'}
     if quote:
         xml_ents[quote] = '&apos;' if (quote == "'") else "&quot;"
-    return ''.join([xml_ents[c] if c in xml_ents else c for c in str])
+    return ''.join(xml_ents[c] if c in xml_ents else c for c in str)
 
 
 def chooseQuoteCharacter(str):
@@ -3477,7 +3477,7 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
 
         if attr.nodeName == 'style':
             # sort declarations
-            attrValue = ';'.join([p for p in sorted(attrValue.split(';'))])
+            attrValue = ';'.join(sorted(attrValue.split(';')))
 
         outParts.append(' ')
         # preserve xmlns: if it is a namespace prefix declaration

From 5be6b03d7cb48c4354e5f7f313b98292cb88d72d Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Tue, 19 May 2020 21:36:58 +0000
Subject: [PATCH 2/4] Serialization: Avoid creating a single-use dict in each
 call to make_well_formed

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 40 +++++++++++++++++++++++-----------------
 test_scour.py  | 21 +++++++++++----------
 2 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 150b8e4..15de2bd 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -74,6 +74,12 @@ VER = __version__
 COPYRIGHT = u'Copyright Jeff Schiller, Louis Simard, 2010'
 
 
+XML_ENTS_NO_QUOTES = {'<': '&lt;', '>': '&gt;', '&': '&amp;'}
+XML_ENTS_ESCAPE_APOS = XML_ENTS_NO_QUOTES.copy()
+XML_ENTS_ESCAPE_APOS["'"] = '&apos;'
+XML_ENTS_ESCAPE_QUOT = XML_ENTS_NO_QUOTES.copy()
+XML_ENTS_ESCAPE_QUOT['"'] = '&quot;'
+
 NS = {'SVG':      'http://www.w3.org/2000/svg',
       'XLINK':    'http://www.w3.org/1999/xlink',
       'SODIPODI': 'http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd',
@@ -3404,23 +3410,23 @@ def remapNamespacePrefix(node, oldprefix, newprefix):
         remapNamespacePrefix(child, oldprefix, newprefix)
 
 
-def makeWellFormed(str, quote=''):
-    xml_ents = {'<': '&lt;', '>': '&gt;', '&': '&amp;'}
-    if quote:
-        xml_ents[quote] = '&apos;' if (quote == "'") else "&quot;"
-    return ''.join(xml_ents[c] if c in xml_ents else c for c in str)
+def make_well_formed(text, quote_dict=None):
+    if quote_dict is None:
+        quote_dict = XML_ENTS_NO_QUOTES
+    return ''.join(quote_dict[c] if c in quote_dict else c for c in text)
 
 
-def chooseQuoteCharacter(str):
-    quotCount = str.count('"')
-    aposCount = str.count("'")
-    if quotCount > aposCount:
-        quote = "'"
-        hasEmbeddedQuote = aposCount
-    else:
+def choose_quote_character(value):
+    quot_count = value.count('"')
+    if quot_count == 0 or quot_count <= value.count("'"):
+        # Fewest "-symbols (if there are 0, we pick this to avoid spending
+        # time counting the '-symbols as it won't matter)
         quote = '"'
-        hasEmbeddedQuote = quotCount
-    return (quote, hasEmbeddedQuote)
+        xml_ent = XML_ENTS_ESCAPE_QUOT
+    else:
+        quote = "'"
+        xml_ent = XML_ENTS_ESCAPE_APOS
+    return quote, xml_ent
 
 
 TEXT_CONTENT_ELEMENTS = ['text', 'tspan', 'tref', 'textPath', 'altGlyph',
@@ -3472,8 +3478,8 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
         attr = attrList.item(index)
 
         attrValue = attr.nodeValue
-        (quote, hasEmbeddedQuote) = chooseQuoteCharacter(attrValue)
-        attrValue = makeWellFormed(attrValue, quote if hasEmbeddedQuote else '')
+        quote, xml_ent = choose_quote_character(attrValue)
+        attrValue = make_well_formed(attrValue, xml_ent)
 
         if attr.nodeName == 'style':
             # sort declarations
@@ -3532,7 +3538,7 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
                             text_content = text_content.replace('  ', ' ')
                     else:
                         text_content = text_content.strip()
-                outParts.append(makeWellFormed(text_content))
+                outParts.append(make_well_formed(text_content))
             # CDATA node
             elif child.nodeType == Node.CDATA_SECTION_NODE:
                 outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])
diff --git a/test_scour.py b/test_scour.py
index 6c4c7ce..e55b9db 100755
--- a/test_scour.py
+++ b/test_scour.py
@@ -30,7 +30,8 @@ import unittest
 import six
 from six.moves import map, range
 
-from scour.scour import makeWellFormed, parse_args, scourString, scourXmlFile, start, run
+from scour.scour import (make_well_formed, parse_args, scourString, scourXmlFile, start, run,
+                         XML_ENTS_ESCAPE_APOS, XML_ENTS_ESCAPE_QUOT)
 from scour.svg_regex import svg_parser
 from scour import __version__
 
@@ -1893,26 +1894,26 @@ class EnsureLineEndings(unittest.TestCase):
 class XmlEntities(unittest.TestCase):
 
     def runTest(self):
-        self.assertEqual(makeWellFormed('<>&'), '&lt;&gt;&amp;',
+        self.assertEqual(make_well_formed('<>&'), '&lt;&gt;&amp;',
                          'Incorrectly translated unquoted XML entities')
-        self.assertEqual(makeWellFormed('<>&', "'"), '&lt;&gt;&amp;',
+        self.assertEqual(make_well_formed('<>&', XML_ENTS_ESCAPE_APOS), '&lt;&gt;&amp;',
                          'Incorrectly translated single-quoted XML entities')
-        self.assertEqual(makeWellFormed('<>&', '"'), '&lt;&gt;&amp;',
+        self.assertEqual(make_well_formed('<>&', XML_ENTS_ESCAPE_QUOT), '&lt;&gt;&amp;',
                          'Incorrectly translated double-quoted XML entities')
 
-        self.assertEqual(makeWellFormed("'"), "'",
+        self.assertEqual(make_well_formed("'"), "'",
                          'Incorrectly translated unquoted single quote')
-        self.assertEqual(makeWellFormed('"'), '"',
+        self.assertEqual(make_well_formed('"'), '"',
                          'Incorrectly translated unquoted double quote')
 
-        self.assertEqual(makeWellFormed("'", '"'), "'",
+        self.assertEqual(make_well_formed("'", XML_ENTS_ESCAPE_QUOT), "'",
                          'Incorrectly translated double-quoted single quote')
-        self.assertEqual(makeWellFormed('"', "'"), '"',
+        self.assertEqual(make_well_formed('"', XML_ENTS_ESCAPE_APOS), '"',
                          'Incorrectly translated single-quoted double quote')
 
-        self.assertEqual(makeWellFormed("'", "'"), '&apos;',
+        self.assertEqual(make_well_formed("'", XML_ENTS_ESCAPE_APOS), '&apos;',
                          'Incorrectly translated single-quoted single quote')
-        self.assertEqual(makeWellFormed('"', '"'), '&quot;',
+        self.assertEqual(make_well_formed('"', XML_ENTS_ESCAPE_QUOT), '&quot;',
                          'Incorrectly translated double-quoted double quote')
 
 

From 9656569a72a8a46cbf5465f43515f9fc38bfecea Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Thu, 21 May 2020 12:15:32 +0000
Subject: [PATCH 3/4] serializeXML: Refactor the attribute ordering code

Rewrite the code for ordering attributes in the output and extract it
into a function.  As a side-effect, we ensure we only use the
`.item(index)` method once per attribute because it is inefficient
(see https://bugs.python.org/issue40689).

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 62 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 38 insertions(+), 24 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 15de2bd..473a2ed 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3433,6 +3433,42 @@ TEXT_CONTENT_ELEMENTS = ['text', 'tspan', 'tref', 'textPath', 'altGlyph',
                          'flowDiv', 'flowPara', 'flowSpan', 'flowTref', 'flowLine']
 
 
+KNOWN_ATTRS = [
+        # TODO: Maybe update with full list from https://www.w3.org/TR/SVG/attindex.html
+        # (but should be kept intuitively ordered)
+        'id', 'xml:id', 'class',
+        'transform',
+        'x', 'y', 'z', 'width', 'height', 'x1', 'x2', 'y1', 'y2',
+        'dx', 'dy', 'rotate', 'startOffset', 'method', 'spacing',
+        'cx', 'cy', 'r', 'rx', 'ry', 'fx', 'fy',
+        'd', 'points',
+    ] + sorted(svgAttributes) + [
+        'style',
+    ]
+
+KNOWN_ATTRS_ORDER_BY_NAME = defaultdict(lambda: len(KNOWN_ATTRS),
+                                        {name: order for order, name in enumerate(KNOWN_ATTRS)})
+
+
+# use custom order for known attributes and alphabetical order for the rest
+def _attribute_sort_key_function(attribute):
+    name = attribute.name
+    order_value = KNOWN_ATTRS_ORDER_BY_NAME[name]
+    return order_value, name
+
+
+def attributes_ordered_for_output(element):
+    if not element.hasAttributes():
+        return []
+    attribute = element.attributes
+    # The .item(i) call is painfully slow (bpo#40689). Therefore we ensure we
+    # call it at most once per attribute.
+    # - it would be many times faster to use `attribute.values()` but sadly
+    #   that is an "experimental" interface.
+    return sorted((attribute.item(i) for i in range(attribute.length)),
+                  key=_attribute_sort_key_function)
+
+
 # hand-rolled serialization function that has the following benefits:
 # - pretty printing
 # - somewhat judicious use of whitespace
@@ -3453,30 +3489,8 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
     outParts.extend([(indent_type * indent_depth), '<', element.nodeName])
 
     # now serialize the other attributes
-    known_attr = [
-        # TODO: Maybe update with full list from https://www.w3.org/TR/SVG/attindex.html
-        # (but should be kept inuitively ordered)
-        'id', 'xml:id', 'class',
-        'transform',
-        'x', 'y', 'z', 'width', 'height', 'x1', 'x2', 'y1', 'y2',
-        'dx', 'dy', 'rotate', 'startOffset', 'method', 'spacing',
-        'cx', 'cy', 'r', 'rx', 'ry', 'fx', 'fy',
-        'd', 'points',
-    ] + sorted(svgAttributes) + [
-        'style',
-    ]
-    attrList = element.attributes
-    attrName2Index = dict([(attrList.item(i).nodeName, i) for i in range(attrList.length)])
-    # use custom order for known attributes and alphabetical order for the rest
-    attrIndices = []
-    for name in known_attr:
-        if name in attrName2Index:
-            attrIndices.append(attrName2Index[name])
-            del attrName2Index[name]
-    attrIndices += [attrName2Index[name] for name in sorted(attrName2Index)]
-    for index in attrIndices:
-        attr = attrList.item(index)
-
+    attrs = attributes_ordered_for_output(element)
+    for attr in attrs:
         attrValue = attr.nodeValue
         quote, xml_ent = choose_quote_character(attrValue)
         attrValue = make_well_formed(attrValue, xml_ent)

From 397ffc55297c8a56b7c25a5f9c3eddd2d78723fe Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Thu, 21 May 2020 12:40:04 +0000
Subject: [PATCH 4/4] make_well_formed: Optimize for the common case of nothing
 needs to be escaped

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/scour/scour.py b/scour/scour.py
index 473a2ed..18a81d2 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -3413,6 +3413,11 @@ def remapNamespacePrefix(node, oldprefix, newprefix):
 def make_well_formed(text, quote_dict=None):
     if quote_dict is None:
         quote_dict = XML_ENTS_NO_QUOTES
+    if not any(c in text for c in quote_dict):
+        # The quote-able characters are quite rare in SVG (they mostly only
+        # occur in text elements in practice).  Therefore it make sense to
+        # optimize for this common case
+        return text
     return ''.join(quote_dict[c] if c in quote_dict else c for c in text)