Merge pull request #199 from Ede123/newline_handling

Several improvements for handling whitespace including newlines, especially in text nodes
2018-07-03 22:56:36 +02:00 · 2018-07-03 22:56:36 +02:00 · 718748ff22
commit 718748ff22
parent 06ea23d0e1 651694a6c0
6 changed files with 190 additions and 55 deletions
--- a/scour/scour.py
+++ b/scour/scour.py
@ -3334,8 +3334,6 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
    children = element.childNodes
    if children.length == 0:
        outParts.append('/>')
-        if indent_depth > 0:
-            outParts.append(newline)
    else:
        outParts.append('>')

@ -3343,34 +3341,47 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
        for child in element.childNodes:
            # element node
            if child.nodeType == Node.ELEMENT_NODE:
-                if preserveWhitespace:
+                # do not indent inside text content elements as in SVG there's a difference between
+                #    "text1\ntext2" and
+                #    "text1\n text2"
+                # see https://www.w3.org/TR/SVG/text.html#WhiteSpace
+                if preserveWhitespace or element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
                    outParts.append(serializeXML(child, options, 0, preserveWhitespace))
                else:
                    outParts.extend([newline, serializeXML(child, options, indent_depth + 1, preserveWhitespace)])
                    onNewLine = True
            # text node
            elif child.nodeType == Node.TEXT_NODE:
-                # trim it only in the case of not being a child of an element
-                # where whitespace might be important
-                if preserveWhitespace:
-                    outParts.append(makeWellFormed(child.nodeValue))
-                else:
-                    outParts.append(makeWellFormed(child.nodeValue.strip()))
+                text_content = child.nodeValue
+                if not preserveWhitespace:
+                    # strip / consolidate whitespace according to spec, see
+                    #    https://www.w3.org/TR/SVG/text.html#WhiteSpace
+                    if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
+                        text_content = text_content.replace('\n', '')
+                        text_content = text_content.replace('\t', ' ')
+                        if child == element.firstChild:
+                            text_content = text_content.lstrip()
+                        elif child == element.lastChild:
+                            text_content = text_content.rstrip()
+                        while '  ' in text_content:
+                            text_content = text_content.replace('  ', ' ')
+                    else:
+                        text_content = text_content.strip()
+                outParts.append(makeWellFormed(text_content))
            # CDATA node
            elif child.nodeType == Node.CDATA_SECTION_NODE:
                outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])
            # Comment node
            elif child.nodeType == Node.COMMENT_NODE:
-                outParts.extend(['<!--', child.nodeValue, '-->'])
+                outParts.extend([newline, indent_type * (indent_depth+1), '<!--', child.nodeValue, '-->'])
            # TODO: entities, processing instructions, what else?
            else:  # ignore the rest
                pass

        if onNewLine:
+            outParts.append(newline)
            outParts.append(indent_type * indent_depth)
        outParts.extend(['</', element.nodeName, '>'])
-        if indent_depth > 0:
-            outParts.append(newline)

    return "".join(outParts)

@ -3632,13 +3643,6 @@ def scourString(in_string, options=None):
 #  out_string = doc.documentElement.toprettyxml(' ')
    out_string = serializeXML(doc.documentElement, options) + '\n'

-    # now strip out empty lines
-    lines = []
-    # Get rid of empty lines
-    for line in out_string.splitlines(True):
-        if line.strip():
-            lines.append(line)
-
    # return the string with its XML prolog and surrounding comments
    if options.strip_xml_prolog is False:
        total_output = '<?xml version="1.0" encoding="UTF-8"'
@ -3650,7 +3654,7 @@ def scourString(in_string, options=None):

    for child in doc.childNodes:
        if child.nodeType == Node.ELEMENT_NODE:
-            total_output += "".join(lines)
+            total_output += out_string
        else:  # doctypes, entities, comments
            total_output += child.toxml() + '\n'

--- a/testscour.py
+++ b/testscour.py
@ -1744,34 +1744,83 @@ class DoNotRemoveGradientsWhenReferencedInStyleCss(unittest.TestCase):
                         'Gradients removed when referenced in CSS')


-class DoNotPrettyPrintWhenWhitespacePreserved(unittest.TestCase):
+class Whitespace(unittest.TestCase):

-    def runTest(self):
-        with open('unittests/whitespace-important.svg') as f:
-            s = scourString(f.read()).splitlines()
-        c = '''<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg">
- <text xml:space="preserve">This is some <tspan font-style="italic">messed-up</tspan> markup</text>
-</svg>
-'''.splitlines()
-        for i in range(4):
-            self.assertEqual(s[i], c[i],
-                             'Whitespace not preserved for line ' + str(i))
+    def setUp(self):
+        self.doc = scourXmlFile('unittests/whitespace.svg')

+    def test_basic(self):
+        text = self.doc.getElementById('txt_a1')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Multiple spaces not stripped from text element')
+        text = self.doc.getElementById('txt_a2')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Tab not replaced with space in text element')
+        text = self.doc.getElementById('txt_a3')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Multiple spaces not stripped from text element with xml:space="default"')
+        text = self.doc.getElementById('txt_a4')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Tab not replaced with space in text element with xml:space="default"')
+        text = self.doc.getElementById('txt_a5')
+        self.assertIn('text1    text2', text.toxml(),
+                      'Multiple spaces not preserved in text element with xml:space="preserve"')
+        text = self.doc.getElementById('txt_a6')
+        self.assertIn('text1\ttext2', text.toxml(),
+                      'Tab not preserved in text element with xml:space="preserve"')

-class DoNotPrettyPrintWhenNestedWhitespacePreserved(unittest.TestCase):
+    def test_newlines(self):
+        text = self.doc.getElementById('txt_b1')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Newline not replaced with space in text element')
+        text = self.doc.getElementById('txt_b2')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Newline not replaced with space in text element with xml:space="default"')
+        text = self.doc.getElementById('txt_b3')
+        self.assertIn('text1\n       text2', text.toxml(),
+                      'Newline not preserved in text element with xml:space="preserve"')

-    def runTest(self):
-        with open('unittests/whitespace-nested.svg') as f:
-            s = scourString(f.read()).splitlines()
-        c = '''<?xml version="1.0" encoding="UTF-8"?>
-<svg xmlns="http://www.w3.org/2000/svg">
- <text xml:space="preserve"><tspan font-style="italic">Use <tspan font-style="bold">bold</tspan> text</tspan></text>
-</svg>
-'''.splitlines()
-        for i in range(4):
-            self.assertEqual(s[i], c[i],
-                             'Whitespace not preserved when nested for line ' + str(i))
+    def test_inheritance(self):
+        text = self.doc.getElementById('txt_c1')
+        self.assertIn('text1    text2', text.toxml(),
+                      '<tspan> does not inherit xml:space="preserve" of parent text element')
+        text = self.doc.getElementById('txt_c2')
+        self.assertIn('text1 text2', text.toxml(),
+                      'xml:space="default" of <tspan> does not overwrite xml:space="preserve" of parent text element')
+        text = self.doc.getElementById('txt_c3')
+        self.assertIn('text1    text2', text.toxml(),
+                      'xml:space="preserve" of <tspan> does not overwrite xml:space="default" of parent text element')
+        text = self.doc.getElementById('txt_c4')
+        self.assertIn('text1    text2', text.toxml(),
+                      '<text> does not inherit xml:space="preserve" of parent group')
+        text = self.doc.getElementById('txt_c5')
+        self.assertIn('text1 text2', text.toxml(),
+                      'xml:space="default" of text element does not overwrite xml:space="preserve" of parent group')
+        text = self.doc.getElementById('txt_c6')
+        self.assertIn('text1    text2', text.toxml(),
+                      'xml:space="preserve" of text element does not overwrite xml:space="default" of parent group')
+
+    def test_important_whitespace(self):
+        text = self.doc.getElementById('txt_d1')
+        self.assertIn('text1 text2', text.toxml(),
+                      'Newline with whitespace collapsed in text element')
+        text = self.doc.getElementById('txt_d2')
+        self.assertIn('text1 <tspan>tspan1</tspan> text2', text.toxml(),
+                      'Whitespace stripped from the middle of a text element')
+        text = self.doc.getElementById('txt_d3')
+        self.assertIn('text1 <tspan>tspan1 <tspan>tspan2</tspan> text2</tspan>', text.toxml(),
+                      'Whitespace stripped from the middle of a text element')
+
+    def test_incorrect_whitespace(self):
+        text = self.doc.getElementById('txt_e1')
+        self.assertIn('text1text2', text.toxml(),
+                      'Whitespace introduced in text element with newline')
+        text = self.doc.getElementById('txt_e2')
+        self.assertIn('text1<tspan>tspan</tspan>text2', text.toxml(),
+                      'Whitespace introduced in text element with <tspan>')
+        text = self.doc.getElementById('txt_e3')
+        self.assertIn('text1<tspan>tspan</tspan>text2', text.toxml(),
+                      'Whitespace introduced in text element with <tspan> and newlines')


 class GetAttrPrefixRight(unittest.TestCase):
@ -1807,10 +1856,10 @@ class HandleEmptyStyleElement(unittest.TestCase):
 class EnsureLineEndings(unittest.TestCase):

    def runTest(self):
-        with open('unittests/whitespace-important.svg') as f:
+        with open('unittests/newlines.svg') as f:
            s = scourString(f.read())
-        self.assertEqual(len(s.splitlines()), 4,
-                         'Did not output line ending character correctly')
+        self.assertEqual(len(s.splitlines()), 24,
+                         'Did handle reading or outputting line ending characters correctly')


 class XmlEntities(unittest.TestCase):
--- a/unittests/newlines.svg
+++ b/unittests/newlines.svg
@ -0,0 +1,50 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+	
+
+<svg xmlns="http://www.w3.org/2000/svg"
+
+>
+
+
+
+     <!-- this file has pretty messed up formatting -->       <rect width="100" height="100"/>	
+  
<rect width="100" height="100"/>
+  <rect width="100" height="100"/>
+  
+  
+ 	
+  
+  
+  <rect width="100" height="100"/>
+                 <rect width="100" height="100"/>	
+  <rect width="100" height="100"/>            
+  
+  
+  
+  
+  
+  
+  <!-- we have mixed newline
+  characters, carriage returns
  and both of them
+  as well as tabs 	   		and spaces        
+  -->
+  
+  <rect width="100" height="100"/><rect width="100" height="100"/>   <rect width="100" height="100"/>
+  
+  <rect width="100" height="100"/>
<rect width="100" height="100"/>   <rect width="100" height="100"/>
+  
+	
+  
+		
+  
+  <rect width="100" height="100"/>		  	<rect width="100" height="100"/>
+  
+  
+</svg>
+
+
+
+
+
+<!-- OMG, really? -->
--- a/unittests/whitespace-important.svg
+++ b/unittests/whitespace-important.svg
@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg xmlns="http://www.w3.org/2000/svg">
- <text xml:space="preserve">This is some <tspan font-style="italic">messed-up</tspan> markup</text>
-</svg>
--- a/unittests/whitespace-nested.svg
+++ b/unittests/whitespace-nested.svg
@ -1,4 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<svg xmlns="http://www.w3.org/2000/svg">
- <text xml:space="preserve"><tspan font-style="italic">Use <tspan font-style="bold">bold</tspan> text</tspan></text>
-</svg>
--- a/unittests/whitespace.svg
+++ b/unittests/whitespace.svg
@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg xmlns="http://www.w3.org/2000/svg">
+ <!-- basic tests -->
+ <text id="txt_a1">text1    text2</text> <!-- multiple spaces -->
+ <text id="txt_a2">text1	text2</text> <!-- tab -->
+ <text id="txt_a3" xml:space="default">text1    text2</text> <!-- multiple spaces -->
+ <text id="txt_a4" xml:space="default">text1	text2</text> <!-- tab -->
+ <text id="txt_a5" xml:space="preserve">text1    text2</text> <!-- multiple spaces -->
+ <text id="txt_a6" xml:space="preserve">text1	text2</text> <!-- tab -->
+
+ <!-- newlines -->
+ <text id="txt_b1">text1
+       text2</text>
+ <text id="txt_b2" xml:space="default">text1
+       text2</text>
+ <text id="txt_b3" xml:space="preserve">text1
+       text2</text>
+
+ <!-- inheritance -->
+ <text id="txt_c1" xml:space="preserve"><tspan>text1    text2</tspan></text>
+ <text id="txt_c2" xml:space="preserve"><tspan xml:space="default">text1    text2</tspan></text>
+ <text id="txt_c3" xml:space="default"><tspan xml:space="preserve">text1    text2</tspan></text>
+ <g xml:space="preserve"><text id="txt_c4">text1    text2</text></g>
+ <g xml:space="preserve"><text id="txt_c5" xml:space="default">text1    text2</text></g>
+ <g xml:space="default"><text id="txt_c6" xml:space="preserve">text1    text2</text></g>
+
+ <!-- important whitespace that must not be stripped -->
+ <text id="txt_d1">text1
+ text2</text>
+ <text id="txt_d2">text1 <tspan>tspan1</tspan> text2</text>
+ <text id="txt_d3">text1 <tspan>tspan1 <tspan>tspan2</tspan> text2</tspan></text>
+ 
+ <!-- whitespace must not be introduced -->
+ <text id="txt_e1">text1
+text2</text>
+ <text id="txt_e2">text1<tspan>tspan</tspan>text2</text>
+ <text id="txt_e3">text1
+<tspan>tspan</tspan>
+text2</text>
+</svg>