Improve whitespace handling in text content elements

SVG specifies special logic for handling whitespace, see
   https://www.w3.org/TR/SVG/text.html#WhiteSpace
by implementing it we can even shave off some unneeded bytes here
and there (e.g. consecutive spaces).

Unfortunately handling of newlines by renderers is inconsistent:
Sometimes they are replaced by a single space, sometimes they
are removed in the output.
As we can not know the expected behavior work around this by keeping
newlines inside text content elements intact.

Fixes #160.
This commit is contained in:
Eduard Braun 2018-07-01 20:16:51 +02:00
parent 7d28f5e051
commit e1c2699f07

View file

@ -3341,19 +3341,30 @@ def serializeXML(element, options, indent_depth=0, preserveWhitespace=False):
for child in element.childNodes: for child in element.childNodes:
# element node # element node
if child.nodeType == Node.ELEMENT_NODE: if child.nodeType == Node.ELEMENT_NODE:
if preserveWhitespace: # do not indent inside text content elements as in SVG there's a difference between
# "text1\ntext2" and
# "text1\n text2"
# see https://www.w3.org/TR/SVG/text.html#WhiteSpace
if preserveWhitespace or element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
outParts.append(serializeXML(child, options, 0, preserveWhitespace)) outParts.append(serializeXML(child, options, 0, preserveWhitespace))
else: else:
outParts.extend([newline, serializeXML(child, options, indent_depth + 1, preserveWhitespace)]) outParts.extend([newline, serializeXML(child, options, indent_depth + 1, preserveWhitespace)])
onNewLine = True onNewLine = True
# text node # text node
elif child.nodeType == Node.TEXT_NODE: elif child.nodeType == Node.TEXT_NODE:
# trim it only in the case of not being a child of an element text_content = child.nodeValue
# where whitespace might be important if not preserveWhitespace:
if preserveWhitespace: # strip / consolidate whitespace according to spec, see
outParts.append(makeWellFormed(child.nodeValue)) # https://www.w3.org/TR/SVG/text.html#WhiteSpace
# As a workaround for inconsistent handling of renderers keep newlines if they were in the original
if element.nodeName in ['text', 'tspan', 'tref', 'textPath', 'altGlyph']:
text_content = text_content.replace('\t', ' ')
text_content = text_content.strip(' ')
while ' ' in text_content:
text_content = text_content.replace(' ', ' ')
else: else:
outParts.append(makeWellFormed(child.nodeValue.strip())) text_content = text_content.strip()
outParts.append(makeWellFormed(text_content))
# CDATA node # CDATA node
elif child.nodeType == Node.CDATA_SECTION_NODE: elif child.nodeType == Node.CDATA_SECTION_NODE:
outParts.extend(['<![CDATA[', child.nodeValue, ']]>']) outParts.extend(['<![CDATA[', child.nodeValue, ']]>'])