Add hand-rolled XML serialization function to improve XML output. Added --indent option to choose indentation mechanism (space, tab, none)

This commit is contained in:
JSCHILL1 2009-08-05 22:53:31 -05:00
parent 04487ed1ec
commit bac229dd14
3 changed files with 102 additions and 8 deletions

View file

@ -13,10 +13,12 @@
<header> <header>
<h2><a href="#0.18">Version 0.18</a></h2> <h2><a href="#0.18">Version 0.18</a></h2>
</header> </header>
<p>Aug 3rd, 2009</p> <p>Aug 5th, 2009</p>
<ul> <ul>
<li>Remove attributes of gradients if they contain default values</li> <li>Remove attributes of gradients if they contain default values</li>
<li>Reduce bezier/quadratic (c/q) segments to their shorthand equivalents (s/t)</li> <li>Reduce bezier/quadratic (c/q) segments to their shorthand equivalents (s/t)</li>
<li>Custom XML serialization such that id/xml:id is printed first (Thanks to Richard Hutch for the suggestion)</li>
<li>Added --indent option to specify indentation type (default='space', other options: 'none', 'tab')</li>
</ul> </ul>
</section> </section>

View file

@ -51,13 +51,12 @@
# <rect /> # <rect />
# </g> # </g>
# Suggestion from Richard Hutch:
# * Put id attributes first in the serialization (or make the d attribute last)
# This would require my own serialization of the DOM objects (not impossible)
# Next Up: # Next Up:
# + Remove some attributes that have default values # + Remove some attributes that have default values
# + Convert c/q path segments into shorthand equivalents where possible: # + Convert c/q path segments into shorthand equivalents where possible:
# + custom serialization of SVG that prints out id/xml:id first (suggestion by Richard Hutch)
# + --indent option to specify how indent should work: space, tab, none
# - option to remove metadata
# - parse transform attribute # - parse transform attribute
# - if a <g> has only one element in it, collapse the <g> (ensure transform, etc are carried down) # - if a <g> has only one element in it, collapse the <g> (ensure transform, etc are carried down)
# - remove id if it matches the Inkscape-style of IDs (also provide a switch to disable this) # - remove id if it matches the Inkscape-style of IDs (also provide a switch to disable this)
@ -1876,6 +1875,82 @@ def remapNamespacePrefix(node, oldprefix, newprefix):
for child in node.childNodes : for child in node.childNodes :
remapNamespacePrefix(child, oldprefix, newprefix) remapNamespacePrefix(child, oldprefix, newprefix)
# hand-rolled serialization function that has the following benefits:
# - pretty printing
# - somewhat judicious use of whitespace
# - ensure id attributes are first
def serializeXML(element, options, ind = 0):
indent = ind
I=''
if options.indent_type == 'tab': I='\t'
elif options.indent_type == 'space': I=' '
outString = (I * ind) + '<' + element.nodeName
# always serialize the id or xml:id attributes first
if element.getAttribute('id') != '':
id = element.getAttribute('id')
quot = '"'
if id.find('"') != -1:
quot = "'"
outString += ' ' + 'id=' + quot + id + quot
if element.getAttribute('xml:id') != '':
id = element.getAttribute('xml:id')
quot = '"'
if id.find('"') != -1:
quot = "'"
outString += ' ' + 'xml:id=' + quot + id + quot
# now serialize the other attributes
attrList = element.attributes
for num in range(attrList.length) :
attr = attrList.item(num)
if attr.nodeName == 'id' or attr.nodeName == 'xml:id': continue
# if the attribute value contains a double-quote, use single-quotes
quot = '"'
if attr.nodeValue.find('"') != -1:
quot = "'"
outString += ' ' + attr.nodeName + '=' + quot + attr.nodeValue + quot
# if no children, self-close
children = element.childNodes
if children.length > 0:
outString += '>'
onNewLine = False
for child in element.childNodes:
# element node
if child.nodeType == 1:
outString += '\n' + serializeXML(child, options, indent + 1)
onNewLine = True
# text node
elif child.nodeType == 3:
# trim it only in the case of not being a child of an element
# where whitespace might be important
if element.nodeName in ["text", "tspan", "textPath", "tref", "title", "desc", "textArea"]:
outString += child.nodeValue
else:
outString += child.nodeValue.strip()
# CDATA node
elif child.nodeType == 4:
outString += '<![CDATA[' + child.nodeValue + ']]>'
# Comment node
elif child.nodeType == 8:
outString += '<!--' + child.nodeValue + '-->'
# TODO: entities, processing instructions, what else?
else: # ignore the rest
pass
if onNewLine: outString += (I * ind)
outString += '</' + element.nodeName + '>'
if indent > 0: outString += '\n'
else:
outString += '/>'
if indent > 0: outString += '\n'
return outString
# this is the main method # this is the main method
# input is a string representation of the input XML # input is a string representation of the input XML
# returns a string representation of the output XML # returns a string representation of the output XML
@ -2004,7 +2079,6 @@ def scourString(in_string, options=None):
elem.setAttribute(attr, scourLength(elem.getAttribute(attr))) elem.setAttribute(attr, scourLength(elem.getAttribute(attr)))
# remove default values of attributes # remove default values of attributes
# print doc.documentElement.toxml()
numAttrsRemoved += removeDefaultAttributeValues(doc.documentElement, options) numAttrsRemoved += removeDefaultAttributeValues(doc.documentElement, options)
# convert rasters references to base64-encoded strings # convert rasters references to base64-encoded strings
@ -2018,8 +2092,9 @@ def scourString(in_string, options=None):
# output the document as a pretty string with a single space for indent # output the document as a pretty string with a single space for indent
# NOTE: removed pretty printing because of this problem: # NOTE: removed pretty printing because of this problem:
# http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/ # http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/
# rolled our own serialize function here to save on space, put id first, customize indentation, etc
# out_string = doc.documentElement.toprettyxml(' ') # out_string = doc.documentElement.toprettyxml(' ')
out_string = doc.documentElement.toxml() out_string = serializeXML(doc.documentElement, options)
# now strip out empty lines # now strip out empty lines
lines = [] lines = []
@ -2096,6 +2171,9 @@ _options_parser.add_option("-i",
action="store", dest="infilename", help=optparse.SUPPRESS_HELP) action="store", dest="infilename", help=optparse.SUPPRESS_HELP)
_options_parser.add_option("-o", _options_parser.add_option("-o",
action="store", dest="outfilename", help=optparse.SUPPRESS_HELP) action="store", dest="outfilename", help=optparse.SUPPRESS_HELP)
_options_parser.add_option("--indent",
action="store", type="string", dest="indent_type", default="space",
help="indentation of the output: none, space, tab (default: %default)")
def maybe_gziped_file(filename, mode="r"): def maybe_gziped_file(filename, mode="r"):
if os.path.splitext(filename)[1].lower() in (".svgz", ".gz"): if os.path.splitext(filename)[1].lower() in (".svgz", ".gz"):
@ -2109,6 +2187,9 @@ def parse_args(args=None):
_options_parser.error("Additional arguments not handled: %r, see --help" % rargs) _options_parser.error("Additional arguments not handled: %r, see --help" % rargs)
if options.digits < 0: if options.digits < 0:
_options_parser.error("Can't have negative significant digits, see --help") _options_parser.error("Can't have negative significant digits, see --help")
if not options.indent_type in ["tab", "space", "none"]:
_options_parser.error("Invalid value for --indent, see --help")
if options.infilename: if options.infilename:
infile = maybe_gziped_file(options.infilename) infile = maybe_gziped_file(options.infilename)
# GZ: could catch a raised IOError here and report # GZ: could catch a raised IOError here and report

View file

@ -820,6 +820,17 @@ class RemoveDefaultGradFYValue(unittest.TestCase):
self.assertEquals( g.getAttribute('fy'), '', self.assertEquals( g.getAttribute('fy'), '',
'fy matching cy not removed') 'fy matching cy not removed')
class CDATAInXml(unittest.TestCase):
def runTest(self):
self.assertEquals( scour.scourString(open('unittests/cdata.svg').read()),
'''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg xmlns="http://www.w3.org/2000/svg">
<script type="application/ecmascript"><![CDATA[
alert('pb&j');
]]></script>
</svg>''',
'Improperly serialized the cdata unit tests')
# TODO; write a test for embedding rasters # TODO; write a test for embedding rasters
# TODO: write a test for --disable-embed-rasters # TODO: write a test for --disable-embed-rasters
# TODO: write tests for --keep-editor-data # TODO: write tests for --keep-editor-data