Add hand-rolled XML serialization function to improve XML output. Added --indent option to choose indentation mechanism (space, tab, none)

2009-08-05 22:53:31 -05:00 · 2009-08-05 22:53:31 -05:00 · bac229dd14
commit bac229dd14
parent 04487ed1ec
3 changed files with 102 additions and 8 deletions
--- a/release-notes.html
+++ b/release-notes.html
@ -13,10 +13,12 @@
 	<header>
 		<h2><a href="#0.18">Version 0.18</a></h2>
 	</header>
-	<p>Aug 3rd, 2009</p>
+	<p>Aug 5th, 2009</p>
 	<ul>
 		<li>Remove attributes of gradients if they contain default values</li>
 		<li>Reduce bezier/quadratic (c/q) segments to their shorthand equivalents (s/t)</li>
 		<li>Custom XML serialization such that id/xml:id is printed first (Thanks to Richard Hutch for the suggestion)</li>
 		<li>Added --indent option to specify indentation type (default='space', other options: 'none', 'tab')</li>
 	</ul>
 </section>
--- a/scour.py
+++ b/scour.py
@ -51,13 +51,12 @@
 #      <rect />
 #    </g>
 # Suggestion from Richard Hutch:
 #  * Put id attributes first in the serialization (or make the d attribute last)
 #    This would require my own serialization of the DOM objects (not impossible)
 # Next Up:
 # + Remove some attributes that have default values
 # + Convert c/q path segments into shorthand equivalents where possible: 
 # + custom serialization of SVG that prints out id/xml:id first (suggestion by Richard Hutch)
 # + --indent option to specify how indent should work: space, tab, none
 # - option to remove metadata
 # - parse transform attribute
 # - if a <g> has only one element in it, collapse the <g> (ensure transform, etc are carried down)
 # - remove id if it matches the Inkscape-style of IDs (also provide a switch to disable this)
@ -1876,6 +1875,82 @@ def remapNamespacePrefix(node, oldprefix, newprefix):
 	for child in node.childNodes :
 		remapNamespacePrefix(child, oldprefix, newprefix)	
 # hand-rolled serialization function that has the following benefits:
 # - pretty printing
 # - somewhat judicious use of whitespace
 # - ensure id attributes are first
 def serializeXML(element, options, ind = 0):
 	indent = ind
 	I=''
 	if options.indent_type == 'tab': I='\t'
 	elif options.indent_type == 'space': I=' '
 	outString = (I * ind) + '<' + element.nodeName
 	# always serialize the id or xml:id attributes first
 	if element.getAttribute('id') != '':
 		id = element.getAttribute('id')
 		quot = '"'
 		if id.find('"') != -1:
 			quot = "'"
 		outString += ' ' + 'id=' + quot + id + quot
 	if element.getAttribute('xml:id') != '':
 		id = element.getAttribute('xml:id')
 		quot = '"'
 		if id.find('"') != -1:
 			quot = "'"
 		outString += ' ' + 'xml:id=' + quot + id + quot
 	# now serialize the other attributes
 	attrList = element.attributes
 	for num in range(attrList.length) :
 		attr = attrList.item(num)
 		if attr.nodeName == 'id' or attr.nodeName == 'xml:id': continue
 		# if the attribute value contains a double-quote, use single-quotes
 		quot = '"'
 		if attr.nodeValue.find('"') != -1:
 			quot = "'"
 		outString += ' ' + attr.nodeName + '=' + quot + attr.nodeValue + quot
 	# if no children, self-close
 	children = element.childNodes
 	if children.length > 0:
 		outString += '>'
 		onNewLine = False
 		for child in element.childNodes:
 			# element node
 			if child.nodeType == 1:
 				outString += '\n' + serializeXML(child, options, indent + 1)
 				onNewLine = True
 			# text node
 			elif child.nodeType == 3:
 				# trim it only in the case of not being a child of an element
 				# where whitespace might be important
 				if element.nodeName in ["text", "tspan", "textPath", "tref", "title", "desc", "textArea"]:
 					outString += child.nodeValue
 				else:
 					outString += child.nodeValue.strip()
 			# CDATA node
 			elif child.nodeType == 4:
 				outString += '<![CDATA[' + child.nodeValue + ']]>'
 			# Comment node
 			elif child.nodeType == 8:
 				outString += '<!--' + child.nodeValue + '-->'
 			# TODO: entities, processing instructions, what else?
 			else: # ignore the rest
 				pass
 		if onNewLine: outString += (I * ind)
 		outString += '</' + element.nodeName + '>'
 		if indent > 0: outString += '\n'
 	else:
 		outString += '/>'
 		if indent > 0: outString += '\n'
 	return outString
 # this is the main method
 # input is a string representation of the input XML
 # returns a string representation of the output XML
@ -2004,7 +2079,6 @@ def scourString(in_string, options=None):
 					elem.setAttribute(attr, scourLength(elem.getAttribute(attr)))
 	# remove default values of attributes
 #	print doc.documentElement.toxml()
 	numAttrsRemoved += removeDefaultAttributeValues(doc.documentElement, options)		
 	# convert rasters references to base64-encoded strings 
@ -2018,8 +2092,9 @@ def scourString(in_string, options=None):
 	# output the document as a pretty string with a single space for indent
 	# NOTE: removed pretty printing because of this problem:
 	# http://ronrothman.com/public/leftbraned/xml-dom-minidom-toprettyxml-and-silly-whitespace/
 	# rolled our own serialize function here to save on space, put id first, customize indentation, etc
 #	out_string = doc.documentElement.toprettyxml(' ')
-	out_string = doc.documentElement.toxml()
+	out_string = serializeXML(doc.documentElement, options)
 	# now strip out empty lines
 	lines = []
@ -2096,6 +2171,9 @@ _options_parser.add_option("-i",
 	action="store", dest="infilename", help=optparse.SUPPRESS_HELP)
 _options_parser.add_option("-o",
 	action="store", dest="outfilename", help=optparse.SUPPRESS_HELP)
 _options_parser.add_option("--indent",
 	action="store", type="string", dest="indent_type", default="space",
 	help="indentation of the output: none, space, tab (default: %default)")
 def maybe_gziped_file(filename, mode="r"):
 	if os.path.splitext(filename)[1].lower() in (".svgz", ".gz"):
@ -2109,6 +2187,9 @@ def parse_args(args=None):
 		_options_parser.error("Additional arguments not handled: %r, see --help" % rargs)
 	if options.digits < 0:
 		_options_parser.error("Can't have negative significant digits, see --help")
 	if not options.indent_type in ["tab", "space", "none"]:
 		_options_parser.error("Invalid value for --indent, see --help")
 	if options.infilename:
 		infile = maybe_gziped_file(options.infilename)
 		# GZ: could catch a raised IOError here and report
--- a/testscour.py
+++ b/testscour.py
@ -820,6 +820,17 @@ class RemoveDefaultGradFYValue(unittest.TestCase):
 		self.assertEquals( g.getAttribute('fy'), '',
 			'fy matching cy not removed')
 class CDATAInXml(unittest.TestCase):
 	def runTest(self):
 		self.assertEquals( scour.scourString(open('unittests/cdata.svg').read()), 
 			'''<?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <svg xmlns="http://www.w3.org/2000/svg">
 <script type="application/ecmascript"><![CDATA[
  	alert('pb&j');
 ]]></script>
 </svg>''',
 			'Improperly serialized the cdata unit tests')
 # TODO; write a test for embedding rasters
 # TODO: write a test for --disable-embed-rasters
 # TODO: write tests for --keep-editor-data