Open input file in binary mode an let XML parser deal with encoding.

Fixes #26
This commit is contained in:
Eduard Braun 2015-12-08 23:38:06 +01:00
parent 1a6ff29c14
commit 4eade69201
3 changed files with 28 additions and 8 deletions

View file

@ -3097,7 +3097,7 @@ def scourString(in_string, options=None):
# input is a filename # input is a filename
# returns the minidom doc representation of the SVG # returns the minidom doc representation of the SVG
def scourXmlFile(filename, options=None): def scourXmlFile(filename, options=None):
with open(filename) as f: with open(filename, "rb") as f:
in_string = f.read() in_string = f.read()
out_string = scourString(in_string, options) out_string = scourString(in_string, options)
return xml.dom.minidom.parseString(out_string.encode('utf-8')) return xml.dom.minidom.parseString(out_string.encode('utf-8'))
@ -3235,7 +3235,7 @@ def parse_args(args=None, ignore_additional_args=False):
_options_parser.error("Input filename is the same as output filename") _options_parser.error("Input filename is the same as output filename")
if options.infilename: if options.infilename:
infile = maybe_gziped_file(options.infilename) infile = maybe_gziped_file(options.infilename, "rb")
# GZ: could catch a raised IOError here and report # GZ: could catch a raised IOError here and report
else: else:
# GZ: could sniff for gzip compression here # GZ: could sniff for gzip compression here

View file

@ -604,12 +604,18 @@ class ChangeQuadToShorthandInPath(unittest.TestCase):
self.assertEqual(path.getAttribute('d'), 'm10 100q50-50 100 0t100 0', self.assertEqual(path.getAttribute('d'), 'm10 100q50-50 100 0t100 0',
'Did not change quadratic curves into shorthand curve segments in path') 'Did not change quadratic curves into shorthand curve segments in path')
class HandleNonAsciiUtf8(unittest.TestCase): class HandleUTF8(unittest.TestCase):
def runTest(self): def runTest(self):
doc = scour.scourXmlFile('unittests/utf8.svg') doc = scour.scourXmlFile('unittests/utf8.svg')
text = u'Hello in many languages:\nar: أهلا\nbn: হ্যালো\nel: Χαίρετε\nen: Hello\nhi: नमस्ते\niw: שלום\nja: こんにちは\nkm: ជំរាបសួរ\nml: ഹലോ\nru: Здравствуйте\nur: ہیلو\nzh: 您好'
desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip() desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[0].firstChild.wholeText).strip()
self.assertEqual( desc, u'ú', self.assertEqual( desc, text, 'Did not handle international UTF8 characters' )
'Did not handle non-ASCII characters' ) desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[1].firstChild.wholeText).strip()
self.assertEqual( desc, u'“”—…°©®™•½¼¾⅓⅔†‡µ¢£€«»♠♣♥♦¿<EFBFBD>', 'Did not handle common UTF8 characters' )
desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[2].firstChild.wholeText).strip()
self.assertEqual( desc, u':-×÷±∞π∅≤≥≠≈∧∨∩∪∈∀∃∄∑∏←↑→↓↔↕↖↗↘↙↺↻⇒⇔', 'Did not handle mathematical UTF8 characters' )
desc = six.text_type(doc.getElementsByTagNameNS(SVGNS, 'desc')[3].firstChild.wholeText).strip()
self.assertEqual( desc, u'⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎', 'Did not handle superscript/subscript UTF8 characters' )
class HandleSciNoInPathData(unittest.TestCase): class HandleSciNoInPathData(unittest.TestCase):
def runTest(self): def runTest(self):

View file

@ -1,5 +1,19 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<svg xmlns:xlink="http://www.w3.org/1999/xlink" <svg xmlns="http://www.w3.org/2000/svg">
xmlns="http://www.w3.org/2000/svg"> <desc id="hello">Hello in many languages:
<desc>ú</desc> ar: أهلا
bn: হ্যালো
el: Χαίρετε
en: Hello
hi: नमस्ते
iw: שלום
ja: こんにちは
km: ជំរាបសួរ
ml: ഹലോ
ru: Здравствуйте
ur: ہیلو
zh: 您好</desc>
<desc id="common">“”—…°©®™•½¼¾⅓⅔†‡µ¢£€«»♠♣♥♦¿<EFBFBD></desc>
<desc id="math">:-×÷±∞π∅≤≥≠≈∧∨∩∪∈∀∃∄∑∏←↑→↓↔↕↖↗↘↙↺↻⇒⇔</desc>
<desc id="supersub">⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁽⁾ⁿⁱ₀₁₂₃₄₅₆₇₈₉₊₋₌₍₎</desc>
</svg> </svg>

Before

Width:  |  Height:  |  Size: 168 B

After

Width:  |  Height:  |  Size: 731 B

Before After
Before After