Merge pull request #187 from nthykier/fix-gh-186-shorten-id-recycle-used-ids

Enable shortenIDs to recycle existing IDs
This commit is contained in:
Patrick Storz 2020-05-17 16:48:18 +02:00 committed by GitHub
commit 4fe2655f86
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 141 additions and 46 deletions

View file

@ -690,19 +690,20 @@ def removeUnreferencedElements(doc, keepDefs):
return num return num
def shortenIDs(doc, prefix, unprotectedElements=None): def shortenIDs(doc, prefix, options):
""" """
Shortens ID names used in the document. ID names referenced the most often are assigned the Shortens ID names used in the document. ID names referenced the most often are assigned the
shortest ID names. shortest ID names.
If the list unprotectedElements is provided, only IDs from this list will be shortened.
Returns the number of bytes saved by shortening ID names in the document. Returns the number of bytes saved by shortening ID names in the document.
""" """
num = 0 num = 0
identifiedElements = findElementsWithId(doc.documentElement) identifiedElements = findElementsWithId(doc.documentElement)
if unprotectedElements is None: # This map contains maps the (original) ID to the nodes referencing it.
unprotectedElements = identifiedElements # At the end of this function, it will no longer be valid and while we
# could keep it up to date, it will complicate the code for no gain
# (as we do not reuse the data structure beyond this function).
referencedIDs = findReferencedElements(doc.documentElement) referencedIDs = findReferencedElements(doc.documentElement)
# Make idList (list of idnames) sorted by reference count # Make idList (list of idnames) sorted by reference count
@ -710,31 +711,99 @@ def shortenIDs(doc, prefix, unprotectedElements=None):
# First check that there's actually a defining element for the current ID name. # First check that there's actually a defining element for the current ID name.
# (Cyn: I've seen documents with #id references but no element with that ID!) # (Cyn: I've seen documents with #id references but no element with that ID!)
idList = [(len(referencedIDs[rid]), rid) for rid in referencedIDs idList = [(len(referencedIDs[rid]), rid) for rid in referencedIDs
if rid in unprotectedElements] if rid in identifiedElements]
idList.sort(reverse=True) idList.sort(reverse=True)
idList = [rid for count, rid in idList] idList = [rid for count, rid in idList]
# Add unreferenced IDs to end of idList in arbitrary order # Add unreferenced IDs to end of idList in arbitrary order
idList.extend([rid for rid in unprotectedElements if rid not in idList]) idList.extend([rid for rid in identifiedElements if rid not in idList])
# Ensure we do not reuse a protected ID by accident
protectedIDs = protected_ids(identifiedElements, options)
# IDs that have been allocated and should not be remapped.
consumedIDs = set()
# List of IDs that need to be assigned a new ID. The list is ordered
# such that earlier entries will be assigned a shorter ID than those
# later in the list. IDs in this list *can* obtain an ID that is
# longer than they already are.
need_new_id = []
id_allocations = list(compute_id_lengths(len(idList) + 1))
# Reverse so we can use it as a stack and still work from "shortest to
# longest" ID.
id_allocations.reverse()
# Here we loop over all current IDs (that we /might/ want to remap)
# and group them into two. 1) The IDs that already have a perfect
# length (these are added to consumedIDs) and 2) the IDs that need
# to change length (these are appended to need_new_id).
optimal_id_length, id_use_limit = 0, 0
for current_id in idList:
# If we are out of IDs of the current length, then move on
# to the next length
if id_use_limit < 1:
optimal_id_length, id_use_limit = id_allocations.pop()
# Reserve an ID from this length
id_use_limit -= 1
# We check for strictly equal to optimal length because our ID
# remapping may have to assign one node a longer ID because
# another node needs a shorter ID.
if len(current_id) == optimal_id_length:
# This rid is already of optimal length - lets just keep it.
consumedIDs.add(current_id)
else:
# Needs a new (possibly longer) ID.
need_new_id.append(current_id)
curIdNum = 1 curIdNum = 1
for rid in idList: for old_id in need_new_id:
curId = intToID(curIdNum, prefix) new_id = intToID(curIdNum, prefix)
# First make sure that *this* element isn't already using
# the ID name we want to give it. # Skip ahead if the new ID has already been used or is protected.
if curId != rid: while new_id in protectedIDs or new_id in consumedIDs:
# Then, skip ahead if the new ID is already in identifiedElement. curIdNum += 1
while curId in identifiedElements: new_id = intToID(curIdNum, prefix)
curIdNum += 1
curId = intToID(curIdNum, prefix) # Now that we have found the first available ID, do the remap.
# Then go rename it. num += renameID(old_id, new_id, identifiedElements, referencedIDs.get(old_id))
num += renameID(doc, rid, curId, identifiedElements, referencedIDs)
curIdNum += 1 curIdNum += 1
return num return num
def compute_id_lengths(highest):
"""Compute how many IDs are available of a given size
Example:
>>> lengths = list(compute_id_lengths(512))
>>> lengths
[(1, 26), (2, 676)]
>>> total_limit = sum(x[1] for x in lengths)
>>> total_limit
702
>>> intToID(total_limit, '')
'zz'
Which tells us that we got 26 IDs of length 1 and up to 676 IDs of length two
if we need to allocate 512 IDs.
:param highest: Highest ID that need to be allocated
:return: An iterator that returns tuples of (id-length, use-limit). The
use-limit applies only to the given id-length (i.e. it is excluding IDs
of shorter length). Note that the sum of the use-limit values is always
equal to or greater than the highest param.
"""
step = 26
id_length = 0
use_limit = 1
while highest:
id_length += 1
use_limit *= step
yield (id_length, use_limit)
highest = int((highest - 1) / step)
def intToID(idnum, prefix): def intToID(idnum, prefix):
""" """
Returns the ID name for the given ID number, spreadsheet-style, i.e. from a to z, Returns the ID name for the given ID number, spreadsheet-style, i.e. from a to z,
@ -750,14 +819,12 @@ def intToID(idnum, prefix):
return prefix + rid return prefix + rid
def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs): def renameID(idFrom, idTo, identifiedElements, referringNodes):
""" """
Changes the ID name from idFrom to idTo, on the declaring element Changes the ID name from idFrom to idTo, on the declaring element
as well as all references in the document doc. as well as all nodes in referringNodes.
Updates identifiedElements and referencedIDs. Updates identifiedElements.
Does not handle the case where idTo is already the ID name
of another element in doc.
Returns the number of bytes saved by this replacement. Returns the number of bytes saved by this replacement.
""" """
@ -771,7 +838,6 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs):
num += len(idFrom) - len(idTo) num += len(idFrom) - len(idTo)
# Update references to renamed node # Update references to renamed node
referringNodes = referencedIDs.get(idFrom)
if referringNodes is not None: if referringNodes is not None:
# Look for the idFrom ID name in each of the referencing elements, # Look for the idFrom ID name in each of the referencing elements,
@ -822,34 +888,39 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs):
node.setAttribute(attr, newValue) node.setAttribute(attr, newValue)
num += len(oldValue) - len(newValue) num += len(oldValue) - len(newValue)
del referencedIDs[idFrom]
referencedIDs[idTo] = referringNodes
return num return num
def protected_ids(seenIDs, options):
"""Return a list of protected IDs out of the seenIDs"""
protectedIDs = []
if options.protect_ids_prefix or options.protect_ids_noninkscape or options.protect_ids_list:
protect_ids_prefixes = []
protect_ids_list = []
if options.protect_ids_list:
protect_ids_list = options.protect_ids_list.split(",")
if options.protect_ids_prefix:
protect_ids_prefixes = options.protect_ids_prefix.split(",")
for id in seenIDs:
protected = False
if options.protect_ids_noninkscape and not id[-1].isdigit():
protected = True
elif protect_ids_list and id in protect_ids_list:
protected = True
elif protect_ids_prefixes:
if any(id.startswith(prefix) for prefix in protect_ids_prefixes):
protected = True
if protected:
protectedIDs.append(id)
return protectedIDs
def unprotected_ids(doc, options): def unprotected_ids(doc, options):
u"""Returns a list of unprotected IDs within the document doc.""" u"""Returns a list of unprotected IDs within the document doc."""
identifiedElements = findElementsWithId(doc.documentElement) identifiedElements = findElementsWithId(doc.documentElement)
if not (options.protect_ids_noninkscape or protectedIDs = protected_ids(identifiedElements, options)
options.protect_ids_list or if protectedIDs:
options.protect_ids_prefix): for id in protectedIDs:
return identifiedElements
if options.protect_ids_list:
protect_ids_list = options.protect_ids_list.split(",")
if options.protect_ids_prefix:
protect_ids_prefixes = options.protect_ids_prefix.split(",")
for id in list(identifiedElements):
protected = False
if options.protect_ids_noninkscape and not id[-1].isdigit():
protected = True
if options.protect_ids_list and id in protect_ids_list:
protected = True
if options.protect_ids_prefix:
for prefix in protect_ids_prefixes:
if id.startswith(prefix):
protected = True
if protected:
del identifiedElements[id] del identifiedElements[id]
return identifiedElements return identifiedElements
@ -3623,7 +3694,7 @@ def scourString(in_string, options=None):
# shorten ID names as much as possible # shorten ID names as much as possible
if options.shorten_ids: if options.shorten_ids:
_num_bytes_saved_in_ids += shortenIDs(doc, options.shorten_ids_prefix, unprotected_ids(doc, options)) _num_bytes_saved_in_ids += shortenIDs(doc, options.shorten_ids_prefix, options)
# scour lengths (including coordinates) # scour lengths (including coordinates)
for type in ['svg', 'image', 'rect', 'circle', 'ellipse', 'line', for type in ['svg', 'image', 'rect', 'circle', 'ellipse', 'line',

View file

@ -2045,6 +2045,19 @@ class ShortenIDsOption(unittest.TestCase):
'Did not update reference to shortened ID') 'Did not update reference to shortened ID')
class ShortenIDsStableOutput(unittest.TestCase):
def runTest(self):
doc = scourXmlFile('unittests/shorten-ids-stable-output.svg',
parse_args(['--shorten-ids']))
use_tags = doc.getElementsByTagName('use')
hrefs_ordered = [x.getAttributeNS('http://www.w3.org/1999/xlink', 'href')
for x in use_tags]
expected = ['#a', '#b', '#b']
self.assertEquals(hrefs_ordered, expected,
'--shorten-ids pointlessly reassigned ids')
class MustKeepGInSwitch(unittest.TestCase): class MustKeepGInSwitch(unittest.TestCase):
def runTest(self): def runTest(self):

View file

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<defs>
<rect id="a" width="80" height="50" fill="red"/>
<rect id="long_id" width="80" height="50" fill="blue"/>
</defs>
<use xlink:href="#a"/>
<use xlink:href="#long_id"/>
<use xlink:href="#long_id"/>
</svg>

After

Width:  |  Height:  |  Size: 341 B