Merge pull request #187 from nthykier/fix-gh-186-shorten-id-recycle-used-ids
Enable shortenIDs to recycle existing IDs
This commit is contained in:
commit
4fe2655f86
3 changed files with 141 additions and 46 deletions
161
scour/scour.py
161
scour/scour.py
|
|
@ -690,19 +690,20 @@ def removeUnreferencedElements(doc, keepDefs):
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
|
||||||
def shortenIDs(doc, prefix, unprotectedElements=None):
|
def shortenIDs(doc, prefix, options):
|
||||||
"""
|
"""
|
||||||
Shortens ID names used in the document. ID names referenced the most often are assigned the
|
Shortens ID names used in the document. ID names referenced the most often are assigned the
|
||||||
shortest ID names.
|
shortest ID names.
|
||||||
If the list unprotectedElements is provided, only IDs from this list will be shortened.
|
|
||||||
|
|
||||||
Returns the number of bytes saved by shortening ID names in the document.
|
Returns the number of bytes saved by shortening ID names in the document.
|
||||||
"""
|
"""
|
||||||
num = 0
|
num = 0
|
||||||
|
|
||||||
identifiedElements = findElementsWithId(doc.documentElement)
|
identifiedElements = findElementsWithId(doc.documentElement)
|
||||||
if unprotectedElements is None:
|
# This map contains maps the (original) ID to the nodes referencing it.
|
||||||
unprotectedElements = identifiedElements
|
# At the end of this function, it will no longer be valid and while we
|
||||||
|
# could keep it up to date, it will complicate the code for no gain
|
||||||
|
# (as we do not reuse the data structure beyond this function).
|
||||||
referencedIDs = findReferencedElements(doc.documentElement)
|
referencedIDs = findReferencedElements(doc.documentElement)
|
||||||
|
|
||||||
# Make idList (list of idnames) sorted by reference count
|
# Make idList (list of idnames) sorted by reference count
|
||||||
|
|
@ -710,31 +711,99 @@ def shortenIDs(doc, prefix, unprotectedElements=None):
|
||||||
# First check that there's actually a defining element for the current ID name.
|
# First check that there's actually a defining element for the current ID name.
|
||||||
# (Cyn: I've seen documents with #id references but no element with that ID!)
|
# (Cyn: I've seen documents with #id references but no element with that ID!)
|
||||||
idList = [(len(referencedIDs[rid]), rid) for rid in referencedIDs
|
idList = [(len(referencedIDs[rid]), rid) for rid in referencedIDs
|
||||||
if rid in unprotectedElements]
|
if rid in identifiedElements]
|
||||||
idList.sort(reverse=True)
|
idList.sort(reverse=True)
|
||||||
idList = [rid for count, rid in idList]
|
idList = [rid for count, rid in idList]
|
||||||
|
|
||||||
# Add unreferenced IDs to end of idList in arbitrary order
|
# Add unreferenced IDs to end of idList in arbitrary order
|
||||||
idList.extend([rid for rid in unprotectedElements if rid not in idList])
|
idList.extend([rid for rid in identifiedElements if rid not in idList])
|
||||||
|
# Ensure we do not reuse a protected ID by accident
|
||||||
|
protectedIDs = protected_ids(identifiedElements, options)
|
||||||
|
# IDs that have been allocated and should not be remapped.
|
||||||
|
consumedIDs = set()
|
||||||
|
|
||||||
|
# List of IDs that need to be assigned a new ID. The list is ordered
|
||||||
|
# such that earlier entries will be assigned a shorter ID than those
|
||||||
|
# later in the list. IDs in this list *can* obtain an ID that is
|
||||||
|
# longer than they already are.
|
||||||
|
need_new_id = []
|
||||||
|
|
||||||
|
id_allocations = list(compute_id_lengths(len(idList) + 1))
|
||||||
|
# Reverse so we can use it as a stack and still work from "shortest to
|
||||||
|
# longest" ID.
|
||||||
|
id_allocations.reverse()
|
||||||
|
|
||||||
|
# Here we loop over all current IDs (that we /might/ want to remap)
|
||||||
|
# and group them into two. 1) The IDs that already have a perfect
|
||||||
|
# length (these are added to consumedIDs) and 2) the IDs that need
|
||||||
|
# to change length (these are appended to need_new_id).
|
||||||
|
optimal_id_length, id_use_limit = 0, 0
|
||||||
|
for current_id in idList:
|
||||||
|
# If we are out of IDs of the current length, then move on
|
||||||
|
# to the next length
|
||||||
|
if id_use_limit < 1:
|
||||||
|
optimal_id_length, id_use_limit = id_allocations.pop()
|
||||||
|
# Reserve an ID from this length
|
||||||
|
id_use_limit -= 1
|
||||||
|
# We check for strictly equal to optimal length because our ID
|
||||||
|
# remapping may have to assign one node a longer ID because
|
||||||
|
# another node needs a shorter ID.
|
||||||
|
if len(current_id) == optimal_id_length:
|
||||||
|
# This rid is already of optimal length - lets just keep it.
|
||||||
|
consumedIDs.add(current_id)
|
||||||
|
else:
|
||||||
|
# Needs a new (possibly longer) ID.
|
||||||
|
need_new_id.append(current_id)
|
||||||
|
|
||||||
curIdNum = 1
|
curIdNum = 1
|
||||||
|
|
||||||
for rid in idList:
|
for old_id in need_new_id:
|
||||||
curId = intToID(curIdNum, prefix)
|
new_id = intToID(curIdNum, prefix)
|
||||||
# First make sure that *this* element isn't already using
|
|
||||||
# the ID name we want to give it.
|
# Skip ahead if the new ID has already been used or is protected.
|
||||||
if curId != rid:
|
while new_id in protectedIDs or new_id in consumedIDs:
|
||||||
# Then, skip ahead if the new ID is already in identifiedElement.
|
|
||||||
while curId in identifiedElements:
|
|
||||||
curIdNum += 1
|
curIdNum += 1
|
||||||
curId = intToID(curIdNum, prefix)
|
new_id = intToID(curIdNum, prefix)
|
||||||
# Then go rename it.
|
|
||||||
num += renameID(doc, rid, curId, identifiedElements, referencedIDs)
|
# Now that we have found the first available ID, do the remap.
|
||||||
|
num += renameID(old_id, new_id, identifiedElements, referencedIDs.get(old_id))
|
||||||
curIdNum += 1
|
curIdNum += 1
|
||||||
|
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
|
||||||
|
def compute_id_lengths(highest):
|
||||||
|
"""Compute how many IDs are available of a given size
|
||||||
|
|
||||||
|
Example:
|
||||||
|
>>> lengths = list(compute_id_lengths(512))
|
||||||
|
>>> lengths
|
||||||
|
[(1, 26), (2, 676)]
|
||||||
|
>>> total_limit = sum(x[1] for x in lengths)
|
||||||
|
>>> total_limit
|
||||||
|
702
|
||||||
|
>>> intToID(total_limit, '')
|
||||||
|
'zz'
|
||||||
|
|
||||||
|
Which tells us that we got 26 IDs of length 1 and up to 676 IDs of length two
|
||||||
|
if we need to allocate 512 IDs.
|
||||||
|
|
||||||
|
:param highest: Highest ID that need to be allocated
|
||||||
|
:return: An iterator that returns tuples of (id-length, use-limit). The
|
||||||
|
use-limit applies only to the given id-length (i.e. it is excluding IDs
|
||||||
|
of shorter length). Note that the sum of the use-limit values is always
|
||||||
|
equal to or greater than the highest param.
|
||||||
|
"""
|
||||||
|
step = 26
|
||||||
|
id_length = 0
|
||||||
|
use_limit = 1
|
||||||
|
while highest:
|
||||||
|
id_length += 1
|
||||||
|
use_limit *= step
|
||||||
|
yield (id_length, use_limit)
|
||||||
|
highest = int((highest - 1) / step)
|
||||||
|
|
||||||
|
|
||||||
def intToID(idnum, prefix):
|
def intToID(idnum, prefix):
|
||||||
"""
|
"""
|
||||||
Returns the ID name for the given ID number, spreadsheet-style, i.e. from a to z,
|
Returns the ID name for the given ID number, spreadsheet-style, i.e. from a to z,
|
||||||
|
|
@ -750,14 +819,12 @@ def intToID(idnum, prefix):
|
||||||
return prefix + rid
|
return prefix + rid
|
||||||
|
|
||||||
|
|
||||||
def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs):
|
def renameID(idFrom, idTo, identifiedElements, referringNodes):
|
||||||
"""
|
"""
|
||||||
Changes the ID name from idFrom to idTo, on the declaring element
|
Changes the ID name from idFrom to idTo, on the declaring element
|
||||||
as well as all references in the document doc.
|
as well as all nodes in referringNodes.
|
||||||
|
|
||||||
Updates identifiedElements and referencedIDs.
|
Updates identifiedElements.
|
||||||
Does not handle the case where idTo is already the ID name
|
|
||||||
of another element in doc.
|
|
||||||
|
|
||||||
Returns the number of bytes saved by this replacement.
|
Returns the number of bytes saved by this replacement.
|
||||||
"""
|
"""
|
||||||
|
|
@ -771,7 +838,6 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs):
|
||||||
num += len(idFrom) - len(idTo)
|
num += len(idFrom) - len(idTo)
|
||||||
|
|
||||||
# Update references to renamed node
|
# Update references to renamed node
|
||||||
referringNodes = referencedIDs.get(idFrom)
|
|
||||||
if referringNodes is not None:
|
if referringNodes is not None:
|
||||||
|
|
||||||
# Look for the idFrom ID name in each of the referencing elements,
|
# Look for the idFrom ID name in each of the referencing elements,
|
||||||
|
|
@ -822,34 +888,39 @@ def renameID(doc, idFrom, idTo, identifiedElements, referencedIDs):
|
||||||
node.setAttribute(attr, newValue)
|
node.setAttribute(attr, newValue)
|
||||||
num += len(oldValue) - len(newValue)
|
num += len(oldValue) - len(newValue)
|
||||||
|
|
||||||
del referencedIDs[idFrom]
|
|
||||||
referencedIDs[idTo] = referringNodes
|
|
||||||
|
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
|
||||||
|
def protected_ids(seenIDs, options):
|
||||||
|
"""Return a list of protected IDs out of the seenIDs"""
|
||||||
|
protectedIDs = []
|
||||||
|
if options.protect_ids_prefix or options.protect_ids_noninkscape or options.protect_ids_list:
|
||||||
|
protect_ids_prefixes = []
|
||||||
|
protect_ids_list = []
|
||||||
|
if options.protect_ids_list:
|
||||||
|
protect_ids_list = options.protect_ids_list.split(",")
|
||||||
|
if options.protect_ids_prefix:
|
||||||
|
protect_ids_prefixes = options.protect_ids_prefix.split(",")
|
||||||
|
for id in seenIDs:
|
||||||
|
protected = False
|
||||||
|
if options.protect_ids_noninkscape and not id[-1].isdigit():
|
||||||
|
protected = True
|
||||||
|
elif protect_ids_list and id in protect_ids_list:
|
||||||
|
protected = True
|
||||||
|
elif protect_ids_prefixes:
|
||||||
|
if any(id.startswith(prefix) for prefix in protect_ids_prefixes):
|
||||||
|
protected = True
|
||||||
|
if protected:
|
||||||
|
protectedIDs.append(id)
|
||||||
|
return protectedIDs
|
||||||
|
|
||||||
|
|
||||||
def unprotected_ids(doc, options):
|
def unprotected_ids(doc, options):
|
||||||
u"""Returns a list of unprotected IDs within the document doc."""
|
u"""Returns a list of unprotected IDs within the document doc."""
|
||||||
identifiedElements = findElementsWithId(doc.documentElement)
|
identifiedElements = findElementsWithId(doc.documentElement)
|
||||||
if not (options.protect_ids_noninkscape or
|
protectedIDs = protected_ids(identifiedElements, options)
|
||||||
options.protect_ids_list or
|
if protectedIDs:
|
||||||
options.protect_ids_prefix):
|
for id in protectedIDs:
|
||||||
return identifiedElements
|
|
||||||
if options.protect_ids_list:
|
|
||||||
protect_ids_list = options.protect_ids_list.split(",")
|
|
||||||
if options.protect_ids_prefix:
|
|
||||||
protect_ids_prefixes = options.protect_ids_prefix.split(",")
|
|
||||||
for id in list(identifiedElements):
|
|
||||||
protected = False
|
|
||||||
if options.protect_ids_noninkscape and not id[-1].isdigit():
|
|
||||||
protected = True
|
|
||||||
if options.protect_ids_list and id in protect_ids_list:
|
|
||||||
protected = True
|
|
||||||
if options.protect_ids_prefix:
|
|
||||||
for prefix in protect_ids_prefixes:
|
|
||||||
if id.startswith(prefix):
|
|
||||||
protected = True
|
|
||||||
if protected:
|
|
||||||
del identifiedElements[id]
|
del identifiedElements[id]
|
||||||
return identifiedElements
|
return identifiedElements
|
||||||
|
|
||||||
|
|
@ -3623,7 +3694,7 @@ def scourString(in_string, options=None):
|
||||||
|
|
||||||
# shorten ID names as much as possible
|
# shorten ID names as much as possible
|
||||||
if options.shorten_ids:
|
if options.shorten_ids:
|
||||||
_num_bytes_saved_in_ids += shortenIDs(doc, options.shorten_ids_prefix, unprotected_ids(doc, options))
|
_num_bytes_saved_in_ids += shortenIDs(doc, options.shorten_ids_prefix, options)
|
||||||
|
|
||||||
# scour lengths (including coordinates)
|
# scour lengths (including coordinates)
|
||||||
for type in ['svg', 'image', 'rect', 'circle', 'ellipse', 'line',
|
for type in ['svg', 'image', 'rect', 'circle', 'ellipse', 'line',
|
||||||
|
|
|
||||||
13
testscour.py
13
testscour.py
|
|
@ -2045,6 +2045,19 @@ class ShortenIDsOption(unittest.TestCase):
|
||||||
'Did not update reference to shortened ID')
|
'Did not update reference to shortened ID')
|
||||||
|
|
||||||
|
|
||||||
|
class ShortenIDsStableOutput(unittest.TestCase):
|
||||||
|
|
||||||
|
def runTest(self):
|
||||||
|
doc = scourXmlFile('unittests/shorten-ids-stable-output.svg',
|
||||||
|
parse_args(['--shorten-ids']))
|
||||||
|
use_tags = doc.getElementsByTagName('use')
|
||||||
|
hrefs_ordered = [x.getAttributeNS('http://www.w3.org/1999/xlink', 'href')
|
||||||
|
for x in use_tags]
|
||||||
|
expected = ['#a', '#b', '#b']
|
||||||
|
self.assertEquals(hrefs_ordered, expected,
|
||||||
|
'--shorten-ids pointlessly reassigned ids')
|
||||||
|
|
||||||
|
|
||||||
class MustKeepGInSwitch(unittest.TestCase):
|
class MustKeepGInSwitch(unittest.TestCase):
|
||||||
|
|
||||||
def runTest(self):
|
def runTest(self):
|
||||||
|
|
|
||||||
11
unittests/shorten-ids-stable-output.svg
Normal file
11
unittests/shorten-ids-stable-output.svg
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
|
||||||
|
<defs>
|
||||||
|
<rect id="a" width="80" height="50" fill="red"/>
|
||||||
|
<rect id="long_id" width="80" height="50" fill="blue"/>
|
||||||
|
</defs>
|
||||||
|
<use xlink:href="#a"/>
|
||||||
|
<use xlink:href="#long_id"/>
|
||||||
|
<use xlink:href="#long_id"/>
|
||||||
|
</svg>
|
||||||
|
|
||||||
|
After Width: | Height: | Size: 341 B |
Loading…
Add table
Add a link
Reference in a new issue