From ace24df5c3245a556581839f7cbbd6fe92b214f4 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Mon, 18 May 2020 21:00:55 +0000 Subject: [PATCH 1/7] removeDuplicateGradients: Avoid compiling regex unless we need it Signed-off-by: Niels Thykier --- scour/scour.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scour/scour.py b/scour/scour.py index 5ba03fd..97ec672 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1581,11 +1581,11 @@ def removeDuplicateGradients(doc): # for each element that referenced the gradient we are going to replace dup_id with master_id dup_id = dupGrad.getAttribute('id') - funcIRI = re.compile('url\\([\'"]?#' + dup_id + '[\'"]?\\)') # matches url(#a), url('#a') and url("#a") # With --keep-unreferenced-defs, we can end up with # unreferenced gradients. See GH#156. if dup_id in referencedIDs: + funcIRI = re.compile('url\\([\'"]?#' + dup_id + '[\'"]?\\)') # matches url(#a), url('#a') and url("#a") for elem in referencedIDs[dup_id]: # find out which attribute referenced the duplicate gradient for attr in ['fill', 'stroke']: From 9e3a5f2e40c39877bcda9a877bd65dac19aba829 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Tue, 19 May 2020 18:02:25 +0000 Subject: [PATCH 2/7] removeDuplicateGradients: Refactor how duplicates are passed around This commit is mostly to enable the following commit to make improvements. It does reduce the number of duplicate getAttribute calls by a tiny bit but it is unlikely to matter in practice. Signed-off-by: Niels Thykier --- scour/scour.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 97ec672..1b9c150 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1536,7 +1536,7 @@ def removeDuplicateGradients(doc): global _num_elements_removed num = 0 - gradientsToRemove = {} + gradients_to_remove = [] for gradType in ['linearGradient', 'radialGradient']: grads = doc.getElementsByTagName(gradType) @@ -1553,35 +1553,35 @@ def removeDuplicateGradients(doc): continue master = bucket[0] duplicates = bucket[1:] + duplicates_ids = [d.getAttribute('id') for d in duplicates] master_id = master.getAttribute('id') if not master_id: # If our selected "master" copy does not have an ID, # then replace it with one that does (assuming any of # them has one). This avoids broken images like we # saw in GH#203 - for i in range(len(duplicates)): - dup = duplicates[i] - dup_id = dup.getAttribute('id') + for i in range(len(duplicates_ids)): + dup_id = duplicates_ids[i] if dup_id: + # We do not bother updating the master field + # as it is not used any more. + master_id = duplicates_ids[i] duplicates[i] = master - master = dup + # Clear the old id to avoid a redundant remapping + duplicates_ids[i] = "" break - gradientsToRemove[master] = duplicates + gradients_to_remove.append((master_id, duplicates_ids, duplicates)) # get a collection of all elements that are referenced and their referencing elements referencedIDs = findReferencedElements(doc.documentElement) - for masterGrad in gradientsToRemove: - master_id = masterGrad.getAttribute('id') - for dupGrad in gradientsToRemove[masterGrad]: + for master_id, duplicates_ids, duplicates in gradients_to_remove: + for dup_id, dupGrad in zip(duplicates_ids, duplicates): # if the duplicate gradient no longer has a parent that means it was # already re-mapped to another master gradient if not dupGrad.parentNode: continue - # for each element that referenced the gradient we are going to replace dup_id with master_id - dup_id = dupGrad.getAttribute('id') - # With --keep-unreferenced-defs, we can end up with # unreferenced gradients. See GH#156. if dup_id in referencedIDs: From 36ee0932a4595f5a26e3db720672bd9b45dd47bf Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Tue, 19 May 2020 18:04:48 +0000 Subject: [PATCH 3/7] removeDuplicateGradients: Compile at most one regex per master gradient Regex compilation is by far the most expensive part of removeDuplicateGradients. This commit reduces the pain a bit by trading "many small regexes" to "few larger regexes", which avoid some of the compilation overhead. Signed-off-by: Niels Thykier --- scour/scour.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/scour/scour.py b/scour/scour.py index 1b9c150..fb8f9d1 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1576,6 +1576,7 @@ def removeDuplicateGradients(doc): # get a collection of all elements that are referenced and their referencing elements referencedIDs = findReferencedElements(doc.documentElement) for master_id, duplicates_ids, duplicates in gradients_to_remove: + funcIRI = None for dup_id, dupGrad in zip(duplicates_ids, duplicates): # if the duplicate gradient no longer has a parent that means it was # already re-mapped to another master gradient @@ -1585,7 +1586,10 @@ def removeDuplicateGradients(doc): # With --keep-unreferenced-defs, we can end up with # unreferenced gradients. See GH#156. if dup_id in referencedIDs: - funcIRI = re.compile('url\\([\'"]?#' + dup_id + '[\'"]?\\)') # matches url(#a), url('#a') and url("#a") + if funcIRI is None: + # matches url(#), url('#') and url("#") + dup_id_regex = "|".join(duplicates_ids) + funcIRI = re.compile('url\\([\'"]?#(?:' + dup_id_regex + ')[\'"]?\\)') for elem in referencedIDs[dup_id]: # find out which attribute referenced the duplicate gradient for attr in ['fill', 'stroke']: From a3f761f40c16c038f1a43319a900657c19a1208f Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Thu, 21 May 2020 13:44:11 +0000 Subject: [PATCH 4/7] Refactor some code out of removeDuplicateGradients This is commits enables a future optimization (but is not a notable optimization in itself). Signed-off-by: Niels Thykier --- scour/scour.py | 113 ++++++++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 48 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index fb8f9d1..519e06a 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1532,21 +1532,26 @@ def computeGradientBucketKey(grad): return "\x1e".join(subKeys) -def removeDuplicateGradients(doc): - global _num_elements_removed - num = 0 +def detect_duplicate_gradients(*grad_lists): + """Detects duplicate gradients from each iterable/generator given as argument - gradients_to_remove = [] - - for gradType in ['linearGradient', 'radialGradient']: - grads = doc.getElementsByTagName(gradType) - gradBuckets = defaultdict(list) + Yields (master, master_id, duplicates_id, duplicates) tuples where: + * master_id: The ID attribute of the master element. This will always be non-empty + and not None as long at least one of the gradients have a valid ID. + * duplicates_id: List of ID attributes of the duplicate gradients elements (can be + empty where the gradient had no ID attribute) + * duplicates: List of elements that are duplicates of the `master` element. Will + never include the `master` element. Has the same order as `duplicates_id` - i.e. + `duplicates[X].getAttribute("id") == duplicates_id[X]`. + """ + for grads in grad_lists: + grad_buckets = defaultdict(list) for grad in grads: key = computeGradientBucketKey(grad) - gradBuckets[key].append(grad) + grad_buckets[key].append(grad) - for bucket in six.itervalues(gradBuckets): + for bucket in six.itervalues(grad_buckets): if len(bucket) < 2: # The gradient must be unique if it is the only one in # this bucket. @@ -1571,47 +1576,59 @@ def removeDuplicateGradients(doc): duplicates_ids[i] = "" break - gradients_to_remove.append((master_id, duplicates_ids, duplicates)) + yield master_id, duplicates_ids, duplicates + + +def dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids): + func_iri = None + for dup_id, dup_grad in zip(duplicates_ids, duplicates): + # if the duplicate gradient no longer has a parent that means it was + # already re-mapped to another master gradient + if not dup_grad.parentNode: + continue + + # With --keep-unreferenced-defs, we can end up with + # unreferenced gradients. See GH#156. + if dup_id in referenced_ids: + if func_iri is None: + # matches url(#), url('#') and url("#") + dup_id_regex = "|".join(duplicates_ids) + func_iri = re.compile('url\\([\'"]?#(?:' + dup_id_regex + ')[\'"]?\\)') + for elem in referenced_ids[dup_id]: + # find out which attribute referenced the duplicate gradient + for attr in ['fill', 'stroke']: + v = elem.getAttribute(attr) + (v_new, n) = func_iri.subn('url(#' + master_id + ')', v) + if n > 0: + elem.setAttribute(attr, v_new) + if elem.getAttributeNS(NS['XLINK'], 'href') == '#' + dup_id: + elem.setAttributeNS(NS['XLINK'], 'href', '#' + master_id) + styles = _getStyle(elem) + for style in styles: + v = styles[style] + (v_new, n) = func_iri.subn('url(#' + master_id + ')', v) + if n > 0: + styles[style] = v_new + _setStyle(elem, styles) + + # now that all referencing elements have been re-mapped to the master + # it is safe to remove this gradient from the document + dup_grad.parentNode.removeChild(dup_grad) + + +def removeDuplicateGradients(doc): + global _num_elements_removed + num = 0 + + linear_gradients = doc.getElementsByTagName('linearGradient') + radial_gradients = doc.getElementsByTagName('radialGradient') # get a collection of all elements that are referenced and their referencing elements referencedIDs = findReferencedElements(doc.documentElement) - for master_id, duplicates_ids, duplicates in gradients_to_remove: - funcIRI = None - for dup_id, dupGrad in zip(duplicates_ids, duplicates): - # if the duplicate gradient no longer has a parent that means it was - # already re-mapped to another master gradient - if not dupGrad.parentNode: - continue - - # With --keep-unreferenced-defs, we can end up with - # unreferenced gradients. See GH#156. - if dup_id in referencedIDs: - if funcIRI is None: - # matches url(#), url('#') and url("#") - dup_id_regex = "|".join(duplicates_ids) - funcIRI = re.compile('url\\([\'"]?#(?:' + dup_id_regex + ')[\'"]?\\)') - for elem in referencedIDs[dup_id]: - # find out which attribute referenced the duplicate gradient - for attr in ['fill', 'stroke']: - v = elem.getAttribute(attr) - (v_new, n) = funcIRI.subn('url(#' + master_id + ')', v) - if n > 0: - elem.setAttribute(attr, v_new) - if elem.getAttributeNS(NS['XLINK'], 'href') == '#' + dup_id: - elem.setAttributeNS(NS['XLINK'], 'href', '#' + master_id) - styles = _getStyle(elem) - for style in styles: - v = styles[style] - (v_new, n) = funcIRI.subn('url(#' + master_id + ')', v) - if n > 0: - styles[style] = v_new - _setStyle(elem, styles) - - # now that all referencing elements have been re-mapped to the master - # it is safe to remove this gradient from the document - dupGrad.parentNode.removeChild(dupGrad) - _num_elements_removed += 1 - num += 1 + for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients): + dedup_gradient(master_id, duplicates_ids, duplicates, referencedIDs) + _num_elements_removed += len(duplicates) + num += len(duplicates) return num From 0e82b8dcad7b9fcc51327122468ae867f5a6eb6d Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Thu, 21 May 2020 13:53:06 +0000 Subject: [PATCH 5/7] Refactor removeDuplicateGradients to loop until it reaches a fixed point This is commits enables a future optimization (but is not a notable optimization in itself). Signed-off-by: Niels Thykier --- scour/scour.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 519e06a..14e17ba 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1617,18 +1617,21 @@ def dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids): def removeDuplicateGradients(doc): - global _num_elements_removed + prev_num = -1 num = 0 - linear_gradients = doc.getElementsByTagName('linearGradient') - radial_gradients = doc.getElementsByTagName('radialGradient') + while prev_num != num: + prev_num = num + + linear_gradients = doc.getElementsByTagName('linearGradient') + radial_gradients = doc.getElementsByTagName('radialGradient') + + # get a collection of all elements that are referenced and their referencing elements + referenced_ids = findReferencedElements(doc.documentElement) + for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients): + dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids) + num += len(duplicates) - # get a collection of all elements that are referenced and their referencing elements - referencedIDs = findReferencedElements(doc.documentElement) - for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients): - dedup_gradient(master_id, duplicates_ids, duplicates, referencedIDs) - _num_elements_removed += len(duplicates) - num += len(duplicates) return num @@ -3775,8 +3778,7 @@ def scourString(in_string, options=None): pass # remove duplicate gradients - while removeDuplicateGradients(doc) > 0: - pass + _num_elements_removed += removeDuplicateGradients(doc) if options.group_collapse: _num_elements_removed += mergeSiblingGroupsWithCommonAttributes(doc.documentElement) From 3d29029c721bb6a0ed40c528ed1dc753c4331484 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Mon, 18 May 2020 20:26:21 +0000 Subject: [PATCH 6/7] findReferencedElements: Use a set instead of list for tracking nodes Except for one caller, nothing cares what kind of collection is used. By migrating to a set, we can enable a future rewrite. Signed-off-by: Niels Thykier --- scour/scour.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 14e17ba..48f7d92 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -556,7 +556,7 @@ def findReferencedElements(node, ids=None): Returns IDs of all referenced elements - node is the node at which to start the search. - returns a map which has the id as key and - each value is is a list of nodes + each value is is a set of nodes Currently looks at 'xlink:href' and all attributes in 'referencingProps' """ @@ -586,9 +586,9 @@ def findReferencedElements(node, ids=None): # we remove the hash mark from the beginning of the id id = href[1:] if id in ids: - ids[id].append(node) + ids[id].add(node) else: - ids[id] = [node] + ids[id] = {node} # now get all style properties and the fill, stroke, filter attributes styles = node.getAttribute('style').split(';') @@ -619,9 +619,9 @@ def findReferencingProperty(node, prop, val, ids): if len(val) >= 7 and val[0:5] == 'url(#': id = val[5:val.find(')')] if id in ids: - ids[id].append(node) + ids[id].add(node) else: - ids[id] = [node] + ids[id] = {node} # if the url has a quote in it, we need to compensate elif len(val) >= 8: id = None @@ -633,9 +633,9 @@ def findReferencingProperty(node, prop, val, ids): id = val[6:val.find("')")] if id is not None: if id in ids: - ids[id].append(node) + ids[id].add(node) else: - ids[id] = [node] + ids[id] = {node} def removeUnusedDefs(doc, defElem, elemsToRemove=None, referencedIDs=None): @@ -1457,7 +1457,7 @@ def collapseSinglyReferencedGradients(doc): elem.namespaceURI == NS['SVG'] ): # found a gradient that is referenced by only 1 other element - refElem = nodes[0] + refElem = nodes.pop() if refElem.nodeType == Node.ELEMENT_NODE and refElem.nodeName in ['linearGradient', 'radialGradient'] \ and refElem.namespaceURI == NS['SVG']: # elem is a gradient referenced by only one other gradient (refElem) From ca2b32c0b38b0d0d224cadf2db12ce9df6339c63 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Thu, 21 May 2020 14:14:25 +0000 Subject: [PATCH 7/7] removeDuplicateGradients: Maintain referenced_ids This avoids calling `findReferencedElements` more than once per removeDuplicateGradients. This is good for performance as `findReferencedElements` is one of the slowest functions in scour. Signed-off-by: Niels Thykier --- scour/scour.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 48f7d92..a3e0c82 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -1615,19 +1615,39 @@ def dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids): # it is safe to remove this gradient from the document dup_grad.parentNode.removeChild(dup_grad) + # If the gradients have an ID, we update referenced_ids to match the newly remapped IDs. + # This enable us to avoid calling findReferencedElements once per loop, which is helpful as it is + # one of the slowest functions in scour. + if master_id: + try: + master_references = referenced_ids[master_id] + except KeyError: + master_references = set() + + for dup_id in duplicates_ids: + references = referenced_ids.pop(dup_id, None) + if references is None: + continue + master_references.update(references) + + # Only necessary but needed if the master gradient did + # not have any references originally + referenced_ids[master_id] = master_references + def removeDuplicateGradients(doc): prev_num = -1 num = 0 + # get a collection of all elements that are referenced and their referencing elements + referenced_ids = findReferencedElements(doc.documentElement) + while prev_num != num: prev_num = num linear_gradients = doc.getElementsByTagName('linearGradient') radial_gradients = doc.getElementsByTagName('radialGradient') - # get a collection of all elements that are referenced and their referencing elements - referenced_ids = findReferencedElements(doc.documentElement) for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients): dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids) num += len(duplicates)