From ace24df5c3245a556581839f7cbbd6fe92b214f4 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Mon, 18 May 2020 21:00:55 +0000
Subject: [PATCH 1/7] removeDuplicateGradients: Avoid compiling regex unless we
 need it

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scour/scour.py b/scour/scour.py
index 5ba03fd..97ec672 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -1581,11 +1581,11 @@ def removeDuplicateGradients(doc):
 
             # for each element that referenced the gradient we are going to replace dup_id with master_id
             dup_id = dupGrad.getAttribute('id')
-            funcIRI = re.compile('url\\([\'"]?#' + dup_id + '[\'"]?\\)')  # matches url(#a), url('#a') and url("#a")
 
             # With --keep-unreferenced-defs, we can end up with
             # unreferenced gradients.  See GH#156.
             if dup_id in referencedIDs:
+                funcIRI = re.compile('url\\([\'"]?#' + dup_id + '[\'"]?\\)')  # matches url(#a), url('#a') and url("#a")
                 for elem in referencedIDs[dup_id]:
                     # find out which attribute referenced the duplicate gradient
                     for attr in ['fill', 'stroke']:

From 9e3a5f2e40c39877bcda9a877bd65dac19aba829 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Tue, 19 May 2020 18:02:25 +0000
Subject: [PATCH 2/7] removeDuplicateGradients: Refactor how duplicates are
 passed around

This commit is mostly to enable the following commit to make
improvements.  It does reduce the number of duplicate getAttribute
calls by a tiny bit but it is unlikely to matter in practice.

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 97ec672..1b9c150 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -1536,7 +1536,7 @@ def removeDuplicateGradients(doc):
     global _num_elements_removed
     num = 0
 
-    gradientsToRemove = {}
+    gradients_to_remove = []
 
     for gradType in ['linearGradient', 'radialGradient']:
         grads = doc.getElementsByTagName(gradType)
@@ -1553,35 +1553,35 @@ def removeDuplicateGradients(doc):
                 continue
             master = bucket[0]
             duplicates = bucket[1:]
+            duplicates_ids = [d.getAttribute('id') for d in duplicates]
             master_id = master.getAttribute('id')
             if not master_id:
                 # If our selected "master" copy does not have an ID,
                 # then replace it with one that does (assuming any of
                 # them has one).  This avoids broken images like we
                 # saw in GH#203
-                for i in range(len(duplicates)):
-                    dup = duplicates[i]
-                    dup_id = dup.getAttribute('id')
+                for i in range(len(duplicates_ids)):
+                    dup_id = duplicates_ids[i]
                     if dup_id:
+                        # We do not bother updating the master field
+                        # as it is not used any more.
+                        master_id = duplicates_ids[i]
                         duplicates[i] = master
-                        master = dup
+                        # Clear the old id to avoid a redundant remapping
+                        duplicates_ids[i] = ""
                         break
 
-            gradientsToRemove[master] = duplicates
+            gradients_to_remove.append((master_id, duplicates_ids, duplicates))
 
     # get a collection of all elements that are referenced and their referencing elements
     referencedIDs = findReferencedElements(doc.documentElement)
-    for masterGrad in gradientsToRemove:
-        master_id = masterGrad.getAttribute('id')
-        for dupGrad in gradientsToRemove[masterGrad]:
+    for master_id, duplicates_ids, duplicates in gradients_to_remove:
+        for dup_id, dupGrad in zip(duplicates_ids, duplicates):
             # if the duplicate gradient no longer has a parent that means it was
             # already re-mapped to another master gradient
             if not dupGrad.parentNode:
                 continue
 
-            # for each element that referenced the gradient we are going to replace dup_id with master_id
-            dup_id = dupGrad.getAttribute('id')
-
             # With --keep-unreferenced-defs, we can end up with
             # unreferenced gradients.  See GH#156.
             if dup_id in referencedIDs:

From 36ee0932a4595f5a26e3db720672bd9b45dd47bf Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Tue, 19 May 2020 18:04:48 +0000
Subject: [PATCH 3/7] removeDuplicateGradients: Compile at most one regex per
 master gradient

Regex compilation is by far the most expensive part of
removeDuplicateGradients.  This commit reduces the pain a bit by
trading "many small regexes" to "few larger regexes", which avoid some
of the compilation overhead.

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/scour/scour.py b/scour/scour.py
index 1b9c150..fb8f9d1 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -1576,6 +1576,7 @@ def removeDuplicateGradients(doc):
     # get a collection of all elements that are referenced and their referencing elements
     referencedIDs = findReferencedElements(doc.documentElement)
     for master_id, duplicates_ids, duplicates in gradients_to_remove:
+        funcIRI = None
         for dup_id, dupGrad in zip(duplicates_ids, duplicates):
             # if the duplicate gradient no longer has a parent that means it was
             # already re-mapped to another master gradient
@@ -1585,7 +1586,10 @@ def removeDuplicateGradients(doc):
             # With --keep-unreferenced-defs, we can end up with
             # unreferenced gradients.  See GH#156.
             if dup_id in referencedIDs:
-                funcIRI = re.compile('url\\([\'"]?#' + dup_id + '[\'"]?\\)')  # matches url(#a), url('#a') and url("#a")
+                if funcIRI is None:
+                    # matches url(#<ANY_DUP_ID>), url('#<ANY_DUP_ID>') and url("#<ANY_DUP_ID>")
+                    dup_id_regex = "|".join(duplicates_ids)
+                    funcIRI = re.compile('url\\([\'"]?#(?:' + dup_id_regex + ')[\'"]?\\)')
                 for elem in referencedIDs[dup_id]:
                     # find out which attribute referenced the duplicate gradient
                     for attr in ['fill', 'stroke']:

From a3f761f40c16c038f1a43319a900657c19a1208f Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Thu, 21 May 2020 13:44:11 +0000
Subject: [PATCH 4/7] Refactor some code out of removeDuplicateGradients

This is commits enables a future optimization (but is not a notable
optimization in itself).

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 113 ++++++++++++++++++++++++++++---------------------
 1 file changed, 65 insertions(+), 48 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index fb8f9d1..519e06a 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -1532,21 +1532,26 @@ def computeGradientBucketKey(grad):
     return "\x1e".join(subKeys)
 
 
-def removeDuplicateGradients(doc):
-    global _num_elements_removed
-    num = 0
+def detect_duplicate_gradients(*grad_lists):
+    """Detects duplicate gradients from each iterable/generator given as argument
 
-    gradients_to_remove = []
-
-    for gradType in ['linearGradient', 'radialGradient']:
-        grads = doc.getElementsByTagName(gradType)
-        gradBuckets = defaultdict(list)
+    Yields (master, master_id, duplicates_id, duplicates) tuples where:
+      * master_id: The ID attribute of the master element.  This will always be non-empty
+        and not None as long at least one of the gradients have a valid ID.
+      * duplicates_id: List of ID attributes of the duplicate gradients elements (can be
+        empty where the gradient had no ID attribute)
+      * duplicates: List of elements that are duplicates of the `master` element.  Will
+        never include the `master` element.  Has the same order as `duplicates_id` - i.e.
+        `duplicates[X].getAttribute("id") == duplicates_id[X]`.
+    """
+    for grads in grad_lists:
+        grad_buckets = defaultdict(list)
 
         for grad in grads:
             key = computeGradientBucketKey(grad)
-            gradBuckets[key].append(grad)
+            grad_buckets[key].append(grad)
 
-        for bucket in six.itervalues(gradBuckets):
+        for bucket in six.itervalues(grad_buckets):
             if len(bucket) < 2:
                 # The gradient must be unique if it is the only one in
                 # this bucket.
@@ -1571,47 +1576,59 @@ def removeDuplicateGradients(doc):
                         duplicates_ids[i] = ""
                         break
 
-            gradients_to_remove.append((master_id, duplicates_ids, duplicates))
+            yield master_id, duplicates_ids, duplicates
+
+
+def dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids):
+    func_iri = None
+    for dup_id, dup_grad in zip(duplicates_ids, duplicates):
+        # if the duplicate gradient no longer has a parent that means it was
+        # already re-mapped to another master gradient
+        if not dup_grad.parentNode:
+            continue
+
+        # With --keep-unreferenced-defs, we can end up with
+        # unreferenced gradients.  See GH#156.
+        if dup_id in referenced_ids:
+            if func_iri is None:
+                # matches url(#<ANY_DUP_ID>), url('#<ANY_DUP_ID>') and url("#<ANY_DUP_ID>")
+                dup_id_regex = "|".join(duplicates_ids)
+                func_iri = re.compile('url\\([\'"]?#(?:' + dup_id_regex + ')[\'"]?\\)')
+            for elem in referenced_ids[dup_id]:
+                # find out which attribute referenced the duplicate gradient
+                for attr in ['fill', 'stroke']:
+                    v = elem.getAttribute(attr)
+                    (v_new, n) = func_iri.subn('url(#' + master_id + ')', v)
+                    if n > 0:
+                        elem.setAttribute(attr, v_new)
+                if elem.getAttributeNS(NS['XLINK'], 'href') == '#' + dup_id:
+                    elem.setAttributeNS(NS['XLINK'], 'href', '#' + master_id)
+                styles = _getStyle(elem)
+                for style in styles:
+                    v = styles[style]
+                    (v_new, n) = func_iri.subn('url(#' + master_id + ')', v)
+                    if n > 0:
+                        styles[style] = v_new
+                _setStyle(elem, styles)
+
+        # now that all referencing elements have been re-mapped to the master
+        # it is safe to remove this gradient from the document
+        dup_grad.parentNode.removeChild(dup_grad)
+
+
+def removeDuplicateGradients(doc):
+    global _num_elements_removed
+    num = 0
+
+    linear_gradients = doc.getElementsByTagName('linearGradient')
+    radial_gradients = doc.getElementsByTagName('radialGradient')
 
     # get a collection of all elements that are referenced and their referencing elements
     referencedIDs = findReferencedElements(doc.documentElement)
-    for master_id, duplicates_ids, duplicates in gradients_to_remove:
-        funcIRI = None
-        for dup_id, dupGrad in zip(duplicates_ids, duplicates):
-            # if the duplicate gradient no longer has a parent that means it was
-            # already re-mapped to another master gradient
-            if not dupGrad.parentNode:
-                continue
-
-            # With --keep-unreferenced-defs, we can end up with
-            # unreferenced gradients.  See GH#156.
-            if dup_id in referencedIDs:
-                if funcIRI is None:
-                    # matches url(#<ANY_DUP_ID>), url('#<ANY_DUP_ID>') and url("#<ANY_DUP_ID>")
-                    dup_id_regex = "|".join(duplicates_ids)
-                    funcIRI = re.compile('url\\([\'"]?#(?:' + dup_id_regex + ')[\'"]?\\)')
-                for elem in referencedIDs[dup_id]:
-                    # find out which attribute referenced the duplicate gradient
-                    for attr in ['fill', 'stroke']:
-                        v = elem.getAttribute(attr)
-                        (v_new, n) = funcIRI.subn('url(#' + master_id + ')', v)
-                        if n > 0:
-                            elem.setAttribute(attr, v_new)
-                    if elem.getAttributeNS(NS['XLINK'], 'href') == '#' + dup_id:
-                        elem.setAttributeNS(NS['XLINK'], 'href', '#' + master_id)
-                    styles = _getStyle(elem)
-                    for style in styles:
-                        v = styles[style]
-                        (v_new, n) = funcIRI.subn('url(#' + master_id + ')', v)
-                        if n > 0:
-                            styles[style] = v_new
-                    _setStyle(elem, styles)
-
-            # now that all referencing elements have been re-mapped to the master
-            # it is safe to remove this gradient from the document
-            dupGrad.parentNode.removeChild(dupGrad)
-            _num_elements_removed += 1
-            num += 1
+    for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients):
+        dedup_gradient(master_id, duplicates_ids, duplicates, referencedIDs)
+        _num_elements_removed += len(duplicates)
+        num += len(duplicates)
     return num
 
 

From 0e82b8dcad7b9fcc51327122468ae867f5a6eb6d Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Thu, 21 May 2020 13:53:06 +0000
Subject: [PATCH 5/7] Refactor removeDuplicateGradients to loop until it
 reaches a fixed point

This is commits enables a future optimization (but is not a notable
optimization in itself).

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 519e06a..14e17ba 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -1617,18 +1617,21 @@ def dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids):
 
 
 def removeDuplicateGradients(doc):
-    global _num_elements_removed
+    prev_num = -1
     num = 0
 
-    linear_gradients = doc.getElementsByTagName('linearGradient')
-    radial_gradients = doc.getElementsByTagName('radialGradient')
+    while prev_num != num:
+        prev_num = num
+
+        linear_gradients = doc.getElementsByTagName('linearGradient')
+        radial_gradients = doc.getElementsByTagName('radialGradient')
+
+        # get a collection of all elements that are referenced and their referencing elements
+        referenced_ids = findReferencedElements(doc.documentElement)
+        for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients):
+            dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids)
+            num += len(duplicates)
 
-    # get a collection of all elements that are referenced and their referencing elements
-    referencedIDs = findReferencedElements(doc.documentElement)
-    for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients):
-        dedup_gradient(master_id, duplicates_ids, duplicates, referencedIDs)
-        _num_elements_removed += len(duplicates)
-        num += len(duplicates)
     return num
 
 
@@ -3775,8 +3778,7 @@ def scourString(in_string, options=None):
         pass
 
     # remove duplicate gradients
-    while removeDuplicateGradients(doc) > 0:
-        pass
+    _num_elements_removed += removeDuplicateGradients(doc)
 
     if options.group_collapse:
         _num_elements_removed += mergeSiblingGroupsWithCommonAttributes(doc.documentElement)

From 3d29029c721bb6a0ed40c528ed1dc753c4331484 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Mon, 18 May 2020 20:26:21 +0000
Subject: [PATCH 6/7] findReferencedElements: Use a set instead of list for
 tracking nodes

Except for one caller, nothing cares what kind of collection is used.
By migrating to a set, we can enable a future rewrite.

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 14e17ba..48f7d92 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -556,7 +556,7 @@ def findReferencedElements(node, ids=None):
     Returns IDs of all referenced elements
     - node is the node at which to start the search.
     - returns a map which has the id as key and
-      each value is is a list of nodes
+      each value is is a set of nodes
 
     Currently looks at 'xlink:href' and all attributes in 'referencingProps'
     """
@@ -586,9 +586,9 @@ def findReferencedElements(node, ids=None):
         # we remove the hash mark from the beginning of the id
         id = href[1:]
         if id in ids:
-            ids[id].append(node)
+            ids[id].add(node)
         else:
-            ids[id] = [node]
+            ids[id] = {node}
 
     # now get all style properties and the fill, stroke, filter attributes
     styles = node.getAttribute('style').split(';')
@@ -619,9 +619,9 @@ def findReferencingProperty(node, prop, val, ids):
         if len(val) >= 7 and val[0:5] == 'url(#':
             id = val[5:val.find(')')]
             if id in ids:
-                ids[id].append(node)
+                ids[id].add(node)
             else:
-                ids[id] = [node]
+                ids[id] = {node}
         # if the url has a quote in it, we need to compensate
         elif len(val) >= 8:
             id = None
@@ -633,9 +633,9 @@ def findReferencingProperty(node, prop, val, ids):
                 id = val[6:val.find("')")]
             if id is not None:
                 if id in ids:
-                    ids[id].append(node)
+                    ids[id].add(node)
                 else:
-                    ids[id] = [node]
+                    ids[id] = {node}
 
 
 def removeUnusedDefs(doc, defElem, elemsToRemove=None, referencedIDs=None):
@@ -1457,7 +1457,7 @@ def collapseSinglyReferencedGradients(doc):
                 elem.namespaceURI == NS['SVG']
             ):
                 # found a gradient that is referenced by only 1 other element
-                refElem = nodes[0]
+                refElem = nodes.pop()
                 if refElem.nodeType == Node.ELEMENT_NODE and refElem.nodeName in ['linearGradient', 'radialGradient'] \
                         and refElem.namespaceURI == NS['SVG']:
                     # elem is a gradient referenced by only one other gradient (refElem)

From ca2b32c0b38b0d0d224cadf2db12ce9df6339c63 Mon Sep 17 00:00:00 2001
From: Niels Thykier <niels@thykier.net>
Date: Thu, 21 May 2020 14:14:25 +0000
Subject: [PATCH 7/7] removeDuplicateGradients: Maintain referenced_ids

This avoids calling `findReferencedElements` more than once per
removeDuplicateGradients.  This is good for performance as
`findReferencedElements` is one of the slowest functions in scour.

Signed-off-by: Niels Thykier <niels@thykier.net>
---
 scour/scour.py | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/scour/scour.py b/scour/scour.py
index 48f7d92..a3e0c82 100644
--- a/scour/scour.py
+++ b/scour/scour.py
@@ -1615,19 +1615,39 @@ def dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids):
         # it is safe to remove this gradient from the document
         dup_grad.parentNode.removeChild(dup_grad)
 
+    # If the gradients have an ID, we update referenced_ids to match the newly remapped IDs.
+    # This enable us to avoid calling findReferencedElements once per loop, which is helpful as it is
+    # one of the slowest functions in scour.
+    if master_id:
+        try:
+            master_references = referenced_ids[master_id]
+        except KeyError:
+            master_references = set()
+
+        for dup_id in duplicates_ids:
+            references = referenced_ids.pop(dup_id, None)
+            if references is None:
+                continue
+            master_references.update(references)
+
+        # Only necessary but needed if the master gradient did
+        # not have any references originally
+        referenced_ids[master_id] = master_references
+
 
 def removeDuplicateGradients(doc):
     prev_num = -1
     num = 0
 
+    # get a collection of all elements that are referenced and their referencing elements
+    referenced_ids = findReferencedElements(doc.documentElement)
+
     while prev_num != num:
         prev_num = num
 
         linear_gradients = doc.getElementsByTagName('linearGradient')
         radial_gradients = doc.getElementsByTagName('radialGradient')
 
-        # get a collection of all elements that are referenced and their referencing elements
-        referenced_ids = findReferencedElements(doc.documentElement)
         for master_id, duplicates_ids, duplicates in detect_duplicate_gradients(linear_gradients, radial_gradients):
             dedup_gradient(master_id, duplicates_ids, duplicates, referenced_ids)
             num += len(duplicates)