Optimize removeDuplicateGradient to avoid O(n²) in the average case

The original implementation of removeDuplicateGradient does O(n²) search over all gradients to remove duplicates. In images with many gradients (such as [MediaWiki_logo_1.svg]), this becomes a significant overhead. This patch optimizes for the average case by splitting gradients into smaller lists (called "buckets" in the code). The splitting is done by selecting some attributes to generate a key with the following properties: * If multiple gradients have the same key, then a subset of those gradients /might/ be duplicates of each other. * If their keys are not identical, then they cannot be duplicates. Note that in worst case, we will still hit O(n²) and it is easily possible to construct svg files that deliberately triggers the O(n²) runtime. With that caveat aside, this improves the runtime performance on [MediaWiki_logo_1.svg] by about 25% (8m51s -> 6m40s on 5 runs). Original: $ time for I in $(seq 1 5) ; do \ python3 -m scour.scour MediaWiki_logo_1.svg out.svg ; \ done Scour processed file "heavy.svg" in 105042 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 103412 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 105334 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 107902 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 108161 ms: 1582746/4989544 bytes new/orig -> 31.7% 8m51.855s ... Optimized: Scour processed file "heavy.svg" in 78162 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 81202 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 81554 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 80067 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 77267 ms: 1582746/4989544 bytes new/orig -> 31.7% [MediaWiki_logo_1.svg]: https://upload.wikimedia.org/wikipedia/commons/archive/5/54/20120822053933%21MediaWiki_logo_1.svg Signed-off-by: Niels Thykier <niels@thykier.net>
2018-03-10 08:37:39 +00:00 · 2018-03-10 08:37:39 +00:00 · 6663b54428
commit 6663b54428
parent 0776d32179
1 changed files with 34 additions and 2 deletions
--- a/scour/scour.py
+++ b/scour/scour.py
@ -57,7 +57,7 @@ import sys
 import time
 import xml.dom.minidom
 from xml.dom import Node, NotFoundErr
-from collections import namedtuple
+from collections import namedtuple, defaultdict
 from decimal import Context, Decimal, InvalidOperation, getcontext

 import six
@ -1355,18 +1355,50 @@ def collapseSinglyReferencedGradients(doc):
    return num


+def computeGradientBucketKey(gradType, grad):
+    # We use these attributes to split gradients into "small" buckets
+    # and then only look for identical gradients inside those buckets.
+    # Note; besides these we also group by gradient stops as some
+    # gradients only differs on gradient stops.  For that purpose, we
+    # use the attributes in gradStopBucketsAttr
+    gradBucketAttr = ['gradientUnits', 'x1', 'x2', 'spreadMethod']
+    gradStopBucketsAttr = ['offset']
+
+    # A linearGradient can never be a duplicate of a
+    # radialGradient (and vice versa)
+    subKeys = [gradType]
+    subKeys.extend(grad.getAttribute(a) for a in gradBucketAttr)
+    stops = grad.getElementsByTagName('stop')
+    if stops.length:
+        for i in range(stops.length):
+            stop = stops.item(i)
+            for attr in gradStopBucketsAttr:
+                stopKey = stop.getAttribute(attr)
+                subKeys.append(stopKey)
+
+    return " ".join(subKeys)
+
+
 def removeDuplicateGradients(doc):
    global _num_elements_removed
    num = 0

    gradientsToRemove = {}
    duplicateToMaster = {}
+    gradBuckets = defaultdict(list)

    for gradType in ['linearGradient', 'radialGradient']:
        grads = doc.getElementsByTagName(gradType)
        for grad in grads:
+            key = computeGradientBucketKey(gradType, grad)
+            gradBuckets[key].append(grad)
+
+    for gradType in ['linearGradient', 'radialGradient']:
+        grads = doc.getElementsByTagName(gradType)
+        for grad in grads:
+            key = computeGradientBucketKey(gradType, grad)
            # TODO: should slice grads from 'grad' here to optimize
-            for ograd in grads:
+            for ograd in gradBuckets[key]:
                # do not compare gradient to itself
                if grad == ograd:
                    continue