Optimize removeDuplicateGradient to avoid O(n²) in the average case
The original implementation of removeDuplicateGradient does O(n²)
search over all gradients to remove duplicates. In images with many
gradients (such as [MediaWiki_logo_1.svg]), this becomes a significant
overhead.
This patch optimizes for the average case by splitting gradients into
smaller lists (called "buckets" in the code). The splitting is done
by selecting some attributes to generate a key with the following
properties:
* If multiple gradients have the same key, then a subset of those
gradients /might/ be duplicates of each other.
* If their keys are not identical, then they cannot be duplicates.
Note that in worst case, we will still hit O(n²) and it is easily
possible to construct svg files that deliberately triggers the O(n²)
runtime.
With that caveat aside, this improves the runtime performance on
[MediaWiki_logo_1.svg] by about 25% (8m51s -> 6m40s on 5 runs).
Original:
$ time for I in $(seq 1 5) ; do \
python3 -m scour.scour MediaWiki_logo_1.svg out.svg ; \
done
Scour processed file "heavy.svg" in 105042 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 103412 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 105334 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 107902 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 108161 ms: 1582746/4989544 bytes new/orig -> 31.7%
8m51.855s
...
Optimized:
Scour processed file "heavy.svg" in 78162 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 81202 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 81554 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 80067 ms: 1582746/4989544 bytes new/orig -> 31.7%
Scour processed file "heavy.svg" in 77267 ms: 1582746/4989544 bytes new/orig -> 31.7%
[MediaWiki_logo_1.svg]: https://upload.wikimedia.org/wikipedia/commons/archive/5/54/20120822053933%21MediaWiki_logo_1.svg
Signed-off-by: Niels Thykier <niels@thykier.net>
This commit is contained in:
parent
0776d32179
commit
6663b54428
1 changed files with 34 additions and 2 deletions
|
|
@ -57,7 +57,7 @@ import sys
|
||||||
import time
|
import time
|
||||||
import xml.dom.minidom
|
import xml.dom.minidom
|
||||||
from xml.dom import Node, NotFoundErr
|
from xml.dom import Node, NotFoundErr
|
||||||
from collections import namedtuple
|
from collections import namedtuple, defaultdict
|
||||||
from decimal import Context, Decimal, InvalidOperation, getcontext
|
from decimal import Context, Decimal, InvalidOperation, getcontext
|
||||||
|
|
||||||
import six
|
import six
|
||||||
|
|
@ -1355,18 +1355,50 @@ def collapseSinglyReferencedGradients(doc):
|
||||||
return num
|
return num
|
||||||
|
|
||||||
|
|
||||||
|
def computeGradientBucketKey(gradType, grad):
|
||||||
|
# We use these attributes to split gradients into "small" buckets
|
||||||
|
# and then only look for identical gradients inside those buckets.
|
||||||
|
# Note; besides these we also group by gradient stops as some
|
||||||
|
# gradients only differs on gradient stops. For that purpose, we
|
||||||
|
# use the attributes in gradStopBucketsAttr
|
||||||
|
gradBucketAttr = ['gradientUnits', 'x1', 'x2', 'spreadMethod']
|
||||||
|
gradStopBucketsAttr = ['offset']
|
||||||
|
|
||||||
|
# A linearGradient can never be a duplicate of a
|
||||||
|
# radialGradient (and vice versa)
|
||||||
|
subKeys = [gradType]
|
||||||
|
subKeys.extend(grad.getAttribute(a) for a in gradBucketAttr)
|
||||||
|
stops = grad.getElementsByTagName('stop')
|
||||||
|
if stops.length:
|
||||||
|
for i in range(stops.length):
|
||||||
|
stop = stops.item(i)
|
||||||
|
for attr in gradStopBucketsAttr:
|
||||||
|
stopKey = stop.getAttribute(attr)
|
||||||
|
subKeys.append(stopKey)
|
||||||
|
|
||||||
|
return " ".join(subKeys)
|
||||||
|
|
||||||
|
|
||||||
def removeDuplicateGradients(doc):
|
def removeDuplicateGradients(doc):
|
||||||
global _num_elements_removed
|
global _num_elements_removed
|
||||||
num = 0
|
num = 0
|
||||||
|
|
||||||
gradientsToRemove = {}
|
gradientsToRemove = {}
|
||||||
duplicateToMaster = {}
|
duplicateToMaster = {}
|
||||||
|
gradBuckets = defaultdict(list)
|
||||||
|
|
||||||
for gradType in ['linearGradient', 'radialGradient']:
|
for gradType in ['linearGradient', 'radialGradient']:
|
||||||
grads = doc.getElementsByTagName(gradType)
|
grads = doc.getElementsByTagName(gradType)
|
||||||
for grad in grads:
|
for grad in grads:
|
||||||
|
key = computeGradientBucketKey(gradType, grad)
|
||||||
|
gradBuckets[key].append(grad)
|
||||||
|
|
||||||
|
for gradType in ['linearGradient', 'radialGradient']:
|
||||||
|
grads = doc.getElementsByTagName(gradType)
|
||||||
|
for grad in grads:
|
||||||
|
key = computeGradientBucketKey(gradType, grad)
|
||||||
# TODO: should slice grads from 'grad' here to optimize
|
# TODO: should slice grads from 'grad' here to optimize
|
||||||
for ograd in grads:
|
for ograd in gradBuckets[key]:
|
||||||
# do not compare gradient to itself
|
# do not compare gradient to itself
|
||||||
if grad == ograd:
|
if grad == ograd:
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue