From 6663b54428250be0e16f3de87a5a8cc000c33ba8 Mon Sep 17 00:00:00 2001 From: Niels Thykier Date: Sat, 10 Mar 2018 08:37:39 +0000 Subject: [PATCH] =?UTF-8?q?Optimize=20removeDuplicateGradient=20to=20avoid?= =?UTF-8?q?=20O(n=C2=B2)=20in=20the=20average=20case?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The original implementation of removeDuplicateGradient does O(n²) search over all gradients to remove duplicates. In images with many gradients (such as [MediaWiki_logo_1.svg]), this becomes a significant overhead. This patch optimizes for the average case by splitting gradients into smaller lists (called "buckets" in the code). The splitting is done by selecting some attributes to generate a key with the following properties: * If multiple gradients have the same key, then a subset of those gradients /might/ be duplicates of each other. * If their keys are not identical, then they cannot be duplicates. Note that in worst case, we will still hit O(n²) and it is easily possible to construct svg files that deliberately triggers the O(n²) runtime. With that caveat aside, this improves the runtime performance on [MediaWiki_logo_1.svg] by about 25% (8m51s -> 6m40s on 5 runs). Original: $ time for I in $(seq 1 5) ; do \ python3 -m scour.scour MediaWiki_logo_1.svg out.svg ; \ done Scour processed file "heavy.svg" in 105042 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 103412 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 105334 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 107902 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 108161 ms: 1582746/4989544 bytes new/orig -> 31.7% 8m51.855s ... Optimized: Scour processed file "heavy.svg" in 78162 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 81202 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 81554 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 80067 ms: 1582746/4989544 bytes new/orig -> 31.7% Scour processed file "heavy.svg" in 77267 ms: 1582746/4989544 bytes new/orig -> 31.7% [MediaWiki_logo_1.svg]: https://upload.wikimedia.org/wikipedia/commons/archive/5/54/20120822053933%21MediaWiki_logo_1.svg Signed-off-by: Niels Thykier --- scour/scour.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/scour/scour.py b/scour/scour.py index 1c65ccd..e4b38af 100644 --- a/scour/scour.py +++ b/scour/scour.py @@ -57,7 +57,7 @@ import sys import time import xml.dom.minidom from xml.dom import Node, NotFoundErr -from collections import namedtuple +from collections import namedtuple, defaultdict from decimal import Context, Decimal, InvalidOperation, getcontext import six @@ -1355,18 +1355,50 @@ def collapseSinglyReferencedGradients(doc): return num +def computeGradientBucketKey(gradType, grad): + # We use these attributes to split gradients into "small" buckets + # and then only look for identical gradients inside those buckets. + # Note; besides these we also group by gradient stops as some + # gradients only differs on gradient stops. For that purpose, we + # use the attributes in gradStopBucketsAttr + gradBucketAttr = ['gradientUnits', 'x1', 'x2', 'spreadMethod'] + gradStopBucketsAttr = ['offset'] + + # A linearGradient can never be a duplicate of a + # radialGradient (and vice versa) + subKeys = [gradType] + subKeys.extend(grad.getAttribute(a) for a in gradBucketAttr) + stops = grad.getElementsByTagName('stop') + if stops.length: + for i in range(stops.length): + stop = stops.item(i) + for attr in gradStopBucketsAttr: + stopKey = stop.getAttribute(attr) + subKeys.append(stopKey) + + return " ".join(subKeys) + + def removeDuplicateGradients(doc): global _num_elements_removed num = 0 gradientsToRemove = {} duplicateToMaster = {} + gradBuckets = defaultdict(list) for gradType in ['linearGradient', 'radialGradient']: grads = doc.getElementsByTagName(gradType) for grad in grads: + key = computeGradientBucketKey(gradType, grad) + gradBuckets[key].append(grad) + + for gradType in ['linearGradient', 'radialGradient']: + grads = doc.getElementsByTagName(gradType) + for grad in grads: + key = computeGradientBucketKey(gradType, grad) # TODO: should slice grads from 'grad' here to optimize - for ograd in grads: + for ograd in gradBuckets[key]: # do not compare gradient to itself if grad == ograd: continue