Update the Lexer to accept expected tokens from the parser
The SVG spec allows paths to omit whitespace in some cases and expect parsers to gracefully handle this. In particularly, the parsers must greedly match as much of the token required as possible but stop as soon the token no longer matches. The latter bit is where the SVG standard gets interesting. An Elliptical command (i.e. A or a) will accept among other: ..., number, flag, flag, number, ... Where flag is defined as "0" or "1" (exactly one character). Given those tokens and the following input: 1 010 The spec requires scour to parse that as: "1" (number), "0" (flag), "1" (flag) and "0" (number). It might be tempting to just include "flag" in the default tokenization. Unfortunately this falls apart pretty quickly if you want to follow the spec. E.g. if you have 100 as input and the lexer has no hint about the next token then it can now parse it as: * Three flags * Two flags and a one-digit number (in that order) * A flag and a two-digit number (in that order) * A three-digit number Therefore, to support this, the SVGPathParser must provide the Lexer with a hint about what it is expecting in some cases. This turns out to be trivially possible by exploiting the fact that "lex" is a generator function and can trivially be converted to a "coroutine" (by replacing "next(x)" with "x.send(value)"). Signed-off-by: Niels Thykier <niels@thykier.net>
This commit is contained in:
parent
6ea126d290
commit
d598c2db5c
1 changed files with 62 additions and 31 deletions
|
|
@ -45,8 +45,6 @@ from __future__ import absolute_import
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from decimal import Decimal, getcontext
|
from decimal import Decimal, getcontext
|
||||||
from functools import partial
|
|
||||||
|
|
||||||
|
|
||||||
# Sentinel.
|
# Sentinel.
|
||||||
|
|
||||||
|
|
@ -59,10 +57,18 @@ class _EOF(object):
|
||||||
|
|
||||||
EOF = _EOF()
|
EOF = _EOF()
|
||||||
|
|
||||||
|
# default tokens
|
||||||
|
# (name, default-token, regex pattern)
|
||||||
lexicon = [
|
lexicon = [
|
||||||
('float', r'[-+]?(?:(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.?))(?:[Ee][-+]?[0-9]+)?'),
|
('float', True, r'[-+]?(?:(?:[0-9]*\.[0-9]+)|(?:[0-9]+\.?))(?:[Ee][-+]?[0-9]+)?'),
|
||||||
('int', r'[-+]?[0-9]+'),
|
('int', True, r'[-+]?[0-9]+'),
|
||||||
('command', r'[AaCcHhLlMmQqSsTtVvZz]'),
|
('command', True, r'[AaCcHhLlMmQqSsTtVvZz]'),
|
||||||
|
# The "flag" token is defined as a single 0 or single 1. We
|
||||||
|
# cannot parse this as a float or an int because those tokens
|
||||||
|
# might consume multiple digits (e.g. "int, 11" instead of "flag,
|
||||||
|
# 1" + "flag, 1") and the spec allows SVGs to omit the space after
|
||||||
|
# a flag here.
|
||||||
|
('flag', False, r'[01]'),
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -80,22 +86,47 @@ class Lexer(object):
|
||||||
def __init__(self, lexicon):
|
def __init__(self, lexicon):
|
||||||
self.lexicon = lexicon
|
self.lexicon = lexicon
|
||||||
parts = []
|
parts = []
|
||||||
for name, regex in lexicon:
|
for name, is_default, regex in lexicon:
|
||||||
parts.append('(?P<%s>%s)' % (name, regex))
|
if is_default:
|
||||||
|
parts.append('(?P<%s>%s)' % (name, regex))
|
||||||
|
self.all_token_names = [x for x, _, _ in lexicon]
|
||||||
|
self.single_token_rules = {x: re.compile('(?P<%s>%s)' % (x, y)) for x, _, y in lexicon}
|
||||||
self.regex_string = '|'.join(parts)
|
self.regex_string = '|'.join(parts)
|
||||||
self.regex = re.compile(self.regex_string)
|
self.regex = re.compile(self.regex_string)
|
||||||
|
|
||||||
def lex(self, text):
|
def lex(self, text):
|
||||||
""" Yield (token_type, str_data) tokens.
|
"""Coroutine that yields (token_type, str_data) tokens.
|
||||||
|
|
||||||
|
The parser can send a token name defined in the lexicon if the
|
||||||
|
default token rules are not useful.
|
||||||
|
|
||||||
The last token will be (EOF, None) where EOF is the singleton object
|
The last token will be (EOF, None) where EOF is the singleton object
|
||||||
defined in this module.
|
defined in this module.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
for match in self.regex.finditer(text):
|
offset = 0
|
||||||
for name, _ in self.lexicon:
|
current_pattern = self.regex
|
||||||
m = match.group(name)
|
|
||||||
|
while True:
|
||||||
|
match = current_pattern.search(text, offset)
|
||||||
|
if not match:
|
||||||
|
break
|
||||||
|
offset = match.end()
|
||||||
|
for name in self.all_token_names:
|
||||||
|
try:
|
||||||
|
m = match.group(name)
|
||||||
|
except IndexError:
|
||||||
|
# Thrown if "name" is defined in the pattern.
|
||||||
|
# This happens when the parser requests a
|
||||||
|
# non-default token as the default token names are
|
||||||
|
# tried before non-default ones.
|
||||||
|
continue
|
||||||
if m is not None:
|
if m is not None:
|
||||||
yield (name, m)
|
pattern_request = (yield (name, m))
|
||||||
|
if pattern_request is None:
|
||||||
|
current_pattern = self.regex
|
||||||
|
else:
|
||||||
|
current_pattern = self.single_token_rules[pattern_request]
|
||||||
break
|
break
|
||||||
yield (EOF, None)
|
yield (EOF, None)
|
||||||
|
|
||||||
|
|
@ -155,8 +186,8 @@ class SVGPathParser(object):
|
||||||
""" Parse a string of SVG <path> data.
|
""" Parse a string of SVG <path> data.
|
||||||
"""
|
"""
|
||||||
gen = self.lexer.lex(text)
|
gen = self.lexer.lex(text)
|
||||||
next_val_fn = partial(next, *(gen,))
|
next_val_fn = gen.send
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
return self.rule_svg_path(next_val_fn, token)
|
return self.rule_svg_path(next_val_fn, token)
|
||||||
|
|
||||||
def rule_svg_path(self, next_val_fn, token):
|
def rule_svg_path(self, next_val_fn, token):
|
||||||
|
|
@ -171,12 +202,12 @@ class SVGPathParser(object):
|
||||||
|
|
||||||
def rule_closepath(self, next_val_fn, token):
|
def rule_closepath(self, next_val_fn, token):
|
||||||
command = token[1]
|
command = token[1]
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
return (command, []), token
|
return (command, []), token
|
||||||
|
|
||||||
def rule_moveto_or_lineto(self, next_val_fn, token):
|
def rule_moveto_or_lineto(self, next_val_fn, token):
|
||||||
command = token[1]
|
command = token[1]
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
coordinates = []
|
coordinates = []
|
||||||
while token[0] in self.number_tokens:
|
while token[0] in self.number_tokens:
|
||||||
pair, token = self.rule_coordinate_pair(next_val_fn, token)
|
pair, token = self.rule_coordinate_pair(next_val_fn, token)
|
||||||
|
|
@ -185,7 +216,7 @@ class SVGPathParser(object):
|
||||||
|
|
||||||
def rule_orthogonal_lineto(self, next_val_fn, token):
|
def rule_orthogonal_lineto(self, next_val_fn, token):
|
||||||
command = token[1]
|
command = token[1]
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
coordinates = []
|
coordinates = []
|
||||||
while token[0] in self.number_tokens:
|
while token[0] in self.number_tokens:
|
||||||
coord, token = self.rule_coordinate(next_val_fn, token)
|
coord, token = self.rule_coordinate(next_val_fn, token)
|
||||||
|
|
@ -194,7 +225,7 @@ class SVGPathParser(object):
|
||||||
|
|
||||||
def rule_curveto3(self, next_val_fn, token):
|
def rule_curveto3(self, next_val_fn, token):
|
||||||
command = token[1]
|
command = token[1]
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
coordinates = []
|
coordinates = []
|
||||||
while token[0] in self.number_tokens:
|
while token[0] in self.number_tokens:
|
||||||
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
|
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
|
||||||
|
|
@ -207,7 +238,7 @@ class SVGPathParser(object):
|
||||||
|
|
||||||
def rule_curveto2(self, next_val_fn, token):
|
def rule_curveto2(self, next_val_fn, token):
|
||||||
command = token[1]
|
command = token[1]
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
coordinates = []
|
coordinates = []
|
||||||
while token[0] in self.number_tokens:
|
while token[0] in self.number_tokens:
|
||||||
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
|
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
|
||||||
|
|
@ -218,7 +249,7 @@ class SVGPathParser(object):
|
||||||
|
|
||||||
def rule_curveto1(self, next_val_fn, token):
|
def rule_curveto1(self, next_val_fn, token):
|
||||||
command = token[1]
|
command = token[1]
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
coordinates = []
|
coordinates = []
|
||||||
while token[0] in self.number_tokens:
|
while token[0] in self.number_tokens:
|
||||||
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
|
pair1, token = self.rule_coordinate_pair(next_val_fn, token)
|
||||||
|
|
@ -227,46 +258,46 @@ class SVGPathParser(object):
|
||||||
|
|
||||||
def rule_elliptical_arc(self, next_val_fn, token):
|
def rule_elliptical_arc(self, next_val_fn, token):
|
||||||
command = token[1]
|
command = token[1]
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
arguments = []
|
arguments = []
|
||||||
while token[0] in self.number_tokens:
|
while token[0] in self.number_tokens:
|
||||||
rx = Decimal(token[1]) * 1
|
rx = Decimal(token[1]) * 1
|
||||||
if rx < Decimal("0.0"):
|
if rx < Decimal("0.0"):
|
||||||
raise SyntaxError("expecting a nonnegative number; got %r" % (token,))
|
raise SyntaxError("expecting a nonnegative number; got %r" % (token,))
|
||||||
|
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
if token[0] not in self.number_tokens:
|
if token[0] not in self.number_tokens:
|
||||||
raise SyntaxError("expecting a number; got %r" % (token,))
|
raise SyntaxError("expecting a number; got %r" % (token,))
|
||||||
ry = Decimal(token[1]) * 1
|
ry = Decimal(token[1]) * 1
|
||||||
if ry < Decimal("0.0"):
|
if ry < Decimal("0.0"):
|
||||||
raise SyntaxError("expecting a nonnegative number; got %r" % (token,))
|
raise SyntaxError("expecting a nonnegative number; got %r" % (token,))
|
||||||
|
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
if token[0] not in self.number_tokens:
|
if token[0] not in self.number_tokens:
|
||||||
raise SyntaxError("expecting a number; got %r" % (token,))
|
raise SyntaxError("expecting a number; got %r" % (token,))
|
||||||
axis_rotation = Decimal(token[1]) * 1
|
axis_rotation = Decimal(token[1]) * 1
|
||||||
|
|
||||||
token = next_val_fn()
|
token = next_val_fn('flag')
|
||||||
if token[1] not in ('0', '1'):
|
if token[1] not in ('0', '1'):
|
||||||
raise SyntaxError("expecting a boolean flag; got %r" % (token,))
|
raise SyntaxError("expecting a boolean flag; got %r" % (token,))
|
||||||
large_arc_flag = Decimal(token[1]) * 1
|
large_arc_flag = Decimal(token[1]) * 1
|
||||||
|
|
||||||
token = next_val_fn()
|
token = next_val_fn('flag')
|
||||||
if token[1] not in ('0', '1'):
|
if token[1] not in ('0', '1'):
|
||||||
raise SyntaxError("expecting a boolean flag; got %r" % (token,))
|
raise SyntaxError("expecting a boolean flag; got %r" % (token,))
|
||||||
sweep_flag = Decimal(token[1]) * 1
|
sweep_flag = Decimal(token[1]) * 1
|
||||||
|
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
if token[0] not in self.number_tokens:
|
if token[0] not in self.number_tokens:
|
||||||
raise SyntaxError("expecting a number; got %r" % (token,))
|
raise SyntaxError("expecting a number; got %r" % (token,))
|
||||||
x = Decimal(token[1]) * 1
|
x = Decimal(token[1]) * 1
|
||||||
|
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
if token[0] not in self.number_tokens:
|
if token[0] not in self.number_tokens:
|
||||||
raise SyntaxError("expecting a number; got %r" % (token,))
|
raise SyntaxError("expecting a number; got %r" % (token,))
|
||||||
y = Decimal(token[1]) * 1
|
y = Decimal(token[1]) * 1
|
||||||
|
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
arguments.extend([rx, ry, axis_rotation, large_arc_flag, sweep_flag, x, y])
|
arguments.extend([rx, ry, axis_rotation, large_arc_flag, sweep_flag, x, y])
|
||||||
|
|
||||||
return (command, arguments), token
|
return (command, arguments), token
|
||||||
|
|
@ -275,7 +306,7 @@ class SVGPathParser(object):
|
||||||
if token[0] not in self.number_tokens:
|
if token[0] not in self.number_tokens:
|
||||||
raise SyntaxError("expecting a number; got %r" % (token,))
|
raise SyntaxError("expecting a number; got %r" % (token,))
|
||||||
x = getcontext().create_decimal(token[1])
|
x = getcontext().create_decimal(token[1])
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
return x, token
|
return x, token
|
||||||
|
|
||||||
def rule_coordinate_pair(self, next_val_fn, token):
|
def rule_coordinate_pair(self, next_val_fn, token):
|
||||||
|
|
@ -283,11 +314,11 @@ class SVGPathParser(object):
|
||||||
if token[0] not in self.number_tokens:
|
if token[0] not in self.number_tokens:
|
||||||
raise SyntaxError("expecting a number; got %r" % (token,))
|
raise SyntaxError("expecting a number; got %r" % (token,))
|
||||||
x = getcontext().create_decimal(token[1])
|
x = getcontext().create_decimal(token[1])
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
if token[0] not in self.number_tokens:
|
if token[0] not in self.number_tokens:
|
||||||
raise SyntaxError("expecting a number; got %r" % (token,))
|
raise SyntaxError("expecting a number; got %r" % (token,))
|
||||||
y = getcontext().create_decimal(token[1])
|
y = getcontext().create_decimal(token[1])
|
||||||
token = next_val_fn()
|
token = next_val_fn(None)
|
||||||
return [x, y], token
|
return [x, y], token
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue