Skip to content

Commit 5d09398

Browse files
add better parser (NOT HOOKED UP YET)
1 parent e846da1 commit 5d09398

File tree

8 files changed

+296
-81
lines changed

8 files changed

+296
-81
lines changed

.devcontainer/devcontainer.json

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"image":"mcr.microsoft.com/devcontainers/python:0-3.11-bullseye"}

.gitignore

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
1-
__pycache__/**
1+
**/__pycache__
22
json_runner.egg-info/**

.vscode/settings.json

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
{
2+
"cSpell.words": [
3+
"identbodychars",
4+
"pyparsing"
5+
]
6+
}

json_runner.test.py

+8
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,11 @@
108108
- say ((1 2 3) foo bar)
109109
- say (#[list 1 2 3])
110110
"""))
111+
112+
# test bad parsing (issue #1)
113+
114+
x.eval(yaml.full_load("""
115+
# this errors because of the unclosed quote
116+
# v
117+
- say I'm a tomato!
118+
"""))

json_runner.py renamed to json_runner/__init__.py

+11-80
Original file line numberDiff line numberDiff line change
@@ -2,75 +2,9 @@
22
import random
33
from collections import OrderedDict
44

5-
__all__ = "parse Signal Done Next Abort Return BareEngine Engine".split()
6-
5+
from .string_parsing import parse
76

8-
def parse(
9-
string,
10-
delimiters,
11-
*,
12-
singletons=[],
13-
openers="[{(\"'",
14-
closers="]})\"'",
15-
include_delims=False,
16-
split_at_parens=True):
17-
# Adapted from https://stackoverflow.com/a/64211769
18-
current_string = ''
19-
stack = []
20-
otc = dict(zip(openers, closers))
21-
delimiters = sorted(delimiters, key=len, reverse=True)
22-
singletons = sorted(singletons, key=len, reverse=True)
23-
while string:
24-
c = string[0]
25-
if c in openers:
26-
if stack and otc[c] == stack[-1] == c:
27-
stack.pop()
28-
if split_at_parens:
29-
yield current_string + c
30-
current_string = ""
31-
string = string[1:]
32-
continue
33-
else:
34-
if split_at_parens and not stack and current_string:
35-
yield current_string
36-
current_string = ""
37-
stack.append(c)
38-
elif c in closers:
39-
if not stack:
40-
raise SyntaxError("unopened %s" % c)
41-
if otc[b := stack.pop()] != c:
42-
raise SyntaxError(
43-
f"closing paren '{c}' does not match opening paren '{b}'")
44-
if split_at_parens and not stack and current_string:
45-
yield current_string + c
46-
current_string = c = ""
47-
at_split = False
48-
if not stack:
49-
for d in delimiters:
50-
if string.startswith(d):
51-
if current_string:
52-
yield current_string
53-
if include_delims:
54-
yield d
55-
current_string = ""
56-
string = string.removeprefix(d)
57-
at_split = True
58-
break
59-
if not at_split:
60-
for s in singletons:
61-
if stack:
62-
continue
63-
if string.startswith(s):
64-
yield from (current_string, s)
65-
current_string = ""
66-
string = string.removeprefix(s)
67-
break
68-
else:
69-
current_string += c
70-
string = string[1:]
71-
if stack:
72-
raise SyntaxError(f"unmatched '{stack[-1]}'")
73-
yield current_string
7+
__all__ = "parse Signal Done Next Abort Return BareEngine Engine".split()
748

759

7610
PYTHONIZE_MAP = {
@@ -125,26 +59,23 @@ def __init__(self):
12559
@property
12660
def ops(self):
12761
names = [x for x in dir(self) if x.startswith("op_")]
128-
nPk = [(int((s := n.removeprefix("op_").split("_", 1))[0]), s[1], n)
62+
precedence_name_method = [(int((s := n.removeprefix("op_").split("_", 1))[0]), s[1], n)
12963
for n in names]
130-
sPk = sorted(nPk, key=lambda x: x[0])
131-
text_ops = [x[1].replace("_", " ") for x in sPk]
132-
for i, o in enumerate(text_ops):
133-
for punc, py in PYTHONIZE_MAP.items():
134-
o = o.replace(py, punc)
135-
text_ops[i] = o
136-
py_name_ops = [getattr(self, x[2]) for x in sPk]
137-
return OrderedDict(zip(text_ops, py_name_ops))
64+
sorted_pnm = sorted(precedence_name_method, key=lambda x: x[0])
65+
op_names = [x[1].replace("_", " ") for x in sorted_pnm]
66+
for i, o in enumerate(op_names):
67+
for punctuation, python in PYTHONIZE_MAP.items():
68+
o = o.replace(python, punctuation)
69+
op_names[i] = o
70+
callbacks = [getattr(self, x[2]) for x in sorted_pnm]
71+
return OrderedDict(zip(op_names, callbacks))
13872

13973
def eval(self, code):
14074
match code:
14175
case str():
14276
code = code.strip()
14377
if not code:
14478
return None
145-
if ";" in code:
146-
return self.eval(list(parse(code, [";"],
147-
split_at_parens=False)))
14879
return self.call_function(code)
14980
case list() | tuple():
15081
self.scope_stack[-1]["result"] = None

json_runner/string_parsing.py

+252
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
2+
3+
from dataclasses import dataclass, field
4+
from pprint import pprint
5+
from typing import Any
6+
import regex
7+
8+
9+
@dataclass
10+
class Token:
11+
start: int = field(compare=False)
12+
value: Any
13+
source: str = field(compare=False)
14+
line: str = field(repr=False, compare=False)
15+
16+
17+
@dataclass
18+
class Expression:
19+
elements: list[Any]
20+
21+
22+
@dataclass
23+
class FunctionCall:
24+
name: str
25+
arg: str
26+
27+
28+
class ParenList(list, Token):
29+
opener: Token | None
30+
closer: Token | None
31+
32+
def __init__(self, *args):
33+
super().__init__(*args)
34+
self.opener = None
35+
self.closer = None
36+
37+
def __repr__(self):
38+
return f"<{self.__class__.__name__} opener={self.opener!r} {super().__repr__()} closer={self.closer!r}>"
39+
40+
41+
def parse(
42+
string,
43+
delimiters,
44+
*,
45+
singletons=[],
46+
openers="[{(\"'",
47+
closers="]})\"'",
48+
include_delims=False,
49+
split_at_parens=True):
50+
# Adapted from https://stackoverflow.com/a/64211769
51+
current_string = ''
52+
stack = []
53+
otc = dict(zip(openers, closers))
54+
delimiters = sorted(delimiters, key=len, reverse=True)
55+
singletons = sorted(singletons, key=len, reverse=True)
56+
while string:
57+
c = string[0]
58+
if c in openers:
59+
if stack and otc[c] == stack[-1] == c:
60+
stack.pop()
61+
if split_at_parens:
62+
yield current_string + c
63+
current_string = ""
64+
string = string[1:]
65+
continue
66+
else:
67+
if split_at_parens and not stack and current_string:
68+
yield current_string
69+
current_string = ""
70+
stack.append(c)
71+
elif c in closers:
72+
if not stack:
73+
raise SyntaxError("unopened %s" % c)
74+
if otc[b := stack.pop()] != c:
75+
raise SyntaxError(
76+
f"closing paren '{c}' does not match opening paren '{b}'")
77+
if split_at_parens and not stack and current_string:
78+
yield current_string + c
79+
current_string = c = ""
80+
at_split = False
81+
if not stack:
82+
for d in delimiters:
83+
if string.startswith(d):
84+
if current_string:
85+
yield current_string
86+
if include_delims:
87+
yield d
88+
current_string = ""
89+
string = string.removeprefix(d)
90+
at_split = True
91+
break
92+
if not at_split:
93+
for s in singletons:
94+
if stack:
95+
continue
96+
if string.startswith(s):
97+
yield from (current_string, s)
98+
current_string = ""
99+
string = string.removeprefix(s)
100+
break
101+
else:
102+
current_string += c
103+
string = string[1:]
104+
if stack:
105+
raise SyntaxError(f"unmatched '{stack[-1]}'")
106+
yield current_string
107+
108+
109+
def raise_token_error(tokens, errormsg):
110+
tokens = sorted(tokens, key=lambda t: t.start)
111+
line = tokens[0].line
112+
pointers = list(" " * len(line))
113+
for token in tokens:
114+
src_len = len(token.source)
115+
pointers[token.start:token.start+src_len] = list("^" * src_len)
116+
raise SyntaxError(f"{errormsg}\n{line}\n{''.join(pointers)}")
117+
118+
119+
def tokenslice(from_, to):
120+
return from_.line[from_.start:to.start+len(to.source)]
121+
122+
123+
def get_end_token(it, which):
124+
while isinstance(it, ParenList):
125+
it = it.closer if which else it.opener
126+
return it
127+
128+
129+
def _parse_secondpass(tree):
130+
if not isinstance(tree, ParenList):
131+
if isinstance(tree, Token):
132+
return tree.value
133+
return tree
134+
treeval = None
135+
match tree.opener.value:
136+
case "{":
137+
treeval = tokenslice(tree.opener, tree.closer)[
138+
1:-1] if tree else ""
139+
case "[":
140+
if tree:
141+
treeval = FunctionCall(tree[0].source, tokenslice(get_end_token(
142+
tree[1], False), get_end_token(tree[-1], True)) if len(tree) > 1 else "")
143+
case "(" | _:
144+
treeval = Expression(list(map(_parse_secondpass, tree)))
145+
return treeval
146+
147+
def _interpolate_secondpass(top):
148+
out = []
149+
prev = top.opener
150+
for item in top:
151+
if isinstance(item, ParenList) and item.opener.value == "(" and item.closer.value == ")":
152+
if string := tokenslice(prev, item.opener).removesuffix("(").removeprefix(")"):
153+
out.append(string)
154+
out.append(_parse_secondpass(item))
155+
prev = item.closer
156+
if string := tokenslice(prev, top.closer).removesuffix("(").removeprefix(")"):
157+
out.append(string)
158+
return out
159+
160+
161+
def _parse_firstpass(line, atoms, mismatch_pred=lambda _: True, notclosed_pred=lambda _: True):
162+
# first pass: nesting stuff
163+
stack = []
164+
current_tokens = ParenList()
165+
# everything is wrapped in top level function call
166+
current_tokens.opener = Token(0, "[", "[", line)
167+
c2o = dict(zip(")]}", "([{"))
168+
for token in tokenize(line, atoms):
169+
if token.value in c2o.values():
170+
stack.append((current_tokens, token.value, token))
171+
current_tokens = ParenList()
172+
current_tokens.opener = token
173+
elif token.value in c2o.keys():
174+
closed_open = c2o[token.value]
175+
previous, expected_open, open_token = stack.pop()
176+
if expected_open != closed_open and mismatch_pred(stack):
177+
raise_token_error(
178+
[token, open_token], f"mismatched parens: {expected_open} <-> {token.value}")
179+
previous.append(current_tokens)
180+
current_tokens.closer = token
181+
current_tokens = previous
182+
else:
183+
current_tokens.append(token)
184+
if stack and notclosed_pred(stack):
185+
raise_token_error([ctx[2] for ctx in stack],
186+
"these parens were never closed:")
187+
elif stack:
188+
current_tokens = stack[0][0]
189+
current_tokens.closer = Token(len(line) - 1, "]", "]", line)
190+
return current_tokens
191+
192+
193+
def parse2(line, atoms):
194+
return _parse_secondpass(_parse_firstpass(line, atoms))
195+
196+
197+
def parse_interpolated(line, atoms):
198+
has_open = lambda stack: stack and stack[0][0].opener == "("
199+
return _interpolate_secondpass(_parse_firstpass(line, atoms, has_open, has_open))
200+
201+
202+
def process_escapes(string):
203+
ESCAPES = dict(zip("nteoc", "\n\t\x1b{}"))
204+
return regex.sub(r"\\(.)", lambda match: ESCAPES.get(match.group(1), match.group(1)), string)
205+
206+
207+
def process_token(token):
208+
if token[0] in "'\'":
209+
return process_escapes(token[1:-1])
210+
try:
211+
return int(token, base=0)
212+
except ValueError:
213+
try:
214+
return float(token)
215+
except ValueError:
216+
return token
217+
218+
219+
def escape_atom(a):
220+
if a[0].isalpha() and a[-1].isalpha():
221+
return fr"(?:\b{regex.escape(a)}\b)"
222+
return f"(?:{regex.escape(a)})"
223+
224+
225+
def tokenize(string, atoms):
226+
ATOM_TOKEN = "|".join(map(escape_atom, sorted(atoms, reverse=True)))
227+
STRING_TOKEN = r"""(?<=\s|^)(?P<q>['"])(?:\\\S|(?!(?P=q))[\s\S])*?(?P=q)(?=\s|$)"""
228+
NUMBER_TOKEN = r"0x\d+|-?\d+(?:\.\d+(?:[eE][+-]\d+)?)?"
229+
PAREN_TOKEN = r"[\[\](){}]"
230+
NOT_ANYNAME_TOKEN = r"(?P<any>" + "|".join(map(lambda t: f"(?:{t})", filter(
231+
None, [PAREN_TOKEN, STRING_TOKEN, ATOM_TOKEN, NUMBER_TOKEN]))) + ")"
232+
ANYNAME_TOKEN = r"(?:(?!(?&any))\S)+"
233+
ALL_TOKENS = regex.compile(NOT_ANYNAME_TOKEN + "|" + ANYNAME_TOKEN)
234+
i = 0
235+
while i < len(string):
236+
match = ALL_TOKENS.search(string, i)
237+
if not match:
238+
return
239+
token = match.group(0)
240+
yield Token(match.start(), process_token(token), token, string)
241+
i = match.end()
242+
243+
244+
if __name__ == '__main__':
245+
line, atoms = "'et a tomato! (Sub-expression) {]{]{] pa(\"'\") spooky! (the orange) And errors: {{{", [
246+
"$", "or", "and", "is in"]
247+
for t in tokenize(line, atoms):
248+
print(t.line)
249+
print(" " * t.start + "^" * len(t.source), repr(t.value))
250+
parsed = parse_interpolated(line, atoms)
251+
print("-" * 80)
252+
print(parsed)

0 commit comments

Comments
 (0)