add better parser (NOT HOOKED UP YET)

dragoncoder047 · web-flow · commit 5d093981ede8 · 2023-10-20T00:23:59.000Z
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -0,0 +1 @@
+{"image":"mcr.microsoft.com/devcontainers/python:0-3.11-bullseye"}
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,2 @@
-__pycache__/**
+**/__pycache__
 json_runner.egg-info/**
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,6 @@
+{
+    "cSpell.words": [
+        "identbodychars",
+        "pyparsing"
+    ]
+}
diff --git a/json_runner.test.py b/json_runner.test.py
@@ -108,3 +108,11 @@
 - say ((1 2 3) foo bar)
 - say (#[list 1 2 3])
 """))
+
+# test bad parsing (issue #1)
+
+x.eval(yaml.full_load("""
+# this errors because of the unclosed quote
+#      v
+- say I'm a tomato!
+"""))
diff --git a/json_runner/__init__.py b/json_runner/__init__.py
@@ -2,75 +2,9 @@
 import random
 from collections import OrderedDict
 
-__all__ = "parse Signal Done Next Abort Return BareEngine Engine".split()
-
+from .string_parsing import parse
 
-def parse(
-        string,
-        delimiters,
-        *,
-        singletons=[],
-        openers="[{(\"'",
-        closers="]})\"'",
-        include_delims=False,
-        split_at_parens=True):
-    # Adapted from https://stackoverflow.com/a/64211769
-    current_string = ''
-    stack = []
-    otc = dict(zip(openers, closers))
-    delimiters = sorted(delimiters, key=len, reverse=True)
-    singletons = sorted(singletons, key=len, reverse=True)
-    while string:
-        c = string[0]
-        if c in openers:
-            if stack and otc[c] == stack[-1] == c:
-                stack.pop()
-                if split_at_parens:
-                    yield current_string + c
-                    current_string = ""
-                    string = string[1:]
-                    continue
-            else:
-                if split_at_parens and not stack and current_string:
-                    yield current_string
-                    current_string = ""
-                stack.append(c)
-        elif c in closers:
-            if not stack:
-                raise SyntaxError("unopened %s" % c)
-            if otc[b := stack.pop()] != c:
-                raise SyntaxError(
-                    f"closing paren '{c}' does not match opening paren '{b}'")
-            if split_at_parens and not stack and current_string:
-                yield current_string + c
-                current_string = c = ""
-        at_split = False
-        if not stack:
-            for d in delimiters:
-                if string.startswith(d):
-                    if current_string:
-                        yield current_string
-                    if include_delims:
-                        yield d
-                    current_string = ""
-                    string = string.removeprefix(d)
-                    at_split = True
-                    break
-        if not at_split:
-            for s in singletons:
-                if stack:
-                    continue
-                if string.startswith(s):
-                    yield from (current_string, s)
-                    current_string = ""
-                    string = string.removeprefix(s)
-                    break
-            else:
-                current_string += c
-                string = string[1:]
-    if stack:
-        raise SyntaxError(f"unmatched '{stack[-1]}'")
-    yield current_string
+__all__ = "parse Signal Done Next Abort Return BareEngine Engine".split()
 
 
 PYTHONIZE_MAP = {
@@ -125,26 +59,23 @@ def __init__(self):
     @property
     def ops(self):
         names = [x for x in dir(self) if x.startswith("op_")]
-        nPk = [(int((s := n.removeprefix("op_").split("_", 1))[0]), s[1], n)
+        precedence_name_method = [(int((s := n.removeprefix("op_").split("_", 1))[0]), s[1], n)
                for n in names]
-        sPk = sorted(nPk, key=lambda x: x[0])
-        text_ops = [x[1].replace("_", " ") for x in sPk]
-        for i, o in enumerate(text_ops):
-            for punc, py in PYTHONIZE_MAP.items():
-                o = o.replace(py, punc)
-            text_ops[i] = o
-        py_name_ops = [getattr(self, x[2]) for x in sPk]
-        return OrderedDict(zip(text_ops, py_name_ops))
+        sorted_pnm = sorted(precedence_name_method, key=lambda x: x[0])
+        op_names = [x[1].replace("_", " ") for x in sorted_pnm]
+        for i, o in enumerate(op_names):
+            for punctuation, python in PYTHONIZE_MAP.items():
+                o = o.replace(python, punctuation)
+            op_names[i] = o
+        callbacks = [getattr(self, x[2]) for x in sorted_pnm]
+        return OrderedDict(zip(op_names, callbacks))
 
     def eval(self, code):
         match code:
             case str():
                 code = code.strip()
                 if not code:
                     return None
-                if ";" in code:
-                    return self.eval(list(parse(code, [";"],
-                                                split_at_parens=False)))
                 return self.call_function(code)
             case list() | tuple():
                 self.scope_stack[-1]["result"] = None
diff --git a/json_runner/string_parsing.py b/json_runner/string_parsing.py
@@ -0,0 +1,252 @@
+
+
+from dataclasses import dataclass, field
+from pprint import pprint
+from typing import Any
+import regex
+
+
+@dataclass
+class Token:
+    start: int = field(compare=False)
+    value: Any
+    source: str = field(compare=False)
+    line: str = field(repr=False, compare=False)
+
+
+@dataclass
+class Expression:
+    elements: list[Any]
+
+
+@dataclass
+class FunctionCall:
+    name: str
+    arg: str
+
+
+class ParenList(list, Token):
+    opener: Token | None
+    closer: Token | None
+
+    def __init__(self, *args):
+        super().__init__(*args)
+        self.opener = None
+        self.closer = None
+
+    def __repr__(self):
+        return f"<{self.__class__.__name__} opener={self.opener!r} {super().__repr__()} closer={self.closer!r}>"
+
+
+def parse(
+        string,
+        delimiters,
+        *,
+        singletons=[],
+        openers="[{(\"'",
+        closers="]})\"'",
+        include_delims=False,
+        split_at_parens=True):
+    # Adapted from https://stackoverflow.com/a/64211769
+    current_string = ''
+    stack = []
+    otc = dict(zip(openers, closers))
+    delimiters = sorted(delimiters, key=len, reverse=True)
+    singletons = sorted(singletons, key=len, reverse=True)
+    while string:
+        c = string[0]
+        if c in openers:
+            if stack and otc[c] == stack[-1] == c:
+                stack.pop()
+                if split_at_parens:
+                    yield current_string + c
+                    current_string = ""
+                    string = string[1:]
+                    continue
+            else:
+                if split_at_parens and not stack and current_string:
+                    yield current_string
+                    current_string = ""
+                stack.append(c)
+        elif c in closers:
+            if not stack:
+                raise SyntaxError("unopened %s" % c)
+            if otc[b := stack.pop()] != c:
+                raise SyntaxError(
+                    f"closing paren '{c}' does not match opening paren '{b}'")
+            if split_at_parens and not stack and current_string:
+                yield current_string + c
+                current_string = c = ""
+        at_split = False
+        if not stack:
+            for d in delimiters:
+                if string.startswith(d):
+                    if current_string:
+                        yield current_string
+                    if include_delims:
+                        yield d
+                    current_string = ""
+                    string = string.removeprefix(d)
+                    at_split = True
+                    break
+        if not at_split:
+            for s in singletons:
+                if stack:
+                    continue
+                if string.startswith(s):
+                    yield from (current_string, s)
+                    current_string = ""
+                    string = string.removeprefix(s)
+                    break
+            else:
+                current_string += c
+                string = string[1:]
+    if stack:
+        raise SyntaxError(f"unmatched '{stack[-1]}'")
+    yield current_string
+
+
+def raise_token_error(tokens, errormsg):
+    tokens = sorted(tokens, key=lambda t: t.start)
+    line = tokens[0].line
+    pointers = list(" " * len(line))
+    for token in tokens:
+        src_len = len(token.source)
+        pointers[token.start:token.start+src_len] = list("^" * src_len)
+    raise SyntaxError(f"{errormsg}\n{line}\n{''.join(pointers)}")
+
+
+def tokenslice(from_, to):
+    return from_.line[from_.start:to.start+len(to.source)]
+
+
+def get_end_token(it, which):
+    while isinstance(it, ParenList):
+        it = it.closer if which else it.opener
+    return it
+
+
+def _parse_secondpass(tree):
+    if not isinstance(tree, ParenList):
+        if isinstance(tree, Token):
+            return tree.value
+        return tree
+    treeval = None
+    match tree.opener.value:
+        case "{":
+            treeval = tokenslice(tree.opener, tree.closer)[
+                1:-1] if tree else ""
+        case "[":
+            if tree:
+                treeval = FunctionCall(tree[0].source, tokenslice(get_end_token(
+                    tree[1], False), get_end_token(tree[-1], True)) if len(tree) > 1 else "")
+        case "(" | _:
+            treeval = Expression(list(map(_parse_secondpass, tree)))
+    return treeval
+
+def _interpolate_secondpass(top):
+    out = []
+    prev = top.opener
+    for item in top:
+        if isinstance(item, ParenList) and item.opener.value == "(" and item.closer.value == ")":
+            if string := tokenslice(prev, item.opener).removesuffix("(").removeprefix(")"):
+                out.append(string)
+            out.append(_parse_secondpass(item))
+            prev = item.closer
+    if string := tokenslice(prev, top.closer).removesuffix("(").removeprefix(")"):
+        out.append(string)
+    return out
+
+
+def _parse_firstpass(line, atoms, mismatch_pred=lambda _: True, notclosed_pred=lambda _: True):
+    # first pass: nesting stuff
+    stack = []
+    current_tokens = ParenList()
+    # everything is wrapped in top level function call
+    current_tokens.opener = Token(0, "[", "[", line)
+    c2o = dict(zip(")]}", "([{"))
+    for token in tokenize(line, atoms):
+        if token.value in c2o.values():
+            stack.append((current_tokens, token.value, token))
+            current_tokens = ParenList()
+            current_tokens.opener = token
+        elif token.value in c2o.keys():
+            closed_open = c2o[token.value]
+            previous, expected_open, open_token = stack.pop()
+            if expected_open != closed_open and mismatch_pred(stack):
+                raise_token_error(
+                    [token, open_token], f"mismatched parens: {expected_open} <-> {token.value}")
+            previous.append(current_tokens)
+            current_tokens.closer = token
+            current_tokens = previous
+        else:
+            current_tokens.append(token)
+    if stack and notclosed_pred(stack):
+        raise_token_error([ctx[2] for ctx in stack],
+                          "these parens were never closed:")
+    elif stack:
+        current_tokens = stack[0][0]
+    current_tokens.closer = Token(len(line) - 1, "]", "]", line)
+    return current_tokens
+
+
+def parse2(line, atoms):
+    return _parse_secondpass(_parse_firstpass(line, atoms))
+
+
+def parse_interpolated(line, atoms):
+    has_open = lambda stack: stack and stack[0][0].opener == "("
+    return _interpolate_secondpass(_parse_firstpass(line, atoms, has_open, has_open))
+
+
+def process_escapes(string):
+    ESCAPES = dict(zip("nteoc", "\n\t\x1b{}"))
+    return regex.sub(r"\\(.)", lambda match: ESCAPES.get(match.group(1), match.group(1)), string)
+
+
+def process_token(token):
+    if token[0] in "'\'":
+        return process_escapes(token[1:-1])
+    try:
+        return int(token, base=0)
+    except ValueError:
+        try:
+            return float(token)
+        except ValueError:
+            return token
+
+
+def escape_atom(a):
+    if a[0].isalpha() and a[-1].isalpha():
+        return fr"(?:\b{regex.escape(a)}\b)"
+    return f"(?:{regex.escape(a)})"
+
+
+def tokenize(string, atoms):
+    ATOM_TOKEN = "|".join(map(escape_atom, sorted(atoms, reverse=True)))
+    STRING_TOKEN = r"""(?<=\s|^)(?P<q>['"])(?:\\\S|(?!(?P=q))[\s\S])*?(?P=q)(?=\s|$)"""
+    NUMBER_TOKEN = r"0x\d+|-?\d+(?:\.\d+(?:[eE][+-]\d+)?)?"
+    PAREN_TOKEN = r"[\[\](){}]"
+    NOT_ANYNAME_TOKEN = r"(?P<any>" + "|".join(map(lambda t: f"(?:{t})", filter(
+        None, [PAREN_TOKEN, STRING_TOKEN, ATOM_TOKEN, NUMBER_TOKEN]))) + ")"
+    ANYNAME_TOKEN = r"(?:(?!(?&any))\S)+"
+    ALL_TOKENS = regex.compile(NOT_ANYNAME_TOKEN + "|" + ANYNAME_TOKEN)
+    i = 0
+    while i < len(string):
+        match = ALL_TOKENS.search(string, i)
+        if not match:
+            return
+        token = match.group(0)
+        yield Token(match.start(), process_token(token), token, string)
+        i = match.end()
+
+
+if __name__ == '__main__':
+    line, atoms = "'et a tomato! (Sub-expression) {]{]{] pa(\"'\") spooky! (the orange) And errors: {{{", [
+        "$", "or", "and", "is in"]
+    for t in tokenize(line, atoms):
+        print(t.line)
+        print(" " * t.start + "^" * len(t.source), repr(t.value))
+    parsed = parse_interpolated(line, atoms)
+    print("-" * 80)
+    print(parsed)
diff --git a/ptest.py b/ptest.py
diff --git a/pyproject.toml b/pyproject.toml

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"image":"mcr.microsoft.com/devcontainers/python:0-3.11-bullseye"}`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-__pycache__/**`
	`1`	`+**/__pycache__`
`2`	`2`	`json_runner.egg-info/**`