@@ -137,6 +137,8 @@ def process_escapes(string):
137
137
138
138
139
139
def process_token (token ):
140
+ if not token :
141
+ return ""
140
142
if token [0 ] in "'\" " :
141
143
return process_escapes (token [1 :- 1 ])
142
144
try :
@@ -155,30 +157,45 @@ def escape_atom(a):
155
157
156
158
157
159
def tokenize (string , atoms ):
158
- ATOM_TOKEN = "|" .join (map (escape_atom , sorted (atoms , reverse = True )))
159
- STRING_TOKEN = r"""(?<=\s|^)(?P<q>['"])(?:\\\S|(?!(?P=q))[\s\S])*?(?P=q)(?=\s|$)"""
160
- NUMBER_TOKEN = r"0x\d+|-?\d+(?:\.\d+(?:[eE][+-]\d+)?)?"
161
- PAREN_TOKEN = r"[\[\](){}]"
162
- NOT_ANYNAME_TOKEN = r"(?P<any>" + "|" .join (map (lambda t : f"(?:{ t } )" , filter (
163
- None , [PAREN_TOKEN , STRING_TOKEN , ATOM_TOKEN , NUMBER_TOKEN ]))) + ")"
164
- ANYNAME_TOKEN = r"(?:(?!(?&any))\S)+"
165
- ALL_TOKENS = regex .compile (NOT_ANYNAME_TOKEN + "|" + ANYNAME_TOKEN )
160
+ ATOM_REGEX = "|" .join (
161
+ fr"(?&start){ regex .escape (a )} (?&end)"
162
+ if a [0 ].isalpha () and a [- 1 ].isalpha ()
163
+ else regex .escape (a )
164
+ for a in sorted (atoms , reverse = True )
165
+ )
166
+ if ATOM_REGEX :
167
+ ATOM_REGEX = "| (?:%s)" % ATOM_REGEX
168
+ ALL_TOKENS = r"""
169
+ (?(DEFINE)
170
+ (?P<start>(?<=\s|^))
171
+ (?P<end>(?=\s|$))
172
+ )
173
+ (?P<special>
174
+ (?:[\[\](){}]) # parens
175
+ | (?:(?&start)(?P<q>['"])(?:\\\S|(?!(?P=q))[\s\S])*?(?P=q)(?&end))
176
+ # double or single quoted string
177
+ %s # an atom (but NOT in a word) -- this will be formatted in below vv
178
+ | (?:0x\d+|-?\d+(?:\.\d+(?:[eE][+-]\d+)?)?) # a number
179
+ ) | (?:(?:(?!(?&special))\S)+) # anything that is not special token""" % ATOM_REGEX
180
+ ALL_TOKENS = regex .compile (ALL_TOKENS , flags = regex .X )
166
181
i = 0
167
182
while i < len (string ):
168
183
match = ALL_TOKENS .search (string , i )
169
184
if not match :
170
185
return
171
186
token = match .group (0 )
187
+ if not token :
188
+ raise_token_error ([Token (i , None , " " , string )], f"empty token (internal error) { atoms = } " )
172
189
yield Token (match .start (), process_token (token ), token , string )
173
190
i = match .end ()
174
191
175
192
176
193
if __name__ == '__main__' :
177
- line , atoms = "It's a tomato! (Sub-expression) {]{]{] pa( \" ' \\ n \\ n \" ) spooky! (the orange) And errors: {{{ " , [
194
+ line , atoms = "sandbox world door " , [
178
195
"$" , "or" , "and" , "is in" ]
179
196
for t in tokenize (line , atoms ):
180
197
print (t .line )
181
198
print (" " * t .start + "^" * len (t .source ), repr (t .value ))
182
- parsed = parse_interpolated (line , atoms )
199
+ parsed = parse2 (line , atoms , "()" )
183
200
print ("-" * 80 )
184
201
print (parsed )
0 commit comments