]>
Commit | Line | Data |
---|---|---|
1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | |
2 | # Licensed to PSF under a Contributor Agreement. | |
3 | ||
4 | import os | |
5 | from typing import ( | |
6 | IO, | |
7 | Any, | |
8 | Dict, | |
9 | Iterator, | |
10 | List, | |
11 | NoReturn, | |
12 | Optional, | |
13 | Sequence, | |
14 | Tuple, | |
15 | Union, | |
16 | ) | |
17 | ||
18 | from blib2to3.pgen2 import grammar, token, tokenize | |
19 | from blib2to3.pgen2.tokenize import GoodTokenInfo | |
20 | ||
21 | Path = Union[str, "os.PathLike[str]"] | |
22 | ||
23 | ||
24 | class PgenGrammar(grammar.Grammar): | |
25 | pass | |
26 | ||
27 | ||
28 | class ParserGenerator: | |
29 | filename: Path | |
30 | stream: IO[str] | |
31 | generator: Iterator[GoodTokenInfo] | |
32 | first: Dict[str, Optional[Dict[str, int]]] | |
33 | ||
34 | def __init__(self, filename: Path, stream: Optional[IO[str]] = None) -> None: | |
35 | close_stream = None | |
36 | if stream is None: | |
37 | stream = open(filename, encoding="utf-8") | |
38 | close_stream = stream.close | |
39 | self.filename = filename | |
40 | self.stream = stream | |
41 | self.generator = tokenize.generate_tokens(stream.readline) | |
42 | self.gettoken() # Initialize lookahead | |
43 | self.dfas, self.startsymbol = self.parse() | |
44 | if close_stream is not None: | |
45 | close_stream() | |
46 | self.first = {} # map from symbol name to set of tokens | |
47 | self.addfirstsets() | |
48 | ||
49 | def make_grammar(self) -> PgenGrammar: | |
50 | c = PgenGrammar() | |
51 | names = list(self.dfas.keys()) | |
52 | names.sort() | |
53 | names.remove(self.startsymbol) | |
54 | names.insert(0, self.startsymbol) | |
55 | for name in names: | |
56 | i = 256 + len(c.symbol2number) | |
57 | c.symbol2number[name] = i | |
58 | c.number2symbol[i] = name | |
59 | for name in names: | |
60 | dfa = self.dfas[name] | |
61 | states = [] | |
62 | for state in dfa: | |
63 | arcs = [] | |
64 | for label, next in sorted(state.arcs.items()): | |
65 | arcs.append((self.make_label(c, label), dfa.index(next))) | |
66 | if state.isfinal: | |
67 | arcs.append((0, dfa.index(state))) | |
68 | states.append(arcs) | |
69 | c.states.append(states) | |
70 | c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name)) | |
71 | c.start = c.symbol2number[self.startsymbol] | |
72 | return c | |
73 | ||
74 | def make_first(self, c: PgenGrammar, name: str) -> Dict[int, int]: | |
75 | rawfirst = self.first[name] | |
76 | assert rawfirst is not None | |
77 | first = {} | |
78 | for label in sorted(rawfirst): | |
79 | ilabel = self.make_label(c, label) | |
80 | ##assert ilabel not in first # XXX failed on <> ... != | |
81 | first[ilabel] = 1 | |
82 | return first | |
83 | ||
84 | def make_label(self, c: PgenGrammar, label: str) -> int: | |
85 | # XXX Maybe this should be a method on a subclass of converter? | |
86 | ilabel = len(c.labels) | |
87 | if label[0].isalpha(): | |
88 | # Either a symbol name or a named token | |
89 | if label in c.symbol2number: | |
90 | # A symbol name (a non-terminal) | |
91 | if label in c.symbol2label: | |
92 | return c.symbol2label[label] | |
93 | else: | |
94 | c.labels.append((c.symbol2number[label], None)) | |
95 | c.symbol2label[label] = ilabel | |
96 | return ilabel | |
97 | else: | |
98 | # A named token (NAME, NUMBER, STRING) | |
99 | itoken = getattr(token, label, None) | |
100 | assert isinstance(itoken, int), label | |
101 | assert itoken in token.tok_name, label | |
102 | if itoken in c.tokens: | |
103 | return c.tokens[itoken] | |
104 | else: | |
105 | c.labels.append((itoken, None)) | |
106 | c.tokens[itoken] = ilabel | |
107 | return ilabel | |
108 | else: | |
109 | # Either a keyword or an operator | |
110 | assert label[0] in ('"', "'"), label | |
111 | value = eval(label) | |
112 | if value[0].isalpha(): | |
113 | if label[0] == '"': | |
114 | keywords = c.soft_keywords | |
115 | else: | |
116 | keywords = c.keywords | |
117 | ||
118 | # A keyword | |
119 | if value in keywords: | |
120 | return keywords[value] | |
121 | else: | |
122 | c.labels.append((token.NAME, value)) | |
123 | keywords[value] = ilabel | |
124 | return ilabel | |
125 | else: | |
126 | # An operator (any non-numeric token) | |
127 | itoken = grammar.opmap[value] # Fails if unknown token | |
128 | if itoken in c.tokens: | |
129 | return c.tokens[itoken] | |
130 | else: | |
131 | c.labels.append((itoken, None)) | |
132 | c.tokens[itoken] = ilabel | |
133 | return ilabel | |
134 | ||
135 | def addfirstsets(self) -> None: | |
136 | names = list(self.dfas.keys()) | |
137 | names.sort() | |
138 | for name in names: | |
139 | if name not in self.first: | |
140 | self.calcfirst(name) | |
141 | # print name, self.first[name].keys() | |
142 | ||
143 | def calcfirst(self, name: str) -> None: | |
144 | dfa = self.dfas[name] | |
145 | self.first[name] = None # dummy to detect left recursion | |
146 | state = dfa[0] | |
147 | totalset: Dict[str, int] = {} | |
148 | overlapcheck = {} | |
149 | for label in state.arcs: | |
150 | if label in self.dfas: | |
151 | if label in self.first: | |
152 | fset = self.first[label] | |
153 | if fset is None: | |
154 | raise ValueError("recursion for rule %r" % name) | |
155 | else: | |
156 | self.calcfirst(label) | |
157 | fset = self.first[label] | |
158 | assert fset is not None | |
159 | totalset.update(fset) | |
160 | overlapcheck[label] = fset | |
161 | else: | |
162 | totalset[label] = 1 | |
163 | overlapcheck[label] = {label: 1} | |
164 | inverse: Dict[str, str] = {} | |
165 | for label, itsfirst in overlapcheck.items(): | |
166 | for symbol in itsfirst: | |
167 | if symbol in inverse: | |
168 | raise ValueError( | |
169 | "rule %s is ambiguous; %s is in the first sets of %s as well" | |
170 | " as %s" % (name, symbol, label, inverse[symbol]) | |
171 | ) | |
172 | inverse[symbol] = label | |
173 | self.first[name] = totalset | |
174 | ||
175 | def parse(self) -> Tuple[Dict[str, List["DFAState"]], str]: | |
176 | dfas = {} | |
177 | startsymbol: Optional[str] = None | |
178 | # MSTART: (NEWLINE | RULE)* ENDMARKER | |
179 | while self.type != token.ENDMARKER: | |
180 | while self.type == token.NEWLINE: | |
181 | self.gettoken() | |
182 | # RULE: NAME ':' RHS NEWLINE | |
183 | name = self.expect(token.NAME) | |
184 | self.expect(token.OP, ":") | |
185 | a, z = self.parse_rhs() | |
186 | self.expect(token.NEWLINE) | |
187 | # self.dump_nfa(name, a, z) | |
188 | dfa = self.make_dfa(a, z) | |
189 | # self.dump_dfa(name, dfa) | |
190 | # oldlen = len(dfa) | |
191 | self.simplify_dfa(dfa) | |
192 | # newlen = len(dfa) | |
193 | dfas[name] = dfa | |
194 | # print name, oldlen, newlen | |
195 | if startsymbol is None: | |
196 | startsymbol = name | |
197 | assert startsymbol is not None | |
198 | return dfas, startsymbol | |
199 | ||
200 | def make_dfa(self, start: "NFAState", finish: "NFAState") -> List["DFAState"]: | |
201 | # To turn an NFA into a DFA, we define the states of the DFA | |
202 | # to correspond to *sets* of states of the NFA. Then do some | |
203 | # state reduction. Let's represent sets as dicts with 1 for | |
204 | # values. | |
205 | assert isinstance(start, NFAState) | |
206 | assert isinstance(finish, NFAState) | |
207 | ||
208 | def closure(state: NFAState) -> Dict[NFAState, int]: | |
209 | base: Dict[NFAState, int] = {} | |
210 | addclosure(state, base) | |
211 | return base | |
212 | ||
213 | def addclosure(state: NFAState, base: Dict[NFAState, int]) -> None: | |
214 | assert isinstance(state, NFAState) | |
215 | if state in base: | |
216 | return | |
217 | base[state] = 1 | |
218 | for label, next in state.arcs: | |
219 | if label is None: | |
220 | addclosure(next, base) | |
221 | ||
222 | states = [DFAState(closure(start), finish)] | |
223 | for state in states: # NB states grows while we're iterating | |
224 | arcs: Dict[str, Dict[NFAState, int]] = {} | |
225 | for nfastate in state.nfaset: | |
226 | for label, next in nfastate.arcs: | |
227 | if label is not None: | |
228 | addclosure(next, arcs.setdefault(label, {})) | |
229 | for label, nfaset in sorted(arcs.items()): | |
230 | for st in states: | |
231 | if st.nfaset == nfaset: | |
232 | break | |
233 | else: | |
234 | st = DFAState(nfaset, finish) | |
235 | states.append(st) | |
236 | state.addarc(st, label) | |
237 | return states # List of DFAState instances; first one is start | |
238 | ||
239 | def dump_nfa(self, name: str, start: "NFAState", finish: "NFAState") -> None: | |
240 | print("Dump of NFA for", name) | |
241 | todo = [start] | |
242 | for i, state in enumerate(todo): | |
243 | print(" State", i, state is finish and "(final)" or "") | |
244 | for label, next in state.arcs: | |
245 | if next in todo: | |
246 | j = todo.index(next) | |
247 | else: | |
248 | j = len(todo) | |
249 | todo.append(next) | |
250 | if label is None: | |
251 | print(" -> %d" % j) | |
252 | else: | |
253 | print(" %s -> %d" % (label, j)) | |
254 | ||
255 | def dump_dfa(self, name: str, dfa: Sequence["DFAState"]) -> None: | |
256 | print("Dump of DFA for", name) | |
257 | for i, state in enumerate(dfa): | |
258 | print(" State", i, state.isfinal and "(final)" or "") | |
259 | for label, next in sorted(state.arcs.items()): | |
260 | print(" %s -> %d" % (label, dfa.index(next))) | |
261 | ||
262 | def simplify_dfa(self, dfa: List["DFAState"]) -> None: | |
263 | # This is not theoretically optimal, but works well enough. | |
264 | # Algorithm: repeatedly look for two states that have the same | |
265 | # set of arcs (same labels pointing to the same nodes) and | |
266 | # unify them, until things stop changing. | |
267 | ||
268 | # dfa is a list of DFAState instances | |
269 | changes = True | |
270 | while changes: | |
271 | changes = False | |
272 | for i, state_i in enumerate(dfa): | |
273 | for j in range(i + 1, len(dfa)): | |
274 | state_j = dfa[j] | |
275 | if state_i == state_j: | |
276 | # print " unify", i, j | |
277 | del dfa[j] | |
278 | for state in dfa: | |
279 | state.unifystate(state_j, state_i) | |
280 | changes = True | |
281 | break | |
282 | ||
283 | def parse_rhs(self) -> Tuple["NFAState", "NFAState"]: | |
284 | # RHS: ALT ('|' ALT)* | |
285 | a, z = self.parse_alt() | |
286 | if self.value != "|": | |
287 | return a, z | |
288 | else: | |
289 | aa = NFAState() | |
290 | zz = NFAState() | |
291 | aa.addarc(a) | |
292 | z.addarc(zz) | |
293 | while self.value == "|": | |
294 | self.gettoken() | |
295 | a, z = self.parse_alt() | |
296 | aa.addarc(a) | |
297 | z.addarc(zz) | |
298 | return aa, zz | |
299 | ||
300 | def parse_alt(self) -> Tuple["NFAState", "NFAState"]: | |
301 | # ALT: ITEM+ | |
302 | a, b = self.parse_item() | |
303 | while self.value in ("(", "[") or self.type in (token.NAME, token.STRING): | |
304 | c, d = self.parse_item() | |
305 | b.addarc(c) | |
306 | b = d | |
307 | return a, b | |
308 | ||
309 | def parse_item(self) -> Tuple["NFAState", "NFAState"]: | |
310 | # ITEM: '[' RHS ']' | ATOM ['+' | '*'] | |
311 | if self.value == "[": | |
312 | self.gettoken() | |
313 | a, z = self.parse_rhs() | |
314 | self.expect(token.OP, "]") | |
315 | a.addarc(z) | |
316 | return a, z | |
317 | else: | |
318 | a, z = self.parse_atom() | |
319 | value = self.value | |
320 | if value not in ("+", "*"): | |
321 | return a, z | |
322 | self.gettoken() | |
323 | z.addarc(a) | |
324 | if value == "+": | |
325 | return a, z | |
326 | else: | |
327 | return a, a | |
328 | ||
329 | def parse_atom(self) -> Tuple["NFAState", "NFAState"]: | |
330 | # ATOM: '(' RHS ')' | NAME | STRING | |
331 | if self.value == "(": | |
332 | self.gettoken() | |
333 | a, z = self.parse_rhs() | |
334 | self.expect(token.OP, ")") | |
335 | return a, z | |
336 | elif self.type in (token.NAME, token.STRING): | |
337 | a = NFAState() | |
338 | z = NFAState() | |
339 | a.addarc(z, self.value) | |
340 | self.gettoken() | |
341 | return a, z | |
342 | else: | |
343 | self.raise_error( | |
344 | "expected (...) or NAME or STRING, got %s/%s", self.type, self.value | |
345 | ) | |
346 | raise AssertionError | |
347 | ||
348 | def expect(self, type: int, value: Optional[Any] = None) -> str: | |
349 | if self.type != type or (value is not None and self.value != value): | |
350 | self.raise_error( | |
351 | "expected %s/%s, got %s/%s", type, value, self.type, self.value | |
352 | ) | |
353 | value = self.value | |
354 | self.gettoken() | |
355 | return value | |
356 | ||
357 | def gettoken(self) -> None: | |
358 | tup = next(self.generator) | |
359 | while tup[0] in (tokenize.COMMENT, tokenize.NL): | |
360 | tup = next(self.generator) | |
361 | self.type, self.value, self.begin, self.end, self.line = tup | |
362 | # print token.tok_name[self.type], repr(self.value) | |
363 | ||
364 | def raise_error(self, msg: str, *args: Any) -> NoReturn: | |
365 | if args: | |
366 | try: | |
367 | msg = msg % args | |
368 | except Exception: | |
369 | msg = " ".join([msg] + list(map(str, args))) | |
370 | raise SyntaxError(msg, (self.filename, self.end[0], self.end[1], self.line)) | |
371 | ||
372 | ||
373 | class NFAState: | |
374 | arcs: List[Tuple[Optional[str], "NFAState"]] | |
375 | ||
376 | def __init__(self) -> None: | |
377 | self.arcs = [] # list of (label, NFAState) pairs | |
378 | ||
379 | def addarc(self, next: "NFAState", label: Optional[str] = None) -> None: | |
380 | assert label is None or isinstance(label, str) | |
381 | assert isinstance(next, NFAState) | |
382 | self.arcs.append((label, next)) | |
383 | ||
384 | ||
385 | class DFAState: | |
386 | nfaset: Dict[NFAState, Any] | |
387 | isfinal: bool | |
388 | arcs: Dict[str, "DFAState"] | |
389 | ||
390 | def __init__(self, nfaset: Dict[NFAState, Any], final: NFAState) -> None: | |
391 | assert isinstance(nfaset, dict) | |
392 | assert isinstance(next(iter(nfaset)), NFAState) | |
393 | assert isinstance(final, NFAState) | |
394 | self.nfaset = nfaset | |
395 | self.isfinal = final in nfaset | |
396 | self.arcs = {} # map from label to DFAState | |
397 | ||
398 | def addarc(self, next: "DFAState", label: str) -> None: | |
399 | assert isinstance(label, str) | |
400 | assert label not in self.arcs | |
401 | assert isinstance(next, DFAState) | |
402 | self.arcs[label] = next | |
403 | ||
404 | def unifystate(self, old: "DFAState", new: "DFAState") -> None: | |
405 | for label, next in self.arcs.items(): | |
406 | if next is old: | |
407 | self.arcs[label] = new | |
408 | ||
409 | def __eq__(self, other: Any) -> bool: | |
410 | # Equality test -- ignore the nfaset instance variable | |
411 | assert isinstance(other, DFAState) | |
412 | if self.isfinal != other.isfinal: | |
413 | return False | |
414 | # Can't just return self.arcs == other.arcs, because that | |
415 | # would invoke this method recursively, with cycles... | |
416 | if len(self.arcs) != len(other.arcs): | |
417 | return False | |
418 | for label, next in self.arcs.items(): | |
419 | if next is not other.arcs.get(label): | |
420 | return False | |
421 | return True | |
422 | ||
423 | __hash__: Any = None # For Py3 compatibility. | |
424 | ||
425 | ||
426 | def generate_grammar(filename: Path = "Grammar.txt") -> PgenGrammar: | |
427 | p = ParserGenerator(filename) | |
428 | return p.make_grammar() |