]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
2 | # Licensed to PSF under a Contributor Agreement. | |
3 | ||
4 | # Modifications: | |
5 | # Copyright David Halter and Contributors | |
6 | # Modifications are dual-licensed: MIT and PSF. | |
7 | ||
8 | """ | |
9 | This module defines the data structures used to represent a grammar. | |
10 | ||
11 | Specifying grammars in pgen is possible with this grammar:: | |
12 | ||
13 | grammar: (NEWLINE | rule)* ENDMARKER | |
14 | rule: NAME ':' rhs NEWLINE | |
15 | rhs: items ('|' items)* | |
16 | items: item+ | |
17 | item: '[' rhs ']' | atom ['+' | '*'] | |
18 | atom: '(' rhs ')' | NAME | STRING | |
19 | ||
20 | This grammar is self-referencing. | |
21 | ||
22 | This parser generator (pgen2) was created by Guido Rossum and used for lib2to3. | |
23 | Most of the code has been refactored to make it more Pythonic. Since this was a | |
24 | "copy" of the CPython Parser parser "pgen", there was some work needed to make | |
25 | it more readable. It should also be slightly faster than the original pgen2, | |
26 | because we made some optimizations. | |
27 | """ | |
28 | ||
29 | from ast import literal_eval | |
30 | from typing import TypeVar, Generic, Mapping, Sequence, Set, Union | |
31 | ||
32 | from parso.pgen2.grammar_parser import GrammarParser, NFAState | |
33 | ||
34 | _TokenTypeT = TypeVar("_TokenTypeT") | |
35 | ||
36 | ||
37 | class Grammar(Generic[_TokenTypeT]): | |
38 | """ | |
39 | Once initialized, this class supplies the grammar tables for the | |
40 | parsing engine implemented by parse.py. The parsing engine | |
41 | accesses the instance variables directly. | |
42 | ||
43 | The only important part in this parsers are dfas and transitions between | |
44 | dfas. | |
45 | """ | |
46 | ||
47 | def __init__(self, | |
48 | start_nonterminal: str, | |
49 | rule_to_dfas: Mapping[str, Sequence['DFAState[_TokenTypeT]']], | |
50 | reserved_syntax_strings: Mapping[str, 'ReservedString']): | |
51 | self.nonterminal_to_dfas = rule_to_dfas | |
52 | self.reserved_syntax_strings = reserved_syntax_strings | |
53 | self.start_nonterminal = start_nonterminal | |
54 | ||
55 | ||
56 | class DFAPlan: | |
57 | """ | |
58 | Plans are used for the parser to create stack nodes and do the proper | |
59 | DFA state transitions. | |
60 | """ | |
61 | def __init__(self, next_dfa: 'DFAState', dfa_pushes: Sequence['DFAState'] = []): | |
62 | self.next_dfa = next_dfa | |
63 | self.dfa_pushes = dfa_pushes | |
64 | ||
65 | def __repr__(self): | |
66 | return '%s(%s, %s)' % (self.__class__.__name__, self.next_dfa, self.dfa_pushes) | |
67 | ||
68 | ||
69 | class DFAState(Generic[_TokenTypeT]): | |
70 | """ | |
71 | The DFAState object is the core class for pretty much anything. DFAState | |
72 | are the vertices of an ordered graph while arcs and transitions are the | |
73 | edges. | |
74 | ||
75 | Arcs are the initial edges, where most DFAStates are not connected and | |
76 | transitions are then calculated to connect the DFA state machines that have | |
77 | different nonterminals. | |
78 | """ | |
79 | def __init__(self, from_rule: str, nfa_set: Set[NFAState], final: NFAState): | |
80 | assert isinstance(nfa_set, set) | |
81 | assert isinstance(next(iter(nfa_set)), NFAState) | |
82 | assert isinstance(final, NFAState) | |
83 | self.from_rule = from_rule | |
84 | self.nfa_set = nfa_set | |
85 | # map from terminals/nonterminals to DFAState | |
86 | self.arcs: Mapping[str, DFAState] = {} | |
87 | # In an intermediary step we set these nonterminal arcs (which has the | |
88 | # same structure as arcs). These don't contain terminals anymore. | |
89 | self.nonterminal_arcs: Mapping[str, DFAState] = {} | |
90 | ||
91 | # Transitions are basically the only thing that the parser is using | |
92 | # with is_final. Everyting else is purely here to create a parser. | |
93 | self.transitions: Mapping[Union[_TokenTypeT, ReservedString], DFAPlan] = {} | |
94 | self.is_final = final in nfa_set | |
95 | ||
96 | def add_arc(self, next_, label): | |
97 | assert isinstance(label, str) | |
98 | assert label not in self.arcs | |
99 | assert isinstance(next_, DFAState) | |
100 | self.arcs[label] = next_ | |
101 | ||
102 | def unifystate(self, old, new): | |
103 | for label, next_ in self.arcs.items(): | |
104 | if next_ is old: | |
105 | self.arcs[label] = new | |
106 | ||
107 | def __eq__(self, other): | |
108 | # Equality test -- ignore the nfa_set instance variable | |
109 | assert isinstance(other, DFAState) | |
110 | if self.is_final != other.is_final: | |
111 | return False | |
112 | # Can't just return self.arcs == other.arcs, because that | |
113 | # would invoke this method recursively, with cycles... | |
114 | if len(self.arcs) != len(other.arcs): | |
115 | return False | |
116 | for label, next_ in self.arcs.items(): | |
117 | if next_ is not other.arcs.get(label): | |
118 | return False | |
119 | return True | |
120 | ||
121 | def __repr__(self): | |
122 | return '<%s: %s is_final=%s>' % ( | |
123 | self.__class__.__name__, self.from_rule, self.is_final | |
124 | ) | |
125 | ||
126 | ||
127 | class ReservedString: | |
128 | """ | |
129 | Most grammars will have certain keywords and operators that are mentioned | |
130 | in the grammar as strings (e.g. "if") and not token types (e.g. NUMBER). | |
131 | This class basically is the former. | |
132 | """ | |
133 | ||
134 | def __init__(self, value: str): | |
135 | self.value = value | |
136 | ||
137 | def __repr__(self): | |
138 | return '%s(%s)' % (self.__class__.__name__, self.value) | |
139 | ||
140 | ||
141 | def _simplify_dfas(dfas): | |
142 | """ | |
143 | This is not theoretically optimal, but works well enough. | |
144 | Algorithm: repeatedly look for two states that have the same | |
145 | set of arcs (same labels pointing to the same nodes) and | |
146 | unify them, until things stop changing. | |
147 | ||
148 | dfas is a list of DFAState instances | |
149 | """ | |
150 | changes = True | |
151 | while changes: | |
152 | changes = False | |
153 | for i, state_i in enumerate(dfas): | |
154 | for j in range(i + 1, len(dfas)): | |
155 | state_j = dfas[j] | |
156 | if state_i == state_j: | |
157 | del dfas[j] | |
158 | for state in dfas: | |
159 | state.unifystate(state_j, state_i) | |
160 | changes = True | |
161 | break | |
162 | ||
163 | ||
164 | def _make_dfas(start, finish): | |
165 | """ | |
166 | Uses the powerset construction algorithm to create DFA states from sets of | |
167 | NFA states. | |
168 | ||
169 | Also does state reduction if some states are not needed. | |
170 | """ | |
171 | # To turn an NFA into a DFA, we define the states of the DFA | |
172 | # to correspond to *sets* of states of the NFA. Then do some | |
173 | # state reduction. | |
174 | assert isinstance(start, NFAState) | |
175 | assert isinstance(finish, NFAState) | |
176 | ||
177 | def addclosure(nfa_state, base_nfa_set): | |
178 | assert isinstance(nfa_state, NFAState) | |
179 | if nfa_state in base_nfa_set: | |
180 | return | |
181 | base_nfa_set.add(nfa_state) | |
182 | for nfa_arc in nfa_state.arcs: | |
183 | if nfa_arc.nonterminal_or_string is None: | |
184 | addclosure(nfa_arc.next, base_nfa_set) | |
185 | ||
186 | base_nfa_set = set() | |
187 | addclosure(start, base_nfa_set) | |
188 | states = [DFAState(start.from_rule, base_nfa_set, finish)] | |
189 | for state in states: # NB states grows while we're iterating | |
190 | arcs = {} | |
191 | # Find state transitions and store them in arcs. | |
192 | for nfa_state in state.nfa_set: | |
193 | for nfa_arc in nfa_state.arcs: | |
194 | if nfa_arc.nonterminal_or_string is not None: | |
195 | nfa_set = arcs.setdefault(nfa_arc.nonterminal_or_string, set()) | |
196 | addclosure(nfa_arc.next, nfa_set) | |
197 | ||
198 | # Now create the dfa's with no None's in arcs anymore. All Nones have | |
199 | # been eliminated and state transitions (arcs) are properly defined, we | |
200 | # just need to create the dfa's. | |
201 | for nonterminal_or_string, nfa_set in arcs.items(): | |
202 | for nested_state in states: | |
203 | if nested_state.nfa_set == nfa_set: | |
204 | # The DFA state already exists for this rule. | |
205 | break | |
206 | else: | |
207 | nested_state = DFAState(start.from_rule, nfa_set, finish) | |
208 | states.append(nested_state) | |
209 | ||
210 | state.add_arc(nested_state, nonterminal_or_string) | |
211 | return states # List of DFAState instances; first one is start | |
212 | ||
213 | ||
214 | def _dump_nfa(start, finish): | |
215 | print("Dump of NFA for", start.from_rule) | |
216 | todo = [start] | |
217 | for i, state in enumerate(todo): | |
218 | print(" State", i, state is finish and "(final)" or "") | |
219 | for arc in state.arcs: | |
220 | label, next_ = arc.nonterminal_or_string, arc.next | |
221 | if next_ in todo: | |
222 | j = todo.index(next_) | |
223 | else: | |
224 | j = len(todo) | |
225 | todo.append(next_) | |
226 | if label is None: | |
227 | print(" -> %d" % j) | |
228 | else: | |
229 | print(" %s -> %d" % (label, j)) | |
230 | ||
231 | ||
232 | def _dump_dfas(dfas): | |
233 | print("Dump of DFA for", dfas[0].from_rule) | |
234 | for i, state in enumerate(dfas): | |
235 | print(" State", i, state.is_final and "(final)" or "") | |
236 | for nonterminal, next_ in state.arcs.items(): | |
237 | print(" %s -> %d" % (nonterminal, dfas.index(next_))) | |
238 | ||
239 | ||
240 | def generate_grammar(bnf_grammar: str, token_namespace) -> Grammar: | |
241 | """ | |
242 | ``bnf_text`` is a grammar in extended BNF (using * for repetition, + for | |
243 | at-least-once repetition, [] for optional parts, | for alternatives and () | |
244 | for grouping). | |
245 | ||
246 | It's not EBNF according to ISO/IEC 14977. It's a dialect Python uses in its | |
247 | own parser. | |
248 | """ | |
249 | rule_to_dfas = {} | |
250 | start_nonterminal = None | |
251 | for nfa_a, nfa_z in GrammarParser(bnf_grammar).parse(): | |
252 | # _dump_nfa(nfa_a, nfa_z) | |
253 | dfas = _make_dfas(nfa_a, nfa_z) | |
254 | # _dump_dfas(dfas) | |
255 | # oldlen = len(dfas) | |
256 | _simplify_dfas(dfas) | |
257 | # newlen = len(dfas) | |
258 | rule_to_dfas[nfa_a.from_rule] = dfas | |
259 | # print(nfa_a.from_rule, oldlen, newlen) | |
260 | ||
261 | if start_nonterminal is None: | |
262 | start_nonterminal = nfa_a.from_rule | |
263 | ||
264 | reserved_strings: Mapping[str, ReservedString] = {} | |
265 | for nonterminal, dfas in rule_to_dfas.items(): | |
266 | for dfa_state in dfas: | |
267 | for terminal_or_nonterminal, next_dfa in dfa_state.arcs.items(): | |
268 | if terminal_or_nonterminal in rule_to_dfas: | |
269 | dfa_state.nonterminal_arcs[terminal_or_nonterminal] = next_dfa | |
270 | else: | |
271 | transition = _make_transition( | |
272 | token_namespace, | |
273 | reserved_strings, | |
274 | terminal_or_nonterminal | |
275 | ) | |
276 | dfa_state.transitions[transition] = DFAPlan(next_dfa) | |
277 | ||
278 | _calculate_tree_traversal(rule_to_dfas) | |
279 | return Grammar(start_nonterminal, rule_to_dfas, reserved_strings) # type: ignore | |
280 | ||
281 | ||
282 | def _make_transition(token_namespace, reserved_syntax_strings, label): | |
283 | """ | |
284 | Creates a reserved string ("if", "for", "*", ...) or returns the token type | |
285 | (NUMBER, STRING, ...) for a given grammar terminal. | |
286 | """ | |
287 | if label[0].isalpha(): | |
288 | # A named token (e.g. NAME, NUMBER, STRING) | |
289 | return getattr(token_namespace, label) | |
290 | else: | |
291 | # Either a keyword or an operator | |
292 | assert label[0] in ('"', "'"), label | |
293 | assert not label.startswith('"""') and not label.startswith("'''") | |
294 | value = literal_eval(label) | |
295 | try: | |
296 | return reserved_syntax_strings[value] | |
297 | except KeyError: | |
298 | r = reserved_syntax_strings[value] = ReservedString(value) | |
299 | return r | |
300 | ||
301 | ||
302 | def _calculate_tree_traversal(nonterminal_to_dfas): | |
303 | """ | |
304 | By this point we know how dfas can move around within a stack node, but we | |
305 | don't know how we can add a new stack node (nonterminal transitions). | |
306 | """ | |
307 | # Map from grammar rule (nonterminal) name to a set of tokens. | |
308 | first_plans = {} | |
309 | ||
310 | nonterminals = list(nonterminal_to_dfas.keys()) | |
311 | nonterminals.sort() | |
312 | for nonterminal in nonterminals: | |
313 | if nonterminal not in first_plans: | |
314 | _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal) | |
315 | ||
316 | # Now that we have calculated the first terminals, we are sure that | |
317 | # there is no left recursion. | |
318 | ||
319 | for dfas in nonterminal_to_dfas.values(): | |
320 | for dfa_state in dfas: | |
321 | transitions = dfa_state.transitions | |
322 | for nonterminal, next_dfa in dfa_state.nonterminal_arcs.items(): | |
323 | for transition, pushes in first_plans[nonterminal].items(): | |
324 | if transition in transitions: | |
325 | prev_plan = transitions[transition] | |
326 | # Make sure these are sorted so that error messages are | |
327 | # at least deterministic | |
328 | choices = sorted([ | |
329 | ( | |
330 | prev_plan.dfa_pushes[0].from_rule | |
331 | if prev_plan.dfa_pushes | |
332 | else prev_plan.next_dfa.from_rule | |
333 | ), | |
334 | ( | |
335 | pushes[0].from_rule | |
336 | if pushes else next_dfa.from_rule | |
337 | ), | |
338 | ]) | |
339 | raise ValueError( | |
340 | "Rule %s is ambiguous; given a %s token, we " | |
341 | "can't determine if we should evaluate %s or %s." | |
342 | % ( | |
343 | ( | |
344 | dfa_state.from_rule, | |
345 | transition, | |
346 | ) + tuple(choices) | |
347 | ) | |
348 | ) | |
349 | transitions[transition] = DFAPlan(next_dfa, pushes) | |
350 | ||
351 | ||
352 | def _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal): | |
353 | """ | |
354 | Calculates the first plan in the first_plans dictionary for every given | |
355 | nonterminal. This is going to be used to know when to create stack nodes. | |
356 | """ | |
357 | dfas = nonterminal_to_dfas[nonterminal] | |
358 | new_first_plans = {} | |
359 | first_plans[nonterminal] = None # dummy to detect left recursion | |
360 | # We only need to check the first dfa. All the following ones are not | |
361 | # interesting to find first terminals. | |
362 | state = dfas[0] | |
363 | for transition, next_ in state.transitions.items(): | |
364 | # It's a string. We have finally found a possible first token. | |
365 | new_first_plans[transition] = [next_.next_dfa] | |
366 | ||
367 | for nonterminal2, next_ in state.nonterminal_arcs.items(): | |
368 | # It's a nonterminal and we have either a left recursion issue | |
369 | # in the grammar or we have to recurse. | |
370 | try: | |
371 | first_plans2 = first_plans[nonterminal2] | |
372 | except KeyError: | |
373 | first_plans2 = _calculate_first_plans(nonterminal_to_dfas, first_plans, nonterminal2) | |
374 | else: | |
375 | if first_plans2 is None: | |
376 | raise ValueError("left recursion for rule %r" % nonterminal) | |
377 | ||
378 | for t, pushes in first_plans2.items(): | |
379 | new_first_plans[t] = [next_] + pushes | |
380 | ||
381 | first_plans[nonterminal] = new_first_plans | |
382 | return new_first_plans |