]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
2 | # Licensed to PSF under a Contributor Agreement. | |
3 | ||
4 | # Modifications: | |
5 | # Copyright David Halter and Contributors | |
6 | # Modifications are dual-licensed: MIT and PSF. | |
7 | from typing import Optional, Iterator, Tuple, List | |
8 | ||
9 | from parso.python.tokenize import tokenize | |
10 | from parso.utils import parse_version_string | |
11 | from parso.python.token import PythonTokenTypes | |
12 | ||
13 | ||
14 | class NFAArc: | |
15 | def __init__(self, next_: 'NFAState', nonterminal_or_string: Optional[str]): | |
16 | self.next: NFAState = next_ | |
17 | self.nonterminal_or_string: Optional[str] = nonterminal_or_string | |
18 | ||
19 | def __repr__(self): | |
20 | return '<%s: %s>' % (self.__class__.__name__, self.nonterminal_or_string) | |
21 | ||
22 | ||
23 | class NFAState: | |
24 | def __init__(self, from_rule: str): | |
25 | self.from_rule: str = from_rule | |
26 | self.arcs: List[NFAArc] = [] | |
27 | ||
28 | def add_arc(self, next_, nonterminal_or_string=None): | |
29 | assert nonterminal_or_string is None or isinstance(nonterminal_or_string, str) | |
30 | assert isinstance(next_, NFAState) | |
31 | self.arcs.append(NFAArc(next_, nonterminal_or_string)) | |
32 | ||
33 | def __repr__(self): | |
34 | return '<%s: from %s>' % (self.__class__.__name__, self.from_rule) | |
35 | ||
36 | ||
37 | class GrammarParser: | |
38 | """ | |
39 | The parser for Python grammar files. | |
40 | """ | |
41 | def __init__(self, bnf_grammar: str): | |
42 | self._bnf_grammar = bnf_grammar | |
43 | self.generator = tokenize( | |
44 | bnf_grammar, | |
45 | version_info=parse_version_string('3.9') | |
46 | ) | |
47 | self._gettoken() # Initialize lookahead | |
48 | ||
49 | def parse(self) -> Iterator[Tuple[NFAState, NFAState]]: | |
50 | # grammar: (NEWLINE | rule)* ENDMARKER | |
51 | while self.type != PythonTokenTypes.ENDMARKER: | |
52 | while self.type == PythonTokenTypes.NEWLINE: | |
53 | self._gettoken() | |
54 | ||
55 | # rule: NAME ':' rhs NEWLINE | |
56 | self._current_rule_name = self._expect(PythonTokenTypes.NAME) | |
57 | self._expect(PythonTokenTypes.OP, ':') | |
58 | ||
59 | a, z = self._parse_rhs() | |
60 | self._expect(PythonTokenTypes.NEWLINE) | |
61 | ||
62 | yield a, z | |
63 | ||
64 | def _parse_rhs(self): | |
65 | # rhs: items ('|' items)* | |
66 | a, z = self._parse_items() | |
67 | if self.value != "|": | |
68 | return a, z | |
69 | else: | |
70 | aa = NFAState(self._current_rule_name) | |
71 | zz = NFAState(self._current_rule_name) | |
72 | while True: | |
73 | # Add the possibility to go into the state of a and come back | |
74 | # to finish. | |
75 | aa.add_arc(a) | |
76 | z.add_arc(zz) | |
77 | if self.value != "|": | |
78 | break | |
79 | ||
80 | self._gettoken() | |
81 | a, z = self._parse_items() | |
82 | return aa, zz | |
83 | ||
84 | def _parse_items(self): | |
85 | # items: item+ | |
86 | a, b = self._parse_item() | |
87 | while self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING) \ | |
88 | or self.value in ('(', '['): | |
89 | c, d = self._parse_item() | |
90 | # Need to end on the next item. | |
91 | b.add_arc(c) | |
92 | b = d | |
93 | return a, b | |
94 | ||
95 | def _parse_item(self): | |
96 | # item: '[' rhs ']' | atom ['+' | '*'] | |
97 | if self.value == "[": | |
98 | self._gettoken() | |
99 | a, z = self._parse_rhs() | |
100 | self._expect(PythonTokenTypes.OP, ']') | |
101 | # Make it also possible that there is no token and change the | |
102 | # state. | |
103 | a.add_arc(z) | |
104 | return a, z | |
105 | else: | |
106 | a, z = self._parse_atom() | |
107 | value = self.value | |
108 | if value not in ("+", "*"): | |
109 | return a, z | |
110 | self._gettoken() | |
111 | # Make it clear that we can go back to the old state and repeat. | |
112 | z.add_arc(a) | |
113 | if value == "+": | |
114 | return a, z | |
115 | else: | |
116 | # The end state is the same as the beginning, nothing must | |
117 | # change. | |
118 | return a, a | |
119 | ||
120 | def _parse_atom(self): | |
121 | # atom: '(' rhs ')' | NAME | STRING | |
122 | if self.value == "(": | |
123 | self._gettoken() | |
124 | a, z = self._parse_rhs() | |
125 | self._expect(PythonTokenTypes.OP, ')') | |
126 | return a, z | |
127 | elif self.type in (PythonTokenTypes.NAME, PythonTokenTypes.STRING): | |
128 | a = NFAState(self._current_rule_name) | |
129 | z = NFAState(self._current_rule_name) | |
130 | # Make it clear that the state transition requires that value. | |
131 | a.add_arc(z, self.value) | |
132 | self._gettoken() | |
133 | return a, z | |
134 | else: | |
135 | self._raise_error("expected (...) or NAME or STRING, got %s/%s", | |
136 | self.type, self.value) | |
137 | ||
138 | def _expect(self, type_, value=None): | |
139 | if self.type != type_: | |
140 | self._raise_error("expected %s, got %s [%s]", | |
141 | type_, self.type, self.value) | |
142 | if value is not None and self.value != value: | |
143 | self._raise_error("expected %s, got %s", value, self.value) | |
144 | value = self.value | |
145 | self._gettoken() | |
146 | return value | |
147 | ||
148 | def _gettoken(self): | |
149 | tup = next(self.generator) | |
150 | self.type, self.value, self.begin, prefix = tup | |
151 | ||
152 | def _raise_error(self, msg, *args): | |
153 | if args: | |
154 | try: | |
155 | msg = msg % args | |
156 | except: | |
157 | msg = " ".join([msg] + list(map(str, args))) | |
158 | line = self._bnf_grammar.splitlines()[self.begin[0] - 1] | |
159 | raise SyntaxError(msg, ('<grammar>', self.begin[0], | |
160 | self.begin[1], line)) |