]>
Commit | Line | Data |
---|---|---|
1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | |
2 | # Licensed to PSF under a Contributor Agreement. | |
3 | ||
4 | # mypy: ignore-errors | |
5 | ||
6 | """Convert graminit.[ch] spit out by pgen to Python code. | |
7 | ||
8 | Pgen is the Python parser generator. It is useful to quickly create a | |
9 | parser from a grammar file in Python's grammar notation. But I don't | |
10 | want my parsers to be written in C (yet), so I'm translating the | |
11 | parsing tables to Python data structures and writing a Python parse | |
12 | engine. | |
13 | ||
14 | Note that the token numbers are constants determined by the standard | |
15 | Python tokenizer. The standard token module defines these numbers and | |
16 | their names (the names are not used much). The token numbers are | |
17 | hardcoded into the Python tokenizer and into pgen. A Python | |
18 | implementation of the Python tokenizer is also available, in the | |
19 | standard tokenize module. | |
20 | ||
21 | On the other hand, symbol numbers (representing the grammar's | |
22 | non-terminals) are assigned by pgen based on the actual grammar | |
23 | input. | |
24 | ||
25 | Note: this module is pretty much obsolete; the pgen module generates | |
26 | equivalent grammar tables directly from the Grammar.txt input file | |
27 | without having to invoke the Python pgen C program. | |
28 | ||
29 | """ | |
30 | ||
31 | # Python imports | |
32 | import re | |
33 | ||
34 | # Local imports | |
35 | from pgen2 import grammar, token | |
36 | ||
37 | ||
38 | class Converter(grammar.Grammar): | |
39 | """Grammar subclass that reads classic pgen output files. | |
40 | ||
41 | The run() method reads the tables as produced by the pgen parser | |
42 | generator, typically contained in two C files, graminit.h and | |
43 | graminit.c. The other methods are for internal use only. | |
44 | ||
45 | See the base class for more documentation. | |
46 | ||
47 | """ | |
48 | ||
49 | def run(self, graminit_h, graminit_c): | |
50 | """Load the grammar tables from the text files written by pgen.""" | |
51 | self.parse_graminit_h(graminit_h) | |
52 | self.parse_graminit_c(graminit_c) | |
53 | self.finish_off() | |
54 | ||
55 | def parse_graminit_h(self, filename): | |
56 | """Parse the .h file written by pgen. (Internal) | |
57 | ||
58 | This file is a sequence of #define statements defining the | |
59 | nonterminals of the grammar as numbers. We build two tables | |
60 | mapping the numbers to names and back. | |
61 | ||
62 | """ | |
63 | try: | |
64 | f = open(filename) | |
65 | except OSError as err: | |
66 | print(f"Can't open {filename}: {err}") | |
67 | return False | |
68 | self.symbol2number = {} | |
69 | self.number2symbol = {} | |
70 | lineno = 0 | |
71 | for line in f: | |
72 | lineno += 1 | |
73 | mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) | |
74 | if not mo and line.strip(): | |
75 | print(f"{filename}({lineno}): can't parse {line.strip()}") | |
76 | else: | |
77 | symbol, number = mo.groups() | |
78 | number = int(number) | |
79 | assert symbol not in self.symbol2number | |
80 | assert number not in self.number2symbol | |
81 | self.symbol2number[symbol] = number | |
82 | self.number2symbol[number] = symbol | |
83 | return True | |
84 | ||
85 | def parse_graminit_c(self, filename): | |
86 | """Parse the .c file written by pgen. (Internal) | |
87 | ||
88 | The file looks as follows. The first two lines are always this: | |
89 | ||
90 | #include "pgenheaders.h" | |
91 | #include "grammar.h" | |
92 | ||
93 | After that come four blocks: | |
94 | ||
95 | 1) one or more state definitions | |
96 | 2) a table defining dfas | |
97 | 3) a table defining labels | |
98 | 4) a struct defining the grammar | |
99 | ||
100 | A state definition has the following form: | |
101 | - one or more arc arrays, each of the form: | |
102 | static arc arcs_<n>_<m>[<k>] = { | |
103 | {<i>, <j>}, | |
104 | ... | |
105 | }; | |
106 | - followed by a state array, of the form: | |
107 | static state states_<s>[<t>] = { | |
108 | {<k>, arcs_<n>_<m>}, | |
109 | ... | |
110 | }; | |
111 | ||
112 | """ | |
113 | try: | |
114 | f = open(filename) | |
115 | except OSError as err: | |
116 | print(f"Can't open {filename}: {err}") | |
117 | return False | |
118 | # The code below essentially uses f's iterator-ness! | |
119 | lineno = 0 | |
120 | ||
121 | # Expect the two #include lines | |
122 | lineno, line = lineno + 1, next(f) | |
123 | assert line == '#include "pgenheaders.h"\n', (lineno, line) | |
124 | lineno, line = lineno + 1, next(f) | |
125 | assert line == '#include "grammar.h"\n', (lineno, line) | |
126 | ||
127 | # Parse the state definitions | |
128 | lineno, line = lineno + 1, next(f) | |
129 | allarcs = {} | |
130 | states = [] | |
131 | while line.startswith("static arc "): | |
132 | while line.startswith("static arc "): | |
133 | mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", line) | |
134 | assert mo, (lineno, line) | |
135 | n, m, k = list(map(int, mo.groups())) | |
136 | arcs = [] | |
137 | for _ in range(k): | |
138 | lineno, line = lineno + 1, next(f) | |
139 | mo = re.match(r"\s+{(\d+), (\d+)},$", line) | |
140 | assert mo, (lineno, line) | |
141 | i, j = list(map(int, mo.groups())) | |
142 | arcs.append((i, j)) | |
143 | lineno, line = lineno + 1, next(f) | |
144 | assert line == "};\n", (lineno, line) | |
145 | allarcs[(n, m)] = arcs | |
146 | lineno, line = lineno + 1, next(f) | |
147 | mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) | |
148 | assert mo, (lineno, line) | |
149 | s, t = list(map(int, mo.groups())) | |
150 | assert s == len(states), (lineno, line) | |
151 | state = [] | |
152 | for _ in range(t): | |
153 | lineno, line = lineno + 1, next(f) | |
154 | mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) | |
155 | assert mo, (lineno, line) | |
156 | k, n, m = list(map(int, mo.groups())) | |
157 | arcs = allarcs[n, m] | |
158 | assert k == len(arcs), (lineno, line) | |
159 | state.append(arcs) | |
160 | states.append(state) | |
161 | lineno, line = lineno + 1, next(f) | |
162 | assert line == "};\n", (lineno, line) | |
163 | lineno, line = lineno + 1, next(f) | |
164 | self.states = states | |
165 | ||
166 | # Parse the dfas | |
167 | dfas = {} | |
168 | mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) | |
169 | assert mo, (lineno, line) | |
170 | ndfas = int(mo.group(1)) | |
171 | for i in range(ndfas): | |
172 | lineno, line = lineno + 1, next(f) | |
173 | mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', line) | |
174 | assert mo, (lineno, line) | |
175 | symbol = mo.group(2) | |
176 | number, x, y, z = list(map(int, mo.group(1, 3, 4, 5))) | |
177 | assert self.symbol2number[symbol] == number, (lineno, line) | |
178 | assert self.number2symbol[number] == symbol, (lineno, line) | |
179 | assert x == 0, (lineno, line) | |
180 | state = states[z] | |
181 | assert y == len(state), (lineno, line) | |
182 | lineno, line = lineno + 1, next(f) | |
183 | mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) | |
184 | assert mo, (lineno, line) | |
185 | first = {} | |
186 | rawbitset = eval(mo.group(1)) | |
187 | for i, c in enumerate(rawbitset): | |
188 | byte = ord(c) | |
189 | for j in range(8): | |
190 | if byte & (1 << j): | |
191 | first[i * 8 + j] = 1 | |
192 | dfas[number] = (state, first) | |
193 | lineno, line = lineno + 1, next(f) | |
194 | assert line == "};\n", (lineno, line) | |
195 | self.dfas = dfas | |
196 | ||
197 | # Parse the labels | |
198 | labels = [] | |
199 | lineno, line = lineno + 1, next(f) | |
200 | mo = re.match(r"static label labels\[(\d+)\] = {$", line) | |
201 | assert mo, (lineno, line) | |
202 | nlabels = int(mo.group(1)) | |
203 | for i in range(nlabels): | |
204 | lineno, line = lineno + 1, next(f) | |
205 | mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) | |
206 | assert mo, (lineno, line) | |
207 | x, y = mo.groups() | |
208 | x = int(x) | |
209 | if y == "0": | |
210 | y = None | |
211 | else: | |
212 | y = eval(y) | |
213 | labels.append((x, y)) | |
214 | lineno, line = lineno + 1, next(f) | |
215 | assert line == "};\n", (lineno, line) | |
216 | self.labels = labels | |
217 | ||
218 | # Parse the grammar struct | |
219 | lineno, line = lineno + 1, next(f) | |
220 | assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) | |
221 | lineno, line = lineno + 1, next(f) | |
222 | mo = re.match(r"\s+(\d+),$", line) | |
223 | assert mo, (lineno, line) | |
224 | ndfas = int(mo.group(1)) | |
225 | assert ndfas == len(self.dfas) | |
226 | lineno, line = lineno + 1, next(f) | |
227 | assert line == "\tdfas,\n", (lineno, line) | |
228 | lineno, line = lineno + 1, next(f) | |
229 | mo = re.match(r"\s+{(\d+), labels},$", line) | |
230 | assert mo, (lineno, line) | |
231 | nlabels = int(mo.group(1)) | |
232 | assert nlabels == len(self.labels), (lineno, line) | |
233 | lineno, line = lineno + 1, next(f) | |
234 | mo = re.match(r"\s+(\d+)$", line) | |
235 | assert mo, (lineno, line) | |
236 | start = int(mo.group(1)) | |
237 | assert start in self.number2symbol, (lineno, line) | |
238 | self.start = start | |
239 | lineno, line = lineno + 1, next(f) | |
240 | assert line == "};\n", (lineno, line) | |
241 | try: | |
242 | lineno, line = lineno + 1, next(f) | |
243 | except StopIteration: | |
244 | pass | |
245 | else: | |
246 | assert 0, (lineno, line) | |
247 | ||
248 | def finish_off(self): | |
249 | """Create additional useful structures. (Internal).""" | |
250 | self.keywords = {} # map from keyword strings to arc labels | |
251 | self.tokens = {} # map from numeric token values to arc labels | |
252 | for ilabel, (type, value) in enumerate(self.labels): | |
253 | if type == token.NAME and value is not None: | |
254 | self.keywords[value] = ilabel | |
255 | elif value is None: | |
256 | self.tokens[type] = ilabel |