]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. |
2 | # Licensed to PSF under a Contributor Agreement. | |
3 | ||
4 | """This module defines the data structures used to represent a grammar. | |
5 | ||
6 | These are a bit arcane because they are derived from the data | |
7 | structures used by Python's 'pgen' parser generator. | |
8 | ||
9 | There's also a table here mapping operators to their names in the | |
10 | token module; the Python tokenize module reports all operators as the | |
11 | fallback token code OP, but the parser needs the actual token code. | |
12 | ||
13 | """ | |
14 | ||
15 | # Python imports | |
16 | import os | |
17 | import pickle | |
18 | import tempfile | |
19 | from typing import Any, Dict, List, Optional, Tuple, TypeVar, Union | |
20 | ||
21 | # Local imports | |
22 | from . import token | |
23 | ||
24 | _P = TypeVar("_P", bound="Grammar") | |
25 | Label = Tuple[int, Optional[str]] | |
26 | DFA = List[List[Tuple[int, int]]] | |
27 | DFAS = Tuple[DFA, Dict[int, int]] | |
28 | Path = Union[str, "os.PathLike[str]"] | |
29 | ||
30 | ||
31 | class Grammar: | |
32 | """Pgen parsing tables conversion class. | |
33 | ||
34 | Once initialized, this class supplies the grammar tables for the | |
35 | parsing engine implemented by parse.py. The parsing engine | |
36 | accesses the instance variables directly. The class here does not | |
37 | provide initialization of the tables; several subclasses exist to | |
38 | do this (see the conv and pgen modules). | |
39 | ||
40 | The load() method reads the tables from a pickle file, which is | |
41 | much faster than the other ways offered by subclasses. The pickle | |
42 | file is written by calling dump() (after loading the grammar | |
43 | tables using a subclass). The report() method prints a readable | |
44 | representation of the tables to stdout, for debugging. | |
45 | ||
46 | The instance variables are as follows: | |
47 | ||
48 | symbol2number -- a dict mapping symbol names to numbers. Symbol | |
49 | numbers are always 256 or higher, to distinguish | |
50 | them from token numbers, which are between 0 and | |
51 | 255 (inclusive). | |
52 | ||
53 | number2symbol -- a dict mapping numbers to symbol names; | |
54 | these two are each other's inverse. | |
55 | ||
56 | states -- a list of DFAs, where each DFA is a list of | |
57 | states, each state is a list of arcs, and each | |
58 | arc is a (i, j) pair where i is a label and j is | |
59 | a state number. The DFA number is the index into | |
60 | this list. (This name is slightly confusing.) | |
61 | Final states are represented by a special arc of | |
62 | the form (0, j) where j is its own state number. | |
63 | ||
64 | dfas -- a dict mapping symbol numbers to (DFA, first) | |
65 | pairs, where DFA is an item from the states list | |
66 | above, and first is a set of tokens that can | |
67 | begin this grammar rule (represented by a dict | |
68 | whose values are always 1). | |
69 | ||
70 | labels -- a list of (x, y) pairs where x is either a token | |
71 | number or a symbol number, and y is either None | |
72 | or a string; the strings are keywords. The label | |
73 | number is the index in this list; label numbers | |
74 | are used to mark state transitions (arcs) in the | |
75 | DFAs. | |
76 | ||
77 | start -- the number of the grammar's start symbol. | |
78 | ||
79 | keywords -- a dict mapping keyword strings to arc labels. | |
80 | ||
81 | tokens -- a dict mapping token numbers to arc labels. | |
82 | ||
83 | """ | |
84 | ||
85 | def __init__(self) -> None: | |
86 | self.symbol2number: Dict[str, int] = {} | |
87 | self.number2symbol: Dict[int, str] = {} | |
88 | self.states: List[DFA] = [] | |
89 | self.dfas: Dict[int, DFAS] = {} | |
90 | self.labels: List[Label] = [(0, "EMPTY")] | |
91 | self.keywords: Dict[str, int] = {} | |
92 | self.soft_keywords: Dict[str, int] = {} | |
93 | self.tokens: Dict[int, int] = {} | |
94 | self.symbol2label: Dict[str, int] = {} | |
95 | self.version: Tuple[int, int] = (0, 0) | |
96 | self.start = 256 | |
97 | # Python 3.7+ parses async as a keyword, not an identifier | |
98 | self.async_keywords = False | |
99 | ||
100 | def dump(self, filename: Path) -> None: | |
101 | """Dump the grammar tables to a pickle file.""" | |
102 | ||
103 | # mypyc generates objects that don't have a __dict__, but they | |
104 | # do have __getstate__ methods that will return an equivalent | |
105 | # dictionary | |
106 | if hasattr(self, "__dict__"): | |
107 | d = self.__dict__ | |
108 | else: | |
109 | d = self.__getstate__() # type: ignore | |
110 | ||
111 | with tempfile.NamedTemporaryFile( | |
112 | dir=os.path.dirname(filename), delete=False | |
113 | ) as f: | |
114 | pickle.dump(d, f, pickle.HIGHEST_PROTOCOL) | |
115 | os.replace(f.name, filename) | |
116 | ||
117 | def _update(self, attrs: Dict[str, Any]) -> None: | |
118 | for k, v in attrs.items(): | |
119 | setattr(self, k, v) | |
120 | ||
121 | def load(self, filename: Path) -> None: | |
122 | """Load the grammar tables from a pickle file.""" | |
123 | with open(filename, "rb") as f: | |
124 | d = pickle.load(f) | |
125 | self._update(d) | |
126 | ||
127 | def loads(self, pkl: bytes) -> None: | |
128 | """Load the grammar tables from a pickle bytes object.""" | |
129 | self._update(pickle.loads(pkl)) | |
130 | ||
131 | def copy(self: _P) -> _P: | |
132 | """ | |
133 | Copy the grammar. | |
134 | """ | |
135 | new = self.__class__() | |
136 | for dict_attr in ( | |
137 | "symbol2number", | |
138 | "number2symbol", | |
139 | "dfas", | |
140 | "keywords", | |
141 | "soft_keywords", | |
142 | "tokens", | |
143 | "symbol2label", | |
144 | ): | |
145 | setattr(new, dict_attr, getattr(self, dict_attr).copy()) | |
146 | new.labels = self.labels[:] | |
147 | new.states = self.states[:] | |
148 | new.start = self.start | |
149 | new.version = self.version | |
150 | new.async_keywords = self.async_keywords | |
151 | return new | |
152 | ||
153 | def report(self) -> None: | |
154 | """Dump the grammar tables to standard output, for debugging.""" | |
155 | from pprint import pprint | |
156 | ||
157 | print("s2n") | |
158 | pprint(self.symbol2number) | |
159 | print("n2s") | |
160 | pprint(self.number2symbol) | |
161 | print("states") | |
162 | pprint(self.states) | |
163 | print("dfas") | |
164 | pprint(self.dfas) | |
165 | print("labels") | |
166 | pprint(self.labels) | |
167 | print("start", self.start) | |
168 | ||
169 | ||
170 | # Map from operator to number (since tokenize doesn't do this) | |
171 | ||
172 | opmap_raw = """ | |
173 | ( LPAR | |
174 | ) RPAR | |
175 | [ LSQB | |
176 | ] RSQB | |
177 | : COLON | |
178 | , COMMA | |
179 | ; SEMI | |
180 | + PLUS | |
181 | - MINUS | |
182 | * STAR | |
183 | / SLASH | |
184 | | VBAR | |
185 | & AMPER | |
186 | < LESS | |
187 | > GREATER | |
188 | = EQUAL | |
189 | . DOT | |
190 | % PERCENT | |
191 | ` BACKQUOTE | |
192 | { LBRACE | |
193 | } RBRACE | |
194 | @ AT | |
195 | @= ATEQUAL | |
196 | == EQEQUAL | |
197 | != NOTEQUAL | |
198 | <> NOTEQUAL | |
199 | <= LESSEQUAL | |
200 | >= GREATEREQUAL | |
201 | ~ TILDE | |
202 | ^ CIRCUMFLEX | |
203 | << LEFTSHIFT | |
204 | >> RIGHTSHIFT | |
205 | ** DOUBLESTAR | |
206 | += PLUSEQUAL | |
207 | -= MINEQUAL | |
208 | *= STAREQUAL | |
209 | /= SLASHEQUAL | |
210 | %= PERCENTEQUAL | |
211 | &= AMPEREQUAL | |
212 | |= VBAREQUAL | |
213 | ^= CIRCUMFLEXEQUAL | |
214 | <<= LEFTSHIFTEQUAL | |
215 | >>= RIGHTSHIFTEQUAL | |
216 | **= DOUBLESTAREQUAL | |
217 | // DOUBLESLASH | |
218 | //= DOUBLESLASHEQUAL | |
219 | -> RARROW | |
220 | := COLONEQUAL | |
221 | """ | |
222 | ||
223 | opmap = {} | |
224 | for line in opmap_raw.splitlines(): | |
225 | if line: | |
226 | op, name = line.split() | |
227 | opmap[op] = getattr(token, name) |