]>
Commit | Line | Data |
---|---|---|
1 | import re | |
2 | import sys | |
3 | from ast import literal_eval | |
4 | from functools import total_ordering | |
5 | from typing import NamedTuple, Sequence, Union | |
6 | ||
7 | # The following is a list in Python that are line breaks in str.splitlines, but | |
8 | # not in Python. In Python only \r (Carriage Return, 0xD) and \n (Line Feed, | |
9 | # 0xA) are allowed to split lines. | |
10 | _NON_LINE_BREAKS = ( | |
11 | '\v', # Vertical Tabulation 0xB | |
12 | '\f', # Form Feed 0xC | |
13 | '\x1C', # File Separator | |
14 | '\x1D', # Group Separator | |
15 | '\x1E', # Record Separator | |
16 | '\x85', # Next Line (NEL - Equivalent to CR+LF. | |
17 | # Used to mark end-of-line on some IBM mainframes.) | |
18 | '\u2028', # Line Separator | |
19 | '\u2029', # Paragraph Separator | |
20 | ) | |
21 | ||
22 | ||
23 | class Version(NamedTuple): | |
24 | major: int | |
25 | minor: int | |
26 | micro: int | |
27 | ||
28 | ||
29 | def split_lines(string: str, keepends: bool = False) -> Sequence[str]: | |
30 | r""" | |
31 | Intended for Python code. In contrast to Python's :py:meth:`str.splitlines`, | |
32 | looks at form feeds and other special characters as normal text. Just | |
33 | splits ``\n`` and ``\r\n``. | |
34 | Also different: Returns ``[""]`` for an empty string input. | |
35 | ||
36 | In Python 2.7 form feeds are used as normal characters when using | |
37 | str.splitlines. However in Python 3 somewhere there was a decision to split | |
38 | also on form feeds. | |
39 | """ | |
40 | if keepends: | |
41 | lst = string.splitlines(True) | |
42 | ||
43 | # We have to merge lines that were broken by form feed characters. | |
44 | merge = [] | |
45 | for i, line in enumerate(lst): | |
46 | try: | |
47 | last_chr = line[-1] | |
48 | except IndexError: | |
49 | pass | |
50 | else: | |
51 | if last_chr in _NON_LINE_BREAKS: | |
52 | merge.append(i) | |
53 | ||
54 | for index in reversed(merge): | |
55 | try: | |
56 | lst[index] = lst[index] + lst[index + 1] | |
57 | del lst[index + 1] | |
58 | except IndexError: | |
59 | # index + 1 can be empty and therefore there's no need to | |
60 | # merge. | |
61 | pass | |
62 | ||
63 | # The stdlib's implementation of the end is inconsistent when calling | |
64 | # it with/without keepends. One time there's an empty string in the | |
65 | # end, one time there's none. | |
66 | if string.endswith('\n') or string.endswith('\r') or string == '': | |
67 | lst.append('') | |
68 | return lst | |
69 | else: | |
70 | return re.split(r'\n|\r\n|\r', string) | |
71 | ||
72 | ||
73 | def python_bytes_to_unicode( | |
74 | source: Union[str, bytes], encoding: str = 'utf-8', errors: str = 'strict' | |
75 | ) -> str: | |
76 | """ | |
77 | Checks for unicode BOMs and PEP 263 encoding declarations. Then returns a | |
78 | unicode object like in :py:meth:`bytes.decode`. | |
79 | ||
80 | :param encoding: See :py:meth:`bytes.decode` documentation. | |
81 | :param errors: See :py:meth:`bytes.decode` documentation. ``errors`` can be | |
82 | ``'strict'``, ``'replace'`` or ``'ignore'``. | |
83 | """ | |
84 | def detect_encoding(): | |
85 | """ | |
86 | For the implementation of encoding definitions in Python, look at: | |
87 | - http://www.python.org/dev/peps/pep-0263/ | |
88 | - http://docs.python.org/2/reference/lexical_analysis.html#encoding-declarations | |
89 | """ | |
90 | byte_mark = literal_eval(r"b'\xef\xbb\xbf'") | |
91 | if source.startswith(byte_mark): | |
92 | # UTF-8 byte-order mark | |
93 | return 'utf-8' | |
94 | ||
95 | first_two_lines = re.match(br'(?:[^\r\n]*(?:\r\n|\r|\n)){0,2}', source).group(0) | |
96 | possible_encoding = re.search(br"coding[=:]\s*([-\w.]+)", | |
97 | first_two_lines) | |
98 | if possible_encoding: | |
99 | e = possible_encoding.group(1) | |
100 | if not isinstance(e, str): | |
101 | e = str(e, 'ascii', 'replace') | |
102 | return e | |
103 | else: | |
104 | # the default if nothing else has been set -> PEP 263 | |
105 | return encoding | |
106 | ||
107 | if isinstance(source, str): | |
108 | # only cast str/bytes | |
109 | return source | |
110 | ||
111 | encoding = detect_encoding() | |
112 | try: | |
113 | # Cast to unicode | |
114 | return str(source, encoding, errors) | |
115 | except LookupError: | |
116 | if errors == 'replace': | |
117 | # This is a weird case that can happen if the given encoding is not | |
118 | # a valid encoding. This usually shouldn't happen with provided | |
119 | # encodings, but can happen if somebody uses encoding declarations | |
120 | # like `# coding: foo-8`. | |
121 | return str(source, 'utf-8', errors) | |
122 | raise | |
123 | ||
124 | ||
125 | def version_info() -> Version: | |
126 | """ | |
127 | Returns a namedtuple of parso's version, similar to Python's | |
128 | ``sys.version_info``. | |
129 | """ | |
130 | from parso import __version__ | |
131 | tupl = re.findall(r'[a-z]+|\d+', __version__) | |
132 | return Version(*[x if i == 3 else int(x) for i, x in enumerate(tupl)]) | |
133 | ||
134 | ||
135 | class _PythonVersionInfo(NamedTuple): | |
136 | major: int | |
137 | minor: int | |
138 | ||
139 | ||
140 | @total_ordering | |
141 | class PythonVersionInfo(_PythonVersionInfo): | |
142 | def __gt__(self, other): | |
143 | if isinstance(other, tuple): | |
144 | if len(other) != 2: | |
145 | raise ValueError("Can only compare to tuples of length 2.") | |
146 | return (self.major, self.minor) > other | |
147 | super().__gt__(other) | |
148 | ||
149 | return (self.major, self.minor) | |
150 | ||
151 | def __eq__(self, other): | |
152 | if isinstance(other, tuple): | |
153 | if len(other) != 2: | |
154 | raise ValueError("Can only compare to tuples of length 2.") | |
155 | return (self.major, self.minor) == other | |
156 | super().__eq__(other) | |
157 | ||
158 | def __ne__(self, other): | |
159 | return not self.__eq__(other) | |
160 | ||
161 | ||
162 | def _parse_version(version) -> PythonVersionInfo: | |
163 | match = re.match(r'(\d+)(?:\.(\d{1,2})(?:\.\d+)?)?((a|b|rc)\d)?$', version) | |
164 | if match is None: | |
165 | raise ValueError('The given version is not in the right format. ' | |
166 | 'Use something like "3.8" or "3".') | |
167 | ||
168 | major = int(match.group(1)) | |
169 | minor = match.group(2) | |
170 | if minor is None: | |
171 | # Use the latest Python in case it's not exactly defined, because the | |
172 | # grammars are typically backwards compatible? | |
173 | if major == 2: | |
174 | minor = "7" | |
175 | elif major == 3: | |
176 | minor = "6" | |
177 | else: | |
178 | raise NotImplementedError("Sorry, no support yet for those fancy new/old versions.") | |
179 | minor = int(minor) | |
180 | return PythonVersionInfo(major, minor) | |
181 | ||
182 | ||
183 | def parse_version_string(version: str = None) -> PythonVersionInfo: | |
184 | """ | |
185 | Checks for a valid version number (e.g. `3.8` or `3.10.1` or `3`) and | |
186 | returns a corresponding version info that is always two characters long in | |
187 | decimal. | |
188 | """ | |
189 | if version is None: | |
190 | version = '%s.%s' % sys.version_info[:2] | |
191 | if not isinstance(version, str): | |
192 | raise TypeError('version must be a string like "3.8"') | |
193 | ||
194 | return _parse_version(version) |