]>
Commit | Line | Data |
---|---|---|
53e6db90 DC |
1 | """ |
2 | Simple formatting on strings. Further string formatting code is in trans.py. | |
3 | """ | |
4 | ||
5 | import re | |
6 | import sys | |
7 | from functools import lru_cache | |
8 | from typing import Final, List, Match, Pattern | |
9 | ||
10 | from black._width_table import WIDTH_TABLE | |
11 | from blib2to3.pytree import Leaf | |
12 | ||
13 | STRING_PREFIX_CHARS: Final = "furbFURB" # All possible string prefix characters. | |
14 | STRING_PREFIX_RE: Final = re.compile( | |
15 | r"^([" + STRING_PREFIX_CHARS + r"]*)(.*)$", re.DOTALL | |
16 | ) | |
17 | FIRST_NON_WHITESPACE_RE: Final = re.compile(r"\s*\t+\s*(\S)") | |
18 | UNICODE_ESCAPE_RE: Final = re.compile( | |
19 | r"(?P<backslashes>\\+)(?P<body>" | |
20 | r"(u(?P<u>[a-fA-F0-9]{4}))" # Character with 16-bit hex value xxxx | |
21 | r"|(U(?P<U>[a-fA-F0-9]{8}))" # Character with 32-bit hex value xxxxxxxx | |
22 | r"|(x(?P<x>[a-fA-F0-9]{2}))" # Character with hex value hh | |
23 | r"|(N\{(?P<N>[a-zA-Z0-9 \-]{2,})\})" # Character named name in the Unicode database | |
24 | r")", | |
25 | re.VERBOSE, | |
26 | ) | |
27 | ||
28 | ||
29 | def sub_twice(regex: Pattern[str], replacement: str, original: str) -> str: | |
30 | """Replace `regex` with `replacement` twice on `original`. | |
31 | ||
32 | This is used by string normalization to perform replaces on | |
33 | overlapping matches. | |
34 | """ | |
35 | return regex.sub(replacement, regex.sub(replacement, original)) | |
36 | ||
37 | ||
38 | def has_triple_quotes(string: str) -> bool: | |
39 | """ | |
40 | Returns: | |
41 | True iff @string starts with three quotation characters. | |
42 | """ | |
43 | raw_string = string.lstrip(STRING_PREFIX_CHARS) | |
44 | return raw_string[:3] in {'"""', "'''"} | |
45 | ||
46 | ||
47 | def lines_with_leading_tabs_expanded(s: str) -> List[str]: | |
48 | """ | |
49 | Splits string into lines and expands only leading tabs (following the normal | |
50 | Python rules) | |
51 | """ | |
52 | lines = [] | |
53 | for line in s.splitlines(): | |
54 | # Find the index of the first non-whitespace character after a string of | |
55 | # whitespace that includes at least one tab | |
56 | match = FIRST_NON_WHITESPACE_RE.match(line) | |
57 | if match: | |
58 | first_non_whitespace_idx = match.start(1) | |
59 | ||
60 | lines.append( | |
61 | line[:first_non_whitespace_idx].expandtabs() | |
62 | + line[first_non_whitespace_idx:] | |
63 | ) | |
64 | else: | |
65 | lines.append(line) | |
66 | return lines | |
67 | ||
68 | ||
69 | def fix_docstring(docstring: str, prefix: str) -> str: | |
70 | # https://www.python.org/dev/peps/pep-0257/#handling-docstring-indentation | |
71 | if not docstring: | |
72 | return "" | |
73 | lines = lines_with_leading_tabs_expanded(docstring) | |
74 | # Determine minimum indentation (first line doesn't count): | |
75 | indent = sys.maxsize | |
76 | for line in lines[1:]: | |
77 | stripped = line.lstrip() | |
78 | if stripped: | |
79 | indent = min(indent, len(line) - len(stripped)) | |
80 | # Remove indentation (first line is special): | |
81 | trimmed = [lines[0].strip()] | |
82 | if indent < sys.maxsize: | |
83 | last_line_idx = len(lines) - 2 | |
84 | for i, line in enumerate(lines[1:]): | |
85 | stripped_line = line[indent:].rstrip() | |
86 | if stripped_line or i == last_line_idx: | |
87 | trimmed.append(prefix + stripped_line) | |
88 | else: | |
89 | trimmed.append("") | |
90 | return "\n".join(trimmed) | |
91 | ||
92 | ||
93 | def get_string_prefix(string: str) -> str: | |
94 | """ | |
95 | Pre-conditions: | |
96 | * assert_is_leaf_string(@string) | |
97 | ||
98 | Returns: | |
99 | @string's prefix (e.g. '', 'r', 'f', or 'rf'). | |
100 | """ | |
101 | assert_is_leaf_string(string) | |
102 | ||
103 | prefix = "" | |
104 | prefix_idx = 0 | |
105 | while string[prefix_idx] in STRING_PREFIX_CHARS: | |
106 | prefix += string[prefix_idx] | |
107 | prefix_idx += 1 | |
108 | ||
109 | return prefix | |
110 | ||
111 | ||
112 | def assert_is_leaf_string(string: str) -> None: | |
113 | """ | |
114 | Checks the pre-condition that @string has the format that you would expect | |
115 | of `leaf.value` where `leaf` is some Leaf such that `leaf.type == | |
116 | token.STRING`. A more precise description of the pre-conditions that are | |
117 | checked are listed below. | |
118 | ||
119 | Pre-conditions: | |
120 | * @string starts with either ', ", <prefix>', or <prefix>" where | |
121 | `set(<prefix>)` is some subset of `set(STRING_PREFIX_CHARS)`. | |
122 | * @string ends with a quote character (' or "). | |
123 | ||
124 | Raises: | |
125 | AssertionError(...) if the pre-conditions listed above are not | |
126 | satisfied. | |
127 | """ | |
128 | dquote_idx = string.find('"') | |
129 | squote_idx = string.find("'") | |
130 | if -1 in [dquote_idx, squote_idx]: | |
131 | quote_idx = max(dquote_idx, squote_idx) | |
132 | else: | |
133 | quote_idx = min(squote_idx, dquote_idx) | |
134 | ||
135 | assert ( | |
136 | 0 <= quote_idx < len(string) - 1 | |
137 | ), f"{string!r} is missing a starting quote character (' or \")." | |
138 | assert string[-1] in ( | |
139 | "'", | |
140 | '"', | |
141 | ), f"{string!r} is missing an ending quote character (' or \")." | |
142 | assert set(string[:quote_idx]).issubset( | |
143 | set(STRING_PREFIX_CHARS) | |
144 | ), f"{set(string[:quote_idx])} is NOT a subset of {set(STRING_PREFIX_CHARS)}." | |
145 | ||
146 | ||
147 | def normalize_string_prefix(s: str) -> str: | |
148 | """Make all string prefixes lowercase.""" | |
149 | match = STRING_PREFIX_RE.match(s) | |
150 | assert match is not None, f"failed to match string {s!r}" | |
151 | orig_prefix = match.group(1) | |
152 | new_prefix = ( | |
153 | orig_prefix.replace("F", "f") | |
154 | .replace("B", "b") | |
155 | .replace("U", "") | |
156 | .replace("u", "") | |
157 | ) | |
158 | ||
159 | # Python syntax guarantees max 2 prefixes and that one of them is "r" | |
160 | if len(new_prefix) == 2 and "r" != new_prefix[0].lower(): | |
161 | new_prefix = new_prefix[::-1] | |
162 | return f"{new_prefix}{match.group(2)}" | |
163 | ||
164 | ||
165 | # Re(gex) does actually cache patterns internally but this still improves | |
166 | # performance on a long list literal of strings by 5-9% since lru_cache's | |
167 | # caching overhead is much lower. | |
168 | @lru_cache(maxsize=64) | |
169 | def _cached_compile(pattern: str) -> Pattern[str]: | |
170 | return re.compile(pattern) | |
171 | ||
172 | ||
173 | def normalize_string_quotes(s: str) -> str: | |
174 | """Prefer double quotes but only if it doesn't cause more escaping. | |
175 | ||
176 | Adds or removes backslashes as appropriate. Doesn't parse and fix | |
177 | strings nested in f-strings. | |
178 | """ | |
179 | value = s.lstrip(STRING_PREFIX_CHARS) | |
180 | if value[:3] == '"""': | |
181 | return s | |
182 | ||
183 | elif value[:3] == "'''": | |
184 | orig_quote = "'''" | |
185 | new_quote = '"""' | |
186 | elif value[0] == '"': | |
187 | orig_quote = '"' | |
188 | new_quote = "'" | |
189 | else: | |
190 | orig_quote = "'" | |
191 | new_quote = '"' | |
192 | first_quote_pos = s.find(orig_quote) | |
193 | if first_quote_pos == -1: | |
194 | return s # There's an internal error | |
195 | ||
196 | prefix = s[:first_quote_pos] | |
197 | unescaped_new_quote = _cached_compile(rf"(([^\\]|^)(\\\\)*){new_quote}") | |
198 | escaped_new_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){new_quote}") | |
199 | escaped_orig_quote = _cached_compile(rf"([^\\]|^)\\((?:\\\\)*){orig_quote}") | |
200 | body = s[first_quote_pos + len(orig_quote) : -len(orig_quote)] | |
201 | if "r" in prefix.casefold(): | |
202 | if unescaped_new_quote.search(body): | |
203 | # There's at least one unescaped new_quote in this raw string | |
204 | # so converting is impossible | |
205 | return s | |
206 | ||
207 | # Do not introduce or remove backslashes in raw strings | |
208 | new_body = body | |
209 | else: | |
210 | # remove unnecessary escapes | |
211 | new_body = sub_twice(escaped_new_quote, rf"\1\2{new_quote}", body) | |
212 | if body != new_body: | |
213 | # Consider the string without unnecessary escapes as the original | |
214 | body = new_body | |
215 | s = f"{prefix}{orig_quote}{body}{orig_quote}" | |
216 | new_body = sub_twice(escaped_orig_quote, rf"\1\2{orig_quote}", new_body) | |
217 | new_body = sub_twice(unescaped_new_quote, rf"\1\\{new_quote}", new_body) | |
218 | if "f" in prefix.casefold(): | |
219 | matches = re.findall( | |
220 | r""" | |
221 | (?:(?<!\{)|^)\{ # start of the string or a non-{ followed by a single { | |
222 | ([^{].*?) # contents of the brackets except if begins with {{ | |
223 | \}(?:(?!\})|$) # A } followed by end of the string or a non-} | |
224 | """, | |
225 | new_body, | |
226 | re.VERBOSE, | |
227 | ) | |
228 | for m in matches: | |
229 | if "\\" in str(m): | |
230 | # Do not introduce backslashes in interpolated expressions | |
231 | return s | |
232 | ||
233 | if new_quote == '"""' and new_body[-1:] == '"': | |
234 | # edge case: | |
235 | new_body = new_body[:-1] + '\\"' | |
236 | orig_escape_count = body.count("\\") | |
237 | new_escape_count = new_body.count("\\") | |
238 | if new_escape_count > orig_escape_count: | |
239 | return s # Do not introduce more escaping | |
240 | ||
241 | if new_escape_count == orig_escape_count and orig_quote == '"': | |
242 | return s # Prefer double quotes | |
243 | ||
244 | return f"{prefix}{new_quote}{new_body}{new_quote}" | |
245 | ||
246 | ||
247 | def normalize_unicode_escape_sequences(leaf: Leaf) -> None: | |
248 | """Replace hex codes in Unicode escape sequences with lowercase representation.""" | |
249 | text = leaf.value | |
250 | prefix = get_string_prefix(text) | |
251 | if "r" in prefix.lower(): | |
252 | return | |
253 | ||
254 | def replace(m: Match[str]) -> str: | |
255 | groups = m.groupdict() | |
256 | back_slashes = groups["backslashes"] | |
257 | ||
258 | if len(back_slashes) % 2 == 0: | |
259 | return back_slashes + groups["body"] | |
260 | ||
261 | if groups["u"]: | |
262 | # \u | |
263 | return back_slashes + "u" + groups["u"].lower() | |
264 | elif groups["U"]: | |
265 | # \U | |
266 | return back_slashes + "U" + groups["U"].lower() | |
267 | elif groups["x"]: | |
268 | # \x | |
269 | return back_slashes + "x" + groups["x"].lower() | |
270 | else: | |
271 | assert groups["N"], f"Unexpected match: {m}" | |
272 | # \N{} | |
273 | return back_slashes + "N{" + groups["N"].upper() + "}" | |
274 | ||
275 | leaf.value = re.sub(UNICODE_ESCAPE_RE, replace, text) | |
276 | ||
277 | ||
278 | @lru_cache(maxsize=4096) | |
279 | def char_width(char: str) -> int: | |
280 | """Return the width of a single character as it would be displayed in a | |
281 | terminal or editor (which respects Unicode East Asian Width). | |
282 | ||
283 | Full width characters are counted as 2, while half width characters are | |
284 | counted as 1. Also control characters are counted as 0. | |
285 | """ | |
286 | table = WIDTH_TABLE | |
287 | codepoint = ord(char) | |
288 | highest = len(table) - 1 | |
289 | lowest = 0 | |
290 | idx = highest // 2 | |
291 | while True: | |
292 | start_codepoint, end_codepoint, width = table[idx] | |
293 | if codepoint < start_codepoint: | |
294 | highest = idx - 1 | |
295 | elif codepoint > end_codepoint: | |
296 | lowest = idx + 1 | |
297 | else: | |
298 | return 0 if width < 0 else width | |
299 | if highest < lowest: | |
300 | break | |
301 | idx = (highest + lowest) // 2 | |
302 | return 1 | |
303 | ||
304 | ||
305 | def str_width(line_str: str) -> int: | |
306 | """Return the width of `line_str` as it would be displayed in a terminal | |
307 | or editor (which respects Unicode East Asian Width). | |
308 | ||
309 | You could utilize this function to determine, for example, if a string | |
310 | is too wide to display in a terminal or editor. | |
311 | """ | |
312 | if line_str.isascii(): | |
313 | # Fast path for a line consisting of only ASCII characters | |
314 | return len(line_str) | |
315 | return sum(map(char_width, line_str)) | |
316 | ||
317 | ||
318 | def count_chars_in_width(line_str: str, max_width: int) -> int: | |
319 | """Count the number of characters in `line_str` that would fit in a | |
320 | terminal or editor of `max_width` (which respects Unicode East Asian | |
321 | Width). | |
322 | """ | |
323 | total_width = 0 | |
324 | for i, char in enumerate(line_str): | |
325 | width = char_width(char) | |
326 | if width + total_width > max_width: | |
327 | return i | |
328 | total_width += width | |
329 | return len(line_str) |