|
1 # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 |
|
2 # For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt |
|
3 |
|
4 """Better tokenizing for coverage.py.""" |
|
5 |
|
6 import ast |
|
7 import keyword |
|
8 import re |
|
9 import token |
|
10 import tokenize |
|
11 |
|
12 from coverage import env |
|
13 from coverage.misc import contract |
|
14 |
|
15 |
|
16 def phys_tokens(toks): |
|
17 """Return all physical tokens, even line continuations. |
|
18 |
|
19 tokenize.generate_tokens() doesn't return a token for the backslash that |
|
20 continues lines. This wrapper provides those tokens so that we can |
|
21 re-create a faithful representation of the original source. |
|
22 |
|
23 Returns the same values as generate_tokens() |
|
24 |
|
25 """ |
|
26 last_line = None |
|
27 last_lineno = -1 |
|
28 last_ttext = None |
|
29 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: |
|
30 if last_lineno != elineno: |
|
31 if last_line and last_line.endswith("\\\n"): |
|
32 # We are at the beginning of a new line, and the last line |
|
33 # ended with a backslash. We probably have to inject a |
|
34 # backslash token into the stream. Unfortunately, there's more |
|
35 # to figure out. This code:: |
|
36 # |
|
37 # usage = """\ |
|
38 # HEY THERE |
|
39 # """ |
|
40 # |
|
41 # triggers this condition, but the token text is:: |
|
42 # |
|
43 # '"""\\\nHEY THERE\n"""' |
|
44 # |
|
45 # so we need to figure out if the backslash is already in the |
|
46 # string token or not. |
|
47 inject_backslash = True |
|
48 if last_ttext.endswith("\\"): |
|
49 inject_backslash = False |
|
50 elif ttype == token.STRING: |
|
51 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': |
|
52 # It's a multi-line string and the first line ends with |
|
53 # a backslash, so we don't need to inject another. |
|
54 inject_backslash = False |
|
55 if inject_backslash: |
|
56 # Figure out what column the backslash is in. |
|
57 ccol = len(last_line.split("\n")[-2]) - 1 |
|
58 # Yield the token, with a fake token type. |
|
59 yield ( |
|
60 99999, "\\\n", |
|
61 (slineno, ccol), (slineno, ccol+2), |
|
62 last_line |
|
63 ) |
|
64 last_line = ltext |
|
65 if ttype not in (tokenize.NEWLINE, tokenize.NL): |
|
66 last_ttext = ttext |
|
67 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
|
68 last_lineno = elineno |
|
69 |
|
70 |
|
71 class MatchCaseFinder(ast.NodeVisitor): |
|
72 """Helper for finding match/case lines.""" |
|
73 def __init__(self, source): |
|
74 # This will be the set of line numbers that start match or case statements. |
|
75 self.match_case_lines = set() |
|
76 self.visit(ast.parse(source)) |
|
77 |
|
78 def visit_Match(self, node): |
|
79 """Invoked by ast.NodeVisitor.visit""" |
|
80 self.match_case_lines.add(node.lineno) |
|
81 for case in node.cases: |
|
82 self.match_case_lines.add(case.pattern.lineno) |
|
83 self.generic_visit(node) |
|
84 |
|
85 |
|
86 @contract(source='unicode') |
|
87 def source_token_lines(source): |
|
88 """Generate a series of lines, one for each line in `source`. |
|
89 |
|
90 Each line is a list of pairs, each pair is a token:: |
|
91 |
|
92 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] |
|
93 |
|
94 Each pair has a token class, and the token text. |
|
95 |
|
96 If you concatenate all the token texts, and then join them with newlines, |
|
97 you should have your original `source` back, with two differences: |
|
98 trailing whitespace is not preserved, and a final line with no newline |
|
99 is indistinguishable from a final line with a newline. |
|
100 |
|
101 """ |
|
102 |
|
103 ws_tokens = {token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL} |
|
104 line = [] |
|
105 col = 0 |
|
106 |
|
107 source = source.expandtabs(8).replace('\r\n', '\n') |
|
108 tokgen = generate_tokens(source) |
|
109 |
|
110 if env.PYBEHAVIOR.soft_keywords: |
|
111 match_case_lines = MatchCaseFinder(source).match_case_lines |
|
112 |
|
113 for ttype, ttext, (sline, scol), (_, ecol), _ in phys_tokens(tokgen): |
|
114 mark_start = True |
|
115 for part in re.split('(\n)', ttext): |
|
116 if part == '\n': |
|
117 yield line |
|
118 line = [] |
|
119 col = 0 |
|
120 mark_end = False |
|
121 elif part == '': |
|
122 mark_end = False |
|
123 elif ttype in ws_tokens: |
|
124 mark_end = False |
|
125 else: |
|
126 if mark_start and scol > col: |
|
127 line.append(("ws", " " * (scol - col))) |
|
128 mark_start = False |
|
129 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
|
130 if ttype == token.NAME: |
|
131 if keyword.iskeyword(ttext): |
|
132 # Hard keywords are always keywords. |
|
133 tok_class = "key" |
|
134 elif env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext): |
|
135 # Soft keywords appear at the start of the line, on lines that start |
|
136 # match or case statements. |
|
137 if len(line) == 0: |
|
138 is_start_of_line = True |
|
139 elif (len(line) == 1) and line[0][0] == "ws": |
|
140 is_start_of_line = True |
|
141 else: |
|
142 is_start_of_line = False |
|
143 if is_start_of_line and sline in match_case_lines: |
|
144 tok_class = "key" |
|
145 line.append((tok_class, part)) |
|
146 mark_end = True |
|
147 scol = 0 |
|
148 if mark_end: |
|
149 col = ecol |
|
150 |
|
151 if line: |
|
152 yield line |
|
153 |
|
154 |
|
155 class CachedTokenizer: |
|
156 """A one-element cache around tokenize.generate_tokens. |
|
157 |
|
158 When reporting, coverage.py tokenizes files twice, once to find the |
|
159 structure of the file, and once to syntax-color it. Tokenizing is |
|
160 expensive, and easily cached. |
|
161 |
|
162 This is a one-element cache so that our twice-in-a-row tokenizing doesn't |
|
163 actually tokenize twice. |
|
164 |
|
165 """ |
|
166 def __init__(self): |
|
167 self.last_text = None |
|
168 self.last_tokens = None |
|
169 |
|
170 @contract(text='unicode') |
|
171 def generate_tokens(self, text): |
|
172 """A stand-in for `tokenize.generate_tokens`.""" |
|
173 if text != self.last_text: |
|
174 self.last_text = text |
|
175 readline = iter(text.splitlines(True)).__next__ |
|
176 self.last_tokens = list(tokenize.generate_tokens(readline)) |
|
177 return self.last_tokens |
|
178 |
|
179 # Create our generate_tokens cache as a callable replacement function. |
|
180 generate_tokens = CachedTokenizer().generate_tokens |
|
181 |
|
182 |
|
183 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) |
|
184 |
|
185 @contract(source='bytes') |
|
186 def source_encoding(source): |
|
187 """Determine the encoding for `source`, according to PEP 263. |
|
188 |
|
189 `source` is a byte string: the text of the program. |
|
190 |
|
191 Returns a string, the name of the encoding. |
|
192 |
|
193 """ |
|
194 readline = iter(source.splitlines(True)).__next__ |
|
195 return tokenize.detect_encoding(readline)[0] |
|
196 |
|
197 |
|
198 @contract(source='unicode') |
|
199 def compile_unicode(source, filename, mode): |
|
200 """Just like the `compile` builtin, but works on any Unicode string. |
|
201 |
|
202 Python 2's compile() builtin has a stupid restriction: if the source string |
|
203 is Unicode, then it may not have a encoding declaration in it. Why not? |
|
204 Who knows! It also decodes to utf-8, and then tries to interpret those |
|
205 utf-8 bytes according to the encoding declaration. Why? Who knows! |
|
206 |
|
207 This function neuters the coding declaration, and compiles it. |
|
208 |
|
209 """ |
|
210 source = neuter_encoding_declaration(source) |
|
211 code = compile(source, filename, mode) |
|
212 return code |
|
213 |
|
214 |
|
215 @contract(source='unicode', returns='unicode') |
|
216 def neuter_encoding_declaration(source): |
|
217 """Return `source`, with any encoding declaration neutered.""" |
|
218 if COOKIE_RE.search(source): |
|
219 source_lines = source.splitlines(True) |
|
220 for lineno in range(min(2, len(source_lines))): |
|
221 source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno]) |
|
222 source = "".join(source_lines) |
|
223 return source |