68 last_ttext = ttext |
66 last_ttext = ttext |
69 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
67 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
70 last_lineno = elineno |
68 last_lineno = elineno |
71 |
69 |
72 |
70 |
|
71 class MatchCaseFinder(ast.NodeVisitor): |
|
72 """Helper for finding match/case lines.""" |
|
73 def __init__(self, source): |
|
74 # This will be the set of line numbers that start match or case statements. |
|
75 self.match_case_lines = set() |
|
76 self.visit(ast.parse(source)) |
|
77 |
|
78 def visit_Match(self, node): |
|
79 """Invoked by ast.NodeVisitor.visit""" |
|
80 self.match_case_lines.add(node.lineno) |
|
81 for case in node.cases: |
|
82 self.match_case_lines.add(case.pattern.lineno) |
|
83 self.generic_visit(node) |
|
84 |
|
85 |
73 @contract(source='unicode') |
86 @contract(source='unicode') |
74 def source_token_lines(source): |
87 def source_token_lines(source): |
75 """Generate a series of lines, one for each line in `source`. |
88 """Generate a series of lines, one for each line in `source`. |
76 |
89 |
77 Each line is a list of pairs, each pair is a token:: |
90 Each line is a list of pairs, each pair is a token:: |
106 mark_end = False |
122 mark_end = False |
107 elif ttype in ws_tokens: |
123 elif ttype in ws_tokens: |
108 mark_end = False |
124 mark_end = False |
109 else: |
125 else: |
110 if mark_start and scol > col: |
126 if mark_start and scol > col: |
111 line.append(("ws", u" " * (scol - col))) |
127 line.append(("ws", " " * (scol - col))) |
112 mark_start = False |
128 mark_start = False |
113 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
129 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
114 if ttype == token.NAME and keyword.iskeyword(ttext): |
130 if ttype == token.NAME: |
115 tok_class = "key" |
131 if keyword.iskeyword(ttext): |
|
132 # Hard keywords are always keywords. |
|
133 tok_class = "key" |
|
134 elif env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext): |
|
135 # Soft keywords appear at the start of the line, on lines that start |
|
136 # match or case statements. |
|
137 if len(line) == 0: |
|
138 is_start_of_line = True |
|
139 elif (len(line) == 1) and line[0][0] == "ws": |
|
140 is_start_of_line = True |
|
141 else: |
|
142 is_start_of_line = False |
|
143 if is_start_of_line and sline in match_case_lines: |
|
144 tok_class = "key" |
116 line.append((tok_class, part)) |
145 line.append((tok_class, part)) |
117 mark_end = True |
146 mark_end = True |
118 scol = 0 |
147 scol = 0 |
119 if mark_end: |
148 if mark_end: |
120 col = ecol |
149 col = ecol |
121 |
150 |
122 if line: |
151 if line: |
123 yield line |
152 yield line |
124 |
153 |
125 |
154 |
126 class CachedTokenizer(object): |
155 class CachedTokenizer: |
127 """A one-element cache around tokenize.generate_tokens. |
156 """A one-element cache around tokenize.generate_tokens. |
128 |
157 |
129 When reporting, coverage.py tokenizes files twice, once to find the |
158 When reporting, coverage.py tokenizes files twice, once to find the |
130 structure of the file, and once to syntax-color it. Tokenizing is |
159 structure of the file, and once to syntax-color it. Tokenizing is |
131 expensive, and easily cached. |
160 expensive, and easily cached. |
141 @contract(text='unicode') |
170 @contract(text='unicode') |
142 def generate_tokens(self, text): |
171 def generate_tokens(self, text): |
143 """A stand-in for `tokenize.generate_tokens`.""" |
172 """A stand-in for `tokenize.generate_tokens`.""" |
144 if text != self.last_text: |
173 if text != self.last_text: |
145 self.last_text = text |
174 self.last_text = text |
146 readline = iternext(text.splitlines(True)) |
175 readline = iter(text.splitlines(True)).__next__ |
147 self.last_tokens = list(tokenize.generate_tokens(readline)) |
176 self.last_tokens = list(tokenize.generate_tokens(readline)) |
148 return self.last_tokens |
177 return self.last_tokens |
149 |
178 |
150 # Create our generate_tokens cache as a callable replacement function. |
179 # Create our generate_tokens cache as a callable replacement function. |
151 generate_tokens = CachedTokenizer().generate_tokens |
180 generate_tokens = CachedTokenizer().generate_tokens |
152 |
181 |
153 |
182 |
154 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) |
183 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) |
155 |
184 |
156 @contract(source='bytes') |
185 @contract(source='bytes') |
157 def _source_encoding_py2(source): |
186 def source_encoding(source): |
158 """Determine the encoding for `source`, according to PEP 263. |
187 """Determine the encoding for `source`, according to PEP 263. |
159 |
188 |
160 `source` is a byte string, the text of the program. |
189 `source` is a byte string: the text of the program. |
161 |
190 |
162 Returns a string, the name of the encoding. |
191 Returns a string, the name of the encoding. |
163 |
192 |
164 """ |
193 """ |
165 assert isinstance(source, bytes) |
194 readline = iter(source.splitlines(True)).__next__ |
166 |
|
167 # Do this so the detect_encode code we copied will work. |
|
168 readline = iternext(source.splitlines(True)) |
|
169 |
|
170 # This is mostly code adapted from Py3.2's tokenize module. |
|
171 |
|
172 def _get_normal_name(orig_enc): |
|
173 """Imitates get_normal_name in tokenizer.c.""" |
|
174 # Only care about the first 12 characters. |
|
175 enc = orig_enc[:12].lower().replace("_", "-") |
|
176 if re.match(r"^utf-8($|-)", enc): |
|
177 return "utf-8" |
|
178 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): |
|
179 return "iso-8859-1" |
|
180 return orig_enc |
|
181 |
|
182 # From detect_encode(): |
|
183 # It detects the encoding from the presence of a UTF-8 BOM or an encoding |
|
184 # cookie as specified in PEP-0263. If both a BOM and a cookie are present, |
|
185 # but disagree, a SyntaxError will be raised. If the encoding cookie is an |
|
186 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, |
|
187 # 'utf-8-sig' is returned. |
|
188 |
|
189 # If no encoding is specified, then the default will be returned. |
|
190 default = 'ascii' |
|
191 |
|
192 bom_found = False |
|
193 encoding = None |
|
194 |
|
195 def read_or_stop(): |
|
196 """Get the next source line, or ''.""" |
|
197 try: |
|
198 return readline() |
|
199 except StopIteration: |
|
200 return '' |
|
201 |
|
202 def find_cookie(line): |
|
203 """Find an encoding cookie in `line`.""" |
|
204 try: |
|
205 line_string = line.decode('ascii') |
|
206 except UnicodeDecodeError: |
|
207 return None |
|
208 |
|
209 matches = COOKIE_RE.findall(line_string) |
|
210 if not matches: |
|
211 return None |
|
212 encoding = _get_normal_name(matches[0]) |
|
213 try: |
|
214 codec = codecs.lookup(encoding) |
|
215 except LookupError: |
|
216 # This behavior mimics the Python interpreter |
|
217 raise SyntaxError("unknown encoding: " + encoding) |
|
218 |
|
219 if bom_found: |
|
220 # codecs in 2.3 were raw tuples of functions, assume the best. |
|
221 codec_name = getattr(codec, 'name', encoding) |
|
222 if codec_name != 'utf-8': |
|
223 # This behavior mimics the Python interpreter |
|
224 raise SyntaxError('encoding problem: utf-8') |
|
225 encoding += '-sig' |
|
226 return encoding |
|
227 |
|
228 first = read_or_stop() |
|
229 if first.startswith(codecs.BOM_UTF8): |
|
230 bom_found = True |
|
231 first = first[3:] |
|
232 default = 'utf-8-sig' |
|
233 if not first: |
|
234 return default |
|
235 |
|
236 encoding = find_cookie(first) |
|
237 if encoding: |
|
238 return encoding |
|
239 |
|
240 second = read_or_stop() |
|
241 if not second: |
|
242 return default |
|
243 |
|
244 encoding = find_cookie(second) |
|
245 if encoding: |
|
246 return encoding |
|
247 |
|
248 return default |
|
249 |
|
250 |
|
251 @contract(source='bytes') |
|
252 def _source_encoding_py3(source): |
|
253 """Determine the encoding for `source`, according to PEP 263. |
|
254 |
|
255 `source` is a byte string: the text of the program. |
|
256 |
|
257 Returns a string, the name of the encoding. |
|
258 |
|
259 """ |
|
260 readline = iternext(source.splitlines(True)) |
|
261 return tokenize.detect_encoding(readline)[0] |
195 return tokenize.detect_encoding(readline)[0] |
262 |
|
263 |
|
264 if env.PY3: |
|
265 source_encoding = _source_encoding_py3 |
|
266 else: |
|
267 source_encoding = _source_encoding_py2 |
|
268 |
196 |
269 |
197 |
270 @contract(source='unicode') |
198 @contract(source='unicode') |
271 def compile_unicode(source, filename, mode): |
199 def compile_unicode(source, filename, mode): |
272 """Just like the `compile` builtin, but works on any Unicode string. |
200 """Just like the `compile` builtin, but works on any Unicode string. |
273 |
201 |
274 Python 2's compile() builtin has a stupid restriction: if the source string |
202 Python 2's compile() builtin has a stupid restriction: if the source string |
275 is Unicode, then it may not have a encoding declaration in it. Why not? |
203 is Unicode, then it may not have a encoding declaration in it. Why not? |
276 Who knows! It also decodes to utf8, and then tries to interpret those utf8 |
204 Who knows! It also decodes to utf-8, and then tries to interpret those |
277 bytes according to the encoding declaration. Why? Who knows! |
205 utf-8 bytes according to the encoding declaration. Why? Who knows! |
278 |
206 |
279 This function neuters the coding declaration, and compiles it. |
207 This function neuters the coding declaration, and compiles it. |
280 |
208 |
281 """ |
209 """ |
282 source = neuter_encoding_declaration(source) |
210 source = neuter_encoding_declaration(source) |
283 if env.PY2 and isinstance(filename, unicode_class): |
|
284 filename = filename.encode(sys.getfilesystemencoding(), "replace") |
|
285 code = compile(source, filename, mode) |
211 code = compile(source, filename, mode) |
286 return code |
212 return code |
287 |
213 |
288 |
214 |
289 @contract(source='unicode', returns='unicode') |
215 @contract(source='unicode', returns='unicode') |