|
1 # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 |
|
2 # For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt |
|
3 |
1 """Better tokenizing for coverage.py.""" |
4 """Better tokenizing for coverage.py.""" |
2 |
5 |
3 import codecs, keyword, re, sys, token, tokenize |
6 import codecs |
4 from .backward import set # pylint: disable=W0622 |
7 import keyword |
5 from .parser import generate_tokens |
8 import re |
|
9 import token |
|
10 import tokenize |
|
11 |
|
12 from coverage import env |
|
13 from coverage.backward import iternext |
|
14 from coverage.misc import contract |
6 |
15 |
7 |
16 |
8 def phys_tokens(toks): |
17 def phys_tokens(toks): |
9 """Return all physical tokens, even line continuations. |
18 """Return all physical tokens, even line continuations. |
10 |
19 |
41 # Comments like this \ |
50 # Comments like this \ |
42 # should never result in a new token. |
51 # should never result in a new token. |
43 inject_backslash = False |
52 inject_backslash = False |
44 elif ttype == token.STRING: |
53 elif ttype == token.STRING: |
45 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': |
54 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': |
46 # It's a multiline string and the first line ends with |
55 # It's a multi-line string and the first line ends with |
47 # a backslash, so we don't need to inject another. |
56 # a backslash, so we don't need to inject another. |
48 inject_backslash = False |
57 inject_backslash = False |
49 if inject_backslash: |
58 if inject_backslash: |
50 # Figure out what column the backslash is in. |
59 # Figure out what column the backslash is in. |
51 ccol = len(last_line.split("\n")[-2]) - 1 |
60 ccol = len(last_line.split("\n")[-2]) - 1 |
59 last_ttype = ttype |
68 last_ttype = ttype |
60 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
69 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
61 last_lineno = elineno |
70 last_lineno = elineno |
62 |
71 |
63 |
72 |
|
73 @contract(source='unicode') |
64 def source_token_lines(source): |
74 def source_token_lines(source): |
65 """Generate a series of lines, one for each line in `source`. |
75 """Generate a series of lines, one for each line in `source`. |
66 |
76 |
67 Each line is a list of pairs, each pair is a token:: |
77 Each line is a list of pairs, each pair is a token:: |
68 |
78 |
74 you should have your original `source` back, with two differences: |
84 you should have your original `source` back, with two differences: |
75 trailing whitespace is not preserved, and a final line with no newline |
85 trailing whitespace is not preserved, and a final line with no newline |
76 is indistinguishable from a final line with a newline. |
86 is indistinguishable from a final line with a newline. |
77 |
87 |
78 """ |
88 """ |
|
89 |
79 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) |
90 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) |
80 line = [] |
91 line = [] |
81 col = 0 |
92 col = 0 |
82 source = source.expandtabs(8).replace('\r\n', '\n') |
93 |
|
94 # The \f is because of http://bugs.python.org/issue19035 |
|
95 source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ') |
83 tokgen = generate_tokens(source) |
96 tokgen = generate_tokens(source) |
|
97 |
84 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
98 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
85 mark_start = True |
99 mark_start = True |
86 for part in re.split('(\n)', ttext): |
100 for part in re.split('(\n)', ttext): |
87 if part == '\n': |
101 if part == '\n': |
88 yield line |
102 yield line |
93 mark_end = False |
107 mark_end = False |
94 elif ttype in ws_tokens: |
108 elif ttype in ws_tokens: |
95 mark_end = False |
109 mark_end = False |
96 else: |
110 else: |
97 if mark_start and scol > col: |
111 if mark_start and scol > col: |
98 line.append(("ws", " " * (scol - col))) |
112 line.append(("ws", u" " * (scol - col))) |
99 mark_start = False |
113 mark_start = False |
100 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
114 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
101 if ttype == token.NAME and keyword.iskeyword(ttext): |
115 if ttype == token.NAME and keyword.iskeyword(ttext): |
102 tok_class = "key" |
116 tok_class = "key" |
103 line.append((tok_class, part)) |
117 line.append((tok_class, part)) |
107 col = ecol |
121 col = ecol |
108 |
122 |
109 if line: |
123 if line: |
110 yield line |
124 yield line |
111 |
125 |
112 def source_encoding(source): |
126 |
113 """Determine the encoding for `source` (a string), according to PEP 263. |
127 class CachedTokenizer(object): |
|
128 """A one-element cache around tokenize.generate_tokens. |
|
129 |
|
130 When reporting, coverage.py tokenizes files twice, once to find the |
|
131 structure of the file, and once to syntax-color it. Tokenizing is |
|
132 expensive, and easily cached. |
|
133 |
|
134 This is a one-element cache so that our twice-in-a-row tokenizing doesn't |
|
135 actually tokenize twice. |
|
136 |
|
137 """ |
|
138 def __init__(self): |
|
139 self.last_text = None |
|
140 self.last_tokens = None |
|
141 |
|
142 @contract(text='unicode') |
|
143 def generate_tokens(self, text): |
|
144 """A stand-in for `tokenize.generate_tokens`.""" |
|
145 if text != self.last_text: |
|
146 self.last_text = text |
|
147 readline = iternext(text.splitlines(True)) |
|
148 self.last_tokens = list(tokenize.generate_tokens(readline)) |
|
149 return self.last_tokens |
|
150 |
|
151 # Create our generate_tokens cache as a callable replacement function. |
|
152 generate_tokens = CachedTokenizer().generate_tokens |
|
153 |
|
154 |
|
155 COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE) |
|
156 |
|
157 @contract(source='bytes') |
|
158 def _source_encoding_py2(source): |
|
159 """Determine the encoding for `source`, according to PEP 263. |
|
160 |
|
161 `source` is a byte string, the text of the program. |
114 |
162 |
115 Returns a string, the name of the encoding. |
163 Returns a string, the name of the encoding. |
116 |
164 |
117 """ |
165 """ |
118 # Note: this function should never be called on Python 3, since py3 has |
166 assert isinstance(source, bytes) |
119 # built-in tools to do this. |
167 |
120 assert sys.version_info < (3, 0) |
168 # Do this so the detect_encode code we copied will work. |
|
169 readline = iternext(source.splitlines(True)) |
121 |
170 |
122 # This is mostly code adapted from Py3.2's tokenize module. |
171 # This is mostly code adapted from Py3.2's tokenize module. |
123 |
|
124 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") |
|
125 |
|
126 # Do this so the detect_encode code we copied will work. |
|
127 readline = iter(source.splitlines(True)).next |
|
128 |
172 |
129 def _get_normal_name(orig_enc): |
173 def _get_normal_name(orig_enc): |
130 """Imitates get_normal_name in tokenizer.c.""" |
174 """Imitates get_normal_name in tokenizer.c.""" |
131 # Only care about the first 12 characters. |
175 # Only care about the first 12 characters. |
132 enc = orig_enc[:12].lower().replace("_", "-") |
176 enc = orig_enc[:12].lower().replace("_", "-") |
135 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): |
179 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): |
136 return "iso-8859-1" |
180 return "iso-8859-1" |
137 return orig_enc |
181 return orig_enc |
138 |
182 |
139 # From detect_encode(): |
183 # From detect_encode(): |
140 # It detects the encoding from the presence of a utf-8 bom or an encoding |
184 # It detects the encoding from the presence of a UTF-8 BOM or an encoding |
141 # cookie as specified in pep-0263. If both a bom and a cookie are present, |
185 # cookie as specified in PEP-0263. If both a BOM and a cookie are present, |
142 # but disagree, a SyntaxError will be raised. If the encoding cookie is an |
186 # but disagree, a SyntaxError will be raised. If the encoding cookie is an |
143 # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, |
187 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, |
144 # 'utf-8-sig' is returned. |
188 # 'utf-8-sig' is returned. |
145 |
189 |
146 # If no encoding is specified, then the default will be returned. The |
190 # If no encoding is specified, then the default will be returned. |
147 # default varied with version. |
191 default = 'ascii' |
148 |
|
149 if sys.version_info <= (2, 4): |
|
150 default = 'iso-8859-1' |
|
151 else: |
|
152 default = 'ascii' |
|
153 |
192 |
154 bom_found = False |
193 bom_found = False |
155 encoding = None |
194 encoding = None |
156 |
195 |
157 def read_or_stop(): |
196 def read_or_stop(): |
166 try: |
205 try: |
167 line_string = line.decode('ascii') |
206 line_string = line.decode('ascii') |
168 except UnicodeDecodeError: |
207 except UnicodeDecodeError: |
169 return None |
208 return None |
170 |
209 |
171 matches = cookie_re.findall(line_string) |
210 matches = COOKIE_RE.findall(line_string) |
172 if not matches: |
211 if not matches: |
173 return None |
212 return None |
174 encoding = _get_normal_name(matches[0]) |
213 encoding = _get_normal_name(matches[0]) |
175 try: |
214 try: |
176 codec = codecs.lookup(encoding) |
215 codec = codecs.lookup(encoding) |
177 except LookupError: |
216 except LookupError: |
178 # This behaviour mimics the Python interpreter |
217 # This behavior mimics the Python interpreter |
179 raise SyntaxError("unknown encoding: " + encoding) |
218 raise SyntaxError("unknown encoding: " + encoding) |
180 |
219 |
181 if bom_found: |
220 if bom_found: |
182 # codecs in 2.3 were raw tuples of functions, assume the best. |
221 # codecs in 2.3 were raw tuples of functions, assume the best. |
183 codec_name = getattr(codec, 'name', encoding) |
222 codec_name = getattr(codec, 'name', encoding) |
184 if codec_name != 'utf-8': |
223 if codec_name != 'utf-8': |
185 # This behaviour mimics the Python interpreter |
224 # This behavior mimics the Python interpreter |
186 raise SyntaxError('encoding problem: utf-8') |
225 raise SyntaxError('encoding problem: utf-8') |
187 encoding += '-sig' |
226 encoding += '-sig' |
188 return encoding |
227 return encoding |
189 |
228 |
190 first = read_or_stop() |
229 first = read_or_stop() |
207 if encoding: |
246 if encoding: |
208 return encoding |
247 return encoding |
209 |
248 |
210 return default |
249 return default |
211 |
250 |
212 # |
251 |
213 # eflag: FileType = Python2 |
252 @contract(source='bytes') |
|
253 def _source_encoding_py3(source): |
|
254 """Determine the encoding for `source`, according to PEP 263. |
|
255 |
|
256 `source` is a byte string: the text of the program. |
|
257 |
|
258 Returns a string, the name of the encoding. |
|
259 |
|
260 """ |
|
261 readline = iternext(source.splitlines(True)) |
|
262 return tokenize.detect_encoding(readline)[0] |
|
263 |
|
264 |
|
265 if env.PY3: |
|
266 source_encoding = _source_encoding_py3 |
|
267 else: |
|
268 source_encoding = _source_encoding_py2 |
|
269 |
|
270 |
|
271 @contract(source='unicode') |
|
272 def compile_unicode(source, filename, mode): |
|
273 """Just like the `compile` builtin, but works on any Unicode string. |
|
274 |
|
275 Python 2's compile() builtin has a stupid restriction: if the source string |
|
276 is Unicode, then it may not have a encoding declaration in it. Why not? |
|
277 Who knows! |
|
278 |
|
279 This function catches that exception, neuters the coding declaration, and |
|
280 compiles it anyway. |
|
281 |
|
282 """ |
|
283 try: |
|
284 code = compile(source, filename, mode) |
|
285 except SyntaxError as synerr: |
|
286 if "coding declaration in unicode string" not in synerr.args[0].lower(): |
|
287 raise |
|
288 source = neuter_encoding_declaration(source) |
|
289 code = compile(source, filename, mode) |
|
290 |
|
291 return code |
|
292 |
|
293 |
|
294 @contract(source='unicode', returns='unicode') |
|
295 def neuter_encoding_declaration(source): |
|
296 """Return `source`, with any encoding declaration neutered. |
|
297 |
|
298 This function will only ever be called on `source` that has an encoding |
|
299 declaration, so some edge cases can be ignored. |
|
300 |
|
301 """ |
|
302 source = COOKIE_RE.sub("# (deleted declaration)", source) |
|
303 return source |