DebugClients/Python/coverage/phystokens.py

changeset 5141
bc64243b7672
parent 5126
d28b92dabc2b
parent 5140
01484c0afbc6
child 5144
1ab536d25072
equal deleted inserted replaced
5126:d28b92dabc2b 5141:bc64243b7672
1 # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2 # For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
3
4 """Better tokenizing for coverage.py."""
5
6 import codecs
7 import keyword
8 import re
9 import sys
10 import token
11 import tokenize
12
13 from coverage import env
14 from coverage.backward import iternext
15 from coverage.misc import contract
16
17
18 def phys_tokens(toks):
19 """Return all physical tokens, even line continuations.
20
21 tokenize.generate_tokens() doesn't return a token for the backslash that
22 continues lines. This wrapper provides those tokens so that we can
23 re-create a faithful representation of the original source.
24
25 Returns the same values as generate_tokens()
26
27 """
28 last_line = None
29 last_lineno = -1
30 last_ttype = None
31 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
32 if last_lineno != elineno:
33 if last_line and last_line.endswith("\\\n"):
34 # We are at the beginning of a new line, and the last line
35 # ended with a backslash. We probably have to inject a
36 # backslash token into the stream. Unfortunately, there's more
37 # to figure out. This code::
38 #
39 # usage = """\
40 # HEY THERE
41 # """
42 #
43 # triggers this condition, but the token text is::
44 #
45 # '"""\\\nHEY THERE\n"""'
46 #
47 # so we need to figure out if the backslash is already in the
48 # string token or not.
49 inject_backslash = True
50 if last_ttype == tokenize.COMMENT:
51 # Comments like this \
52 # should never result in a new token.
53 inject_backslash = False
54 elif ttype == token.STRING:
55 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
56 # It's a multi-line string and the first line ends with
57 # a backslash, so we don't need to inject another.
58 inject_backslash = False
59 if inject_backslash:
60 # Figure out what column the backslash is in.
61 ccol = len(last_line.split("\n")[-2]) - 1
62 # Yield the token, with a fake token type.
63 yield (
64 99999, "\\\n",
65 (slineno, ccol), (slineno, ccol+2),
66 last_line
67 )
68 last_line = ltext
69 last_ttype = ttype
70 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
71 last_lineno = elineno
72
73
74 @contract(source='unicode')
75 def source_token_lines(source):
76 """Generate a series of lines, one for each line in `source`.
77
78 Each line is a list of pairs, each pair is a token::
79
80 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
81
82 Each pair has a token class, and the token text.
83
84 If you concatenate all the token texts, and then join them with newlines,
85 you should have your original `source` back, with two differences:
86 trailing whitespace is not preserved, and a final line with no newline
87 is indistinguishable from a final line with a newline.
88
89 """
90
91 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
92 line = []
93 col = 0
94
95 source = source.expandtabs(8).replace('\r\n', '\n')
96 tokgen = generate_tokens(source)
97
98 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
99 mark_start = True
100 for part in re.split('(\n)', ttext):
101 if part == '\n':
102 yield line
103 line = []
104 col = 0
105 mark_end = False
106 elif part == '':
107 mark_end = False
108 elif ttype in ws_tokens:
109 mark_end = False
110 else:
111 if mark_start and scol > col:
112 line.append(("ws", u" " * (scol - col)))
113 mark_start = False
114 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
115 if ttype == token.NAME and keyword.iskeyword(ttext):
116 tok_class = "key"
117 line.append((tok_class, part))
118 mark_end = True
119 scol = 0
120 if mark_end:
121 col = ecol
122
123 if line:
124 yield line
125
126
127 class CachedTokenizer(object):
128 """A one-element cache around tokenize.generate_tokens.
129
130 When reporting, coverage.py tokenizes files twice, once to find the
131 structure of the file, and once to syntax-color it. Tokenizing is
132 expensive, and easily cached.
133
134 This is a one-element cache so that our twice-in-a-row tokenizing doesn't
135 actually tokenize twice.
136
137 """
138 def __init__(self):
139 self.last_text = None
140 self.last_tokens = None
141
142 @contract(text='unicode')
143 def generate_tokens(self, text):
144 """A stand-in for `tokenize.generate_tokens`."""
145 if text != self.last_text:
146 self.last_text = text
147 readline = iternext(text.splitlines(True))
148 self.last_tokens = list(tokenize.generate_tokens(readline))
149 return self.last_tokens
150
151 # Create our generate_tokens cache as a callable replacement function.
152 generate_tokens = CachedTokenizer().generate_tokens
153
154
155 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
156
157 @contract(source='bytes')
158 def _source_encoding_py2(source):
159 """Determine the encoding for `source`, according to PEP 263.
160
161 `source` is a byte string, the text of the program.
162
163 Returns a string, the name of the encoding.
164
165 """
166 assert isinstance(source, bytes)
167
168 # Do this so the detect_encode code we copied will work.
169 readline = iternext(source.splitlines(True))
170
171 # This is mostly code adapted from Py3.2's tokenize module.
172
173 def _get_normal_name(orig_enc):
174 """Imitates get_normal_name in tokenizer.c."""
175 # Only care about the first 12 characters.
176 enc = orig_enc[:12].lower().replace("_", "-")
177 if re.match(r"^utf-8($|-)", enc):
178 return "utf-8"
179 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
180 return "iso-8859-1"
181 return orig_enc
182
183 # From detect_encode():
184 # It detects the encoding from the presence of a UTF-8 BOM or an encoding
185 # cookie as specified in PEP-0263. If both a BOM and a cookie are present,
186 # but disagree, a SyntaxError will be raised. If the encoding cookie is an
187 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found,
188 # 'utf-8-sig' is returned.
189
190 # If no encoding is specified, then the default will be returned.
191 default = 'ascii'
192
193 bom_found = False
194 encoding = None
195
196 def read_or_stop():
197 """Get the next source line, or ''."""
198 try:
199 return readline()
200 except StopIteration:
201 return ''
202
203 def find_cookie(line):
204 """Find an encoding cookie in `line`."""
205 try:
206 line_string = line.decode('ascii')
207 except UnicodeDecodeError:
208 return None
209
210 matches = COOKIE_RE.findall(line_string)
211 if not matches:
212 return None
213 encoding = _get_normal_name(matches[0])
214 try:
215 codec = codecs.lookup(encoding)
216 except LookupError:
217 # This behavior mimics the Python interpreter
218 raise SyntaxError("unknown encoding: " + encoding)
219
220 if bom_found:
221 # codecs in 2.3 were raw tuples of functions, assume the best.
222 codec_name = getattr(codec, 'name', encoding)
223 if codec_name != 'utf-8':
224 # This behavior mimics the Python interpreter
225 raise SyntaxError('encoding problem: utf-8')
226 encoding += '-sig'
227 return encoding
228
229 first = read_or_stop()
230 if first.startswith(codecs.BOM_UTF8):
231 bom_found = True
232 first = first[3:]
233 default = 'utf-8-sig'
234 if not first:
235 return default
236
237 encoding = find_cookie(first)
238 if encoding:
239 return encoding
240
241 second = read_or_stop()
242 if not second:
243 return default
244
245 encoding = find_cookie(second)
246 if encoding:
247 return encoding
248
249 return default
250
251
252 @contract(source='bytes')
253 def _source_encoding_py3(source):
254 """Determine the encoding for `source`, according to PEP 263.
255
256 `source` is a byte string: the text of the program.
257
258 Returns a string, the name of the encoding.
259
260 """
261 readline = iternext(source.splitlines(True))
262 return tokenize.detect_encoding(readline)[0]
263
264
265 if env.PY3:
266 source_encoding = _source_encoding_py3
267 else:
268 source_encoding = _source_encoding_py2
269
270
271 @contract(source='unicode')
272 def compile_unicode(source, filename, mode):
273 """Just like the `compile` builtin, but works on any Unicode string.
274
275 Python 2's compile() builtin has a stupid restriction: if the source string
276 is Unicode, then it may not have a encoding declaration in it. Why not?
277 Who knows! It also decodes to utf8, and then tries to interpret those utf8
278 bytes according to the encoding declaration. Why? Who knows!
279
280 This function neuters the coding declaration, and compiles it.
281
282 """
283 source = neuter_encoding_declaration(source)
284 if env.PY2 and isinstance(filename, unicode):
285 filename = filename.encode(sys.getfilesystemencoding(), "replace")
286 code = compile(source, filename, mode)
287 return code
288
289
290 @contract(source='unicode', returns='unicode')
291 def neuter_encoding_declaration(source):
292 """Return `source`, with any encoding declaration neutered."""
293 source = COOKIE_RE.sub("# (deleted declaration)", source, count=2)
294 return source
295
296 #
297 # eflag: FileType = Python2

eric ide

mercurial