eric7/DebugClients/Python/coverage/phystokens.py

branch
eric7
changeset 8312
800c432b34c8
parent 7427
362cd1b6f81a
child 8527
2bd1325d727e
equal deleted inserted replaced
8311:4e8b98454baa 8312:800c432b34c8
1 # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2 # For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt
3
4 """Better tokenizing for coverage.py."""
5
6 import codecs
7 import keyword
8 import re
9 import sys
10 import token
11 import tokenize
12
13 from coverage import env
14 from coverage.backward import iternext, unicode_class
15 from coverage.misc import contract
16
17
18 def phys_tokens(toks):
19 """Return all physical tokens, even line continuations.
20
21 tokenize.generate_tokens() doesn't return a token for the backslash that
22 continues lines. This wrapper provides those tokens so that we can
23 re-create a faithful representation of the original source.
24
25 Returns the same values as generate_tokens()
26
27 """
28 last_line = None
29 last_lineno = -1
30 last_ttext = None
31 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
32 if last_lineno != elineno:
33 if last_line and last_line.endswith("\\\n"):
34 # We are at the beginning of a new line, and the last line
35 # ended with a backslash. We probably have to inject a
36 # backslash token into the stream. Unfortunately, there's more
37 # to figure out. This code::
38 #
39 # usage = """\
40 # HEY THERE
41 # """
42 #
43 # triggers this condition, but the token text is::
44 #
45 # '"""\\\nHEY THERE\n"""'
46 #
47 # so we need to figure out if the backslash is already in the
48 # string token or not.
49 inject_backslash = True
50 if last_ttext.endswith("\\"):
51 inject_backslash = False
52 elif ttype == token.STRING:
53 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
54 # It's a multi-line string and the first line ends with
55 # a backslash, so we don't need to inject another.
56 inject_backslash = False
57 if inject_backslash:
58 # Figure out what column the backslash is in.
59 ccol = len(last_line.split("\n")[-2]) - 1
60 # Yield the token, with a fake token type.
61 yield (
62 99999, "\\\n",
63 (slineno, ccol), (slineno, ccol+2),
64 last_line
65 )
66 last_line = ltext
67 if ttype not in (tokenize.NEWLINE, tokenize.NL):
68 last_ttext = ttext
69 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
70 last_lineno = elineno
71
72
73 @contract(source='unicode')
74 def source_token_lines(source):
75 """Generate a series of lines, one for each line in `source`.
76
77 Each line is a list of pairs, each pair is a token::
78
79 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
80
81 Each pair has a token class, and the token text.
82
83 If you concatenate all the token texts, and then join them with newlines,
84 you should have your original `source` back, with two differences:
85 trailing whitespace is not preserved, and a final line with no newline
86 is indistinguishable from a final line with a newline.
87
88 """
89
90 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
91 line = []
92 col = 0
93
94 source = source.expandtabs(8).replace('\r\n', '\n')
95 tokgen = generate_tokens(source)
96
97 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
98 mark_start = True
99 for part in re.split('(\n)', ttext):
100 if part == '\n':
101 yield line
102 line = []
103 col = 0
104 mark_end = False
105 elif part == '':
106 mark_end = False
107 elif ttype in ws_tokens:
108 mark_end = False
109 else:
110 if mark_start and scol > col:
111 line.append(("ws", u" " * (scol - col)))
112 mark_start = False
113 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
114 if ttype == token.NAME and keyword.iskeyword(ttext):
115 tok_class = "key"
116 line.append((tok_class, part))
117 mark_end = True
118 scol = 0
119 if mark_end:
120 col = ecol
121
122 if line:
123 yield line
124
125
126 class CachedTokenizer(object):
127 """A one-element cache around tokenize.generate_tokens.
128
129 When reporting, coverage.py tokenizes files twice, once to find the
130 structure of the file, and once to syntax-color it. Tokenizing is
131 expensive, and easily cached.
132
133 This is a one-element cache so that our twice-in-a-row tokenizing doesn't
134 actually tokenize twice.
135
136 """
137 def __init__(self):
138 self.last_text = None
139 self.last_tokens = None
140
141 @contract(text='unicode')
142 def generate_tokens(self, text):
143 """A stand-in for `tokenize.generate_tokens`."""
144 if text != self.last_text:
145 self.last_text = text
146 readline = iternext(text.splitlines(True))
147 self.last_tokens = list(tokenize.generate_tokens(readline))
148 return self.last_tokens
149
150 # Create our generate_tokens cache as a callable replacement function.
151 generate_tokens = CachedTokenizer().generate_tokens
152
153
154 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
155
156 @contract(source='bytes')
157 def _source_encoding_py2(source):
158 """Determine the encoding for `source`, according to PEP 263.
159
160 `source` is a byte string, the text of the program.
161
162 Returns a string, the name of the encoding.
163
164 """
165 assert isinstance(source, bytes)
166
167 # Do this so the detect_encode code we copied will work.
168 readline = iternext(source.splitlines(True))
169
170 # This is mostly code adapted from Py3.2's tokenize module.
171
172 def _get_normal_name(orig_enc):
173 """Imitates get_normal_name in tokenizer.c."""
174 # Only care about the first 12 characters.
175 enc = orig_enc[:12].lower().replace("_", "-")
176 if re.match(r"^utf-8($|-)", enc):
177 return "utf-8"
178 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
179 return "iso-8859-1"
180 return orig_enc
181
182 # From detect_encode():
183 # It detects the encoding from the presence of a UTF-8 BOM or an encoding
184 # cookie as specified in PEP-0263. If both a BOM and a cookie are present,
185 # but disagree, a SyntaxError will be raised. If the encoding cookie is an
186 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found,
187 # 'utf-8-sig' is returned.
188
189 # If no encoding is specified, then the default will be returned.
190 default = 'ascii'
191
192 bom_found = False
193 encoding = None
194
195 def read_or_stop():
196 """Get the next source line, or ''."""
197 try:
198 return readline()
199 except StopIteration:
200 return ''
201
202 def find_cookie(line):
203 """Find an encoding cookie in `line`."""
204 try:
205 line_string = line.decode('ascii')
206 except UnicodeDecodeError:
207 return None
208
209 matches = COOKIE_RE.findall(line_string)
210 if not matches:
211 return None
212 encoding = _get_normal_name(matches[0])
213 try:
214 codec = codecs.lookup(encoding)
215 except LookupError:
216 # This behavior mimics the Python interpreter
217 raise SyntaxError("unknown encoding: " + encoding)
218
219 if bom_found:
220 # codecs in 2.3 were raw tuples of functions, assume the best.
221 codec_name = getattr(codec, 'name', encoding)
222 if codec_name != 'utf-8':
223 # This behavior mimics the Python interpreter
224 raise SyntaxError('encoding problem: utf-8')
225 encoding += '-sig'
226 return encoding
227
228 first = read_or_stop()
229 if first.startswith(codecs.BOM_UTF8):
230 bom_found = True
231 first = first[3:]
232 default = 'utf-8-sig'
233 if not first:
234 return default
235
236 encoding = find_cookie(first)
237 if encoding:
238 return encoding
239
240 second = read_or_stop()
241 if not second:
242 return default
243
244 encoding = find_cookie(second)
245 if encoding:
246 return encoding
247
248 return default
249
250
251 @contract(source='bytes')
252 def _source_encoding_py3(source):
253 """Determine the encoding for `source`, according to PEP 263.
254
255 `source` is a byte string: the text of the program.
256
257 Returns a string, the name of the encoding.
258
259 """
260 readline = iternext(source.splitlines(True))
261 return tokenize.detect_encoding(readline)[0]
262
263
264 if env.PY3:
265 source_encoding = _source_encoding_py3
266 else:
267 source_encoding = _source_encoding_py2
268
269
270 @contract(source='unicode')
271 def compile_unicode(source, filename, mode):
272 """Just like the `compile` builtin, but works on any Unicode string.
273
274 Python 2's compile() builtin has a stupid restriction: if the source string
275 is Unicode, then it may not have a encoding declaration in it. Why not?
276 Who knows! It also decodes to utf8, and then tries to interpret those utf8
277 bytes according to the encoding declaration. Why? Who knows!
278
279 This function neuters the coding declaration, and compiles it.
280
281 """
282 source = neuter_encoding_declaration(source)
283 if env.PY2 and isinstance(filename, unicode_class):
284 filename = filename.encode(sys.getfilesystemencoding(), "replace")
285 code = compile(source, filename, mode)
286 return code
287
288
289 @contract(source='unicode', returns='unicode')
290 def neuter_encoding_declaration(source):
291 """Return `source`, with any encoding declaration neutered."""
292 if COOKIE_RE.search(source):
293 source_lines = source.splitlines(True)
294 for lineno in range(min(2, len(source_lines))):
295 source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno])
296 source = "".join(source_lines)
297 return source

eric ide

mercurial