eric7/DebugClients/Python/coverage/phystokens.py

branch
eric7
changeset 8775
0802ae193343
parent 8527
2bd1325d727e
child 9099
0e511e0e94a3
equal deleted inserted replaced
8774:d728227e8ebb 8775:0802ae193343
1 # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 1 # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2 # For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt 2 # For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt
3 3
4 """Better tokenizing for coverage.py.""" 4 """Better tokenizing for coverage.py."""
5 5
6 import codecs 6 import ast
7 import keyword 7 import keyword
8 import re 8 import re
9 import sys
10 import token 9 import token
11 import tokenize 10 import tokenize
12 11
13 from coverage import env 12 from coverage import env
14 from coverage.backward import iternext, unicode_class
15 from coverage.misc import contract 13 from coverage.misc import contract
16 14
17 15
18 def phys_tokens(toks): 16 def phys_tokens(toks):
19 """Return all physical tokens, even line continuations. 17 """Return all physical tokens, even line continuations.
68 last_ttext = ttext 66 last_ttext = ttext
69 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext 67 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
70 last_lineno = elineno 68 last_lineno = elineno
71 69
72 70
71 class MatchCaseFinder(ast.NodeVisitor):
72 """Helper for finding match/case lines."""
73 def __init__(self, source):
74 # This will be the set of line numbers that start match or case statements.
75 self.match_case_lines = set()
76 self.visit(ast.parse(source))
77
78 def visit_Match(self, node):
79 """Invoked by ast.NodeVisitor.visit"""
80 self.match_case_lines.add(node.lineno)
81 for case in node.cases:
82 self.match_case_lines.add(case.pattern.lineno)
83 self.generic_visit(node)
84
85
73 @contract(source='unicode') 86 @contract(source='unicode')
74 def source_token_lines(source): 87 def source_token_lines(source):
75 """Generate a series of lines, one for each line in `source`. 88 """Generate a series of lines, one for each line in `source`.
76 89
77 Each line is a list of pairs, each pair is a token:: 90 Each line is a list of pairs, each pair is a token::
92 col = 0 105 col = 0
93 106
94 source = source.expandtabs(8).replace('\r\n', '\n') 107 source = source.expandtabs(8).replace('\r\n', '\n')
95 tokgen = generate_tokens(source) 108 tokgen = generate_tokens(source)
96 109
97 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): 110 if env.PYBEHAVIOR.soft_keywords:
111 match_case_lines = MatchCaseFinder(source).match_case_lines
112
113 for ttype, ttext, (sline, scol), (_, ecol), _ in phys_tokens(tokgen):
98 mark_start = True 114 mark_start = True
99 for part in re.split('(\n)', ttext): 115 for part in re.split('(\n)', ttext):
100 if part == '\n': 116 if part == '\n':
101 yield line 117 yield line
102 line = [] 118 line = []
106 mark_end = False 122 mark_end = False
107 elif ttype in ws_tokens: 123 elif ttype in ws_tokens:
108 mark_end = False 124 mark_end = False
109 else: 125 else:
110 if mark_start and scol > col: 126 if mark_start and scol > col:
111 line.append(("ws", u" " * (scol - col))) 127 line.append(("ws", " " * (scol - col)))
112 mark_start = False 128 mark_start = False
113 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] 129 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
114 if ttype == token.NAME and keyword.iskeyword(ttext): 130 if ttype == token.NAME:
115 tok_class = "key" 131 if keyword.iskeyword(ttext):
132 # Hard keywords are always keywords.
133 tok_class = "key"
134 elif env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext):
135 # Soft keywords appear at the start of the line, on lines that start
136 # match or case statements.
137 if len(line) == 0:
138 is_start_of_line = True
139 elif (len(line) == 1) and line[0][0] == "ws":
140 is_start_of_line = True
141 else:
142 is_start_of_line = False
143 if is_start_of_line and sline in match_case_lines:
144 tok_class = "key"
116 line.append((tok_class, part)) 145 line.append((tok_class, part))
117 mark_end = True 146 mark_end = True
118 scol = 0 147 scol = 0
119 if mark_end: 148 if mark_end:
120 col = ecol 149 col = ecol
121 150
122 if line: 151 if line:
123 yield line 152 yield line
124 153
125 154
126 class CachedTokenizer(object): 155 class CachedTokenizer:
127 """A one-element cache around tokenize.generate_tokens. 156 """A one-element cache around tokenize.generate_tokens.
128 157
129 When reporting, coverage.py tokenizes files twice, once to find the 158 When reporting, coverage.py tokenizes files twice, once to find the
130 structure of the file, and once to syntax-color it. Tokenizing is 159 structure of the file, and once to syntax-color it. Tokenizing is
131 expensive, and easily cached. 160 expensive, and easily cached.
141 @contract(text='unicode') 170 @contract(text='unicode')
142 def generate_tokens(self, text): 171 def generate_tokens(self, text):
143 """A stand-in for `tokenize.generate_tokens`.""" 172 """A stand-in for `tokenize.generate_tokens`."""
144 if text != self.last_text: 173 if text != self.last_text:
145 self.last_text = text 174 self.last_text = text
146 readline = iternext(text.splitlines(True)) 175 readline = iter(text.splitlines(True)).__next__
147 self.last_tokens = list(tokenize.generate_tokens(readline)) 176 self.last_tokens = list(tokenize.generate_tokens(readline))
148 return self.last_tokens 177 return self.last_tokens
149 178
150 # Create our generate_tokens cache as a callable replacement function. 179 # Create our generate_tokens cache as a callable replacement function.
151 generate_tokens = CachedTokenizer().generate_tokens 180 generate_tokens = CachedTokenizer().generate_tokens
152 181
153 182
154 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) 183 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
155 184
156 @contract(source='bytes') 185 @contract(source='bytes')
157 def _source_encoding_py2(source): 186 def source_encoding(source):
158 """Determine the encoding for `source`, according to PEP 263. 187 """Determine the encoding for `source`, according to PEP 263.
159 188
160 `source` is a byte string, the text of the program. 189 `source` is a byte string: the text of the program.
161 190
162 Returns a string, the name of the encoding. 191 Returns a string, the name of the encoding.
163 192
164 """ 193 """
165 assert isinstance(source, bytes) 194 readline = iter(source.splitlines(True)).__next__
166
167 # Do this so the detect_encode code we copied will work.
168 readline = iternext(source.splitlines(True))
169
170 # This is mostly code adapted from Py3.2's tokenize module.
171
172 def _get_normal_name(orig_enc):
173 """Imitates get_normal_name in tokenizer.c."""
174 # Only care about the first 12 characters.
175 enc = orig_enc[:12].lower().replace("_", "-")
176 if re.match(r"^utf-8($|-)", enc):
177 return "utf-8"
178 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
179 return "iso-8859-1"
180 return orig_enc
181
182 # From detect_encode():
183 # It detects the encoding from the presence of a UTF-8 BOM or an encoding
184 # cookie as specified in PEP-0263. If both a BOM and a cookie are present,
185 # but disagree, a SyntaxError will be raised. If the encoding cookie is an
186 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found,
187 # 'utf-8-sig' is returned.
188
189 # If no encoding is specified, then the default will be returned.
190 default = 'ascii'
191
192 bom_found = False
193 encoding = None
194
195 def read_or_stop():
196 """Get the next source line, or ''."""
197 try:
198 return readline()
199 except StopIteration:
200 return ''
201
202 def find_cookie(line):
203 """Find an encoding cookie in `line`."""
204 try:
205 line_string = line.decode('ascii')
206 except UnicodeDecodeError:
207 return None
208
209 matches = COOKIE_RE.findall(line_string)
210 if not matches:
211 return None
212 encoding = _get_normal_name(matches[0])
213 try:
214 codec = codecs.lookup(encoding)
215 except LookupError:
216 # This behavior mimics the Python interpreter
217 raise SyntaxError("unknown encoding: " + encoding)
218
219 if bom_found:
220 # codecs in 2.3 were raw tuples of functions, assume the best.
221 codec_name = getattr(codec, 'name', encoding)
222 if codec_name != 'utf-8':
223 # This behavior mimics the Python interpreter
224 raise SyntaxError('encoding problem: utf-8')
225 encoding += '-sig'
226 return encoding
227
228 first = read_or_stop()
229 if first.startswith(codecs.BOM_UTF8):
230 bom_found = True
231 first = first[3:]
232 default = 'utf-8-sig'
233 if not first:
234 return default
235
236 encoding = find_cookie(first)
237 if encoding:
238 return encoding
239
240 second = read_or_stop()
241 if not second:
242 return default
243
244 encoding = find_cookie(second)
245 if encoding:
246 return encoding
247
248 return default
249
250
251 @contract(source='bytes')
252 def _source_encoding_py3(source):
253 """Determine the encoding for `source`, according to PEP 263.
254
255 `source` is a byte string: the text of the program.
256
257 Returns a string, the name of the encoding.
258
259 """
260 readline = iternext(source.splitlines(True))
261 return tokenize.detect_encoding(readline)[0] 195 return tokenize.detect_encoding(readline)[0]
262
263
264 if env.PY3:
265 source_encoding = _source_encoding_py3
266 else:
267 source_encoding = _source_encoding_py2
268 196
269 197
270 @contract(source='unicode') 198 @contract(source='unicode')
271 def compile_unicode(source, filename, mode): 199 def compile_unicode(source, filename, mode):
272 """Just like the `compile` builtin, but works on any Unicode string. 200 """Just like the `compile` builtin, but works on any Unicode string.
273 201
274 Python 2's compile() builtin has a stupid restriction: if the source string 202 Python 2's compile() builtin has a stupid restriction: if the source string
275 is Unicode, then it may not have a encoding declaration in it. Why not? 203 is Unicode, then it may not have a encoding declaration in it. Why not?
276 Who knows! It also decodes to utf8, and then tries to interpret those utf8 204 Who knows! It also decodes to utf-8, and then tries to interpret those
277 bytes according to the encoding declaration. Why? Who knows! 205 utf-8 bytes according to the encoding declaration. Why? Who knows!
278 206
279 This function neuters the coding declaration, and compiles it. 207 This function neuters the coding declaration, and compiles it.
280 208
281 """ 209 """
282 source = neuter_encoding_declaration(source) 210 source = neuter_encoding_declaration(source)
283 if env.PY2 and isinstance(filename, unicode_class):
284 filename = filename.encode(sys.getfilesystemencoding(), "replace")
285 code = compile(source, filename, mode) 211 code = compile(source, filename, mode)
286 return code 212 return code
287 213
288 214
289 @contract(source='unicode', returns='unicode') 215 @contract(source='unicode', returns='unicode')

eric ide

mercurial