DebugClients/Python/coverage/phystokens.py

changeset 4489
d0d6e4ad31bd
parent 3499
f2d4b02c7e88
child 4491
0d8612e24fef
equal deleted inserted replaced
4481:456c58fc64b0 4489:d0d6e4ad31bd
1 # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
2 # For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
3
1 """Better tokenizing for coverage.py.""" 4 """Better tokenizing for coverage.py."""
2 5
3 import codecs, keyword, re, sys, token, tokenize 6 import codecs
4 from .backward import set # pylint: disable=W0622 7 import keyword
5 from .parser import generate_tokens 8 import re
9 import token
10 import tokenize
11
12 from coverage import env
13 from coverage.backward import iternext
14 from coverage.misc import contract
6 15
7 16
8 def phys_tokens(toks): 17 def phys_tokens(toks):
9 """Return all physical tokens, even line continuations. 18 """Return all physical tokens, even line continuations.
10 19
41 # Comments like this \ 50 # Comments like this \
42 # should never result in a new token. 51 # should never result in a new token.
43 inject_backslash = False 52 inject_backslash = False
44 elif ttype == token.STRING: 53 elif ttype == token.STRING:
45 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': 54 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
46 # It's a multiline string and the first line ends with 55 # It's a multi-line string and the first line ends with
47 # a backslash, so we don't need to inject another. 56 # a backslash, so we don't need to inject another.
48 inject_backslash = False 57 inject_backslash = False
49 if inject_backslash: 58 if inject_backslash:
50 # Figure out what column the backslash is in. 59 # Figure out what column the backslash is in.
51 ccol = len(last_line.split("\n")[-2]) - 1 60 ccol = len(last_line.split("\n")[-2]) - 1
59 last_ttype = ttype 68 last_ttype = ttype
60 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext 69 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
61 last_lineno = elineno 70 last_lineno = elineno
62 71
63 72
73 @contract(source='unicode')
64 def source_token_lines(source): 74 def source_token_lines(source):
65 """Generate a series of lines, one for each line in `source`. 75 """Generate a series of lines, one for each line in `source`.
66 76
67 Each line is a list of pairs, each pair is a token:: 77 Each line is a list of pairs, each pair is a token::
68 78
74 you should have your original `source` back, with two differences: 84 you should have your original `source` back, with two differences:
75 trailing whitespace is not preserved, and a final line with no newline 85 trailing whitespace is not preserved, and a final line with no newline
76 is indistinguishable from a final line with a newline. 86 is indistinguishable from a final line with a newline.
77 87
78 """ 88 """
89
79 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) 90 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
80 line = [] 91 line = []
81 col = 0 92 col = 0
82 source = source.expandtabs(8).replace('\r\n', '\n') 93
94 # The \f is because of http://bugs.python.org/issue19035
95 source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ')
83 tokgen = generate_tokens(source) 96 tokgen = generate_tokens(source)
97
84 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): 98 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
85 mark_start = True 99 mark_start = True
86 for part in re.split('(\n)', ttext): 100 for part in re.split('(\n)', ttext):
87 if part == '\n': 101 if part == '\n':
88 yield line 102 yield line
93 mark_end = False 107 mark_end = False
94 elif ttype in ws_tokens: 108 elif ttype in ws_tokens:
95 mark_end = False 109 mark_end = False
96 else: 110 else:
97 if mark_start and scol > col: 111 if mark_start and scol > col:
98 line.append(("ws", " " * (scol - col))) 112 line.append(("ws", u" " * (scol - col)))
99 mark_start = False 113 mark_start = False
100 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] 114 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
101 if ttype == token.NAME and keyword.iskeyword(ttext): 115 if ttype == token.NAME and keyword.iskeyword(ttext):
102 tok_class = "key" 116 tok_class = "key"
103 line.append((tok_class, part)) 117 line.append((tok_class, part))
107 col = ecol 121 col = ecol
108 122
109 if line: 123 if line:
110 yield line 124 yield line
111 125
112 def source_encoding(source): 126
113 """Determine the encoding for `source` (a string), according to PEP 263. 127 class CachedTokenizer(object):
128 """A one-element cache around tokenize.generate_tokens.
129
130 When reporting, coverage.py tokenizes files twice, once to find the
131 structure of the file, and once to syntax-color it. Tokenizing is
132 expensive, and easily cached.
133
134 This is a one-element cache so that our twice-in-a-row tokenizing doesn't
135 actually tokenize twice.
136
137 """
138 def __init__(self):
139 self.last_text = None
140 self.last_tokens = None
141
142 @contract(text='unicode')
143 def generate_tokens(self, text):
144 """A stand-in for `tokenize.generate_tokens`."""
145 if text != self.last_text:
146 self.last_text = text
147 readline = iternext(text.splitlines(True))
148 self.last_tokens = list(tokenize.generate_tokens(readline))
149 return self.last_tokens
150
151 # Create our generate_tokens cache as a callable replacement function.
152 generate_tokens = CachedTokenizer().generate_tokens
153
154
155 COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE)
156
157 @contract(source='bytes')
158 def _source_encoding_py2(source):
159 """Determine the encoding for `source`, according to PEP 263.
160
161 `source` is a byte string, the text of the program.
114 162
115 Returns a string, the name of the encoding. 163 Returns a string, the name of the encoding.
116 164
117 """ 165 """
118 # Note: this function should never be called on Python 3, since py3 has 166 assert isinstance(source, bytes)
119 # built-in tools to do this. 167
120 assert sys.version_info < (3, 0) 168 # Do this so the detect_encode code we copied will work.
169 readline = iternext(source.splitlines(True))
121 170
122 # This is mostly code adapted from Py3.2's tokenize module. 171 # This is mostly code adapted from Py3.2's tokenize module.
123
124 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
125
126 # Do this so the detect_encode code we copied will work.
127 readline = iter(source.splitlines(True)).next
128 172
129 def _get_normal_name(orig_enc): 173 def _get_normal_name(orig_enc):
130 """Imitates get_normal_name in tokenizer.c.""" 174 """Imitates get_normal_name in tokenizer.c."""
131 # Only care about the first 12 characters. 175 # Only care about the first 12 characters.
132 enc = orig_enc[:12].lower().replace("_", "-") 176 enc = orig_enc[:12].lower().replace("_", "-")
135 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): 179 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
136 return "iso-8859-1" 180 return "iso-8859-1"
137 return orig_enc 181 return orig_enc
138 182
139 # From detect_encode(): 183 # From detect_encode():
140 # It detects the encoding from the presence of a utf-8 bom or an encoding 184 # It detects the encoding from the presence of a UTF-8 BOM or an encoding
141 # cookie as specified in pep-0263. If both a bom and a cookie are present, 185 # cookie as specified in PEP-0263. If both a BOM and a cookie are present,
142 # but disagree, a SyntaxError will be raised. If the encoding cookie is an 186 # but disagree, a SyntaxError will be raised. If the encoding cookie is an
143 # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 187 # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found,
144 # 'utf-8-sig' is returned. 188 # 'utf-8-sig' is returned.
145 189
146 # If no encoding is specified, then the default will be returned. The 190 # If no encoding is specified, then the default will be returned.
147 # default varied with version. 191 default = 'ascii'
148
149 if sys.version_info <= (2, 4):
150 default = 'iso-8859-1'
151 else:
152 default = 'ascii'
153 192
154 bom_found = False 193 bom_found = False
155 encoding = None 194 encoding = None
156 195
157 def read_or_stop(): 196 def read_or_stop():
166 try: 205 try:
167 line_string = line.decode('ascii') 206 line_string = line.decode('ascii')
168 except UnicodeDecodeError: 207 except UnicodeDecodeError:
169 return None 208 return None
170 209
171 matches = cookie_re.findall(line_string) 210 matches = COOKIE_RE.findall(line_string)
172 if not matches: 211 if not matches:
173 return None 212 return None
174 encoding = _get_normal_name(matches[0]) 213 encoding = _get_normal_name(matches[0])
175 try: 214 try:
176 codec = codecs.lookup(encoding) 215 codec = codecs.lookup(encoding)
177 except LookupError: 216 except LookupError:
178 # This behaviour mimics the Python interpreter 217 # This behavior mimics the Python interpreter
179 raise SyntaxError("unknown encoding: " + encoding) 218 raise SyntaxError("unknown encoding: " + encoding)
180 219
181 if bom_found: 220 if bom_found:
182 # codecs in 2.3 were raw tuples of functions, assume the best. 221 # codecs in 2.3 were raw tuples of functions, assume the best.
183 codec_name = getattr(codec, 'name', encoding) 222 codec_name = getattr(codec, 'name', encoding)
184 if codec_name != 'utf-8': 223 if codec_name != 'utf-8':
185 # This behaviour mimics the Python interpreter 224 # This behavior mimics the Python interpreter
186 raise SyntaxError('encoding problem: utf-8') 225 raise SyntaxError('encoding problem: utf-8')
187 encoding += '-sig' 226 encoding += '-sig'
188 return encoding 227 return encoding
189 228
190 first = read_or_stop() 229 first = read_or_stop()
207 if encoding: 246 if encoding:
208 return encoding 247 return encoding
209 248
210 return default 249 return default
211 250
212 # 251
213 # eflag: FileType = Python2 252 @contract(source='bytes')
253 def _source_encoding_py3(source):
254 """Determine the encoding for `source`, according to PEP 263.
255
256 `source` is a byte string: the text of the program.
257
258 Returns a string, the name of the encoding.
259
260 """
261 readline = iternext(source.splitlines(True))
262 return tokenize.detect_encoding(readline)[0]
263
264
265 if env.PY3:
266 source_encoding = _source_encoding_py3
267 else:
268 source_encoding = _source_encoding_py2
269
270
271 @contract(source='unicode')
272 def compile_unicode(source, filename, mode):
273 """Just like the `compile` builtin, but works on any Unicode string.
274
275 Python 2's compile() builtin has a stupid restriction: if the source string
276 is Unicode, then it may not have a encoding declaration in it. Why not?
277 Who knows!
278
279 This function catches that exception, neuters the coding declaration, and
280 compiles it anyway.
281
282 """
283 try:
284 code = compile(source, filename, mode)
285 except SyntaxError as synerr:
286 if "coding declaration in unicode string" not in synerr.args[0].lower():
287 raise
288 source = neuter_encoding_declaration(source)
289 code = compile(source, filename, mode)
290
291 return code
292
293
294 @contract(source='unicode', returns='unicode')
295 def neuter_encoding_declaration(source):
296 """Return `source`, with any encoding declaration neutered.
297
298 This function will only ever be called on `source` that has an encoding
299 declaration, so some edge cases can be ignored.
300
301 """
302 source = COOKIE_RE.sub("# (deleted declaration)", source)
303 return source

eric ide

mercurial