DebugClients/Python/coverage/phystokens.py

changeset 3497
7f51ab29a1a2
parent 790
2c0ea0163ef4
child 3499
f2d4b02c7e88
equal deleted inserted replaced
3493:71f15675e89f 3497:7f51ab29a1a2
1 """Better tokenizing for coverage.py.""" 1 """Better tokenizing for coverage.py."""
2 2
3 import keyword, re, token, tokenize 3 import codecs, keyword, re, sys, token, tokenize
4 from .backward import StringIO # pylint: disable-msg=W0622 4 from .backward import set # pylint: disable=W0622
5 from .parser import generate_tokens
6
5 7
6 def phys_tokens(toks): 8 def phys_tokens(toks):
7 """Return all physical tokens, even line continuations. 9 """Return all physical tokens, even line continuations.
8 10
9 tokenize.generate_tokens() doesn't return a token for the backslash that 11 tokenize.generate_tokens() doesn't return a token for the backslash that
16 last_line = None 18 last_line = None
17 last_lineno = -1 19 last_lineno = -1
18 last_ttype = None 20 last_ttype = None
19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: 21 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
20 if last_lineno != elineno: 22 if last_lineno != elineno:
21 if last_line and last_line[-2:] == "\\\n": 23 if last_line and last_line.endswith("\\\n"):
22 # We are at the beginning of a new line, and the last line 24 # We are at the beginning of a new line, and the last line
23 # ended with a backslash. We probably have to inject a 25 # ended with a backslash. We probably have to inject a
24 # backslash token into the stream. Unfortunately, there's more 26 # backslash token into the stream. Unfortunately, there's more
25 # to figure out. This code:: 27 # to figure out. This code::
26 # 28 #
72 you should have your original `source` back, with two differences: 74 you should have your original `source` back, with two differences:
73 trailing whitespace is not preserved, and a final line with no newline 75 trailing whitespace is not preserved, and a final line with no newline
74 is indistinguishable from a final line with a newline. 76 is indistinguishable from a final line with a newline.
75 77
76 """ 78 """
77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] 79 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
78 line = [] 80 line = []
79 col = 0 81 col = 0
80 tokgen = tokenize.generate_tokens(StringIO(source.expandtabs(8)).readline) 82 source = source.expandtabs(8).replace('\r\n', '\n')
83 tokgen = generate_tokens(source)
81 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): 84 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
82 mark_start = True 85 mark_start = True
83 for part in re.split('(\n)', ttext): 86 for part in re.split('(\n)', ttext):
84 if part == '\n': 87 if part == '\n':
85 yield line 88 yield line
104 col = ecol 107 col = ecol
105 108
106 if line: 109 if line:
107 yield line 110 yield line
108 111
109 # 112 def source_encoding(source):
110 # eflag: FileType = Python2 113 """Determine the encoding for `source` (a string), according to PEP 263.
114
115 Returns a string, the name of the encoding.
116
117 """
118 # Note: this function should never be called on Python 3, since py3 has
119 # built-in tools to do this.
120 assert sys.version_info < (3, 0)
121
122 # This is mostly code adapted from Py3.2's tokenize module.
123
124 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
125
126 # Do this so the detect_encode code we copied will work.
127 readline = iter(source.splitlines(True)).next
128
129 def _get_normal_name(orig_enc):
130 """Imitates get_normal_name in tokenizer.c."""
131 # Only care about the first 12 characters.
132 enc = orig_enc[:12].lower().replace("_", "-")
133 if re.match(r"^utf-8($|-)", enc):
134 return "utf-8"
135 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
136 return "iso-8859-1"
137 return orig_enc
138
139 # From detect_encode():
140 # It detects the encoding from the presence of a utf-8 bom or an encoding
141 # cookie as specified in pep-0263. If both a bom and a cookie are present,
142 # but disagree, a SyntaxError will be raised. If the encoding cookie is an
143 # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found,
144 # 'utf-8-sig' is returned.
145
146 # If no encoding is specified, then the default will be returned. The
147 # default varied with version.
148
149 if sys.version_info <= (2, 4):
150 default = 'iso-8859-1'
151 else:
152 default = 'ascii'
153
154 bom_found = False
155 encoding = None
156
157 def read_or_stop():
158 """Get the next source line, or ''."""
159 try:
160 return readline()
161 except StopIteration:
162 return ''
163
164 def find_cookie(line):
165 """Find an encoding cookie in `line`."""
166 try:
167 line_string = line.decode('ascii')
168 except UnicodeDecodeError:
169 return None
170
171 matches = cookie_re.findall(line_string)
172 if not matches:
173 return None
174 encoding = _get_normal_name(matches[0])
175 try:
176 codec = codecs.lookup(encoding)
177 except LookupError:
178 # This behaviour mimics the Python interpreter
179 raise SyntaxError("unknown encoding: " + encoding)
180
181 if bom_found:
182 # codecs in 2.3 were raw tuples of functions, assume the best.
183 codec_name = getattr(codec, 'name', encoding)
184 if codec_name != 'utf-8':
185 # This behaviour mimics the Python interpreter
186 raise SyntaxError('encoding problem: utf-8')
187 encoding += '-sig'
188 return encoding
189
190 first = read_or_stop()
191 if first.startswith(codecs.BOM_UTF8):
192 bom_found = True
193 first = first[3:]
194 default = 'utf-8-sig'
195 if not first:
196 return default
197
198 encoding = find_cookie(first)
199 if encoding:
200 return encoding
201
202 second = read_or_stop()
203 if not second:
204 return default
205
206 encoding = find_cookie(second)
207 if encoding:
208 return encoding
209
210 return default

eric ide

mercurial