1 """Better tokenizing for coverage.py.""" |
1 """Better tokenizing for coverage.py.""" |
2 |
2 |
3 import keyword, re, token, tokenize |
3 import codecs, keyword, re, sys, token, tokenize |
4 from .backward import StringIO # pylint: disable-msg=W0622 |
4 from .backward import set # pylint: disable=W0622 |
|
5 from .parser import generate_tokens |
|
6 |
5 |
7 |
6 def phys_tokens(toks): |
8 def phys_tokens(toks): |
7 """Return all physical tokens, even line continuations. |
9 """Return all physical tokens, even line continuations. |
8 |
10 |
9 tokenize.generate_tokens() doesn't return a token for the backslash that |
11 tokenize.generate_tokens() doesn't return a token for the backslash that |
16 last_line = None |
18 last_line = None |
17 last_lineno = -1 |
19 last_lineno = -1 |
18 last_ttype = None |
20 last_ttype = None |
19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: |
21 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: |
20 if last_lineno != elineno: |
22 if last_lineno != elineno: |
21 if last_line and last_line[-2:] == "\\\n": |
23 if last_line and last_line.endswith("\\\n"): |
22 # We are at the beginning of a new line, and the last line |
24 # We are at the beginning of a new line, and the last line |
23 # ended with a backslash. We probably have to inject a |
25 # ended with a backslash. We probably have to inject a |
24 # backslash token into the stream. Unfortunately, there's more |
26 # backslash token into the stream. Unfortunately, there's more |
25 # to figure out. This code:: |
27 # to figure out. This code:: |
26 # |
28 # |
72 you should have your original `source` back, with two differences: |
74 you should have your original `source` back, with two differences: |
73 trailing whitespace is not preserved, and a final line with no newline |
75 trailing whitespace is not preserved, and a final line with no newline |
74 is indistinguishable from a final line with a newline. |
76 is indistinguishable from a final line with a newline. |
75 |
77 |
76 """ |
78 """ |
77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] |
79 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) |
78 line = [] |
80 line = [] |
79 col = 0 |
81 col = 0 |
80 tokgen = tokenize.generate_tokens(StringIO(source.expandtabs(8)).readline) |
82 source = source.expandtabs(8).replace('\r\n', '\n') |
|
83 tokgen = generate_tokens(source) |
81 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
84 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
82 mark_start = True |
85 mark_start = True |
83 for part in re.split('(\n)', ttext): |
86 for part in re.split('(\n)', ttext): |
84 if part == '\n': |
87 if part == '\n': |
85 yield line |
88 yield line |
104 col = ecol |
107 col = ecol |
105 |
108 |
106 if line: |
109 if line: |
107 yield line |
110 yield line |
108 |
111 |
109 # |
112 def source_encoding(source): |
110 # eflag: FileType = Python2 |
113 """Determine the encoding for `source` (a string), according to PEP 263. |
|
114 |
|
115 Returns a string, the name of the encoding. |
|
116 |
|
117 """ |
|
118 # Note: this function should never be called on Python 3, since py3 has |
|
119 # built-in tools to do this. |
|
120 assert sys.version_info < (3, 0) |
|
121 |
|
122 # This is mostly code adapted from Py3.2's tokenize module. |
|
123 |
|
124 cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") |
|
125 |
|
126 # Do this so the detect_encode code we copied will work. |
|
127 readline = iter(source.splitlines(True)).next |
|
128 |
|
129 def _get_normal_name(orig_enc): |
|
130 """Imitates get_normal_name in tokenizer.c.""" |
|
131 # Only care about the first 12 characters. |
|
132 enc = orig_enc[:12].lower().replace("_", "-") |
|
133 if re.match(r"^utf-8($|-)", enc): |
|
134 return "utf-8" |
|
135 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): |
|
136 return "iso-8859-1" |
|
137 return orig_enc |
|
138 |
|
139 # From detect_encode(): |
|
140 # It detects the encoding from the presence of a utf-8 bom or an encoding |
|
141 # cookie as specified in pep-0263. If both a bom and a cookie are present, |
|
142 # but disagree, a SyntaxError will be raised. If the encoding cookie is an |
|
143 # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, |
|
144 # 'utf-8-sig' is returned. |
|
145 |
|
146 # If no encoding is specified, then the default will be returned. The |
|
147 # default varied with version. |
|
148 |
|
149 if sys.version_info <= (2, 4): |
|
150 default = 'iso-8859-1' |
|
151 else: |
|
152 default = 'ascii' |
|
153 |
|
154 bom_found = False |
|
155 encoding = None |
|
156 |
|
157 def read_or_stop(): |
|
158 """Get the next source line, or ''.""" |
|
159 try: |
|
160 return readline() |
|
161 except StopIteration: |
|
162 return '' |
|
163 |
|
164 def find_cookie(line): |
|
165 """Find an encoding cookie in `line`.""" |
|
166 try: |
|
167 line_string = line.decode('ascii') |
|
168 except UnicodeDecodeError: |
|
169 return None |
|
170 |
|
171 matches = cookie_re.findall(line_string) |
|
172 if not matches: |
|
173 return None |
|
174 encoding = _get_normal_name(matches[0]) |
|
175 try: |
|
176 codec = codecs.lookup(encoding) |
|
177 except LookupError: |
|
178 # This behaviour mimics the Python interpreter |
|
179 raise SyntaxError("unknown encoding: " + encoding) |
|
180 |
|
181 if bom_found: |
|
182 # codecs in 2.3 were raw tuples of functions, assume the best. |
|
183 codec_name = getattr(codec, 'name', encoding) |
|
184 if codec_name != 'utf-8': |
|
185 # This behaviour mimics the Python interpreter |
|
186 raise SyntaxError('encoding problem: utf-8') |
|
187 encoding += '-sig' |
|
188 return encoding |
|
189 |
|
190 first = read_or_stop() |
|
191 if first.startswith(codecs.BOM_UTF8): |
|
192 bom_found = True |
|
193 first = first[3:] |
|
194 default = 'utf-8-sig' |
|
195 if not first: |
|
196 return default |
|
197 |
|
198 encoding = find_cookie(first) |
|
199 if encoding: |
|
200 return encoding |
|
201 |
|
202 second = read_or_stop() |
|
203 if not second: |
|
204 return default |
|
205 |
|
206 encoding = find_cookie(second) |
|
207 if encoding: |
|
208 return encoding |
|
209 |
|
210 return default |