89 |
90 |
90 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) |
91 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) |
91 line = [] |
92 line = [] |
92 col = 0 |
93 col = 0 |
93 |
94 |
94 # The \f is because of http://bugs.python.org/issue19035 |
95 source = source.expandtabs(8).replace('\r\n', '\n') |
95 source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ') |
|
96 tokgen = generate_tokens(source) |
96 tokgen = generate_tokens(source) |
97 |
97 |
98 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
98 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
99 mark_start = True |
99 mark_start = True |
100 for part in re.split('(\n)', ttext): |
100 for part in re.split('(\n)', ttext): |
150 |
150 |
151 # Create our generate_tokens cache as a callable replacement function. |
151 # Create our generate_tokens cache as a callable replacement function. |
152 generate_tokens = CachedTokenizer().generate_tokens |
152 generate_tokens = CachedTokenizer().generate_tokens |
153 |
153 |
154 |
154 |
155 COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE) |
155 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) |
156 |
156 |
157 @contract(source='bytes') |
157 @contract(source='bytes') |
158 def _source_encoding_py2(source): |
158 def _source_encoding_py2(source): |
159 """Determine the encoding for `source`, according to PEP 263. |
159 """Determine the encoding for `source`, according to PEP 263. |
160 |
160 |
272 def compile_unicode(source, filename, mode): |
272 def compile_unicode(source, filename, mode): |
273 """Just like the `compile` builtin, but works on any Unicode string. |
273 """Just like the `compile` builtin, but works on any Unicode string. |
274 |
274 |
275 Python 2's compile() builtin has a stupid restriction: if the source string |
275 Python 2's compile() builtin has a stupid restriction: if the source string |
276 is Unicode, then it may not have a encoding declaration in it. Why not? |
276 is Unicode, then it may not have a encoding declaration in it. Why not? |
277 Who knows! |
277 Who knows! It also decodes to utf8, and then tries to interpret those utf8 |
278 |
278 bytes according to the encoding declaration. Why? Who knows! |
279 This function catches that exception, neuters the coding declaration, and |
279 |
280 compiles it anyway. |
280 This function neuters the coding declaration, and compiles it. |
281 |
281 |
282 """ |
282 """ |
283 try: |
283 source = neuter_encoding_declaration(source) |
284 code = compile(source, filename, mode) |
284 if env.PY2 and isinstance(filename, unicode): |
285 except SyntaxError as synerr: |
285 filename = filename.encode(sys.getfilesystemencoding(), "replace") |
286 if "coding declaration in unicode string" not in synerr.args[0].lower(): |
286 code = compile(source, filename, mode) |
287 raise |
|
288 source = neuter_encoding_declaration(source) |
|
289 code = compile(source, filename, mode) |
|
290 |
|
291 return code |
287 return code |
292 |
288 |
293 |
289 |
294 @contract(source='unicode', returns='unicode') |
290 @contract(source='unicode', returns='unicode') |
295 def neuter_encoding_declaration(source): |
291 def neuter_encoding_declaration(source): |
296 """Return `source`, with any encoding declaration neutered. |
292 """Return `source`, with any encoding declaration neutered.""" |
297 |
293 source = COOKIE_RE.sub("# (deleted declaration)", source, count=2) |
298 This function will only ever be called on `source` that has an encoding |
|
299 declaration, so some edge cases can be ignored. |
|
300 |
|
301 """ |
|
302 source = COOKIE_RE.sub("# (deleted declaration)", source) |
|
303 return source |
294 return source |