diff -r 71f15675e89f -r 7f51ab29a1a2 DebugClients/Python/coverage/phystokens.py --- a/DebugClients/Python/coverage/phystokens.py Fri Apr 11 18:37:22 2014 +0200 +++ b/DebugClients/Python/coverage/phystokens.py Thu Apr 10 23:02:20 2014 +0200 @@ -1,7 +1,9 @@ """Better tokenizing for coverage.py.""" -import keyword, re, token, tokenize -from .backward import StringIO # pylint: disable-msg=W0622 +import codecs, keyword, re, sys, token, tokenize +from .backward import set # pylint: disable=W0622 +from .parser import generate_tokens + def phys_tokens(toks): """Return all physical tokens, even line continuations. @@ -18,7 +20,7 @@ last_ttype = None for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: if last_lineno != elineno: - if last_line and last_line[-2:] == "\\\n": + if last_line and last_line.endswith("\\\n"): # We are at the beginning of a new line, and the last line # ended with a backslash. We probably have to inject a # backslash token into the stream. Unfortunately, there's more @@ -74,10 +76,11 @@ is indistinguishable from a final line with a newline. """ - ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] + ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) line = [] col = 0 - tokgen = tokenize.generate_tokens(StringIO(source.expandtabs(8)).readline) + source = source.expandtabs(8).replace('\r\n', '\n') + tokgen = generate_tokens(source) for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): mark_start = True for part in re.split('(\n)', ttext): @@ -106,5 +109,102 @@ if line: yield line -# -# eflag: FileType = Python2 +def source_encoding(source): + """Determine the encoding for `source` (a string), according to PEP 263. + + Returns a string, the name of the encoding. + + """ + # Note: this function should never be called on Python 3, since py3 has + # built-in tools to do this. + assert sys.version_info < (3, 0) + + # This is mostly code adapted from Py3.2's tokenize module. + + cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") + + # Do this so the detect_encode code we copied will work. + readline = iter(source.splitlines(True)).next + + def _get_normal_name(orig_enc): + """Imitates get_normal_name in tokenizer.c.""" + # Only care about the first 12 characters. + enc = orig_enc[:12].lower().replace("_", "-") + if re.match(r"^utf-8($|-)", enc): + return "utf-8" + if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): + return "iso-8859-1" + return orig_enc + + # From detect_encode(): + # It detects the encoding from the presence of a utf-8 bom or an encoding + # cookie as specified in pep-0263. If both a bom and a cookie are present, + # but disagree, a SyntaxError will be raised. If the encoding cookie is an + # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, + # 'utf-8-sig' is returned. + + # If no encoding is specified, then the default will be returned. The + # default varied with version. + + if sys.version_info <= (2, 4): + default = 'iso-8859-1' + else: + default = 'ascii' + + bom_found = False + encoding = None + + def read_or_stop(): + """Get the next source line, or ''.""" + try: + return readline() + except StopIteration: + return '' + + def find_cookie(line): + """Find an encoding cookie in `line`.""" + try: + line_string = line.decode('ascii') + except UnicodeDecodeError: + return None + + matches = cookie_re.findall(line_string) + if not matches: + return None + encoding = _get_normal_name(matches[0]) + try: + codec = codecs.lookup(encoding) + except LookupError: + # This behaviour mimics the Python interpreter + raise SyntaxError("unknown encoding: " + encoding) + + if bom_found: + # codecs in 2.3 were raw tuples of functions, assume the best. + codec_name = getattr(codec, 'name', encoding) + if codec_name != 'utf-8': + # This behaviour mimics the Python interpreter + raise SyntaxError('encoding problem: utf-8') + encoding += '-sig' + return encoding + + first = read_or_stop() + if first.startswith(codecs.BOM_UTF8): + bom_found = True + first = first[3:] + default = 'utf-8-sig' + if not first: + return default + + encoding = find_cookie(first) + if encoding: + return encoding + + second = read_or_stop() + if not second: + return default + + encoding = find_cookie(second) + if encoding: + return encoding + + return default