--- a/DebugClients/Python3/coverage/phystokens.py Sun Oct 04 13:35:09 2015 +0200 +++ b/DebugClients/Python3/coverage/phystokens.py Sun Oct 04 22:37:56 2015 +0200 @@ -1,8 +1,17 @@ +# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 +# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt + """Better tokenizing for coverage.py.""" -import codecs, keyword, re, sys, token, tokenize -from .backward import set # pylint: disable=W0622 -from .parser import generate_tokens +import codecs +import keyword +import re +import token +import tokenize + +from coverage import env +from coverage.backward import iternext +from coverage.misc import contract def phys_tokens(toks): @@ -43,7 +52,7 @@ inject_backslash = False elif ttype == token.STRING: if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': - # It's a multiline string and the first line ends with + # It's a multi-line string and the first line ends with # a backslash, so we don't need to inject another. inject_backslash = False if inject_backslash: @@ -61,6 +70,7 @@ last_lineno = elineno +@contract(source='unicode') def source_token_lines(source): """Generate a series of lines, one for each line in `source`. @@ -76,11 +86,15 @@ is indistinguishable from a final line with a newline. """ + ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) line = [] col = 0 - source = source.expandtabs(8).replace('\r\n', '\n') + + # The \f is because of http://bugs.python.org/issue19035 + source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ') tokgen = generate_tokens(source) + for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): mark_start = True for part in re.split('(\n)', ttext): @@ -95,7 +109,7 @@ mark_end = False else: if mark_start and scol > col: - line.append(("ws", " " * (scol - col))) + line.append(("ws", u" " * (scol - col))) mark_start = False tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] if ttype == token.NAME and keyword.iskeyword(ttext): @@ -109,23 +123,53 @@ if line: yield line -def source_encoding(source): - """Determine the encoding for `source` (a string), according to PEP 263. + +class CachedTokenizer(object): + """A one-element cache around tokenize.generate_tokens. + + When reporting, coverage.py tokenizes files twice, once to find the + structure of the file, and once to syntax-color it. Tokenizing is + expensive, and easily cached. + + This is a one-element cache so that our twice-in-a-row tokenizing doesn't + actually tokenize twice. + + """ + def __init__(self): + self.last_text = None + self.last_tokens = None + + @contract(text='unicode') + def generate_tokens(self, text): + """A stand-in for `tokenize.generate_tokens`.""" + if text != self.last_text: + self.last_text = text + readline = iternext(text.splitlines(True)) + self.last_tokens = list(tokenize.generate_tokens(readline)) + return self.last_tokens + +# Create our generate_tokens cache as a callable replacement function. +generate_tokens = CachedTokenizer().generate_tokens + + +COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE) + +@contract(source='bytes') +def _source_encoding_py2(source): + """Determine the encoding for `source`, according to PEP 263. + + `source` is a byte string, the text of the program. Returns a string, the name of the encoding. """ - # Note: this function should never be called on Python 3, since py3 has - # built-in tools to do this. - assert sys.version_info < (3, 0) + assert isinstance(source, bytes) + + # Do this so the detect_encode code we copied will work. + readline = iternext(source.splitlines(True)) # This is mostly code adapted from Py3.2's tokenize module. - cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") - - # Do this so the detect_encode code we copied will work. - readline = iter(source.splitlines(True)).next - def _get_normal_name(orig_enc): """Imitates get_normal_name in tokenizer.c.""" # Only care about the first 12 characters. @@ -137,19 +181,14 @@ return orig_enc # From detect_encode(): - # It detects the encoding from the presence of a utf-8 bom or an encoding - # cookie as specified in pep-0263. If both a bom and a cookie are present, + # It detects the encoding from the presence of a UTF-8 BOM or an encoding + # cookie as specified in PEP-0263. If both a BOM and a cookie are present, # but disagree, a SyntaxError will be raised. If the encoding cookie is an - # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, + # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, # 'utf-8-sig' is returned. - # If no encoding is specified, then the default will be returned. The - # default varied with version. - - if sys.version_info <= (2, 4): - default = 'iso-8859-1' - else: - default = 'ascii' + # If no encoding is specified, then the default will be returned. + default = 'ascii' bom_found = False encoding = None @@ -168,21 +207,21 @@ except UnicodeDecodeError: return None - matches = cookie_re.findall(line_string) + matches = COOKIE_RE.findall(line_string) if not matches: return None encoding = _get_normal_name(matches[0]) try: codec = codecs.lookup(encoding) except LookupError: - # This behaviour mimics the Python interpreter + # This behavior mimics the Python interpreter raise SyntaxError("unknown encoding: " + encoding) if bom_found: # codecs in 2.3 were raw tuples of functions, assume the best. codec_name = getattr(codec, 'name', encoding) if codec_name != 'utf-8': - # This behaviour mimics the Python interpreter + # This behavior mimics the Python interpreter raise SyntaxError('encoding problem: utf-8') encoding += '-sig' return encoding @@ -208,3 +247,57 @@ return encoding return default + + +@contract(source='bytes') +def _source_encoding_py3(source): + """Determine the encoding for `source`, according to PEP 263. + + `source` is a byte string: the text of the program. + + Returns a string, the name of the encoding. + + """ + readline = iternext(source.splitlines(True)) + return tokenize.detect_encoding(readline)[0] + + +if env.PY3: + source_encoding = _source_encoding_py3 +else: + source_encoding = _source_encoding_py2 + + +@contract(source='unicode') +def compile_unicode(source, filename, mode): + """Just like the `compile` builtin, but works on any Unicode string. + + Python 2's compile() builtin has a stupid restriction: if the source string + is Unicode, then it may not have a encoding declaration in it. Why not? + Who knows! + + This function catches that exception, neuters the coding declaration, and + compiles it anyway. + + """ + try: + code = compile(source, filename, mode) + except SyntaxError as synerr: + if "coding declaration in unicode string" not in synerr.args[0].lower(): + raise + source = neuter_encoding_declaration(source) + code = compile(source, filename, mode) + + return code + + +@contract(source='unicode', returns='unicode') +def neuter_encoding_declaration(source): + """Return `source`, with any encoding declaration neutered. + + This function will only ever be called on `source` that has an encoding + declaration, so some edge cases can be ignored. + + """ + source = COOKIE_RE.sub("# (deleted declaration)", source) + return source