diff -r d28b92dabc2b -r bc64243b7672 DebugClients/Python/coverage/phystokens.py --- a/DebugClients/Python/coverage/phystokens.py Fri Sep 02 19:08:02 2016 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,297 +0,0 @@ -# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 -# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt - -"""Better tokenizing for coverage.py.""" - -import codecs -import keyword -import re -import sys -import token -import tokenize - -from coverage import env -from coverage.backward import iternext -from coverage.misc import contract - - -def phys_tokens(toks): - """Return all physical tokens, even line continuations. - - tokenize.generate_tokens() doesn't return a token for the backslash that - continues lines. This wrapper provides those tokens so that we can - re-create a faithful representation of the original source. - - Returns the same values as generate_tokens() - - """ - last_line = None - last_lineno = -1 - last_ttype = None - for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: - if last_lineno != elineno: - if last_line and last_line.endswith("\\\n"): - # We are at the beginning of a new line, and the last line - # ended with a backslash. We probably have to inject a - # backslash token into the stream. Unfortunately, there's more - # to figure out. This code:: - # - # usage = """\ - # HEY THERE - # """ - # - # triggers this condition, but the token text is:: - # - # '"""\\\nHEY THERE\n"""' - # - # so we need to figure out if the backslash is already in the - # string token or not. - inject_backslash = True - if last_ttype == tokenize.COMMENT: - # Comments like this \ - # should never result in a new token. - inject_backslash = False - elif ttype == token.STRING: - if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': - # It's a multi-line string and the first line ends with - # a backslash, so we don't need to inject another. - inject_backslash = False - if inject_backslash: - # Figure out what column the backslash is in. - ccol = len(last_line.split("\n")[-2]) - 1 - # Yield the token, with a fake token type. - yield ( - 99999, "\\\n", - (slineno, ccol), (slineno, ccol+2), - last_line - ) - last_line = ltext - last_ttype = ttype - yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext - last_lineno = elineno - - -@contract(source='unicode') -def source_token_lines(source): - """Generate a series of lines, one for each line in `source`. - - Each line is a list of pairs, each pair is a token:: - - [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] - - Each pair has a token class, and the token text. - - If you concatenate all the token texts, and then join them with newlines, - you should have your original `source` back, with two differences: - trailing whitespace is not preserved, and a final line with no newline - is indistinguishable from a final line with a newline. - - """ - - ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) - line = [] - col = 0 - - source = source.expandtabs(8).replace('\r\n', '\n') - tokgen = generate_tokens(source) - - for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): - mark_start = True - for part in re.split('(\n)', ttext): - if part == '\n': - yield line - line = [] - col = 0 - mark_end = False - elif part == '': - mark_end = False - elif ttype in ws_tokens: - mark_end = False - else: - if mark_start and scol > col: - line.append(("ws", u" " * (scol - col))) - mark_start = False - tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] - if ttype == token.NAME and keyword.iskeyword(ttext): - tok_class = "key" - line.append((tok_class, part)) - mark_end = True - scol = 0 - if mark_end: - col = ecol - - if line: - yield line - - -class CachedTokenizer(object): - """A one-element cache around tokenize.generate_tokens. - - When reporting, coverage.py tokenizes files twice, once to find the - structure of the file, and once to syntax-color it. Tokenizing is - expensive, and easily cached. - - This is a one-element cache so that our twice-in-a-row tokenizing doesn't - actually tokenize twice. - - """ - def __init__(self): - self.last_text = None - self.last_tokens = None - - @contract(text='unicode') - def generate_tokens(self, text): - """A stand-in for `tokenize.generate_tokens`.""" - if text != self.last_text: - self.last_text = text - readline = iternext(text.splitlines(True)) - self.last_tokens = list(tokenize.generate_tokens(readline)) - return self.last_tokens - -# Create our generate_tokens cache as a callable replacement function. -generate_tokens = CachedTokenizer().generate_tokens - - -COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) - -@contract(source='bytes') -def _source_encoding_py2(source): - """Determine the encoding for `source`, according to PEP 263. - - `source` is a byte string, the text of the program. - - Returns a string, the name of the encoding. - - """ - assert isinstance(source, bytes) - - # Do this so the detect_encode code we copied will work. - readline = iternext(source.splitlines(True)) - - # This is mostly code adapted from Py3.2's tokenize module. - - def _get_normal_name(orig_enc): - """Imitates get_normal_name in tokenizer.c.""" - # Only care about the first 12 characters. - enc = orig_enc[:12].lower().replace("_", "-") - if re.match(r"^utf-8($|-)", enc): - return "utf-8" - if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): - return "iso-8859-1" - return orig_enc - - # From detect_encode(): - # It detects the encoding from the presence of a UTF-8 BOM or an encoding - # cookie as specified in PEP-0263. If both a BOM and a cookie are present, - # but disagree, a SyntaxError will be raised. If the encoding cookie is an - # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, - # 'utf-8-sig' is returned. - - # If no encoding is specified, then the default will be returned. - default = 'ascii' - - bom_found = False - encoding = None - - def read_or_stop(): - """Get the next source line, or ''.""" - try: - return readline() - except StopIteration: - return '' - - def find_cookie(line): - """Find an encoding cookie in `line`.""" - try: - line_string = line.decode('ascii') - except UnicodeDecodeError: - return None - - matches = COOKIE_RE.findall(line_string) - if not matches: - return None - encoding = _get_normal_name(matches[0]) - try: - codec = codecs.lookup(encoding) - except LookupError: - # This behavior mimics the Python interpreter - raise SyntaxError("unknown encoding: " + encoding) - - if bom_found: - # codecs in 2.3 were raw tuples of functions, assume the best. - codec_name = getattr(codec, 'name', encoding) - if codec_name != 'utf-8': - # This behavior mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') - encoding += '-sig' - return encoding - - first = read_or_stop() - if first.startswith(codecs.BOM_UTF8): - bom_found = True - first = first[3:] - default = 'utf-8-sig' - if not first: - return default - - encoding = find_cookie(first) - if encoding: - return encoding - - second = read_or_stop() - if not second: - return default - - encoding = find_cookie(second) - if encoding: - return encoding - - return default - - -@contract(source='bytes') -def _source_encoding_py3(source): - """Determine the encoding for `source`, according to PEP 263. - - `source` is a byte string: the text of the program. - - Returns a string, the name of the encoding. - - """ - readline = iternext(source.splitlines(True)) - return tokenize.detect_encoding(readline)[0] - - -if env.PY3: - source_encoding = _source_encoding_py3 -else: - source_encoding = _source_encoding_py2 - - -@contract(source='unicode') -def compile_unicode(source, filename, mode): - """Just like the `compile` builtin, but works on any Unicode string. - - Python 2's compile() builtin has a stupid restriction: if the source string - is Unicode, then it may not have a encoding declaration in it. Why not? - Who knows! It also decodes to utf8, and then tries to interpret those utf8 - bytes according to the encoding declaration. Why? Who knows! - - This function neuters the coding declaration, and compiles it. - - """ - source = neuter_encoding_declaration(source) - if env.PY2 and isinstance(filename, unicode): - filename = filename.encode(sys.getfilesystemencoding(), "replace") - code = compile(source, filename, mode) - return code - - -@contract(source='unicode', returns='unicode') -def neuter_encoding_declaration(source): - """Return `source`, with any encoding declaration neutered.""" - source = COOKIE_RE.sub("# (deleted declaration)", source, count=2) - return source - -# -# eflag: FileType = Python2