eric: comparison eric7/DebugClients/Python/coverage/phystokens.py

-:4e8b98454baa
+:800c432b34c8
+# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
+# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt
+"""Better tokenizing for coverage.py."""
+import codecs
+import keyword
+import re
+import sys
+import token
+import tokenize
+from coverage import env
+from coverage.backward import iternext, unicode_class
+from coverage.misc import contract
+def phys_tokens(toks):
+"""Return all physical tokens, even line continuations.
+tokenize.generate_tokens() doesn't return a token for the backslash that
+continues lines.  This wrapper provides those tokens so that we can
+re-create a faithful representation of the original source.
+Returns the same values as generate_tokens()
+"""
+last_line = None
+last_lineno = -1
+last_ttext = None
+for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
+if last_lineno != elineno:
+if last_line and last_line.endswith("\\\n"):
+# We are at the beginning of a new line, and the last line
+# ended with a backslash.  We probably have to inject a
+# backslash token into the stream. Unfortunately, there's more
+# to figure out.  This code::
+#
+#   usage = """\
+#   HEY THERE
+#   """
+#
+# triggers this condition, but the token text is::
+#
+#   '"""\\\nHEY THERE\n"""'
+#
+# so we need to figure out if the backslash is already in the
+# string token or not.
+inject_backslash = True
+if last_ttext.endswith("\\"):
+inject_backslash = False
+elif ttype == token.STRING:
+if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
+# It's a multi-line string and the first line ends with
+# a backslash, so we don't need to inject another.
+inject_backslash = False
+if inject_backslash:
+# Figure out what column the backslash is in.
+ccol = len(last_line.split("\n")[-2]) - 1
+# Yield the token, with a fake token type.
+yield (
+99999, "\\\n",
+(slineno, ccol), (slineno, ccol+2),
+last_line
+)
+last_line = ltext
+if ttype not in (tokenize.NEWLINE, tokenize.NL):
+last_ttext = ttext
+yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
+last_lineno = elineno
+@contract(source='unicode')
+def source_token_lines(source):
+"""Generate a series of lines, one for each line in `source`.
+Each line is a list of pairs, each pair is a token::
+[('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
+Each pair has a token class, and the token text.
+If you concatenate all the token texts, and then join them with newlines,
+you should have your original `source` back, with two differences:
+trailing whitespace is not preserved, and a final line with no newline
+is indistinguishable from a final line with a newline.
+"""
+ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
+line = []
+col = 0
+source = source.expandtabs(8).replace('\r\n', '\n')
+tokgen = generate_tokens(source)
+for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
+mark_start = True
+for part in re.split('(\n)', ttext):
+if part == '\n':
+yield line
+line = []
+col = 0
+mark_end = False
+elif part == '':
+mark_end = False
+elif ttype in ws_tokens:
+mark_end = False
+else:
+if mark_start and scol > col:
+line.append(("ws", u" " * (scol - col)))
+mark_start = False
+tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
+if ttype == token.NAME and keyword.iskeyword(ttext):
+tok_class = "key"
+line.append((tok_class, part))
+mark_end = True
+scol = 0
+if mark_end:
+col = ecol
+if line:
+yield line
+class CachedTokenizer(object):
+"""A one-element cache around tokenize.generate_tokens.
+When reporting, coverage.py tokenizes files twice, once to find the
+structure of the file, and once to syntax-color it.  Tokenizing is
+expensive, and easily cached.
+This is a one-element cache so that our twice-in-a-row tokenizing doesn't
+actually tokenize twice.
+"""
+def __init__(self):
+self.last_text = None
+self.last_tokens = None
+@contract(text='unicode')
+def generate_tokens(self, text):
+"""A stand-in for `tokenize.generate_tokens`."""
+if text != self.last_text:
+self.last_text = text
+readline = iternext(text.splitlines(True))
+self.last_tokens = list(tokenize.generate_tokens(readline))
+return self.last_tokens
+# Create our generate_tokens cache as a callable replacement function.
+generate_tokens = CachedTokenizer().generate_tokens
+COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
+@contract(source='bytes')
+def _source_encoding_py2(source):
+"""Determine the encoding for `source`, according to PEP 263.
+`source` is a byte string, the text of the program.
+Returns a string, the name of the encoding.
+"""
+assert isinstance(source, bytes)
+# Do this so the detect_encode code we copied will work.
+readline = iternext(source.splitlines(True))
+# This is mostly code adapted from Py3.2's tokenize module.
+def _get_normal_name(orig_enc):
+"""Imitates get_normal_name in tokenizer.c."""
+# Only care about the first 12 characters.
+enc = orig_enc[:12].lower().replace("_", "-")
+if re.match(r"^utf-8($|-)", enc):
+return "utf-8"
+if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
+return "iso-8859-1"
+return orig_enc
+# From detect_encode():
+# It detects the encoding from the presence of a UTF-8 BOM or an encoding
+# cookie as specified in PEP-0263.  If both a BOM and a cookie are present,
+# but disagree, a SyntaxError will be raised.  If the encoding cookie is an
+# invalid charset, raise a SyntaxError.  Note that if a UTF-8 BOM is found,
+# 'utf-8-sig' is returned.
+# If no encoding is specified, then the default will be returned.
+default = 'ascii'
+bom_found = False
+encoding = None
+def read_or_stop():
+"""Get the next source line, or ''."""
+try:
+return readline()
+except StopIteration:
+return ''
+def find_cookie(line):
+"""Find an encoding cookie in `line`."""
+try:
+line_string = line.decode('ascii')
+except UnicodeDecodeError:
+return None
+matches = COOKIE_RE.findall(line_string)
+if not matches:
+return None
+encoding = _get_normal_name(matches[0])
+try:
+codec = codecs.lookup(encoding)
+except LookupError:
+# This behavior mimics the Python interpreter
+raise SyntaxError("unknown encoding: " + encoding)
+if bom_found:
+# codecs in 2.3 were raw tuples of functions, assume the best.
+codec_name = getattr(codec, 'name', encoding)
+if codec_name != 'utf-8':
+# This behavior mimics the Python interpreter
+raise SyntaxError('encoding problem: utf-8')
+encoding += '-sig'
+return encoding
+first = read_or_stop()
+if first.startswith(codecs.BOM_UTF8):
+bom_found = True
+first = first[3:]
+default = 'utf-8-sig'
+if not first:
+return default
+encoding = find_cookie(first)
+if encoding:
+return encoding
+second = read_or_stop()
+if not second:
+return default
+encoding = find_cookie(second)
+if encoding:
+return encoding
+return default
+@contract(source='bytes')
+def _source_encoding_py3(source):
+"""Determine the encoding for `source`, according to PEP 263.
+`source` is a byte string: the text of the program.
+Returns a string, the name of the encoding.
+"""
+readline = iternext(source.splitlines(True))
+return tokenize.detect_encoding(readline)[0]
+if env.PY3:
+source_encoding = _source_encoding_py3
+else:
+source_encoding = _source_encoding_py2
+@contract(source='unicode')
+def compile_unicode(source, filename, mode):
+"""Just like the `compile` builtin, but works on any Unicode string.
+Python 2's compile() builtin has a stupid restriction: if the source string
+is Unicode, then it may not have a encoding declaration in it.  Why not?
+Who knows!  It also decodes to utf8, and then tries to interpret those utf8
+bytes according to the encoding declaration.  Why? Who knows!
+This function neuters the coding declaration, and compiles it.
+"""
+source = neuter_encoding_declaration(source)
+if env.PY2 and isinstance(filename, unicode_class):
+filename = filename.encode(sys.getfilesystemencoding(), "replace")
+code = compile(source, filename, mode)
+return code
+@contract(source='unicode', returns='unicode')
+def neuter_encoding_declaration(source):
+"""Return `source`, with any encoding declaration neutered."""
+if COOKIE_RE.search(source):
+source_lines = source.splitlines(True)
+for lineno in range(min(2, len(source_lines))):
+source_lines[lineno] = COOKIE_RE.sub("# (deleted declaration)", source_lines[lineno])
+source = "".join(source_lines)
+return source

Mercurial Repositories > eric / file comparison

comparison: eric7/DebugClients/Python/coverage/phystokens.py

eric7/DebugClients/Python/coverage/phystokens.py