eric: comparison DebugClients/Python/coverage/phystokens.py

-:456c58fc64b0
+:d0d6e4ad31bd
+# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
+# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
 """Better tokenizing for coverage.py."""
-import codecs, keyword, re, sys, token, tokenize
+import codecs
-from .backward import set                       # pylint: disable=W0622
+import keyword
-from .parser import generate_tokens
+import re
+import token
+import tokenize
+from coverage import env
+from coverage.backward import iternext
+from coverage.misc import contract
 def phys_tokens(toks):
 """Return all physical tokens, even line continuations.
 # Comments like this \
 # should never result in a new token.
 inject_backslash = False
 elif ttype == token.STRING:
 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
-# It's a multiline string and the first line ends with
+# It's a multi-line string and the first line ends with
 # a backslash, so we don't need to inject another.
 inject_backslash = False
 if inject_backslash:
 # Figure out what column the backslash is in.
 ccol = len(last_line.split("\n")[-2]) - 1
 last_ttype = ttype
 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
 last_lineno = elineno
+@contract(source='unicode')
 def source_token_lines(source):
 """Generate a series of lines, one for each line in `source`.
 Each line is a list of pairs, each pair is a token::
 you should have your original `source` back, with two differences:
 trailing whitespace is not preserved, and a final line with no newline
 is indistinguishable from a final line with a newline.
 """
 ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
 line = []
 col = 0
-source = source.expandtabs(8).replace('\r\n', '\n')
+# The \f is because of http://bugs.python.org/issue19035
+source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ')
 tokgen = generate_tokens(source)
 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
 mark_start = True
 for part in re.split('(\n)', ttext):
 if part == '\n':
 yield line
 mark_end = False
 elif ttype in ws_tokens:
 mark_end = False
 else:
 if mark_start and scol > col:
-line.append(("ws", " " * (scol - col)))
+line.append(("ws", u" " * (scol - col)))
 mark_start = False
 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
 if ttype == token.NAME and keyword.iskeyword(ttext):
 tok_class = "key"
 line.append((tok_class, part))
 col = ecol
 if line:
 yield line
-def source_encoding(source):
-"""Determine the encoding for `source` (a string), according to PEP 263.
+class CachedTokenizer(object):
+"""A one-element cache around tokenize.generate_tokens.
+When reporting, coverage.py tokenizes files twice, once to find the
+structure of the file, and once to syntax-color it.  Tokenizing is
+expensive, and easily cached.
+This is a one-element cache so that our twice-in-a-row tokenizing doesn't
+actually tokenize twice.
+"""
+def __init__(self):
+self.last_text = None
+self.last_tokens = None
+@contract(text='unicode')
+def generate_tokens(self, text):
+"""A stand-in for `tokenize.generate_tokens`."""
+if text != self.last_text:
+self.last_text = text
+readline = iternext(text.splitlines(True))
+self.last_tokens = list(tokenize.generate_tokens(readline))
+return self.last_tokens
+# Create our generate_tokens cache as a callable replacement function.
+generate_tokens = CachedTokenizer().generate_tokens
+COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE)
+@contract(source='bytes')
+def _source_encoding_py2(source):
+"""Determine the encoding for `source`, according to PEP 263.
+`source` is a byte string, the text of the program.
 Returns a string, the name of the encoding.
 """
-# Note: this function should never be called on Python 3, since py3 has
+assert isinstance(source, bytes)
-# built-in tools to do this.
-assert sys.version_info < (3, 0)
+# Do this so the detect_encode code we copied will work.
+readline = iternext(source.splitlines(True))
 # This is mostly code adapted from Py3.2's tokenize module.
-cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
-# Do this so the detect_encode code we copied will work.
-readline = iter(source.splitlines(True)).next
 def _get_normal_name(orig_enc):
 """Imitates get_normal_name in tokenizer.c."""
 # Only care about the first 12 characters.
 enc = orig_enc[:12].lower().replace("_", "-")
 if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
 return "iso-8859-1"
 return orig_enc
 # From detect_encode():
-# It detects the encoding from the presence of a utf-8 bom or an encoding
+# It detects the encoding from the presence of a UTF-8 BOM or an encoding
-# cookie as specified in pep-0263.  If both a bom and a cookie are present,
+# cookie as specified in PEP-0263.  If both a BOM and a cookie are present,
 # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
-# invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+# invalid charset, raise a SyntaxError.  Note that if a UTF-8 BOM is found,
 # 'utf-8-sig' is returned.
-# If no encoding is specified, then the default will be returned.  The
+# If no encoding is specified, then the default will be returned.
-# default varied with version.
+default = 'ascii'
-if sys.version_info <= (2, 4):
-default = 'iso-8859-1'
-else:
-default = 'ascii'
 bom_found = False
 encoding = None
 def read_or_stop():
 try:
 line_string = line.decode('ascii')
 except UnicodeDecodeError:
 return None
-matches = cookie_re.findall(line_string)
+matches = COOKIE_RE.findall(line_string)
 if not matches:
 return None
 encoding = _get_normal_name(matches[0])
 try:
 codec = codecs.lookup(encoding)
 except LookupError:
-# This behaviour mimics the Python interpreter
+# This behavior mimics the Python interpreter
 raise SyntaxError("unknown encoding: " + encoding)
 if bom_found:
 # codecs in 2.3 were raw tuples of functions, assume the best.
 codec_name = getattr(codec, 'name', encoding)
 if codec_name != 'utf-8':
-# This behaviour mimics the Python interpreter
+# This behavior mimics the Python interpreter
 raise SyntaxError('encoding problem: utf-8')
 encoding += '-sig'
 return encoding
 first = read_or_stop()
 if encoding:
 return encoding
 return default
-#
-# eflag: FileType = Python2
+@contract(source='bytes')
+def _source_encoding_py3(source):
+"""Determine the encoding for `source`, according to PEP 263.
+`source` is a byte string: the text of the program.
+Returns a string, the name of the encoding.
+"""
+readline = iternext(source.splitlines(True))
+return tokenize.detect_encoding(readline)[0]
+if env.PY3:
+source_encoding = _source_encoding_py3
+else:
+source_encoding = _source_encoding_py2
+@contract(source='unicode')
+def compile_unicode(source, filename, mode):
+"""Just like the `compile` builtin, but works on any Unicode string.
+Python 2's compile() builtin has a stupid restriction: if the source string
+is Unicode, then it may not have a encoding declaration in it.  Why not?
+Who knows!
+This function catches that exception, neuters the coding declaration, and
+compiles it anyway.
+"""
+try:
+code = compile(source, filename, mode)
+except SyntaxError as synerr:
+if "coding declaration in unicode string" not in synerr.args[0].lower():
+raise
+source = neuter_encoding_declaration(source)
+code = compile(source, filename, mode)
+return code
+@contract(source='unicode', returns='unicode')
+def neuter_encoding_declaration(source):
+"""Return `source`, with any encoding declaration neutered.
+This function will only ever be called on `source` that has an encoding
+declaration, so some edge cases can be ignored.
+"""
+source = COOKIE_RE.sub("# (deleted declaration)", source)
+return source

Mercurial Repositories > eric / file comparison

comparison: DebugClients/Python/coverage/phystokens.py

DebugClients/Python/coverage/phystokens.py