DebugClients/Python/coverage/phystokens.py

changeset 4489
d0d6e4ad31bd
parent 3499
f2d4b02c7e88
child 4491
0d8612e24fef
diff -r 456c58fc64b0 -r d0d6e4ad31bd DebugClients/Python/coverage/phystokens.py
--- a/DebugClients/Python/coverage/phystokens.py	Sun Oct 04 13:35:09 2015 +0200
+++ b/DebugClients/Python/coverage/phystokens.py	Sun Oct 04 22:37:56 2015 +0200
@@ -1,8 +1,17 @@
+# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
+# For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
+
 """Better tokenizing for coverage.py."""
 
-import codecs, keyword, re, sys, token, tokenize
-from .backward import set                       # pylint: disable=W0622
-from .parser import generate_tokens
+import codecs
+import keyword
+import re
+import token
+import tokenize
+
+from coverage import env
+from coverage.backward import iternext
+from coverage.misc import contract
 
 
 def phys_tokens(toks):
@@ -43,7 +52,7 @@
                     inject_backslash = False
                 elif ttype == token.STRING:
                     if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
-                        # It's a multiline string and the first line ends with
+                        # It's a multi-line string and the first line ends with
                         # a backslash, so we don't need to inject another.
                         inject_backslash = False
                 if inject_backslash:
@@ -61,6 +70,7 @@
         last_lineno = elineno
 
 
+@contract(source='unicode')
 def source_token_lines(source):
     """Generate a series of lines, one for each line in `source`.
 
@@ -76,11 +86,15 @@
     is indistinguishable from a final line with a newline.
 
     """
+
     ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
     line = []
     col = 0
-    source = source.expandtabs(8).replace('\r\n', '\n')
+
+    # The \f is because of http://bugs.python.org/issue19035
+    source = source.expandtabs(8).replace('\r\n', '\n').replace('\f', ' ')
     tokgen = generate_tokens(source)
+
     for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
         mark_start = True
         for part in re.split('(\n)', ttext):
@@ -95,7 +109,7 @@
                 mark_end = False
             else:
                 if mark_start and scol > col:
-                    line.append(("ws", " " * (scol - col)))
+                    line.append(("ws", u" " * (scol - col)))
                     mark_start = False
                 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
                 if ttype == token.NAME and keyword.iskeyword(ttext):
@@ -109,23 +123,53 @@
     if line:
         yield line
 
-def source_encoding(source):
-    """Determine the encoding for `source` (a string), according to PEP 263.
+
+class CachedTokenizer(object):
+    """A one-element cache around tokenize.generate_tokens.
+
+    When reporting, coverage.py tokenizes files twice, once to find the
+    structure of the file, and once to syntax-color it.  Tokenizing is
+    expensive, and easily cached.
+
+    This is a one-element cache so that our twice-in-a-row tokenizing doesn't
+    actually tokenize twice.
+
+    """
+    def __init__(self):
+        self.last_text = None
+        self.last_tokens = None
+
+    @contract(text='unicode')
+    def generate_tokens(self, text):
+        """A stand-in for `tokenize.generate_tokens`."""
+        if text != self.last_text:
+            self.last_text = text
+            readline = iternext(text.splitlines(True))
+            self.last_tokens = list(tokenize.generate_tokens(readline))
+        return self.last_tokens
+
+# Create our generate_tokens cache as a callable replacement function.
+generate_tokens = CachedTokenizer().generate_tokens
+
+
+COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE)
+
+@contract(source='bytes')
+def _source_encoding_py2(source):
+    """Determine the encoding for `source`, according to PEP 263.
+
+    `source` is a byte string, the text of the program.
 
     Returns a string, the name of the encoding.
 
     """
-    # Note: this function should never be called on Python 3, since py3 has
-    # built-in tools to do this.
-    assert sys.version_info < (3, 0)
+    assert isinstance(source, bytes)
+
+    # Do this so the detect_encode code we copied will work.
+    readline = iternext(source.splitlines(True))
 
     # This is mostly code adapted from Py3.2's tokenize module.
 
-    cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)")
-
-    # Do this so the detect_encode code we copied will work.
-    readline = iter(source.splitlines(True)).next
-
     def _get_normal_name(orig_enc):
         """Imitates get_normal_name in tokenizer.c."""
         # Only care about the first 12 characters.
@@ -137,19 +181,14 @@
         return orig_enc
 
     # From detect_encode():
-    # It detects the encoding from the presence of a utf-8 bom or an encoding
-    # cookie as specified in pep-0263.  If both a bom and a cookie are present,
+    # It detects the encoding from the presence of a UTF-8 BOM or an encoding
+    # cookie as specified in PEP-0263.  If both a BOM and a cookie are present,
     # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
-    # invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
+    # invalid charset, raise a SyntaxError.  Note that if a UTF-8 BOM is found,
     # 'utf-8-sig' is returned.
 
-    # If no encoding is specified, then the default will be returned.  The
-    # default varied with version.
-
-    if sys.version_info <= (2, 4):
-        default = 'iso-8859-1'
-    else:
-        default = 'ascii'
+    # If no encoding is specified, then the default will be returned.
+    default = 'ascii'
 
     bom_found = False
     encoding = None
@@ -168,21 +207,21 @@
         except UnicodeDecodeError:
             return None
 
-        matches = cookie_re.findall(line_string)
+        matches = COOKIE_RE.findall(line_string)
         if not matches:
             return None
         encoding = _get_normal_name(matches[0])
         try:
             codec = codecs.lookup(encoding)
         except LookupError:
-            # This behaviour mimics the Python interpreter
+            # This behavior mimics the Python interpreter
             raise SyntaxError("unknown encoding: " + encoding)
 
         if bom_found:
             # codecs in 2.3 were raw tuples of functions, assume the best.
             codec_name = getattr(codec, 'name', encoding)
             if codec_name != 'utf-8':
-                # This behaviour mimics the Python interpreter
+                # This behavior mimics the Python interpreter
                 raise SyntaxError('encoding problem: utf-8')
             encoding += '-sig'
         return encoding
@@ -209,5 +248,56 @@
 
     return default
 
-#
-# eflag: FileType = Python2
+
+@contract(source='bytes')
+def _source_encoding_py3(source):
+    """Determine the encoding for `source`, according to PEP 263.
+
+    `source` is a byte string: the text of the program.
+
+    Returns a string, the name of the encoding.
+
+    """
+    readline = iternext(source.splitlines(True))
+    return tokenize.detect_encoding(readline)[0]
+
+
+if env.PY3:
+    source_encoding = _source_encoding_py3
+else:
+    source_encoding = _source_encoding_py2
+
+
+@contract(source='unicode')
+def compile_unicode(source, filename, mode):
+    """Just like the `compile` builtin, but works on any Unicode string.
+
+    Python 2's compile() builtin has a stupid restriction: if the source string
+    is Unicode, then it may not have a encoding declaration in it.  Why not?
+    Who knows!
+
+    This function catches that exception, neuters the coding declaration, and
+    compiles it anyway.
+
+    """
+    try:
+        code = compile(source, filename, mode)
+    except SyntaxError as synerr:
+        if "coding declaration in unicode string" not in synerr.args[0].lower():
+            raise
+        source = neuter_encoding_declaration(source)
+        code = compile(source, filename, mode)
+
+    return code
+
+
+@contract(source='unicode', returns='unicode')
+def neuter_encoding_declaration(source):
+    """Return `source`, with any encoding declaration neutered.
+
+    This function will only ever be called on `source` that has an encoding
+    declaration, so some edge cases can be ignored.
+
+    """
+    source = COOKIE_RE.sub("# (deleted declaration)", source)
+    return source

eric ide

mercurial