eric7/DebugClients/Python/coverage/phystokens.py

branch
eric7
changeset 8775
0802ae193343
parent 8527
2bd1325d727e
child 9099
0e511e0e94a3
--- a/eric7/DebugClients/Python/coverage/phystokens.py	Fri Nov 19 19:28:47 2021 +0100
+++ b/eric7/DebugClients/Python/coverage/phystokens.py	Sat Nov 20 16:47:38 2021 +0100
@@ -3,15 +3,13 @@
 
 """Better tokenizing for coverage.py."""
 
-import codecs
+import ast
 import keyword
 import re
-import sys
 import token
 import tokenize
 
 from coverage import env
-from coverage.backward import iternext, unicode_class
 from coverage.misc import contract
 
 
@@ -70,6 +68,21 @@
         last_lineno = elineno
 
 
+class MatchCaseFinder(ast.NodeVisitor):
+    """Helper for finding match/case lines."""
+    def __init__(self, source):
+        # This will be the set of line numbers that start match or case statements.
+        self.match_case_lines = set()
+        self.visit(ast.parse(source))
+
+    def visit_Match(self, node):
+        """Invoked by ast.NodeVisitor.visit"""
+        self.match_case_lines.add(node.lineno)
+        for case in node.cases:
+            self.match_case_lines.add(case.pattern.lineno)
+        self.generic_visit(node)
+
+
 @contract(source='unicode')
 def source_token_lines(source):
     """Generate a series of lines, one for each line in `source`.
@@ -94,7 +107,10 @@
     source = source.expandtabs(8).replace('\r\n', '\n')
     tokgen = generate_tokens(source)
 
-    for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
+    if env.PYBEHAVIOR.soft_keywords:
+        match_case_lines = MatchCaseFinder(source).match_case_lines
+
+    for ttype, ttext, (sline, scol), (_, ecol), _ in phys_tokens(tokgen):
         mark_start = True
         for part in re.split('(\n)', ttext):
             if part == '\n':
@@ -108,11 +124,24 @@
                 mark_end = False
             else:
                 if mark_start and scol > col:
-                    line.append(("ws", u" " * (scol - col)))
+                    line.append(("ws", " " * (scol - col)))
                     mark_start = False
                 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
-                if ttype == token.NAME and keyword.iskeyword(ttext):
-                    tok_class = "key"
+                if ttype == token.NAME:
+                    if keyword.iskeyword(ttext):
+                        # Hard keywords are always keywords.
+                        tok_class = "key"
+                    elif env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext):
+                        # Soft keywords appear at the start of the line, on lines that start
+                        # match or case statements.
+                        if len(line) == 0:
+                            is_start_of_line = True
+                        elif (len(line) == 1) and line[0][0] == "ws":
+                            is_start_of_line = True
+                        else:
+                            is_start_of_line = False
+                        if is_start_of_line and sline in match_case_lines:
+                            tok_class = "key"
                 line.append((tok_class, part))
                 mark_end = True
             scol = 0
@@ -123,7 +152,7 @@
         yield line
 
 
-class CachedTokenizer(object):
+class CachedTokenizer:
     """A one-element cache around tokenize.generate_tokens.
 
     When reporting, coverage.py tokenizes files twice, once to find the
@@ -143,7 +172,7 @@
         """A stand-in for `tokenize.generate_tokens`."""
         if text != self.last_text:
             self.last_text = text
-            readline = iternext(text.splitlines(True))
+            readline = iter(text.splitlines(True)).__next__
             self.last_tokens = list(tokenize.generate_tokens(readline))
         return self.last_tokens
 
@@ -154,102 +183,7 @@
 COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE)
 
 @contract(source='bytes')
-def _source_encoding_py2(source):
-    """Determine the encoding for `source`, according to PEP 263.
-
-    `source` is a byte string, the text of the program.
-
-    Returns a string, the name of the encoding.
-
-    """
-    assert isinstance(source, bytes)
-
-    # Do this so the detect_encode code we copied will work.
-    readline = iternext(source.splitlines(True))
-
-    # This is mostly code adapted from Py3.2's tokenize module.
-
-    def _get_normal_name(orig_enc):
-        """Imitates get_normal_name in tokenizer.c."""
-        # Only care about the first 12 characters.
-        enc = orig_enc[:12].lower().replace("_", "-")
-        if re.match(r"^utf-8($|-)", enc):
-            return "utf-8"
-        if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc):
-            return "iso-8859-1"
-        return orig_enc
-
-    # From detect_encode():
-    # It detects the encoding from the presence of a UTF-8 BOM or an encoding
-    # cookie as specified in PEP-0263.  If both a BOM and a cookie are present,
-    # but disagree, a SyntaxError will be raised.  If the encoding cookie is an
-    # invalid charset, raise a SyntaxError.  Note that if a UTF-8 BOM is found,
-    # 'utf-8-sig' is returned.
-
-    # If no encoding is specified, then the default will be returned.
-    default = 'ascii'
-
-    bom_found = False
-    encoding = None
-
-    def read_or_stop():
-        """Get the next source line, or ''."""
-        try:
-            return readline()
-        except StopIteration:
-            return ''
-
-    def find_cookie(line):
-        """Find an encoding cookie in `line`."""
-        try:
-            line_string = line.decode('ascii')
-        except UnicodeDecodeError:
-            return None
-
-        matches = COOKIE_RE.findall(line_string)
-        if not matches:
-            return None
-        encoding = _get_normal_name(matches[0])
-        try:
-            codec = codecs.lookup(encoding)
-        except LookupError:
-            # This behavior mimics the Python interpreter
-            raise SyntaxError("unknown encoding: " + encoding)
-
-        if bom_found:
-            # codecs in 2.3 were raw tuples of functions, assume the best.
-            codec_name = getattr(codec, 'name', encoding)
-            if codec_name != 'utf-8':
-                # This behavior mimics the Python interpreter
-                raise SyntaxError('encoding problem: utf-8')
-            encoding += '-sig'
-        return encoding
-
-    first = read_or_stop()
-    if first.startswith(codecs.BOM_UTF8):
-        bom_found = True
-        first = first[3:]
-        default = 'utf-8-sig'
-    if not first:
-        return default
-
-    encoding = find_cookie(first)
-    if encoding:
-        return encoding
-
-    second = read_or_stop()
-    if not second:
-        return default
-
-    encoding = find_cookie(second)
-    if encoding:
-        return encoding
-
-    return default
-
-
-@contract(source='bytes')
-def _source_encoding_py3(source):
+def source_encoding(source):
     """Determine the encoding for `source`, according to PEP 263.
 
     `source` is a byte string: the text of the program.
@@ -257,31 +191,23 @@
     Returns a string, the name of the encoding.
 
     """
-    readline = iternext(source.splitlines(True))
+    readline = iter(source.splitlines(True)).__next__
     return tokenize.detect_encoding(readline)[0]
 
 
-if env.PY3:
-    source_encoding = _source_encoding_py3
-else:
-    source_encoding = _source_encoding_py2
-
-
 @contract(source='unicode')
 def compile_unicode(source, filename, mode):
     """Just like the `compile` builtin, but works on any Unicode string.
 
     Python 2's compile() builtin has a stupid restriction: if the source string
     is Unicode, then it may not have a encoding declaration in it.  Why not?
-    Who knows!  It also decodes to utf8, and then tries to interpret those utf8
-    bytes according to the encoding declaration.  Why? Who knows!
+    Who knows!  It also decodes to utf-8, and then tries to interpret those
+    utf-8 bytes according to the encoding declaration.  Why? Who knows!
 
     This function neuters the coding declaration, and compiles it.
 
     """
     source = neuter_encoding_declaration(source)
-    if env.PY2 and isinstance(filename, unicode_class):
-        filename = filename.encode(sys.getfilesystemencoding(), "replace")
     code = compile(source, filename, mode)
     return code
 

eric ide

mercurial