--- a/eric7/DebugClients/Python/coverage/phystokens.py Fri Nov 19 19:28:47 2021 +0100 +++ b/eric7/DebugClients/Python/coverage/phystokens.py Sat Nov 20 16:47:38 2021 +0100 @@ -3,15 +3,13 @@ """Better tokenizing for coverage.py.""" -import codecs +import ast import keyword import re -import sys import token import tokenize from coverage import env -from coverage.backward import iternext, unicode_class from coverage.misc import contract @@ -70,6 +68,21 @@ last_lineno = elineno +class MatchCaseFinder(ast.NodeVisitor): + """Helper for finding match/case lines.""" + def __init__(self, source): + # This will be the set of line numbers that start match or case statements. + self.match_case_lines = set() + self.visit(ast.parse(source)) + + def visit_Match(self, node): + """Invoked by ast.NodeVisitor.visit""" + self.match_case_lines.add(node.lineno) + for case in node.cases: + self.match_case_lines.add(case.pattern.lineno) + self.generic_visit(node) + + @contract(source='unicode') def source_token_lines(source): """Generate a series of lines, one for each line in `source`. @@ -94,7 +107,10 @@ source = source.expandtabs(8).replace('\r\n', '\n') tokgen = generate_tokens(source) - for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): + if env.PYBEHAVIOR.soft_keywords: + match_case_lines = MatchCaseFinder(source).match_case_lines + + for ttype, ttext, (sline, scol), (_, ecol), _ in phys_tokens(tokgen): mark_start = True for part in re.split('(\n)', ttext): if part == '\n': @@ -108,11 +124,24 @@ mark_end = False else: if mark_start and scol > col: - line.append(("ws", u" " * (scol - col))) + line.append(("ws", " " * (scol - col))) mark_start = False tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] - if ttype == token.NAME and keyword.iskeyword(ttext): - tok_class = "key" + if ttype == token.NAME: + if keyword.iskeyword(ttext): + # Hard keywords are always keywords. + tok_class = "key" + elif env.PYBEHAVIOR.soft_keywords and keyword.issoftkeyword(ttext): + # Soft keywords appear at the start of the line, on lines that start + # match or case statements. + if len(line) == 0: + is_start_of_line = True + elif (len(line) == 1) and line[0][0] == "ws": + is_start_of_line = True + else: + is_start_of_line = False + if is_start_of_line and sline in match_case_lines: + tok_class = "key" line.append((tok_class, part)) mark_end = True scol = 0 @@ -123,7 +152,7 @@ yield line -class CachedTokenizer(object): +class CachedTokenizer: """A one-element cache around tokenize.generate_tokens. When reporting, coverage.py tokenizes files twice, once to find the @@ -143,7 +172,7 @@ """A stand-in for `tokenize.generate_tokens`.""" if text != self.last_text: self.last_text = text - readline = iternext(text.splitlines(True)) + readline = iter(text.splitlines(True)).__next__ self.last_tokens = list(tokenize.generate_tokens(readline)) return self.last_tokens @@ -154,102 +183,7 @@ COOKIE_RE = re.compile(r"^[ \t]*#.*coding[:=][ \t]*([-\w.]+)", flags=re.MULTILINE) @contract(source='bytes') -def _source_encoding_py2(source): - """Determine the encoding for `source`, according to PEP 263. - - `source` is a byte string, the text of the program. - - Returns a string, the name of the encoding. - - """ - assert isinstance(source, bytes) - - # Do this so the detect_encode code we copied will work. - readline = iternext(source.splitlines(True)) - - # This is mostly code adapted from Py3.2's tokenize module. - - def _get_normal_name(orig_enc): - """Imitates get_normal_name in tokenizer.c.""" - # Only care about the first 12 characters. - enc = orig_enc[:12].lower().replace("_", "-") - if re.match(r"^utf-8($|-)", enc): - return "utf-8" - if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): - return "iso-8859-1" - return orig_enc - - # From detect_encode(): - # It detects the encoding from the presence of a UTF-8 BOM or an encoding - # cookie as specified in PEP-0263. If both a BOM and a cookie are present, - # but disagree, a SyntaxError will be raised. If the encoding cookie is an - # invalid charset, raise a SyntaxError. Note that if a UTF-8 BOM is found, - # 'utf-8-sig' is returned. - - # If no encoding is specified, then the default will be returned. - default = 'ascii' - - bom_found = False - encoding = None - - def read_or_stop(): - """Get the next source line, or ''.""" - try: - return readline() - except StopIteration: - return '' - - def find_cookie(line): - """Find an encoding cookie in `line`.""" - try: - line_string = line.decode('ascii') - except UnicodeDecodeError: - return None - - matches = COOKIE_RE.findall(line_string) - if not matches: - return None - encoding = _get_normal_name(matches[0]) - try: - codec = codecs.lookup(encoding) - except LookupError: - # This behavior mimics the Python interpreter - raise SyntaxError("unknown encoding: " + encoding) - - if bom_found: - # codecs in 2.3 were raw tuples of functions, assume the best. - codec_name = getattr(codec, 'name', encoding) - if codec_name != 'utf-8': - # This behavior mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') - encoding += '-sig' - return encoding - - first = read_or_stop() - if first.startswith(codecs.BOM_UTF8): - bom_found = True - first = first[3:] - default = 'utf-8-sig' - if not first: - return default - - encoding = find_cookie(first) - if encoding: - return encoding - - second = read_or_stop() - if not second: - return default - - encoding = find_cookie(second) - if encoding: - return encoding - - return default - - -@contract(source='bytes') -def _source_encoding_py3(source): +def source_encoding(source): """Determine the encoding for `source`, according to PEP 263. `source` is a byte string: the text of the program. @@ -257,31 +191,23 @@ Returns a string, the name of the encoding. """ - readline = iternext(source.splitlines(True)) + readline = iter(source.splitlines(True)).__next__ return tokenize.detect_encoding(readline)[0] -if env.PY3: - source_encoding = _source_encoding_py3 -else: - source_encoding = _source_encoding_py2 - - @contract(source='unicode') def compile_unicode(source, filename, mode): """Just like the `compile` builtin, but works on any Unicode string. Python 2's compile() builtin has a stupid restriction: if the source string is Unicode, then it may not have a encoding declaration in it. Why not? - Who knows! It also decodes to utf8, and then tries to interpret those utf8 - bytes according to the encoding declaration. Why? Who knows! + Who knows! It also decodes to utf-8, and then tries to interpret those + utf-8 bytes according to the encoding declaration. Why? Who knows! This function neuters the coding declaration, and compiles it. """ source = neuter_encoding_declaration(source) - if env.PY2 and isinstance(filename, unicode_class): - filename = filename.encode(sys.getfilesystemencoding(), "replace") code = compile(source, filename, mode) return code