|
1 """Better tokenizing for coverage.py.""" |
|
2 |
|
3 import keyword, re, token, tokenize |
|
4 from coverage.backward import StringIO # pylint: disable-msg=W0622 |
|
5 |
|
6 def phys_tokens(toks): |
|
7 """Return all physical tokens, even line continuations. |
|
8 |
|
9 tokenize.generate_tokens() doesn't return a token for the backslash that |
|
10 continues lines. This wrapper provides those tokens so that we can |
|
11 re-create a faithful representation of the original source. |
|
12 |
|
13 Returns the same values as generate_tokens() |
|
14 |
|
15 """ |
|
16 last_line = None |
|
17 last_lineno = -1 |
|
18 last_ttype = None |
|
19 for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: |
|
20 if last_lineno != elineno: |
|
21 if last_line and last_line[-2:] == "\\\n": |
|
22 # We are at the beginning of a new line, and the last line |
|
23 # ended with a backslash. We probably have to inject a |
|
24 # backslash token into the stream. Unfortunately, there's more |
|
25 # to figure out. This code:: |
|
26 # |
|
27 # usage = """\ |
|
28 # HEY THERE |
|
29 # """ |
|
30 # |
|
31 # triggers this condition, but the token text is:: |
|
32 # |
|
33 # '"""\\\nHEY THERE\n"""' |
|
34 # |
|
35 # so we need to figure out if the backslash is already in the |
|
36 # string token or not. |
|
37 inject_backslash = True |
|
38 if last_ttype == tokenize.COMMENT: |
|
39 # Comments like this \ |
|
40 # should never result in a new token. |
|
41 inject_backslash = False |
|
42 elif ttype == token.STRING: |
|
43 if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': |
|
44 # It's a multiline string and the first line ends with |
|
45 # a backslash, so we don't need to inject another. |
|
46 inject_backslash = False |
|
47 if inject_backslash: |
|
48 # Figure out what column the backslash is in. |
|
49 ccol = len(last_line.split("\n")[-2]) - 1 |
|
50 # Yield the token, with a fake token type. |
|
51 yield ( |
|
52 99999, "\\\n", |
|
53 (slineno, ccol), (slineno, ccol+2), |
|
54 last_line |
|
55 ) |
|
56 last_line = ltext |
|
57 last_ttype = ttype |
|
58 yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
|
59 last_lineno = elineno |
|
60 |
|
61 |
|
62 def source_token_lines(source): |
|
63 """Generate a series of lines, one for each line in `source`. |
|
64 |
|
65 Each line is a list of pairs, each pair is a token:: |
|
66 |
|
67 [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] |
|
68 |
|
69 Each pair has a token class, and the token text. |
|
70 |
|
71 If you concatenate all the token texts, and then join them with newlines, |
|
72 you should have your original `source` back, with two differences: |
|
73 trailing whitespace is not preserved, and a final line with no newline |
|
74 is indistinguishable from a final line with a newline. |
|
75 |
|
76 """ |
|
77 ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL] |
|
78 line = [] |
|
79 col = 0 |
|
80 tokgen = tokenize.generate_tokens(StringIO(source.expandtabs(8)).readline) |
|
81 for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
|
82 mark_start = True |
|
83 for part in re.split('(\n)', ttext): |
|
84 if part == '\n': |
|
85 yield line |
|
86 line = [] |
|
87 col = 0 |
|
88 mark_end = False |
|
89 elif part == '': |
|
90 mark_end = False |
|
91 elif ttype in ws_tokens: |
|
92 mark_end = False |
|
93 else: |
|
94 if mark_start and scol > col: |
|
95 line.append(("ws", " " * (scol - col))) |
|
96 mark_start = False |
|
97 tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
|
98 if ttype == token.NAME and keyword.iskeyword(ttext): |
|
99 tok_class = "key" |
|
100 line.append((tok_class, part)) |
|
101 mark_end = True |
|
102 scol = 0 |
|
103 if mark_end: |
|
104 col = ecol |
|
105 |
|
106 if line: |
|
107 yield line |