Sat, 04 Jul 2015 17:31:46 +0200
Changed the Python debugger backends to evaluate statements entered into the shell in the frame selected in the local variables viewer.
29
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
1 | """Better tokenizing for coverage.py.""" |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
2 | |
3495
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
3 | import codecs, keyword, re, sys, token, tokenize |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
4 | from .backward import set # pylint: disable=W0622 |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
5 | from .parser import generate_tokens |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
6 | |
29
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
7 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
8 | def phys_tokens(toks): |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
9 | """Return all physical tokens, even line continuations. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
10 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
11 | tokenize.generate_tokens() doesn't return a token for the backslash that |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
12 | continues lines. This wrapper provides those tokens so that we can |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
13 | re-create a faithful representation of the original source. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
14 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
15 | Returns the same values as generate_tokens() |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
16 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
17 | """ |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
18 | last_line = None |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
19 | last_lineno = -1 |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
20 | last_ttype = None |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
21 | for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
22 | if last_lineno != elineno: |
3495
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
23 | if last_line and last_line.endswith("\\\n"): |
29
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
24 | # We are at the beginning of a new line, and the last line |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
25 | # ended with a backslash. We probably have to inject a |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
26 | # backslash token into the stream. Unfortunately, there's more |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
27 | # to figure out. This code:: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
28 | # |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
29 | # usage = """\ |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
30 | # HEY THERE |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
31 | # """ |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
32 | # |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
33 | # triggers this condition, but the token text is:: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
34 | # |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
35 | # '"""\\\nHEY THERE\n"""' |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
36 | # |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
37 | # so we need to figure out if the backslash is already in the |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
38 | # string token or not. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
39 | inject_backslash = True |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
40 | if last_ttype == tokenize.COMMENT: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
41 | # Comments like this \ |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
42 | # should never result in a new token. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
43 | inject_backslash = False |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
44 | elif ttype == token.STRING: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
45 | if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\': |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
46 | # It's a multiline string and the first line ends with |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
47 | # a backslash, so we don't need to inject another. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
48 | inject_backslash = False |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
49 | if inject_backslash: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
50 | # Figure out what column the backslash is in. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
51 | ccol = len(last_line.split("\n")[-2]) - 1 |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
52 | # Yield the token, with a fake token type. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
53 | yield ( |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
54 | 99999, "\\\n", |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
55 | (slineno, ccol), (slineno, ccol+2), |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
56 | last_line |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
57 | ) |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
58 | last_line = ltext |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
59 | last_ttype = ttype |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
60 | yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
61 | last_lineno = elineno |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
62 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
63 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
64 | def source_token_lines(source): |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
65 | """Generate a series of lines, one for each line in `source`. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
66 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
67 | Each line is a list of pairs, each pair is a token:: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
68 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
69 | [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ] |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
70 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
71 | Each pair has a token class, and the token text. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
72 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
73 | If you concatenate all the token texts, and then join them with newlines, |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
74 | you should have your original `source` back, with two differences: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
75 | trailing whitespace is not preserved, and a final line with no newline |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
76 | is indistinguishable from a final line with a newline. |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
77 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
78 | """ |
3495
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
79 | ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) |
29
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
80 | line = [] |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
81 | col = 0 |
3495
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
82 | source = source.expandtabs(8).replace('\r\n', '\n') |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
83 | tokgen = generate_tokens(source) |
29
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
84 | for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen): |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
85 | mark_start = True |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
86 | for part in re.split('(\n)', ttext): |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
87 | if part == '\n': |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
88 | yield line |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
89 | line = [] |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
90 | col = 0 |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
91 | mark_end = False |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
92 | elif part == '': |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
93 | mark_end = False |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
94 | elif ttype in ws_tokens: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
95 | mark_end = False |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
96 | else: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
97 | if mark_start and scol > col: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
98 | line.append(("ws", " " * (scol - col))) |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
99 | mark_start = False |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
100 | tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3] |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
101 | if ttype == token.NAME and keyword.iskeyword(ttext): |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
102 | tok_class = "key" |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
103 | line.append((tok_class, part)) |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
104 | mark_end = True |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
105 | scol = 0 |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
106 | if mark_end: |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
107 | col = ecol |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
108 | |
391dc0bc4ae5
Updated coverage.py to version 3.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
109 | if line: |
3495
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
110 | yield line |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
111 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
112 | def source_encoding(source): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
113 | """Determine the encoding for `source` (a string), according to PEP 263. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
114 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
115 | Returns a string, the name of the encoding. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
116 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
117 | """ |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
118 | # Note: this function should never be called on Python 3, since py3 has |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
119 | # built-in tools to do this. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
120 | assert sys.version_info < (3, 0) |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
121 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
122 | # This is mostly code adapted from Py3.2's tokenize module. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
123 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
124 | cookie_re = re.compile(r"coding[:=]\s*([-\w.]+)") |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
125 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
126 | # Do this so the detect_encode code we copied will work. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
127 | readline = iter(source.splitlines(True)).next |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
128 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
129 | def _get_normal_name(orig_enc): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
130 | """Imitates get_normal_name in tokenizer.c.""" |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
131 | # Only care about the first 12 characters. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
132 | enc = orig_enc[:12].lower().replace("_", "-") |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
133 | if re.match(r"^utf-8($|-)", enc): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
134 | return "utf-8" |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
135 | if re.match(r"^(latin-1|iso-8859-1|iso-latin-1)($|-)", enc): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
136 | return "iso-8859-1" |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
137 | return orig_enc |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
138 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
139 | # From detect_encode(): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
140 | # It detects the encoding from the presence of a utf-8 bom or an encoding |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
141 | # cookie as specified in pep-0263. If both a bom and a cookie are present, |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
142 | # but disagree, a SyntaxError will be raised. If the encoding cookie is an |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
143 | # invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
144 | # 'utf-8-sig' is returned. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
145 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
146 | # If no encoding is specified, then the default will be returned. The |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
147 | # default varied with version. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
148 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
149 | if sys.version_info <= (2, 4): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
150 | default = 'iso-8859-1' |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
151 | else: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
152 | default = 'ascii' |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
153 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
154 | bom_found = False |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
155 | encoding = None |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
156 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
157 | def read_or_stop(): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
158 | """Get the next source line, or ''.""" |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
159 | try: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
160 | return readline() |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
161 | except StopIteration: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
162 | return '' |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
163 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
164 | def find_cookie(line): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
165 | """Find an encoding cookie in `line`.""" |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
166 | try: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
167 | line_string = line.decode('ascii') |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
168 | except UnicodeDecodeError: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
169 | return None |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
170 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
171 | matches = cookie_re.findall(line_string) |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
172 | if not matches: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
173 | return None |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
174 | encoding = _get_normal_name(matches[0]) |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
175 | try: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
176 | codec = codecs.lookup(encoding) |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
177 | except LookupError: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
178 | # This behaviour mimics the Python interpreter |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
179 | raise SyntaxError("unknown encoding: " + encoding) |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
180 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
181 | if bom_found: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
182 | # codecs in 2.3 were raw tuples of functions, assume the best. |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
183 | codec_name = getattr(codec, 'name', encoding) |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
184 | if codec_name != 'utf-8': |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
185 | # This behaviour mimics the Python interpreter |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
186 | raise SyntaxError('encoding problem: utf-8') |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
187 | encoding += '-sig' |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
188 | return encoding |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
189 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
190 | first = read_or_stop() |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
191 | if first.startswith(codecs.BOM_UTF8): |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
192 | bom_found = True |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
193 | first = first[3:] |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
194 | default = 'utf-8-sig' |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
195 | if not first: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
196 | return default |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
197 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
198 | encoding = find_cookie(first) |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
199 | if encoding: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
200 | return encoding |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
201 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
202 | second = read_or_stop() |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
203 | if not second: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
204 | return default |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
205 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
206 | encoding = find_cookie(second) |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
207 | if encoding: |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
208 | return encoding |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
209 | |
fac17a82b431
updated coverage to 3.7.1
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
29
diff
changeset
|
210 | return default |