3 pygments.lexers.special |
3 pygments.lexers.special |
4 ~~~~~~~~~~~~~~~~~~~~~~~ |
4 ~~~~~~~~~~~~~~~~~~~~~~~ |
5 |
5 |
6 Special lexers. |
6 Special lexers. |
7 |
7 |
8 :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS. |
8 :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS. |
9 :license: BSD, see LICENSE for details. |
9 :license: BSD, see LICENSE for details. |
10 """ |
10 """ |
11 |
11 |
12 import re |
12 import re |
13 from io import BytesIO |
13 from io import BytesIO |
63 self.compress = get_choice_opt(options, 'compress', |
63 self.compress = get_choice_opt(options, 'compress', |
64 ['', 'none', 'gz', 'bz2'], '') |
64 ['', 'none', 'gz', 'bz2'], '') |
65 Lexer.__init__(self, **options) |
65 Lexer.__init__(self, **options) |
66 |
66 |
67 def get_tokens(self, text): |
67 def get_tokens(self, text): |
68 if isinstance(text, str): |
68 if self.compress: |
69 # raw token stream never has any non-ASCII characters |
69 if isinstance(text, str): |
70 text = text.encode('ascii') |
70 text = text.encode('latin1') |
71 if self.compress == 'gz': |
71 if self.compress == 'gz': |
72 import gzip |
72 import gzip |
73 gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text)) |
73 gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text)) |
74 text = gzipfile.read() |
74 text = gzipfile.read() |
75 elif self.compress == 'bz2': |
75 elif self.compress == 'bz2': |
76 import bz2 |
76 import bz2 |
77 text = bz2.decompress(text) |
77 text = bz2.decompress(text) |
|
78 text = text.decode('latin1') |
78 |
79 |
79 # do not call Lexer.get_tokens() because we do not want Unicode |
80 # do not call Lexer.get_tokens() because stripping is not optional. |
80 # decoding to occur, and stripping is not optional. |
81 text = text.strip('\n') + '\n' |
81 text = text.strip(b'\n') + b'\n' |
|
82 for i, t, v in self.get_tokens_unprocessed(text): |
82 for i, t, v in self.get_tokens_unprocessed(text): |
83 yield t, v |
83 yield t, v |
84 |
84 |
85 def get_tokens_unprocessed(self, text): |
85 def get_tokens_unprocessed(self, text): |
86 length = 0 |
86 length = 0 |
87 for match in line_re.finditer(text): |
87 for match in line_re.finditer(text): |
88 try: |
88 try: |
89 ttypestr, val = match.group().split(b'\t', 1) |
89 ttypestr, val = match.group().rstrip().split('\t', 1) |
90 except ValueError: |
90 except ValueError: |
91 val = match.group().decode('ascii', 'replace') |
91 val = match.group() |
92 ttype = Error |
92 ttype = Error |
93 else: |
93 else: |
94 ttype = _ttype_cache.get(ttypestr) |
94 ttype = _ttype_cache.get(ttypestr) |
95 if not ttype: |
95 if not ttype: |
96 ttype = Token |
96 ttype = Token |
98 for ttype_ in ttypes: |
98 for ttype_ in ttypes: |
99 if not ttype_ or not ttype_[0].isupper(): |
99 if not ttype_ or not ttype_[0].isupper(): |
100 raise ValueError('malformed token name') |
100 raise ValueError('malformed token name') |
101 ttype = getattr(ttype, ttype_) |
101 ttype = getattr(ttype, ttype_) |
102 _ttype_cache[ttypestr] = ttype |
102 _ttype_cache[ttypestr] = ttype |
103 val = val[2:-2].decode('unicode-escape') |
103 val = val[1:-1].encode().decode('unicode-escape') |
104 yield length, ttype, val |
104 yield length, ttype, val |
105 length += len(val) |
105 length += len(val) |