3 pygments.lexers.special |
3 pygments.lexers.special |
4 ~~~~~~~~~~~~~~~~~~~~~~~ |
4 ~~~~~~~~~~~~~~~~~~~~~~~ |
5 |
5 |
6 Special lexers. |
6 Special lexers. |
7 |
7 |
8 :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS. |
8 :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS. |
9 :license: BSD, see LICENSE for details. |
9 :license: BSD, see LICENSE for details. |
10 """ |
10 """ |
11 |
11 |
12 from __future__ import unicode_literals |
|
13 |
|
14 import re |
12 import re |
15 import io |
|
16 |
13 |
17 from pygments.lexer import Lexer |
14 from pygments.lexer import Lexer |
18 from pygments.token import Token, Error, Text |
15 from pygments.token import Token, Error, Text |
19 from pygments.util import get_choice_opt, b |
16 from pygments.util import get_choice_opt, text_type, BytesIO |
20 |
17 |
21 |
18 |
22 __all__ = ['TextLexer', 'RawTokenLexer'] |
19 __all__ = ['TextLexer', 'RawTokenLexer'] |
23 |
20 |
24 |
21 |
60 self.compress = get_choice_opt(options, 'compress', |
58 self.compress = get_choice_opt(options, 'compress', |
61 ['', 'none', 'gz', 'bz2'], '') |
59 ['', 'none', 'gz', 'bz2'], '') |
62 Lexer.__init__(self, **options) |
60 Lexer.__init__(self, **options) |
63 |
61 |
64 def get_tokens(self, text): |
62 def get_tokens(self, text): |
65 if isinstance(text, str): |
63 if isinstance(text, text_type): |
66 # raw token stream never has any non-ASCII characters |
64 # raw token stream never has any non-ASCII characters |
67 text = text.encode('ascii') |
65 text = text.encode('ascii') |
68 if self.compress == 'gz': |
66 if self.compress == 'gz': |
69 import gzip |
67 import gzip |
70 gzipfile = gzip.GzipFile('', 'rb', 9, io.StringIO(text)) |
68 gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text)) |
71 text = gzipfile.read() |
69 text = gzipfile.read() |
72 elif self.compress == 'bz2': |
70 elif self.compress == 'bz2': |
73 import bz2 |
71 import bz2 |
74 text = bz2.decompress(text) |
72 text = bz2.decompress(text) |
75 |
73 |
76 # do not call Lexer.get_tokens() because we do not want Unicode |
74 # do not call Lexer.get_tokens() because we do not want Unicode |
77 # decoding to occur, and stripping is not optional. |
75 # decoding to occur, and stripping is not optional. |
78 text = text.strip(b('\n')) + b('\n') |
76 text = text.strip(b'\n') + b'\n' |
79 for i, t, v in self.get_tokens_unprocessed(text): |
77 for i, t, v in self.get_tokens_unprocessed(text): |
80 yield t, v |
78 yield t, v |
81 |
79 |
82 def get_tokens_unprocessed(self, text): |
80 def get_tokens_unprocessed(self, text): |
83 length = 0 |
81 length = 0 |
84 for match in line_re.finditer(text): |
82 for match in line_re.finditer(text): |
85 try: |
83 try: |
86 ttypestr, val = match.group().split(b('\t'), 1) |
84 ttypestr, val = match.group().split(b'\t', 1) |
87 except ValueError: |
85 except ValueError: |
88 val = match.group().decode(self.encoding) |
86 val = match.group().decode('ascii', 'replace') |
89 ttype = Error |
87 ttype = Error |
90 else: |
88 else: |
91 ttype = _ttype_cache.get(ttypestr) |
89 ttype = _ttype_cache.get(ttypestr) |
92 if not ttype: |
90 if not ttype: |
93 ttype = Token |
91 ttype = Token |