|
1 # -*- coding: utf-8 -*- |
|
2 """ |
|
3 pygments.lexers.special |
|
4 ~~~~~~~~~~~~~~~~~~~~~~~ |
|
5 |
|
6 Special lexers. |
|
7 |
|
8 :copyright: Copyright 2006-2009 by the Pygments team, see AUTHORS. |
|
9 :license: BSD, see LICENSE for details. |
|
10 """ |
|
11 |
|
12 import re |
|
13 import cStringIO |
|
14 |
|
15 from pygments.lexer import Lexer |
|
16 from pygments.token import Token, Error, Text |
|
17 from pygments.util import get_choice_opt, b |
|
18 |
|
19 |
|
20 __all__ = ['TextLexer', 'RawTokenLexer'] |
|
21 |
|
22 |
|
23 class TextLexer(Lexer): |
|
24 """ |
|
25 "Null" lexer, doesn't highlight anything. |
|
26 """ |
|
27 name = 'Text only' |
|
28 aliases = ['text'] |
|
29 filenames = ['*.txt'] |
|
30 mimetypes = ['text/plain'] |
|
31 |
|
32 def get_tokens_unprocessed(self, text): |
|
33 yield 0, Text, text |
|
34 |
|
35 |
|
36 _ttype_cache = {} |
|
37 |
|
38 line_re = re.compile(b('.*?\n')) |
|
39 |
|
40 class RawTokenLexer(Lexer): |
|
41 """ |
|
42 Recreate a token stream formatted with the `RawTokenFormatter`. This |
|
43 lexer raises exceptions during parsing if the token stream in the |
|
44 file is malformed. |
|
45 |
|
46 Additional options accepted: |
|
47 |
|
48 `compress` |
|
49 If set to ``"gz"`` or ``"bz2"``, decompress the token stream with |
|
50 the given compression algorithm before lexing (default: ``""``). |
|
51 """ |
|
52 name = 'Raw token data' |
|
53 aliases = ['raw'] |
|
54 filenames = [] |
|
55 mimetypes = ['application/x-pygments-tokens'] |
|
56 |
|
57 def __init__(self, **options): |
|
58 self.compress = get_choice_opt(options, 'compress', |
|
59 ['', 'none', 'gz', 'bz2'], '') |
|
60 Lexer.__init__(self, **options) |
|
61 |
|
62 def get_tokens(self, text): |
|
63 if isinstance(text, unicode): |
|
64 # raw token stream never has any non-ASCII characters |
|
65 text = text.encode('ascii') |
|
66 if self.compress == 'gz': |
|
67 import gzip |
|
68 gzipfile = gzip.GzipFile('', 'rb', 9, cStringIO.StringIO(text)) |
|
69 text = gzipfile.read() |
|
70 elif self.compress == 'bz2': |
|
71 import bz2 |
|
72 text = bz2.decompress(text) |
|
73 |
|
74 # do not call Lexer.get_tokens() because we do not want Unicode |
|
75 # decoding to occur, and stripping is not optional. |
|
76 text = text.strip(b('\n')) + b('\n') |
|
77 for i, t, v in self.get_tokens_unprocessed(text): |
|
78 yield t, v |
|
79 |
|
80 def get_tokens_unprocessed(self, text): |
|
81 length = 0 |
|
82 for match in line_re.finditer(text): |
|
83 try: |
|
84 ttypestr, val = match.group().split(b('\t'), 1) |
|
85 except ValueError: |
|
86 val = match.group().decode(self.encoding) |
|
87 ttype = Error |
|
88 else: |
|
89 ttype = _ttype_cache.get(ttypestr) |
|
90 if not ttype: |
|
91 ttype = Token |
|
92 ttypes = ttypestr.split('.')[1:] |
|
93 for ttype_ in ttypes: |
|
94 if not ttype_ or not ttype_[0].isupper(): |
|
95 raise ValueError('malformed token name') |
|
96 ttype = getattr(ttype, ttype_) |
|
97 _ttype_cache[ttypestr] = ttype |
|
98 val = val[2:-2].decode('unicode-escape') |
|
99 yield length, ttype, val |
|
100 length += len(val) |