eric6/ThirdParty/Pygments/pygments/lexers/special.py

changeset 7983
54c5cfbb1e29
parent 7701
25f42e208e08
equal deleted inserted replaced
7982:48d210e41c65 7983:54c5cfbb1e29
3 pygments.lexers.special 3 pygments.lexers.special
4 ~~~~~~~~~~~~~~~~~~~~~~~ 4 ~~~~~~~~~~~~~~~~~~~~~~~
5 5
6 Special lexers. 6 Special lexers.
7 7
8 :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS. 8 :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details. 9 :license: BSD, see LICENSE for details.
10 """ 10 """
11 11
12 import re 12 import re
13 from io import BytesIO 13 from io import BytesIO
37 return TextLexer.priority 37 return TextLexer.priority
38 38
39 39
40 _ttype_cache = {} 40 _ttype_cache = {}
41 41
42 line_re = re.compile(b'.*?\n') 42 line_re = re.compile('.*?\n')
43 43
44 44
45 class RawTokenLexer(Lexer): 45 class RawTokenLexer(Lexer):
46 """ 46 """
47 Recreate a token stream formatted with the `RawTokenFormatter`. This 47 Recreate a token stream formatted with the `RawTokenFormatter`. This
63 self.compress = get_choice_opt(options, 'compress', 63 self.compress = get_choice_opt(options, 'compress',
64 ['', 'none', 'gz', 'bz2'], '') 64 ['', 'none', 'gz', 'bz2'], '')
65 Lexer.__init__(self, **options) 65 Lexer.__init__(self, **options)
66 66
67 def get_tokens(self, text): 67 def get_tokens(self, text):
68 if isinstance(text, str): 68 if self.compress:
69 # raw token stream never has any non-ASCII characters 69 if isinstance(text, str):
70 text = text.encode('ascii') 70 text = text.encode('latin1')
71 if self.compress == 'gz': 71 if self.compress == 'gz':
72 import gzip 72 import gzip
73 gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text)) 73 gzipfile = gzip.GzipFile('', 'rb', 9, BytesIO(text))
74 text = gzipfile.read() 74 text = gzipfile.read()
75 elif self.compress == 'bz2': 75 elif self.compress == 'bz2':
76 import bz2 76 import bz2
77 text = bz2.decompress(text) 77 text = bz2.decompress(text)
78 text = text.decode('latin1')
78 79
79 # do not call Lexer.get_tokens() because we do not want Unicode 80 # do not call Lexer.get_tokens() because stripping is not optional.
80 # decoding to occur, and stripping is not optional. 81 text = text.strip('\n') + '\n'
81 text = text.strip(b'\n') + b'\n'
82 for i, t, v in self.get_tokens_unprocessed(text): 82 for i, t, v in self.get_tokens_unprocessed(text):
83 yield t, v 83 yield t, v
84 84
85 def get_tokens_unprocessed(self, text): 85 def get_tokens_unprocessed(self, text):
86 length = 0 86 length = 0
87 for match in line_re.finditer(text): 87 for match in line_re.finditer(text):
88 try: 88 try:
89 ttypestr, val = match.group().split(b'\t', 1) 89 ttypestr, val = match.group().rstrip().split('\t', 1)
90 except ValueError: 90 except ValueError:
91 val = match.group().decode('ascii', 'replace') 91 val = match.group()
92 ttype = Error 92 ttype = Error
93 else: 93 else:
94 ttype = _ttype_cache.get(ttypestr) 94 ttype = _ttype_cache.get(ttypestr)
95 if not ttype: 95 if not ttype:
96 ttype = Token 96 ttype = Token
98 for ttype_ in ttypes: 98 for ttype_ in ttypes:
99 if not ttype_ or not ttype_[0].isupper(): 99 if not ttype_ or not ttype_[0].isupper():
100 raise ValueError('malformed token name') 100 raise ValueError('malformed token name')
101 ttype = getattr(ttype, ttype_) 101 ttype = getattr(ttype, ttype_)
102 _ttype_cache[ttypestr] = ttype 102 _ttype_cache[ttypestr] = ttype
103 val = val[2:-2].decode('unicode-escape') 103 val = val[1:-1].encode().decode('unicode-escape')
104 yield length, ttype, val 104 yield length, ttype, val
105 length += len(val) 105 length += len(val)

eric ide

mercurial