ThirdParty/Pygments/pygments/lexer.py

changeset 1705
b0fbc9300f2b
parent 808
8f85926125ef
child 2426
da76c71624de
equal deleted inserted replaced
1704:02ae6c55b35b 1705:b0fbc9300f2b
3 pygments.lexer 3 pygments.lexer
4 ~~~~~~~~~~~~~~ 4 ~~~~~~~~~~~~~~
5 5
6 Base lexer classes. 6 Base lexer classes.
7 7
8 :copyright: Copyright 2006-2010 by the Pygments team, see AUTHORS. 8 :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details. 9 :license: BSD, see LICENSE for details.
10 """ 10 """
11 import re 11 import re
12 12
13 from pygments.filter import apply_filters, Filter 13 from pygments.filter import apply_filters, Filter
14 from pygments.filters import get_filter_by_name 14 from pygments.filters import get_filter_by_name
15 from pygments.token import Error, Text, Other, _TokenType 15 from pygments.token import Error, Text, Other, _TokenType
16 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ 16 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
17 make_analysator 17 make_analysator
18 import collections
18 19
19 20
20 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 21 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
21 'LexerContext', 'include', 'bygroups', 'using', 'this'] 22 'LexerContext', 'include', 'bygroups', 'using', 'this']
22 23
24
25 _encoding_map = [('\xef\xbb\xbf', 'utf-8'),
26 ('\xff\xfe\0\0', 'utf-32'),
27 ('\0\0\xfe\xff', 'utf-32be'),
28 ('\xff\xfe', 'utf-16'),
29 ('\xfe\xff', 'utf-16be')]
23 30
24 _default_analyse = staticmethod(lambda x: 0.0) 31 _default_analyse = staticmethod(lambda x: 0.0)
25 32
26 33
27 class LexerMeta(type): 34 class LexerMeta(type):
138 import chardet 145 import chardet
139 except ImportError: 146 except ImportError:
140 raise ImportError('To enable chardet encoding guessing, ' 147 raise ImportError('To enable chardet encoding guessing, '
141 'please install the chardet library ' 148 'please install the chardet library '
142 'from http://chardet.feedparser.org/') 149 'from http://chardet.feedparser.org/')
143 enc = chardet.detect(text) 150 # check for BOM first
144 text = text.decode(enc['encoding']) 151 decoded = None
152 for bom, encoding in _encoding_map:
153 if text.startswith(bom):
154 decoded = str(text[len(bom):], encoding,
155 errors='replace')
156 break
157 # no BOM found, so use chardet
158 if decoded is None:
159 enc = chardet.detect(text[:1024]) # Guess using first 1KB
160 decoded = str(text, enc.get('encoding') or 'utf-8',
161 errors='replace')
162 text = decoded
145 else: 163 else:
146 text = text.decode(self.encoding) 164 text = text.decode(self.encoding)
147 # text now *is* a unicode string 165 # text now *is* a unicode string
148 text = text.replace('\r\n', '\n') 166 text = text.replace('\r\n', '\n')
149 text = text.replace('\r', '\n') 167 text = text.replace('\r', '\n')
270 elif type(action) is _TokenType: 288 elif type(action) is _TokenType:
271 data = match.group(i + 1) 289 data = match.group(i + 1)
272 if data: 290 if data:
273 yield match.start(i + 1), action, data 291 yield match.start(i + 1), action, data
274 else: 292 else:
275 if ctx: 293 data = match.group(i + 1)
276 ctx.pos = match.start(i + 1) 294 if data is not None:
277 for item in action(lexer, _PseudoMatch(match.start(i + 1), 295 if ctx:
278 match.group(i + 1)), ctx): 296 ctx.pos = match.start(i + 1)
279 if item: 297 for item in action(lexer, _PseudoMatch(match.start(i + 1),
280 yield item 298 data), ctx):
299 if item:
300 yield item
281 if ctx: 301 if ctx:
282 ctx.pos = match.end() 302 ctx.pos = match.end()
283 return callback 303 return callback
284 304
285 305
351 """Preprocess the regular expression component of a token definition.""" 371 """Preprocess the regular expression component of a token definition."""
352 return re.compile(regex, rflags).match 372 return re.compile(regex, rflags).match
353 373
354 def _process_token(cls, token): 374 def _process_token(cls, token):
355 """Preprocess the token component of a token definition.""" 375 """Preprocess the token component of a token definition."""
356 assert type(token) is _TokenType or hasattr(token, '__call__'), \ 376 assert type(token) is _TokenType or isinstance(token, collections.Callable), \
357 'token type must be simple type or callable, not %r' % (token,) 377 'token type must be simple type or callable, not %r' % (token,)
358 return token 378 return token
359 379
360 def _process_new_state(cls, new_state, unprocessed, processed): 380 def _process_new_state(cls, new_state, unprocessed, processed):
361 """Preprocess the state transition action of a token definition.""" 381 """Preprocess the state transition action of a token definition."""
435 cls._process_state(tokendefs, processed, state) 455 cls._process_state(tokendefs, processed, state)
436 return processed 456 return processed
437 457
438 def __call__(cls, *args, **kwds): 458 def __call__(cls, *args, **kwds):
439 """Instantiate cls after preprocessing its token definitions.""" 459 """Instantiate cls after preprocessing its token definitions."""
440 if not hasattr(cls, '_tokens'): 460 if '_tokens' not in cls.__dict__:
441 cls._all_tokens = {} 461 cls._all_tokens = {}
442 cls._tmpname = 0 462 cls._tmpname = 0
443 if hasattr(cls, 'token_variants') and cls.token_variants: 463 if hasattr(cls, 'token_variants') and cls.token_variants:
444 # don't process yet 464 # don't process yet
445 pass 465 pass

eric ide

mercurial