3 pygments.lexer |
3 pygments.lexer |
4 ~~~~~~~~~~~~~~ |
4 ~~~~~~~~~~~~~~ |
5 |
5 |
6 Base lexer classes. |
6 Base lexer classes. |
7 |
7 |
8 :copyright: Copyright 2006-2010 by the Pygments team, see AUTHORS. |
8 :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS. |
9 :license: BSD, see LICENSE for details. |
9 :license: BSD, see LICENSE for details. |
10 """ |
10 """ |
11 import re |
11 import re |
12 |
12 |
13 from pygments.filter import apply_filters, Filter |
13 from pygments.filter import apply_filters, Filter |
14 from pygments.filters import get_filter_by_name |
14 from pygments.filters import get_filter_by_name |
15 from pygments.token import Error, Text, Other, _TokenType |
15 from pygments.token import Error, Text, Other, _TokenType |
16 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ |
16 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ |
17 make_analysator |
17 make_analysator |
|
18 import collections |
18 |
19 |
19 |
20 |
20 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', |
21 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', |
21 'LexerContext', 'include', 'bygroups', 'using', 'this'] |
22 'LexerContext', 'include', 'bygroups', 'using', 'this'] |
22 |
23 |
|
24 |
|
25 _encoding_map = [('\xef\xbb\xbf', 'utf-8'), |
|
26 ('\xff\xfe\0\0', 'utf-32'), |
|
27 ('\0\0\xfe\xff', 'utf-32be'), |
|
28 ('\xff\xfe', 'utf-16'), |
|
29 ('\xfe\xff', 'utf-16be')] |
23 |
30 |
24 _default_analyse = staticmethod(lambda x: 0.0) |
31 _default_analyse = staticmethod(lambda x: 0.0) |
25 |
32 |
26 |
33 |
27 class LexerMeta(type): |
34 class LexerMeta(type): |
138 import chardet |
145 import chardet |
139 except ImportError: |
146 except ImportError: |
140 raise ImportError('To enable chardet encoding guessing, ' |
147 raise ImportError('To enable chardet encoding guessing, ' |
141 'please install the chardet library ' |
148 'please install the chardet library ' |
142 'from http://chardet.feedparser.org/') |
149 'from http://chardet.feedparser.org/') |
143 enc = chardet.detect(text) |
150 # check for BOM first |
144 text = text.decode(enc['encoding']) |
151 decoded = None |
|
152 for bom, encoding in _encoding_map: |
|
153 if text.startswith(bom): |
|
154 decoded = str(text[len(bom):], encoding, |
|
155 errors='replace') |
|
156 break |
|
157 # no BOM found, so use chardet |
|
158 if decoded is None: |
|
159 enc = chardet.detect(text[:1024]) # Guess using first 1KB |
|
160 decoded = str(text, enc.get('encoding') or 'utf-8', |
|
161 errors='replace') |
|
162 text = decoded |
145 else: |
163 else: |
146 text = text.decode(self.encoding) |
164 text = text.decode(self.encoding) |
147 # text now *is* a unicode string |
165 # text now *is* a unicode string |
148 text = text.replace('\r\n', '\n') |
166 text = text.replace('\r\n', '\n') |
149 text = text.replace('\r', '\n') |
167 text = text.replace('\r', '\n') |
270 elif type(action) is _TokenType: |
288 elif type(action) is _TokenType: |
271 data = match.group(i + 1) |
289 data = match.group(i + 1) |
272 if data: |
290 if data: |
273 yield match.start(i + 1), action, data |
291 yield match.start(i + 1), action, data |
274 else: |
292 else: |
275 if ctx: |
293 data = match.group(i + 1) |
276 ctx.pos = match.start(i + 1) |
294 if data is not None: |
277 for item in action(lexer, _PseudoMatch(match.start(i + 1), |
295 if ctx: |
278 match.group(i + 1)), ctx): |
296 ctx.pos = match.start(i + 1) |
279 if item: |
297 for item in action(lexer, _PseudoMatch(match.start(i + 1), |
280 yield item |
298 data), ctx): |
|
299 if item: |
|
300 yield item |
281 if ctx: |
301 if ctx: |
282 ctx.pos = match.end() |
302 ctx.pos = match.end() |
283 return callback |
303 return callback |
284 |
304 |
285 |
305 |
351 """Preprocess the regular expression component of a token definition.""" |
371 """Preprocess the regular expression component of a token definition.""" |
352 return re.compile(regex, rflags).match |
372 return re.compile(regex, rflags).match |
353 |
373 |
354 def _process_token(cls, token): |
374 def _process_token(cls, token): |
355 """Preprocess the token component of a token definition.""" |
375 """Preprocess the token component of a token definition.""" |
356 assert type(token) is _TokenType or hasattr(token, '__call__'), \ |
376 assert type(token) is _TokenType or isinstance(token, collections.Callable), \ |
357 'token type must be simple type or callable, not %r' % (token,) |
377 'token type must be simple type or callable, not %r' % (token,) |
358 return token |
378 return token |
359 |
379 |
360 def _process_new_state(cls, new_state, unprocessed, processed): |
380 def _process_new_state(cls, new_state, unprocessed, processed): |
361 """Preprocess the state transition action of a token definition.""" |
381 """Preprocess the state transition action of a token definition.""" |
435 cls._process_state(tokendefs, processed, state) |
455 cls._process_state(tokendefs, processed, state) |
436 return processed |
456 return processed |
437 |
457 |
438 def __call__(cls, *args, **kwds): |
458 def __call__(cls, *args, **kwds): |
439 """Instantiate cls after preprocessing its token definitions.""" |
459 """Instantiate cls after preprocessing its token definitions.""" |
440 if not hasattr(cls, '_tokens'): |
460 if '_tokens' not in cls.__dict__: |
441 cls._all_tokens = {} |
461 cls._all_tokens = {} |
442 cls._tmpname = 0 |
462 cls._tmpname = 0 |
443 if hasattr(cls, 'token_variants') and cls.token_variants: |
463 if hasattr(cls, 'token_variants') and cls.token_variants: |
444 # don't process yet |
464 # don't process yet |
445 pass |
465 pass |