3 pygments.lexer |
3 pygments.lexer |
4 ~~~~~~~~~~~~~~ |
4 ~~~~~~~~~~~~~~ |
5 |
5 |
6 Base lexer classes. |
6 Base lexer classes. |
7 |
7 |
8 :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS. |
8 :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS. |
9 :license: BSD, see LICENSE for details. |
9 :license: BSD, see LICENSE for details. |
10 """ |
10 """ |
11 import re |
11 import re, itertools |
12 |
12 |
13 from pygments.filter import apply_filters, Filter |
13 from pygments.filter import apply_filters, Filter |
14 from pygments.filters import get_filter_by_name |
14 from pygments.filters import get_filter_by_name |
15 from pygments.token import Error, Text, Other, _TokenType |
15 from pygments.token import Error, Text, Other, _TokenType |
16 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ |
16 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ |
17 make_analysator |
17 make_analysator |
18 import collections |
18 import collections |
19 |
19 |
20 |
20 |
21 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', |
21 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', |
22 'LexerContext', 'include', 'bygroups', 'using', 'this'] |
22 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this'] |
23 |
23 |
24 |
24 |
25 _encoding_map = [('\xef\xbb\xbf', 'utf-8'), |
25 _encoding_map = [('\xef\xbb\xbf', 'utf-8'), |
26 ('\xff\xfe\0\0', 'utf-32'), |
26 ('\xff\xfe\0\0', 'utf-32'), |
27 ('\0\0\xfe\xff', 'utf-32be'), |
27 ('\0\0\xfe\xff', 'utf-32be'), |
71 name = None |
71 name = None |
72 |
72 |
73 #: Shortcuts for the lexer |
73 #: Shortcuts for the lexer |
74 aliases = [] |
74 aliases = [] |
75 |
75 |
76 #: fn match rules |
76 #: File name globs |
77 filenames = [] |
77 filenames = [] |
78 |
78 |
79 #: fn alias filenames |
79 #: Secondary file name globs |
80 alias_filenames = [] |
80 alias_filenames = [] |
81 |
81 |
82 #: mime types |
82 #: MIME types |
83 mimetypes = [] |
83 mimetypes = [] |
|
84 |
|
85 #: Priority, should multiple lexers match and no content is provided |
|
86 priority = 0 |
84 |
87 |
85 def __init__(self, **options): |
88 def __init__(self, **options): |
86 self.options = options |
89 self.options = options |
87 self.stripnl = get_bool_opt(options, 'stripnl', True) |
90 self.stripnl = get_bool_opt(options, 'stripnl', True) |
88 self.stripall = get_bool_opt(options, 'stripall', False) |
91 self.stripall = get_bool_opt(options, 'stripall', False) |
160 decoded = str(text, enc.get('encoding') or 'utf-8', |
163 decoded = str(text, enc.get('encoding') or 'utf-8', |
161 errors='replace') |
164 errors='replace') |
162 text = decoded |
165 text = decoded |
163 else: |
166 else: |
164 text = text.decode(self.encoding) |
167 text = text.decode(self.encoding) |
|
168 else: |
|
169 if text.startswith('\ufeff'): |
|
170 text = text[len('\ufeff'):] |
|
171 |
165 # text now *is* a unicode string |
172 # text now *is* a unicode string |
166 text = text.replace('\r\n', '\n') |
173 text = text.replace('\r\n', '\n') |
167 text = text.replace('\r', '\n') |
174 text = text.replace('\r', '\n') |
168 if self.stripall: |
175 if self.stripall: |
169 text = text.strip() |
176 text = text.strip() |
425 # it's a state reference |
442 # it's a state reference |
426 assert tdef != state, "circular state reference %r" % state |
443 assert tdef != state, "circular state reference %r" % state |
427 tokens.extend(cls._process_state(unprocessed, processed, |
444 tokens.extend(cls._process_state(unprocessed, processed, |
428 str(tdef))) |
445 str(tdef))) |
429 continue |
446 continue |
|
447 if isinstance(tdef, _inherit): |
|
448 # processed already |
|
449 continue |
430 |
450 |
431 assert type(tdef) is tuple, "wrong rule def %r" % tdef |
451 assert type(tdef) is tuple, "wrong rule def %r" % tdef |
432 |
452 |
433 try: |
453 try: |
434 rex = cls._process_regex(tdef[0], rflags) |
454 rex = cls._process_regex(tdef[0], rflags) |
452 processed = cls._all_tokens[name] = {} |
472 processed = cls._all_tokens[name] = {} |
453 tokendefs = tokendefs or cls.tokens[name] |
473 tokendefs = tokendefs or cls.tokens[name] |
454 for state in list(tokendefs.keys()): |
474 for state in list(tokendefs.keys()): |
455 cls._process_state(tokendefs, processed, state) |
475 cls._process_state(tokendefs, processed, state) |
456 return processed |
476 return processed |
|
477 |
|
478 def get_tokendefs(cls): |
|
479 """ |
|
480 Merge tokens from superclasses in MRO order, returning a single tokendef |
|
481 dictionary. |
|
482 |
|
483 Any state that is not defined by a subclass will be inherited |
|
484 automatically. States that *are* defined by subclasses will, by |
|
485 default, override that state in the superclass. If a subclass wishes to |
|
486 inherit definitions from a superclass, it can use the special value |
|
487 "inherit", which will cause the superclass' state definition to be |
|
488 included at that point in the state. |
|
489 """ |
|
490 tokens = {} |
|
491 inheritable = {} |
|
492 for c in itertools.chain((cls,), cls.__mro__): |
|
493 toks = c.__dict__.get('tokens', {}) |
|
494 |
|
495 for state, items in toks.items(): |
|
496 curitems = tokens.get(state) |
|
497 if curitems is None: |
|
498 tokens[state] = items |
|
499 try: |
|
500 inherit_ndx = items.index(inherit) |
|
501 except ValueError: |
|
502 continue |
|
503 inheritable[state] = inherit_ndx |
|
504 continue |
|
505 |
|
506 inherit_ndx = inheritable.pop(state, None) |
|
507 if inherit_ndx is None: |
|
508 continue |
|
509 |
|
510 # Replace the "inherit" value with the items |
|
511 curitems[inherit_ndx:inherit_ndx+1] = items |
|
512 try: |
|
513 new_inh_ndx = items.index(inherit) |
|
514 except ValueError: |
|
515 pass |
|
516 else: |
|
517 inheritable[state] = inherit_ndx + new_inh_ndx |
|
518 |
|
519 return tokens |
457 |
520 |
458 def __call__(cls, *args, **kwds): |
521 def __call__(cls, *args, **kwds): |
459 """Instantiate cls after preprocessing its token definitions.""" |
522 """Instantiate cls after preprocessing its token definitions.""" |
460 if '_tokens' not in cls.__dict__: |
523 if '_tokens' not in cls.__dict__: |
461 cls._all_tokens = {} |
524 cls._all_tokens = {} |
462 cls._tmpname = 0 |
525 cls._tmpname = 0 |
463 if hasattr(cls, 'token_variants') and cls.token_variants: |
526 if hasattr(cls, 'token_variants') and cls.token_variants: |
464 # don't process yet |
527 # don't process yet |
465 pass |
528 pass |
466 else: |
529 else: |
467 cls._tokens = cls.process_tokendef('', cls.tokens) |
530 cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) |
468 |
531 |
469 return type.__call__(cls, *args, **kwds) |
532 return type.__call__(cls, *args, **kwds) |
470 |
533 |
471 |
534 |
472 class RegexLexer(Lexer, metaclass=RegexLexerMeta): |
535 class RegexLexer(Lexer, metaclass=RegexLexerMeta): |
540 break |
603 break |
541 else: |
604 else: |
542 try: |
605 try: |
543 if text[pos] == '\n': |
606 if text[pos] == '\n': |
544 # at EOL, reset state to "root" |
607 # at EOL, reset state to "root" |
545 pos += 1 |
|
546 statestack = ['root'] |
608 statestack = ['root'] |
547 statetokens = tokendefs['root'] |
609 statetokens = tokendefs['root'] |
548 yield pos, Text, '\n' |
610 yield pos, Text, '\n' |
|
611 pos += 1 |
549 continue |
612 continue |
550 yield pos, Error, text[pos] |
613 yield pos, Error, text[pos] |
551 pos += 1 |
614 pos += 1 |
552 except IndexError: |
615 except IndexError: |
553 break |
616 break |
602 statetokens = tokendefs[ctx.stack[-1]] |
665 statetokens = tokendefs[ctx.stack[-1]] |
603 # CAUTION: callback must set ctx.pos! |
666 # CAUTION: callback must set ctx.pos! |
604 if new_state is not None: |
667 if new_state is not None: |
605 # state transition |
668 # state transition |
606 if isinstance(new_state, tuple): |
669 if isinstance(new_state, tuple): |
607 ctx.stack.extend(new_state) |
670 for state in new_state: |
|
671 if state == '#pop': |
|
672 ctx.stack.pop() |
|
673 elif state == '#push': |
|
674 ctx.stack.append(statestack[-1]) |
|
675 else: |
|
676 ctx.stack.append(state) |
608 elif isinstance(new_state, int): |
677 elif isinstance(new_state, int): |
609 # pop |
678 # pop |
610 del ctx.stack[new_state:] |
679 del ctx.stack[new_state:] |
611 elif new_state == '#push': |
680 elif new_state == '#push': |
612 ctx.stack.append(ctx.stack[-1]) |
681 ctx.stack.append(ctx.stack[-1]) |
618 try: |
687 try: |
619 if ctx.pos >= ctx.end: |
688 if ctx.pos >= ctx.end: |
620 break |
689 break |
621 if text[ctx.pos] == '\n': |
690 if text[ctx.pos] == '\n': |
622 # at EOL, reset state to "root" |
691 # at EOL, reset state to "root" |
623 ctx.pos += 1 |
|
624 ctx.stack = ['root'] |
692 ctx.stack = ['root'] |
625 statetokens = tokendefs['root'] |
693 statetokens = tokendefs['root'] |
626 yield ctx.pos, Text, '\n' |
694 yield ctx.pos, Text, '\n' |
|
695 ctx.pos += 1 |
627 continue |
696 continue |
628 yield ctx.pos, Error, text[ctx.pos] |
697 yield ctx.pos, Error, text[ctx.pos] |
629 ctx.pos += 1 |
698 ctx.pos += 1 |
630 except IndexError: |
699 except IndexError: |
631 break |
700 break |