3 pygments.lexer |
3 pygments.lexer |
4 ~~~~~~~~~~~~~~ |
4 ~~~~~~~~~~~~~~ |
5 |
5 |
6 Base lexer classes. |
6 Base lexer classes. |
7 |
7 |
8 :copyright: Copyright 2006-2017 by the Pygments team, see AUTHORS. |
8 :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS. |
9 :license: BSD, see LICENSE for details. |
9 :license: BSD, see LICENSE for details. |
10 """ |
10 """ |
11 |
|
12 from __future__ import print_function |
|
13 |
11 |
14 import re |
12 import re |
15 import sys |
13 import sys |
16 import time |
14 import time |
17 |
15 |
18 from pygments.filter import apply_filters, Filter |
16 from pygments.filter import apply_filters, Filter |
19 from pygments.filters import get_filter_by_name |
17 from pygments.filters import get_filter_by_name |
20 from pygments.token import Error, Text, Other, _TokenType |
18 from pygments.token import Error, Text, Other, _TokenType |
21 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ |
19 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ |
22 make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode |
20 make_analysator, Future, guess_decode |
23 from pygments.regexopt import regex_opt |
21 from pygments.regexopt import regex_opt |
24 |
22 |
25 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', |
23 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', |
26 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', |
24 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', |
27 'default', 'words'] |
25 'default', 'words'] |
46 if 'analyse_text' in d: |
44 if 'analyse_text' in d: |
47 d['analyse_text'] = make_analysator(d['analyse_text']) |
45 d['analyse_text'] = make_analysator(d['analyse_text']) |
48 return type.__new__(mcs, name, bases, d) |
46 return type.__new__(mcs, name, bases, d) |
49 |
47 |
50 |
48 |
51 @add_metaclass(LexerMeta) |
49 class Lexer(metaclass=LexerMeta): |
52 class Lexer(object): |
|
53 """ |
50 """ |
54 Lexer for a specific language. |
51 Lexer for a specific language. |
55 |
52 |
56 Basic options recognized: |
53 Basic options recognized: |
57 ``stripnl`` |
54 ``stripnl`` |
143 is bypassed even if filters are defined. |
140 is bypassed even if filters are defined. |
144 |
141 |
145 Also preprocess the text, i.e. expand tabs and strip it if |
142 Also preprocess the text, i.e. expand tabs and strip it if |
146 wanted and applies registered filters. |
143 wanted and applies registered filters. |
147 """ |
144 """ |
148 if not isinstance(text, text_type): |
145 if not isinstance(text, str): |
149 if self.encoding == 'guess': |
146 if self.encoding == 'guess': |
150 text, _ = guess_decode(text) |
147 text, _ = guess_decode(text) |
151 elif self.encoding == 'chardet': |
148 elif self.encoding == 'chardet': |
152 try: |
149 try: |
153 import chardet |
150 import chardet |
273 def __init__(self, *args): |
270 def __init__(self, *args): |
274 # tuple.__init__ doesn't do anything |
271 # tuple.__init__ doesn't do anything |
275 pass |
272 pass |
276 |
273 |
277 |
274 |
278 class _PseudoMatch(object): |
275 class _PseudoMatch: |
279 """ |
276 """ |
280 A pseudo match object constructed from a string. |
277 A pseudo match object constructed from a string. |
281 """ |
278 """ |
282 |
279 |
283 def __init__(self, start, text): |
280 def __init__(self, start, text): |
534 tokens = {} |
532 tokens = {} |
535 inheritable = {} |
533 inheritable = {} |
536 for c in cls.__mro__: |
534 for c in cls.__mro__: |
537 toks = c.__dict__.get('tokens', {}) |
535 toks = c.__dict__.get('tokens', {}) |
538 |
536 |
539 for state, items in iteritems(toks): |
537 for state, items in toks.items(): |
540 curitems = tokens.get(state) |
538 curitems = tokens.get(state) |
541 if curitems is None: |
539 if curitems is None: |
542 # N.b. because this is assigned by reference, sufficiently |
540 # N.b. because this is assigned by reference, sufficiently |
543 # deep hierarchies are processed incrementally (e.g. for |
541 # deep hierarchies are processed incrementally (e.g. for |
544 # A(B), B(C), C(RegexLexer), B will be premodified so X(B) |
542 # A(B), B(C), C(RegexLexer), B will be premodified so X(B) |
580 cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) |
578 cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) |
581 |
579 |
582 return type.__call__(cls, *args, **kwds) |
580 return type.__call__(cls, *args, **kwds) |
583 |
581 |
584 |
582 |
585 @add_metaclass(RegexLexerMeta) |
583 class RegexLexer(Lexer, metaclass=RegexLexerMeta): |
586 class RegexLexer(Lexer): |
|
587 """ |
584 """ |
588 Base for simple stateful regular expression-based lexers. |
585 Base for simple stateful regular expression-based lexers. |
589 Simplifies the lexing process so that you need only |
586 Simplifies the lexing process so that you need only |
590 provide a list of states and regular expressions. |
587 provide a list of states and regular expressions. |
591 """ |
588 """ |
637 if new_state is not None: |
634 if new_state is not None: |
638 # state transition |
635 # state transition |
639 if isinstance(new_state, tuple): |
636 if isinstance(new_state, tuple): |
640 for state in new_state: |
637 for state in new_state: |
641 if state == '#pop': |
638 if state == '#pop': |
642 statestack.pop() |
639 if len(statestack) > 1: |
|
640 statestack.pop() |
643 elif state == '#push': |
641 elif state == '#push': |
644 statestack.append(statestack[-1]) |
642 statestack.append(statestack[-1]) |
645 else: |
643 else: |
646 statestack.append(state) |
644 statestack.append(state) |
647 elif isinstance(new_state, int): |
645 elif isinstance(new_state, int): |
648 # pop |
646 # pop, but keep at least one state on the stack |
649 del statestack[new_state:] |
647 # (random code leading to unexpected pops should |
|
648 # not allow exceptions) |
|
649 if abs(new_state) >= len(statestack): |
|
650 del statestack[1:] |
|
651 else: |
|
652 del statestack[new_state:] |
650 elif new_state == '#push': |
653 elif new_state == '#push': |
651 statestack.append(statestack[-1]) |
654 statestack.append(statestack[-1]) |
652 else: |
655 else: |
653 assert False, "wrong state def: %r" % new_state |
656 assert False, "wrong state def: %r" % new_state |
654 statetokens = tokendefs[statestack[-1]] |
657 statetokens = tokendefs[statestack[-1]] |
722 if new_state is not None: |
725 if new_state is not None: |
723 # state transition |
726 # state transition |
724 if isinstance(new_state, tuple): |
727 if isinstance(new_state, tuple): |
725 for state in new_state: |
728 for state in new_state: |
726 if state == '#pop': |
729 if state == '#pop': |
727 ctx.stack.pop() |
730 if len(ctx.stack) > 1: |
|
731 ctx.stack.pop() |
728 elif state == '#push': |
732 elif state == '#push': |
729 ctx.stack.append(ctx.stack[-1]) |
733 ctx.stack.append(ctx.stack[-1]) |
730 else: |
734 else: |
731 ctx.stack.append(state) |
735 ctx.stack.append(state) |
732 elif isinstance(new_state, int): |
736 elif isinstance(new_state, int): |
733 # pop |
737 # see RegexLexer for why this check is made |
734 del ctx.stack[new_state:] |
738 if abs(new_state) >= len(ctx.stack): |
|
739 del ctx.state[1:] |
|
740 else: |
|
741 del ctx.stack[new_state:] |
735 elif new_state == '#push': |
742 elif new_state == '#push': |
736 ctx.stack.append(ctx.stack[-1]) |
743 ctx.stack.append(ctx.stack[-1]) |
737 else: |
744 else: |
738 assert False, "wrong state def: %r" % new_state |
745 assert False, "wrong state def: %r" % new_state |
739 statetokens = tokendefs[ctx.stack[-1]] |
746 statetokens = tokendefs[ctx.stack[-1]] |