ThirdParty/Pygments/pygments/lexer.py

changeset 4172
4f20dba37ab6
parent 3484
645c12de6b0c
child 4697
c2e9bf425554
equal deleted inserted replaced
4170:8bc578136279 4172:4f20dba37ab6
3 pygments.lexer 3 pygments.lexer
4 ~~~~~~~~~~~~~~ 4 ~~~~~~~~~~~~~~
5 5
6 Base lexer classes. 6 Base lexer classes.
7 7
8 :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS. 8 :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details. 9 :license: BSD, see LICENSE for details.
10 """ 10 """
11 try: 11
12 str = unicode 12 from __future__ import print_function
13 except NameError: 13
14 basestring = str 14 import re
15 15 import sys
16 import re, itertools 16 import time
17 import itertools
17 18
18 from pygments.filter import apply_filters, Filter 19 from pygments.filter import apply_filters, Filter
19 from pygments.filters import get_filter_by_name 20 from pygments.filters import get_filter_by_name
20 from pygments.token import Error, Text, Other, _TokenType 21 from pygments.token import Error, Text, Other, _TokenType
21 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ 22 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
22 make_analysator 23 make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode
23 import collections 24 from pygments.regexopt import regex_opt
24
25 25
26 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 26 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
27 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this'] 27 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
28 28 'default', 'words']
29 29
30 _encoding_map = [('\xef\xbb\xbf', 'utf-8'), 30
31 ('\xff\xfe\0\0', 'utf-32'), 31 _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
32 ('\0\0\xfe\xff', 'utf-32be'), 32 (b'\xff\xfe\0\0', 'utf-32'),
33 ('\xff\xfe', 'utf-16'), 33 (b'\0\0\xfe\xff', 'utf-32be'),
34 ('\xfe\xff', 'utf-16be')] 34 (b'\xff\xfe', 'utf-16'),
35 (b'\xfe\xff', 'utf-16be')]
35 36
36 _default_analyse = staticmethod(lambda x: 0.0) 37 _default_analyse = staticmethod(lambda x: 0.0)
37
38
39 def with_metaclass(meta, base=object):
40 """
41 Python independent version to create a base class with a metaclass.
42 Taken from six 1.3.0 (http://pythonhosted.org/six)
43 """
44 return meta("NewBase", (base,), {})
45 38
46 39
47 class LexerMeta(type): 40 class LexerMeta(type):
48 """ 41 """
49 This metaclass automagically converts ``analyse_text`` methods into 42 This metaclass automagically converts ``analyse_text`` methods into
54 if 'analyse_text' in d: 47 if 'analyse_text' in d:
55 d['analyse_text'] = make_analysator(d['analyse_text']) 48 d['analyse_text'] = make_analysator(d['analyse_text'])
56 return type.__new__(cls, name, bases, d) 49 return type.__new__(cls, name, bases, d)
57 50
58 51
59 class Lexer(with_metaclass(LexerMeta, object)): 52 @add_metaclass(LexerMeta)
53 class Lexer(object):
60 """ 54 """
61 Lexer for a specific language. 55 Lexer for a specific language.
62 56
63 Basic options recognized: 57 Basic options recognized:
64 ``stripnl`` 58 ``stripnl``
67 Strip all leading and trailing whitespace from the input 61 Strip all leading and trailing whitespace from the input
68 (default: False). 62 (default: False).
69 ``ensurenl`` 63 ``ensurenl``
70 Make sure that the input ends with a newline (default: True). This 64 Make sure that the input ends with a newline (default: True). This
71 is required for some lexers that consume input linewise. 65 is required for some lexers that consume input linewise.
72 *New in Pygments 1.3.* 66
67 .. versionadded:: 1.3
68
73 ``tabsize`` 69 ``tabsize``
74 If given and greater than 0, expand tabs in the input (default: 0). 70 If given and greater than 0, expand tabs in the input (default: 0).
75 ``encoding`` 71 ``encoding``
76 If given, must be an encoding name. This encoding will be used to 72 If given, must be an encoding name. This encoding will be used to
77 convert the input string to Unicode, if it is not already a Unicode 73 convert the input string to Unicode, if it is not already a Unicode
78 string (default: ``'latin1'``). 74 string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
79 Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or 75 Latin1 detection. Can also be ``'chardet'`` to use the chardet
80 ``'chardet'`` to use the chardet library, if it is installed. 76 library, if it is installed.
77 ``inencoding``
78 Overrides the ``encoding`` if given.
81 """ 79 """
82 80
83 #: Name of the lexer 81 #: Name of the lexer
84 name = None 82 name = None
85 83
102 self.options = options 100 self.options = options
103 self.stripnl = get_bool_opt(options, 'stripnl', True) 101 self.stripnl = get_bool_opt(options, 'stripnl', True)
104 self.stripall = get_bool_opt(options, 'stripall', False) 102 self.stripall = get_bool_opt(options, 'stripall', False)
105 self.ensurenl = get_bool_opt(options, 'ensurenl', True) 103 self.ensurenl = get_bool_opt(options, 'ensurenl', True)
106 self.tabsize = get_int_opt(options, 'tabsize', 0) 104 self.tabsize = get_int_opt(options, 'tabsize', 0)
107 self.encoding = options.get('encoding', 'latin1') 105 self.encoding = options.get('encoding', 'guess')
108 # self.encoding = options.get('inencoding', None) or self.encoding 106 self.encoding = options.get('inencoding') or self.encoding
109 self.filters = [] 107 self.filters = []
110 for filter_ in get_list_opt(options, 'filters', ()): 108 for filter_ in get_list_opt(options, 'filters', ()):
111 self.add_filter(filter_) 109 self.add_filter(filter_)
112 110
113 def __repr__(self): 111 def __repr__(self):
146 is bypassed even if filters are defined. 144 is bypassed even if filters are defined.
147 145
148 Also preprocess the text, i.e. expand tabs and strip it if 146 Also preprocess the text, i.e. expand tabs and strip it if
149 wanted and applies registered filters. 147 wanted and applies registered filters.
150 """ 148 """
151 if not isinstance(text, str): 149 if not isinstance(text, text_type):
152 if self.encoding == 'guess': 150 if self.encoding == 'guess':
153 try: 151 text, _ = guess_decode(text)
154 text = text.decode('utf-8')
155 if text.startswith('\ufeff'):
156 text = text[len('\ufeff'):]
157 except UnicodeDecodeError:
158 text = text.decode('latin1')
159 elif self.encoding == 'chardet': 152 elif self.encoding == 'chardet':
160 try: 153 try:
161 import chardet 154 import chardet
162 except ImportError: 155 except ImportError:
163 raise ImportError('To enable chardet encoding guessing, ' 156 raise ImportError('To enable chardet encoding guessing, '
165 'from http://chardet.feedparser.org/') 158 'from http://chardet.feedparser.org/')
166 # check for BOM first 159 # check for BOM first
167 decoded = None 160 decoded = None
168 for bom, encoding in _encoding_map: 161 for bom, encoding in _encoding_map:
169 if text.startswith(bom): 162 if text.startswith(bom):
170 decoded = str(text[len(bom):], encoding, 163 decoded = text[len(bom):].decode(encoding, 'replace')
171 errors='replace')
172 break 164 break
173 # no BOM found, so use chardet 165 # no BOM found, so use chardet
174 if decoded is None: 166 if decoded is None:
175 enc = chardet.detect(text[:1024]) # Guess using first 1KB 167 enc = chardet.detect(text[:1024]) # Guess using first 1KB
176 decoded = str(text, enc.get('encoding') or 'utf-8', 168 decoded = text.decode(enc.get('encoding') or 'utf-8',
177 errors='replace') 169 'replace')
178 text = decoded 170 text = decoded
179 else: 171 else:
180 text = text.decode(self.encoding) 172 text = text.decode(self.encoding)
173 if text.startswith(u'\ufeff'):
174 text = text[len(u'\ufeff'):]
181 else: 175 else:
182 if text.startswith('\ufeff'): 176 if text.startswith(u'\ufeff'):
183 text = text[len('\ufeff'):] 177 text = text[len(u'\ufeff'):]
184 178
185 # text now *is* a unicode string 179 # text now *is* a unicode string
186 text = text.replace('\r\n', '\n') 180 text = text.replace('\r\n', '\n')
187 text = text.replace('\r', '\n') 181 text = text.replace('\r', '\n')
188 if self.stripall: 182 if self.stripall:
202 stream = apply_filters(stream, self.filters, self) 196 stream = apply_filters(stream, self.filters, self)
203 return stream 197 return stream
204 198
205 def get_tokens_unprocessed(self, text): 199 def get_tokens_unprocessed(self, text):
206 """ 200 """
207 Return an iterable of (tokentype, value) pairs. 201 Return an iterable of (index, tokentype, value) pairs where "index"
202 is the starting position of the token within the input text.
203
208 In subclasses, implement this method as a generator to 204 In subclasses, implement this method as a generator to
209 maximize effectiveness. 205 maximize effectiveness.
210 """ 206 """
211 raise NotImplementedError 207 raise NotImplementedError
212 208
243 insertions.append((len(buffered), lng_buffer)) 239 insertions.append((len(buffered), lng_buffer))
244 return do_insertions(insertions, 240 return do_insertions(insertions,
245 self.root_lexer.get_tokens_unprocessed(buffered)) 241 self.root_lexer.get_tokens_unprocessed(buffered))
246 242
247 243
248 #------------------------------------------------------------------------------- 244 # ------------------------------------------------------------------------------
249 # RegexLexer and ExtendedRegexLexer 245 # RegexLexer and ExtendedRegexLexer
250 # 246 #
251 247
252 248
253 class include(str): 249 class include(str):
389 if ctx: 385 if ctx:
390 ctx.pos = match.end() 386 ctx.pos = match.end()
391 return callback 387 return callback
392 388
393 389
390 class default:
391 """
392 Indicates a state or state action (e.g. #pop) to apply.
393 For example default('#pop') is equivalent to ('', Token, '#pop')
394 Note that state tuples may be used as well.
395
396 .. versionadded:: 2.0
397 """
398 def __init__(self, state):
399 self.state = state
400
401
402 class words(Future):
403 """
404 Indicates a list of literal words that is transformed into an optimized
405 regex that matches any of the words.
406
407 .. versionadded:: 2.0
408 """
409 def __init__(self, words, prefix='', suffix=''):
410 self.words = words
411 self.prefix = prefix
412 self.suffix = suffix
413
414 def get(self):
415 return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
416
417
394 class RegexLexerMeta(LexerMeta): 418 class RegexLexerMeta(LexerMeta):
395 """ 419 """
396 Metaclass for RegexLexer, creates the self._tokens attribute from 420 Metaclass for RegexLexer, creates the self._tokens attribute from
397 self.tokens on the first instantiation. 421 self.tokens on the first instantiation.
398 """ 422 """
399 423
400 def _process_regex(cls, regex, rflags): 424 def _process_regex(cls, regex, rflags, state):
401 """Preprocess the regular expression component of a token definition.""" 425 """Preprocess the regular expression component of a token definition."""
426 if isinstance(regex, Future):
427 regex = regex.get()
402 return re.compile(regex, rflags).match 428 return re.compile(regex, rflags).match
403 429
404 def _process_token(cls, token): 430 def _process_token(cls, token):
405 """Preprocess the token component of a token definition.""" 431 """Preprocess the token component of a token definition."""
406 assert type(token) is _TokenType or isinstance(token, collections.Callable), \ 432 assert type(token) is _TokenType or callable(token), \
407 'token type must be simple type or callable, not %r' % (token,) 433 'token type must be simple type or callable, not %r' % (token,)
408 return token 434 return token
409 435
410 def _process_new_state(cls, new_state, unprocessed, processed): 436 def _process_new_state(cls, new_state, unprocessed, processed):
411 """Preprocess the state transition action of a token definition.""" 437 """Preprocess the state transition action of a token definition."""
412 if isinstance(new_state, basestring): 438 if isinstance(new_state, str):
413 # an existing state 439 # an existing state
414 if new_state == '#pop': 440 if new_state == '#pop':
415 return -1 441 return -1
416 elif new_state in unprocessed: 442 elif new_state in unprocessed:
417 return (new_state,) 443 return (new_state,)
435 elif isinstance(new_state, tuple): 461 elif isinstance(new_state, tuple):
436 # push more than one state 462 # push more than one state
437 for istate in new_state: 463 for istate in new_state:
438 assert (istate in unprocessed or 464 assert (istate in unprocessed or
439 istate in ('#pop', '#push')), \ 465 istate in ('#pop', '#push')), \
440 'unknown new state ' + istate 466 'unknown new state ' + istate
441 return new_state 467 return new_state
442 else: 468 else:
443 assert False, 'unknown new state def %r' % new_state 469 assert False, 'unknown new state def %r' % new_state
444 470
445 def _process_state(cls, unprocessed, processed, state): 471 def _process_state(cls, unprocessed, processed, state):
446 """Preprocess a single state definition.""" 472 """Preprocess a single state definition."""
447 assert isinstance(state, basestring), "wrong state name %r" % state 473 assert type(state) is str, "wrong state name %r" % state
448 assert state[0] != '#', "invalid state name %r" % state 474 assert state[0] != '#', "invalid state name %r" % state
449 if state in processed: 475 if state in processed:
450 return processed[state] 476 return processed[state]
451 tokens = processed[state] = [] 477 tokens = processed[state] = []
452 rflags = cls.flags 478 rflags = cls.flags
456 assert tdef != state, "circular state reference %r" % state 482 assert tdef != state, "circular state reference %r" % state
457 tokens.extend(cls._process_state(unprocessed, processed, 483 tokens.extend(cls._process_state(unprocessed, processed,
458 str(tdef))) 484 str(tdef)))
459 continue 485 continue
460 if isinstance(tdef, _inherit): 486 if isinstance(tdef, _inherit):
461 # processed already 487 # should be processed already, but may not in the case of:
488 # 1. the state has no counterpart in any parent
489 # 2. the state includes more than one 'inherit'
462 continue 490 continue
491 if isinstance(tdef, default):
492 new_state = cls._process_new_state(tdef.state, unprocessed, processed)
493 tokens.append((re.compile('').match, None, new_state))
494 continue
463 495
464 assert type(tdef) is tuple, "wrong rule def %r" % tdef 496 assert type(tdef) is tuple, "wrong rule def %r" % tdef
465 497
466 try: 498 try:
467 rex = cls._process_regex(tdef[0], rflags) 499 rex = cls._process_regex(tdef[0], rflags, state)
468 except Exception as err: 500 except Exception as err:
469 raise ValueError("uncompilable regex %r in state %r of %r: %s" % 501 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
470 (tdef[0], state, cls, err)) 502 (tdef[0], state, cls, err))
471 503
472 token = cls._process_token(tdef[1]) 504 token = cls._process_token(tdef[1])
482 514
483 def process_tokendef(cls, name, tokendefs=None): 515 def process_tokendef(cls, name, tokendefs=None):
484 """Preprocess a dictionary of token definitions.""" 516 """Preprocess a dictionary of token definitions."""
485 processed = cls._all_tokens[name] = {} 517 processed = cls._all_tokens[name] = {}
486 tokendefs = tokendefs or cls.tokens[name] 518 tokendefs = tokendefs or cls.tokens[name]
487 for state in list(tokendefs.keys()): 519 for state in list(tokendefs):
488 cls._process_state(tokendefs, processed, state) 520 cls._process_state(tokendefs, processed, state)
489 return processed 521 return processed
490 522
491 def get_tokendefs(cls): 523 def get_tokendefs(cls):
492 """ 524 """
500 "inherit", which will cause the superclass' state definition to be 532 "inherit", which will cause the superclass' state definition to be
501 included at that point in the state. 533 included at that point in the state.
502 """ 534 """
503 tokens = {} 535 tokens = {}
504 inheritable = {} 536 inheritable = {}
505 for c in itertools.chain((cls,), cls.__mro__): 537 for c in cls.__mro__:
506 toks = c.__dict__.get('tokens', {}) 538 toks = c.__dict__.get('tokens', {})
507 539
508 for state, items in toks.items(): 540 for state, items in iteritems(toks):
509 curitems = tokens.get(state) 541 curitems = tokens.get(state)
510 if curitems is None: 542 if curitems is None:
543 # N.b. because this is assigned by reference, sufficiently
544 # deep hierarchies are processed incrementally (e.g. for
545 # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
546 # will not see any inherits in B).
511 tokens[state] = items 547 tokens[state] = items
512 try: 548 try:
513 inherit_ndx = items.index(inherit) 549 inherit_ndx = items.index(inherit)
514 except ValueError: 550 except ValueError:
515 continue 551 continue
521 continue 557 continue
522 558
523 # Replace the "inherit" value with the items 559 # Replace the "inherit" value with the items
524 curitems[inherit_ndx:inherit_ndx+1] = items 560 curitems[inherit_ndx:inherit_ndx+1] = items
525 try: 561 try:
562 # N.b. this is the index in items (that is, the superclass
563 # copy), so offset required when storing below.
526 new_inh_ndx = items.index(inherit) 564 new_inh_ndx = items.index(inherit)
527 except ValueError: 565 except ValueError:
528 pass 566 pass
529 else: 567 else:
530 inheritable[state] = inherit_ndx + new_inh_ndx 568 inheritable[state] = inherit_ndx + new_inh_ndx
543 cls._tokens = cls.process_tokendef('', cls.get_tokendefs()) 581 cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
544 582
545 return type.__call__(cls, *args, **kwds) 583 return type.__call__(cls, *args, **kwds)
546 584
547 585
548 class RegexLexer(with_metaclass(RegexLexerMeta, Lexer)): 586 @add_metaclass(RegexLexerMeta)
587 class RegexLexer(Lexer):
549 """ 588 """
550 Base for simple stateful regular expression-based lexers. 589 Base for simple stateful regular expression-based lexers.
551 Simplifies the lexing process so that you need only 590 Simplifies the lexing process so that you need only
552 provide a list of states and regular expressions. 591 provide a list of states and regular expressions.
553 """ 592 """
587 statetokens = tokendefs[statestack[-1]] 626 statetokens = tokendefs[statestack[-1]]
588 while 1: 627 while 1:
589 for rexmatch, action, new_state in statetokens: 628 for rexmatch, action, new_state in statetokens:
590 m = rexmatch(text, pos) 629 m = rexmatch(text, pos)
591 if m: 630 if m:
592 if type(action) is _TokenType: 631 if action is not None:
593 yield pos, action, m.group() 632 if type(action) is _TokenType:
594 else: 633 yield pos, action, m.group()
595 for item in action(self, m): 634 else:
596 yield item 635 for item in action(self, m):
636 yield item
597 pos = m.end() 637 pos = m.end()
598 if new_state is not None: 638 if new_state is not None:
599 # state transition 639 # state transition
600 if isinstance(new_state, tuple): 640 if isinstance(new_state, tuple):
601 for state in new_state: 641 for state in new_state:
618 try: 658 try:
619 if text[pos] == '\n': 659 if text[pos] == '\n':
620 # at EOL, reset state to "root" 660 # at EOL, reset state to "root"
621 statestack = ['root'] 661 statestack = ['root']
622 statetokens = tokendefs['root'] 662 statetokens = tokendefs['root']
623 yield pos, Text, '\n' 663 yield pos, Text, u'\n'
624 pos += 1 664 pos += 1
625 continue 665 continue
626 yield pos, Error, text[pos] 666 yield pos, Error, text[pos]
627 pos += 1 667 pos += 1
628 except IndexError: 668 except IndexError:
635 """ 675 """
636 676
637 def __init__(self, text, pos, stack=None, end=None): 677 def __init__(self, text, pos, stack=None, end=None):
638 self.text = text 678 self.text = text
639 self.pos = pos 679 self.pos = pos
640 self.end = end or len(text) # end=0 not supported ;-) 680 self.end = end or len(text) # end=0 not supported ;-)
641 self.stack = stack or ['root'] 681 self.stack = stack or ['root']
642 682
643 def __repr__(self): 683 def __repr__(self):
644 return 'LexerContext(%r, %r, %r)' % ( 684 return 'LexerContext(%r, %r, %r)' % (
645 self.text, self.pos, self.stack) 685 self.text, self.pos, self.stack)
665 text = ctx.text 705 text = ctx.text
666 while 1: 706 while 1:
667 for rexmatch, action, new_state in statetokens: 707 for rexmatch, action, new_state in statetokens:
668 m = rexmatch(text, ctx.pos, ctx.end) 708 m = rexmatch(text, ctx.pos, ctx.end)
669 if m: 709 if m:
670 if type(action) is _TokenType: 710 if action is not None:
671 yield ctx.pos, action, m.group() 711 if type(action) is _TokenType:
672 ctx.pos = m.end() 712 yield ctx.pos, action, m.group()
673 else: 713 ctx.pos = m.end()
674 for item in action(self, m, ctx): 714 else:
675 yield item 715 for item in action(self, m, ctx):
676 if not new_state: 716 yield item
677 # altered the state stack? 717 if not new_state:
678 statetokens = tokendefs[ctx.stack[-1]] 718 # altered the state stack?
719 statetokens = tokendefs[ctx.stack[-1]]
679 # CAUTION: callback must set ctx.pos! 720 # CAUTION: callback must set ctx.pos!
680 if new_state is not None: 721 if new_state is not None:
681 # state transition 722 # state transition
682 if isinstance(new_state, tuple): 723 if isinstance(new_state, tuple):
683 for state in new_state: 724 for state in new_state:
684 if state == '#pop': 725 if state == '#pop':
685 ctx.stack.pop() 726 ctx.stack.pop()
686 elif state == '#push': 727 elif state == '#push':
687 ctx.stack.append(statestack[-1]) 728 ctx.stack.append(ctx.stack[-1])
688 else: 729 else:
689 ctx.stack.append(state) 730 ctx.stack.append(state)
690 elif isinstance(new_state, int): 731 elif isinstance(new_state, int):
691 # pop 732 # pop
692 del ctx.stack[new_state:] 733 del ctx.stack[new_state:]
702 break 743 break
703 if text[ctx.pos] == '\n': 744 if text[ctx.pos] == '\n':
704 # at EOL, reset state to "root" 745 # at EOL, reset state to "root"
705 ctx.stack = ['root'] 746 ctx.stack = ['root']
706 statetokens = tokendefs['root'] 747 statetokens = tokendefs['root']
707 yield ctx.pos, Text, '\n' 748 yield ctx.pos, Text, u'\n'
708 ctx.pos += 1 749 ctx.pos += 1
709 continue 750 continue
710 yield ctx.pos, Error, text[ctx.pos] 751 yield ctx.pos, Error, text[ctx.pos]
711 ctx.pos += 1 752 ctx.pos += 1
712 except IndexError: 753 except IndexError:
772 try: 813 try:
773 index, itokens = next(insertions) 814 index, itokens = next(insertions)
774 except StopIteration: 815 except StopIteration:
775 insleft = False 816 insleft = False
776 break # not strictly necessary 817 break # not strictly necessary
818
819
820 class ProfilingRegexLexerMeta(RegexLexerMeta):
821 """Metaclass for ProfilingRegexLexer, collects regex timing info."""
822
823 def _process_regex(cls, regex, rflags, state):
824 if isinstance(regex, words):
825 rex = regex_opt(regex.words, prefix=regex.prefix,
826 suffix=regex.suffix)
827 else:
828 rex = regex
829 compiled = re.compile(rex, rflags)
830
831 def match_func(text, pos, endpos=sys.maxsize):
832 info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
833 t0 = time.time()
834 res = compiled.match(text, pos, endpos)
835 t1 = time.time()
836 info[0] += 1
837 info[1] += t1 - t0
838 return res
839 return match_func
840
841
842 @add_metaclass(ProfilingRegexLexerMeta)
843 class ProfilingRegexLexer(RegexLexer):
844 """Drop-in replacement for RegexLexer that does profiling of its regexes."""
845
846 _prof_data = []
847 _prof_sort_index = 4 # defaults to time per call
848
849 def get_tokens_unprocessed(self, text, stack=('root',)):
850 # this needs to be a stack, since using(this) will produce nested calls
851 self.__class__._prof_data.append({})
852 for tok in RegexLexer.get_tokens_unprocessed(self, text, stack):
853 yield tok
854 rawdata = self.__class__._prof_data.pop()
855 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
856 n, 1000 * t, 1000 * t / n)
857 for ((s, r), (n, t)) in rawdata.items()),
858 key=lambda x: x[self._prof_sort_index],
859 reverse=True)
860 sum_total = sum(x[3] for x in data)
861
862 print()
863 print('Profiling result for %s lexing %d chars in %.3f ms' %
864 (self.__class__.__name__, len(text), sum_total))
865 print('=' * 110)
866 print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
867 print('-' * 110)
868 for d in data:
869 print('%-20s %-65s %5d %8.4f %8.4f' % d)
870 print('=' * 110)

eric ide

mercurial