eric6/ThirdParty/Pygments/pygments/lexer.py

changeset 6942
2602857055c5
parent 5713
6762afd9f963
child 7547
21b0534faebc
equal deleted inserted replaced
6941:f99d60d6b59b 6942:2602857055c5
1 # -*- coding: utf-8 -*-
2 """
3 pygments.lexer
4 ~~~~~~~~~~~~~~
5
6 Base lexer classes.
7
8 :copyright: Copyright 2006-2017 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details.
10 """
11
12 from __future__ import print_function
13
14 import re
15 import sys
16 import time
17
18 from pygments.filter import apply_filters, Filter
19 from pygments.filters import get_filter_by_name
20 from pygments.token import Error, Text, Other, _TokenType
21 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
22 make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode
23 from pygments.regexopt import regex_opt
24
25 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
26 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
27 'default', 'words']
28
29
30 _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
31 (b'\xff\xfe\0\0', 'utf-32'),
32 (b'\0\0\xfe\xff', 'utf-32be'),
33 (b'\xff\xfe', 'utf-16'),
34 (b'\xfe\xff', 'utf-16be')]
35
36 _default_analyse = staticmethod(lambda x: 0.0)
37
38
39 class LexerMeta(type):
40 """
41 This metaclass automagically converts ``analyse_text`` methods into
42 static methods which always return float values.
43 """
44
45 def __new__(mcs, name, bases, d):
46 if 'analyse_text' in d:
47 d['analyse_text'] = make_analysator(d['analyse_text'])
48 return type.__new__(mcs, name, bases, d)
49
50
51 @add_metaclass(LexerMeta)
52 class Lexer(object):
53 """
54 Lexer for a specific language.
55
56 Basic options recognized:
57 ``stripnl``
58 Strip leading and trailing newlines from the input (default: True).
59 ``stripall``
60 Strip all leading and trailing whitespace from the input
61 (default: False).
62 ``ensurenl``
63 Make sure that the input ends with a newline (default: True). This
64 is required for some lexers that consume input linewise.
65
66 .. versionadded:: 1.3
67
68 ``tabsize``
69 If given and greater than 0, expand tabs in the input (default: 0).
70 ``encoding``
71 If given, must be an encoding name. This encoding will be used to
72 convert the input string to Unicode, if it is not already a Unicode
73 string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
74 Latin1 detection. Can also be ``'chardet'`` to use the chardet
75 library, if it is installed.
76 ``inencoding``
77 Overrides the ``encoding`` if given.
78 """
79
80 #: Name of the lexer
81 name = None
82
83 #: Shortcuts for the lexer
84 aliases = []
85
86 #: File name globs
87 filenames = []
88
89 #: Secondary file name globs
90 alias_filenames = []
91
92 #: MIME types
93 mimetypes = []
94
95 #: Priority, should multiple lexers match and no content is provided
96 priority = 0
97
98 def __init__(self, **options):
99 self.options = options
100 self.stripnl = get_bool_opt(options, 'stripnl', True)
101 self.stripall = get_bool_opt(options, 'stripall', False)
102 self.ensurenl = get_bool_opt(options, 'ensurenl', True)
103 self.tabsize = get_int_opt(options, 'tabsize', 0)
104 self.encoding = options.get('encoding', 'guess')
105 self.encoding = options.get('inencoding') or self.encoding
106 self.filters = []
107 for filter_ in get_list_opt(options, 'filters', ()):
108 self.add_filter(filter_)
109
110 def __repr__(self):
111 if self.options:
112 return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
113 self.options)
114 else:
115 return '<pygments.lexers.%s>' % self.__class__.__name__
116
117 def add_filter(self, filter_, **options):
118 """
119 Add a new stream filter to this lexer.
120 """
121 if not isinstance(filter_, Filter):
122 filter_ = get_filter_by_name(filter_, **options)
123 self.filters.append(filter_)
124
125 def analyse_text(text):
126 """
127 Has to return a float between ``0`` and ``1`` that indicates
128 if a lexer wants to highlight this text. Used by ``guess_lexer``.
129 If this method returns ``0`` it won't highlight it in any case, if
130 it returns ``1`` highlighting with this lexer is guaranteed.
131
132 The `LexerMeta` metaclass automatically wraps this function so
133 that it works like a static method (no ``self`` or ``cls``
134 parameter) and the return value is automatically converted to
135 `float`. If the return value is an object that is boolean `False`
136 it's the same as if the return values was ``0.0``.
137 """
138
139 def get_tokens(self, text, unfiltered=False):
140 """
141 Return an iterable of (tokentype, value) pairs generated from
142 `text`. If `unfiltered` is set to `True`, the filtering mechanism
143 is bypassed even if filters are defined.
144
145 Also preprocess the text, i.e. expand tabs and strip it if
146 wanted and applies registered filters.
147 """
148 if not isinstance(text, text_type):
149 if self.encoding == 'guess':
150 text, _ = guess_decode(text)
151 elif self.encoding == 'chardet':
152 try:
153 import chardet
154 except ImportError:
155 raise ImportError('To enable chardet encoding guessing, '
156 'please install the chardet library '
157 'from http://chardet.feedparser.org/')
158 # check for BOM first
159 decoded = None
160 for bom, encoding in _encoding_map:
161 if text.startswith(bom):
162 decoded = text[len(bom):].decode(encoding, 'replace')
163 break
164 # no BOM found, so use chardet
165 if decoded is None:
166 enc = chardet.detect(text[:1024]) # Guess using first 1KB
167 decoded = text.decode(enc.get('encoding') or 'utf-8',
168 'replace')
169 text = decoded
170 else:
171 text = text.decode(self.encoding)
172 if text.startswith(u'\ufeff'):
173 text = text[len(u'\ufeff'):]
174 else:
175 if text.startswith(u'\ufeff'):
176 text = text[len(u'\ufeff'):]
177
178 # text now *is* a unicode string
179 text = text.replace('\r\n', '\n')
180 text = text.replace('\r', '\n')
181 if self.stripall:
182 text = text.strip()
183 elif self.stripnl:
184 text = text.strip('\n')
185 if self.tabsize > 0:
186 text = text.expandtabs(self.tabsize)
187 if self.ensurenl and not text.endswith('\n'):
188 text += '\n'
189
190 def streamer():
191 for _, t, v in self.get_tokens_unprocessed(text):
192 yield t, v
193 stream = streamer()
194 if not unfiltered:
195 stream = apply_filters(stream, self.filters, self)
196 return stream
197
198 def get_tokens_unprocessed(self, text):
199 """
200 Return an iterable of (index, tokentype, value) pairs where "index"
201 is the starting position of the token within the input text.
202
203 In subclasses, implement this method as a generator to
204 maximize effectiveness.
205 """
206 raise NotImplementedError
207
208
209 class DelegatingLexer(Lexer):
210 """
211 This lexer takes two lexer as arguments. A root lexer and
212 a language lexer. First everything is scanned using the language
213 lexer, afterwards all ``Other`` tokens are lexed using the root
214 lexer.
215
216 The lexers from the ``template`` lexer package use this base lexer.
217 """
218
219 def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
220 self.root_lexer = _root_lexer(**options)
221 self.language_lexer = _language_lexer(**options)
222 self.needle = _needle
223 Lexer.__init__(self, **options)
224
225 def get_tokens_unprocessed(self, text):
226 buffered = ''
227 insertions = []
228 lng_buffer = []
229 for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
230 if t is self.needle:
231 if lng_buffer:
232 insertions.append((len(buffered), lng_buffer))
233 lng_buffer = []
234 buffered += v
235 else:
236 lng_buffer.append((i, t, v))
237 if lng_buffer:
238 insertions.append((len(buffered), lng_buffer))
239 return do_insertions(insertions,
240 self.root_lexer.get_tokens_unprocessed(buffered))
241
242
243 # ------------------------------------------------------------------------------
244 # RegexLexer and ExtendedRegexLexer
245 #
246
247
248 class include(str): # pylint: disable=invalid-name
249 """
250 Indicates that a state should include rules from another state.
251 """
252 pass
253
254
255 class _inherit(object):
256 """
257 Indicates the a state should inherit from its superclass.
258 """
259 def __repr__(self):
260 return 'inherit'
261
262 inherit = _inherit() # pylint: disable=invalid-name
263
264
265 class combined(tuple): # pylint: disable=invalid-name
266 """
267 Indicates a state combined from multiple states.
268 """
269
270 def __new__(cls, *args):
271 return tuple.__new__(cls, args)
272
273 def __init__(self, *args):
274 # tuple.__init__ doesn't do anything
275 pass
276
277
278 class _PseudoMatch(object):
279 """
280 A pseudo match object constructed from a string.
281 """
282
283 def __init__(self, start, text):
284 self._text = text
285 self._start = start
286
287 def start(self, arg=None):
288 return self._start
289
290 def end(self, arg=None):
291 return self._start + len(self._text)
292
293 def group(self, arg=None):
294 if arg:
295 raise IndexError('No such group')
296 return self._text
297
298 def groups(self):
299 return (self._text,)
300
301 def groupdict(self):
302 return {}
303
304
305 def bygroups(*args):
306 """
307 Callback that yields multiple actions for each group in the match.
308 """
309 def callback(lexer, match, ctx=None):
310 for i, action in enumerate(args):
311 if action is None:
312 continue
313 elif type(action) is _TokenType:
314 data = match.group(i + 1)
315 if data:
316 yield match.start(i + 1), action, data
317 else:
318 data = match.group(i + 1)
319 if data is not None:
320 if ctx:
321 ctx.pos = match.start(i + 1)
322 for item in action(lexer,
323 _PseudoMatch(match.start(i + 1), data), ctx):
324 if item:
325 yield item
326 if ctx:
327 ctx.pos = match.end()
328 return callback
329
330
331 class _This(object):
332 """
333 Special singleton used for indicating the caller class.
334 Used by ``using``.
335 """
336 this = _This()
337
338
339 def using(_other, **kwargs):
340 """
341 Callback that processes the match with a different lexer.
342
343 The keyword arguments are forwarded to the lexer, except `state` which
344 is handled separately.
345
346 `state` specifies the state that the new lexer will start in, and can
347 be an enumerable such as ('root', 'inline', 'string') or a simple
348 string which is assumed to be on top of the root state.
349
350 Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
351 """
352 gt_kwargs = {}
353 if 'state' in kwargs:
354 s = kwargs.pop('state')
355 if isinstance(s, (list, tuple)):
356 gt_kwargs['stack'] = s
357 else:
358 gt_kwargs['stack'] = ('root', s)
359
360 if _other is this:
361 def callback(lexer, match, ctx=None):
362 # if keyword arguments are given the callback
363 # function has to create a new lexer instance
364 if kwargs:
365 # XXX: cache that somehow
366 kwargs.update(lexer.options)
367 lx = lexer.__class__(**kwargs)
368 else:
369 lx = lexer
370 s = match.start()
371 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
372 yield i + s, t, v
373 if ctx:
374 ctx.pos = match.end()
375 else:
376 def callback(lexer, match, ctx=None):
377 # XXX: cache that somehow
378 kwargs.update(lexer.options)
379 lx = _other(**kwargs)
380
381 s = match.start()
382 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
383 yield i + s, t, v
384 if ctx:
385 ctx.pos = match.end()
386 return callback
387
388
389 class default:
390 """
391 Indicates a state or state action (e.g. #pop) to apply.
392 For example default('#pop') is equivalent to ('', Token, '#pop')
393 Note that state tuples may be used as well.
394
395 .. versionadded:: 2.0
396 """
397 def __init__(self, state):
398 self.state = state
399
400
401 class words(Future):
402 """
403 Indicates a list of literal words that is transformed into an optimized
404 regex that matches any of the words.
405
406 .. versionadded:: 2.0
407 """
408 def __init__(self, words, prefix='', suffix=''):
409 self.words = words
410 self.prefix = prefix
411 self.suffix = suffix
412
413 def get(self):
414 return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
415
416
417 class RegexLexerMeta(LexerMeta):
418 """
419 Metaclass for RegexLexer, creates the self._tokens attribute from
420 self.tokens on the first instantiation.
421 """
422
423 def _process_regex(cls, regex, rflags, state):
424 """Preprocess the regular expression component of a token definition."""
425 if isinstance(regex, Future):
426 regex = regex.get()
427 return re.compile(regex, rflags).match
428
429 def _process_token(cls, token):
430 """Preprocess the token component of a token definition."""
431 assert type(token) is _TokenType or callable(token), \
432 'token type must be simple type or callable, not %r' % (token,)
433 return token
434
435 def _process_new_state(cls, new_state, unprocessed, processed):
436 """Preprocess the state transition action of a token definition."""
437 if isinstance(new_state, str):
438 # an existing state
439 if new_state == '#pop':
440 return -1
441 elif new_state in unprocessed:
442 return (new_state,)
443 elif new_state == '#push':
444 return new_state
445 elif new_state[:5] == '#pop:':
446 return -int(new_state[5:])
447 else:
448 assert False, 'unknown new state %r' % new_state
449 elif isinstance(new_state, combined):
450 # combine a new state from existing ones
451 tmp_state = '_tmp_%d' % cls._tmpname
452 cls._tmpname += 1
453 itokens = []
454 for istate in new_state:
455 assert istate != new_state, 'circular state ref %r' % istate
456 itokens.extend(cls._process_state(unprocessed,
457 processed, istate))
458 processed[tmp_state] = itokens
459 return (tmp_state,)
460 elif isinstance(new_state, tuple):
461 # push more than one state
462 for istate in new_state:
463 assert (istate in unprocessed or
464 istate in ('#pop', '#push')), \
465 'unknown new state ' + istate
466 return new_state
467 else:
468 assert False, 'unknown new state def %r' % new_state
469
470 def _process_state(cls, unprocessed, processed, state):
471 """Preprocess a single state definition."""
472 assert type(state) is str, "wrong state name %r" % state
473 assert state[0] != '#', "invalid state name %r" % state
474 if state in processed:
475 return processed[state]
476 tokens = processed[state] = []
477 rflags = cls.flags
478 for tdef in unprocessed[state]:
479 if isinstance(tdef, include):
480 # it's a state reference
481 assert tdef != state, "circular state reference %r" % state
482 tokens.extend(cls._process_state(unprocessed, processed,
483 str(tdef)))
484 continue
485 if isinstance(tdef, _inherit):
486 # should be processed already, but may not in the case of:
487 # 1. the state has no counterpart in any parent
488 # 2. the state includes more than one 'inherit'
489 continue
490 if isinstance(tdef, default):
491 new_state = cls._process_new_state(tdef.state, unprocessed, processed)
492 tokens.append((re.compile('').match, None, new_state))
493 continue
494
495 assert type(tdef) is tuple, "wrong rule def %r" % tdef
496
497 try:
498 rex = cls._process_regex(tdef[0], rflags, state)
499 except Exception as err:
500 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
501 (tdef[0], state, cls, err))
502
503 token = cls._process_token(tdef[1])
504
505 if len(tdef) == 2:
506 new_state = None
507 else:
508 new_state = cls._process_new_state(tdef[2],
509 unprocessed, processed)
510
511 tokens.append((rex, token, new_state))
512 return tokens
513
514 def process_tokendef(cls, name, tokendefs=None):
515 """Preprocess a dictionary of token definitions."""
516 processed = cls._all_tokens[name] = {}
517 tokendefs = tokendefs or cls.tokens[name]
518 for state in list(tokendefs):
519 cls._process_state(tokendefs, processed, state)
520 return processed
521
522 def get_tokendefs(cls):
523 """
524 Merge tokens from superclasses in MRO order, returning a single tokendef
525 dictionary.
526
527 Any state that is not defined by a subclass will be inherited
528 automatically. States that *are* defined by subclasses will, by
529 default, override that state in the superclass. If a subclass wishes to
530 inherit definitions from a superclass, it can use the special value
531 "inherit", which will cause the superclass' state definition to be
532 included at that point in the state.
533 """
534 tokens = {}
535 inheritable = {}
536 for c in cls.__mro__:
537 toks = c.__dict__.get('tokens', {})
538
539 for state, items in iteritems(toks):
540 curitems = tokens.get(state)
541 if curitems is None:
542 # N.b. because this is assigned by reference, sufficiently
543 # deep hierarchies are processed incrementally (e.g. for
544 # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
545 # will not see any inherits in B).
546 tokens[state] = items
547 try:
548 inherit_ndx = items.index(inherit)
549 except ValueError:
550 continue
551 inheritable[state] = inherit_ndx
552 continue
553
554 inherit_ndx = inheritable.pop(state, None)
555 if inherit_ndx is None:
556 continue
557
558 # Replace the "inherit" value with the items
559 curitems[inherit_ndx:inherit_ndx+1] = items
560 try:
561 # N.b. this is the index in items (that is, the superclass
562 # copy), so offset required when storing below.
563 new_inh_ndx = items.index(inherit)
564 except ValueError:
565 pass
566 else:
567 inheritable[state] = inherit_ndx + new_inh_ndx
568
569 return tokens
570
571 def __call__(cls, *args, **kwds):
572 """Instantiate cls after preprocessing its token definitions."""
573 if '_tokens' not in cls.__dict__:
574 cls._all_tokens = {}
575 cls._tmpname = 0
576 if hasattr(cls, 'token_variants') and cls.token_variants:
577 # don't process yet
578 pass
579 else:
580 cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
581
582 return type.__call__(cls, *args, **kwds)
583
584
585 @add_metaclass(RegexLexerMeta)
586 class RegexLexer(Lexer):
587 """
588 Base for simple stateful regular expression-based lexers.
589 Simplifies the lexing process so that you need only
590 provide a list of states and regular expressions.
591 """
592
593 #: Flags for compiling the regular expressions.
594 #: Defaults to MULTILINE.
595 flags = re.MULTILINE
596
597 #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
598 #:
599 #: The initial state is 'root'.
600 #: ``new_state`` can be omitted to signify no state transition.
601 #: If it is a string, the state is pushed on the stack and changed.
602 #: If it is a tuple of strings, all states are pushed on the stack and
603 #: the current state will be the topmost.
604 #: It can also be ``combined('state1', 'state2', ...)``
605 #: to signify a new, anonymous state combined from the rules of two
606 #: or more existing ones.
607 #: Furthermore, it can be '#pop' to signify going back one step in
608 #: the state stack, or '#push' to push the current state on the stack
609 #: again.
610 #:
611 #: The tuple can also be replaced with ``include('state')``, in which
612 #: case the rules from the state named by the string are included in the
613 #: current one.
614 tokens = {}
615
616 def get_tokens_unprocessed(self, text, stack=('root',)):
617 """
618 Split ``text`` into (tokentype, text) pairs.
619
620 ``stack`` is the inital stack (default: ``['root']``)
621 """
622 pos = 0
623 tokendefs = self._tokens
624 statestack = list(stack)
625 statetokens = tokendefs[statestack[-1]]
626 while 1:
627 for rexmatch, action, new_state in statetokens:
628 m = rexmatch(text, pos)
629 if m:
630 if action is not None:
631 if type(action) is _TokenType:
632 yield pos, action, m.group()
633 else:
634 for item in action(self, m):
635 yield item
636 pos = m.end()
637 if new_state is not None:
638 # state transition
639 if isinstance(new_state, tuple):
640 for state in new_state:
641 if state == '#pop':
642 statestack.pop()
643 elif state == '#push':
644 statestack.append(statestack[-1])
645 else:
646 statestack.append(state)
647 elif isinstance(new_state, int):
648 # pop
649 del statestack[new_state:]
650 elif new_state == '#push':
651 statestack.append(statestack[-1])
652 else:
653 assert False, "wrong state def: %r" % new_state
654 statetokens = tokendefs[statestack[-1]]
655 break
656 else:
657 # We are here only if all state tokens have been considered
658 # and there was not a match on any of them.
659 try:
660 if text[pos] == '\n':
661 # at EOL, reset state to "root"
662 statestack = ['root']
663 statetokens = tokendefs['root']
664 yield pos, Text, u'\n'
665 pos += 1
666 continue
667 yield pos, Error, text[pos]
668 pos += 1
669 except IndexError:
670 break
671
672
673 class LexerContext(object):
674 """
675 A helper object that holds lexer position data.
676 """
677
678 def __init__(self, text, pos, stack=None, end=None):
679 self.text = text
680 self.pos = pos
681 self.end = end or len(text) # end=0 not supported ;-)
682 self.stack = stack or ['root']
683
684 def __repr__(self):
685 return 'LexerContext(%r, %r, %r)' % (
686 self.text, self.pos, self.stack)
687
688
689 class ExtendedRegexLexer(RegexLexer):
690 """
691 A RegexLexer that uses a context object to store its state.
692 """
693
694 def get_tokens_unprocessed(self, text=None, context=None):
695 """
696 Split ``text`` into (tokentype, text) pairs.
697 If ``context`` is given, use this lexer context instead.
698 """
699 tokendefs = self._tokens
700 if not context:
701 ctx = LexerContext(text, 0)
702 statetokens = tokendefs['root']
703 else:
704 ctx = context
705 statetokens = tokendefs[ctx.stack[-1]]
706 text = ctx.text
707 while 1:
708 for rexmatch, action, new_state in statetokens:
709 m = rexmatch(text, ctx.pos, ctx.end)
710 if m:
711 if action is not None:
712 if type(action) is _TokenType:
713 yield ctx.pos, action, m.group()
714 ctx.pos = m.end()
715 else:
716 for item in action(self, m, ctx):
717 yield item
718 if not new_state:
719 # altered the state stack?
720 statetokens = tokendefs[ctx.stack[-1]]
721 # CAUTION: callback must set ctx.pos!
722 if new_state is not None:
723 # state transition
724 if isinstance(new_state, tuple):
725 for state in new_state:
726 if state == '#pop':
727 ctx.stack.pop()
728 elif state == '#push':
729 ctx.stack.append(ctx.stack[-1])
730 else:
731 ctx.stack.append(state)
732 elif isinstance(new_state, int):
733 # pop
734 del ctx.stack[new_state:]
735 elif new_state == '#push':
736 ctx.stack.append(ctx.stack[-1])
737 else:
738 assert False, "wrong state def: %r" % new_state
739 statetokens = tokendefs[ctx.stack[-1]]
740 break
741 else:
742 try:
743 if ctx.pos >= ctx.end:
744 break
745 if text[ctx.pos] == '\n':
746 # at EOL, reset state to "root"
747 ctx.stack = ['root']
748 statetokens = tokendefs['root']
749 yield ctx.pos, Text, u'\n'
750 ctx.pos += 1
751 continue
752 yield ctx.pos, Error, text[ctx.pos]
753 ctx.pos += 1
754 except IndexError:
755 break
756
757
758 def do_insertions(insertions, tokens):
759 """
760 Helper for lexers which must combine the results of several
761 sublexers.
762
763 ``insertions`` is a list of ``(index, itokens)`` pairs.
764 Each ``itokens`` iterable should be inserted at position
765 ``index`` into the token stream given by the ``tokens``
766 argument.
767
768 The result is a combined token stream.
769
770 TODO: clean up the code here.
771 """
772 insertions = iter(insertions)
773 try:
774 index, itokens = next(insertions)
775 except StopIteration:
776 # no insertions
777 for item in tokens:
778 yield item
779 return
780
781 realpos = None
782 insleft = True
783
784 # iterate over the token stream where we want to insert
785 # the tokens from the insertion list.
786 for i, t, v in tokens:
787 # first iteration. store the postition of first item
788 if realpos is None:
789 realpos = i
790 oldi = 0
791 while insleft and i + len(v) >= index:
792 tmpval = v[oldi:index - i]
793 yield realpos, t, tmpval
794 realpos += len(tmpval)
795 for it_index, it_token, it_value in itokens:
796 yield realpos, it_token, it_value
797 realpos += len(it_value)
798 oldi = index - i
799 try:
800 index, itokens = next(insertions)
801 except StopIteration:
802 insleft = False
803 break # not strictly necessary
804 yield realpos, t, v[oldi:]
805 realpos += len(v) - oldi
806
807 # leftover tokens
808 while insleft:
809 # no normal tokens, set realpos to zero
810 realpos = realpos or 0
811 for p, t, v in itokens:
812 yield realpos, t, v
813 realpos += len(v)
814 try:
815 index, itokens = next(insertions)
816 except StopIteration:
817 insleft = False
818 break # not strictly necessary
819
820
821 class ProfilingRegexLexerMeta(RegexLexerMeta):
822 """Metaclass for ProfilingRegexLexer, collects regex timing info."""
823
824 def _process_regex(cls, regex, rflags, state):
825 if isinstance(regex, words):
826 rex = regex_opt(regex.words, prefix=regex.prefix,
827 suffix=regex.suffix)
828 else:
829 rex = regex
830 compiled = re.compile(rex, rflags)
831
832 def match_func(text, pos, endpos=sys.maxsize):
833 info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
834 t0 = time.time()
835 res = compiled.match(text, pos, endpos)
836 t1 = time.time()
837 info[0] += 1
838 info[1] += t1 - t0
839 return res
840 return match_func
841
842
843 @add_metaclass(ProfilingRegexLexerMeta)
844 class ProfilingRegexLexer(RegexLexer):
845 """Drop-in replacement for RegexLexer that does profiling of its regexes."""
846
847 _prof_data = []
848 _prof_sort_index = 4 # defaults to time per call
849
850 def get_tokens_unprocessed(self, text, stack=('root',)):
851 # this needs to be a stack, since using(this) will produce nested calls
852 self.__class__._prof_data.append({})
853 for tok in RegexLexer.get_tokens_unprocessed(self, text, stack):
854 yield tok
855 rawdata = self.__class__._prof_data.pop()
856 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
857 n, 1000 * t, 1000 * t / n)
858 for ((s, r), (n, t)) in rawdata.items()),
859 key=lambda x: x[self._prof_sort_index],
860 reverse=True)
861 sum_total = sum(x[3] for x in data)
862
863 print()
864 print('Profiling result for %s lexing %d chars in %.3f ms' %
865 (self.__class__.__name__, len(text), sum_total))
866 print('=' * 110)
867 print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
868 print('-' * 110)
869 for d in data:
870 print('%-20s %-65s %5d %8.4f %8.4f' % d)
871 print('=' * 110)

eric ide

mercurial