eric6/ThirdParty/Pygments/pygments/lexer.py

changeset 8258
82b608e352ec
parent 8257
28146736bbfc
child 8259
2bbec88047dd
equal deleted inserted replaced
8257:28146736bbfc 8258:82b608e352ec
1 # -*- coding: utf-8 -*-
2 """
3 pygments.lexer
4 ~~~~~~~~~~~~~~
5
6 Base lexer classes.
7
8 :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details.
10 """
11
12 import re
13 import sys
14 import time
15
16 from pygments.filter import apply_filters, Filter
17 from pygments.filters import get_filter_by_name
18 from pygments.token import Error, Text, Other, _TokenType
19 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
20 make_analysator, Future, guess_decode
21 from pygments.regexopt import regex_opt
22
23 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
24 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
25 'default', 'words']
26
27
28 _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
29 (b'\xff\xfe\0\0', 'utf-32'),
30 (b'\0\0\xfe\xff', 'utf-32be'),
31 (b'\xff\xfe', 'utf-16'),
32 (b'\xfe\xff', 'utf-16be')]
33
34 _default_analyse = staticmethod(lambda x: 0.0)
35
36
37 class LexerMeta(type):
38 """
39 This metaclass automagically converts ``analyse_text`` methods into
40 static methods which always return float values.
41 """
42
43 def __new__(mcs, name, bases, d):
44 if 'analyse_text' in d:
45 d['analyse_text'] = make_analysator(d['analyse_text'])
46 return type.__new__(mcs, name, bases, d)
47
48
49 class Lexer(metaclass=LexerMeta):
50 """
51 Lexer for a specific language.
52
53 Basic options recognized:
54 ``stripnl``
55 Strip leading and trailing newlines from the input (default: True).
56 ``stripall``
57 Strip all leading and trailing whitespace from the input
58 (default: False).
59 ``ensurenl``
60 Make sure that the input ends with a newline (default: True). This
61 is required for some lexers that consume input linewise.
62
63 .. versionadded:: 1.3
64
65 ``tabsize``
66 If given and greater than 0, expand tabs in the input (default: 0).
67 ``encoding``
68 If given, must be an encoding name. This encoding will be used to
69 convert the input string to Unicode, if it is not already a Unicode
70 string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
71 Latin1 detection. Can also be ``'chardet'`` to use the chardet
72 library, if it is installed.
73 ``inencoding``
74 Overrides the ``encoding`` if given.
75 """
76
77 #: Name of the lexer
78 name = None
79
80 #: Shortcuts for the lexer
81 aliases = []
82
83 #: File name globs
84 filenames = []
85
86 #: Secondary file name globs
87 alias_filenames = []
88
89 #: MIME types
90 mimetypes = []
91
92 #: Priority, should multiple lexers match and no content is provided
93 priority = 0
94
95 def __init__(self, **options):
96 self.options = options
97 self.stripnl = get_bool_opt(options, 'stripnl', True)
98 self.stripall = get_bool_opt(options, 'stripall', False)
99 self.ensurenl = get_bool_opt(options, 'ensurenl', True)
100 self.tabsize = get_int_opt(options, 'tabsize', 0)
101 self.encoding = options.get('encoding', 'guess')
102 self.encoding = options.get('inencoding') or self.encoding
103 self.filters = []
104 for filter_ in get_list_opt(options, 'filters', ()):
105 self.add_filter(filter_)
106
107 def __repr__(self):
108 if self.options:
109 return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
110 self.options)
111 else:
112 return '<pygments.lexers.%s>' % self.__class__.__name__
113
114 def add_filter(self, filter_, **options):
115 """
116 Add a new stream filter to this lexer.
117 """
118 if not isinstance(filter_, Filter):
119 filter_ = get_filter_by_name(filter_, **options)
120 self.filters.append(filter_)
121
122 def analyse_text(text):
123 """
124 Has to return a float between ``0`` and ``1`` that indicates
125 if a lexer wants to highlight this text. Used by ``guess_lexer``.
126 If this method returns ``0`` it won't highlight it in any case, if
127 it returns ``1`` highlighting with this lexer is guaranteed.
128
129 The `LexerMeta` metaclass automatically wraps this function so
130 that it works like a static method (no ``self`` or ``cls``
131 parameter) and the return value is automatically converted to
132 `float`. If the return value is an object that is boolean `False`
133 it's the same as if the return values was ``0.0``.
134 """
135
136 def get_tokens(self, text, unfiltered=False):
137 """
138 Return an iterable of (tokentype, value) pairs generated from
139 `text`. If `unfiltered` is set to `True`, the filtering mechanism
140 is bypassed even if filters are defined.
141
142 Also preprocess the text, i.e. expand tabs and strip it if
143 wanted and applies registered filters.
144 """
145 if not isinstance(text, str):
146 if self.encoding == 'guess':
147 text, _ = guess_decode(text)
148 elif self.encoding == 'chardet':
149 try:
150 import chardet
151 except ImportError as e:
152 raise ImportError('To enable chardet encoding guessing, '
153 'please install the chardet library '
154 'from http://chardet.feedparser.org/') from e
155 # check for BOM first
156 decoded = None
157 for bom, encoding in _encoding_map:
158 if text.startswith(bom):
159 decoded = text[len(bom):].decode(encoding, 'replace')
160 break
161 # no BOM found, so use chardet
162 if decoded is None:
163 enc = chardet.detect(text[:1024]) # Guess using first 1KB
164 decoded = text.decode(enc.get('encoding') or 'utf-8',
165 'replace')
166 text = decoded
167 else:
168 text = text.decode(self.encoding)
169 if text.startswith('\ufeff'):
170 text = text[len('\ufeff'):]
171 else:
172 if text.startswith('\ufeff'):
173 text = text[len('\ufeff'):]
174
175 # text now *is* a unicode string
176 text = text.replace('\r\n', '\n')
177 text = text.replace('\r', '\n')
178 if self.stripall:
179 text = text.strip()
180 elif self.stripnl:
181 text = text.strip('\n')
182 if self.tabsize > 0:
183 text = text.expandtabs(self.tabsize)
184 if self.ensurenl and not text.endswith('\n'):
185 text += '\n'
186
187 def streamer():
188 for _, t, v in self.get_tokens_unprocessed(text):
189 yield t, v
190 stream = streamer()
191 if not unfiltered:
192 stream = apply_filters(stream, self.filters, self)
193 return stream
194
195 def get_tokens_unprocessed(self, text):
196 """
197 Return an iterable of (index, tokentype, value) pairs where "index"
198 is the starting position of the token within the input text.
199
200 In subclasses, implement this method as a generator to
201 maximize effectiveness.
202 """
203 raise NotImplementedError
204
205
206 class DelegatingLexer(Lexer):
207 """
208 This lexer takes two lexer as arguments. A root lexer and
209 a language lexer. First everything is scanned using the language
210 lexer, afterwards all ``Other`` tokens are lexed using the root
211 lexer.
212
213 The lexers from the ``template`` lexer package use this base lexer.
214 """
215
216 def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
217 self.root_lexer = _root_lexer(**options)
218 self.language_lexer = _language_lexer(**options)
219 self.needle = _needle
220 Lexer.__init__(self, **options)
221
222 def get_tokens_unprocessed(self, text):
223 buffered = ''
224 insertions = []
225 lng_buffer = []
226 for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
227 if t is self.needle:
228 if lng_buffer:
229 insertions.append((len(buffered), lng_buffer))
230 lng_buffer = []
231 buffered += v
232 else:
233 lng_buffer.append((i, t, v))
234 if lng_buffer:
235 insertions.append((len(buffered), lng_buffer))
236 return do_insertions(insertions,
237 self.root_lexer.get_tokens_unprocessed(buffered))
238
239
240 # ------------------------------------------------------------------------------
241 # RegexLexer and ExtendedRegexLexer
242 #
243
244
245 class include(str): # pylint: disable=invalid-name
246 """
247 Indicates that a state should include rules from another state.
248 """
249 pass
250
251
252 class _inherit:
253 """
254 Indicates the a state should inherit from its superclass.
255 """
256 def __repr__(self):
257 return 'inherit'
258
259 inherit = _inherit() # pylint: disable=invalid-name
260
261
262 class combined(tuple): # pylint: disable=invalid-name
263 """
264 Indicates a state combined from multiple states.
265 """
266
267 def __new__(cls, *args):
268 return tuple.__new__(cls, args)
269
270 def __init__(self, *args):
271 # tuple.__init__ doesn't do anything
272 pass
273
274
275 class _PseudoMatch:
276 """
277 A pseudo match object constructed from a string.
278 """
279
280 def __init__(self, start, text):
281 self._text = text
282 self._start = start
283
284 def start(self, arg=None):
285 return self._start
286
287 def end(self, arg=None):
288 return self._start + len(self._text)
289
290 def group(self, arg=None):
291 if arg:
292 raise IndexError('No such group')
293 return self._text
294
295 def groups(self):
296 return (self._text,)
297
298 def groupdict(self):
299 return {}
300
301
302 def bygroups(*args):
303 """
304 Callback that yields multiple actions for each group in the match.
305 """
306 def callback(lexer, match, ctx=None):
307 for i, action in enumerate(args):
308 if action is None:
309 continue
310 elif type(action) is _TokenType:
311 data = match.group(i + 1)
312 if data:
313 yield match.start(i + 1), action, data
314 else:
315 data = match.group(i + 1)
316 if data is not None:
317 if ctx:
318 ctx.pos = match.start(i + 1)
319 for item in action(lexer,
320 _PseudoMatch(match.start(i + 1), data), ctx):
321 if item:
322 yield item
323 if ctx:
324 ctx.pos = match.end()
325 return callback
326
327
328 class _This:
329 """
330 Special singleton used for indicating the caller class.
331 Used by ``using``.
332 """
333
334 this = _This()
335
336
337 def using(_other, **kwargs):
338 """
339 Callback that processes the match with a different lexer.
340
341 The keyword arguments are forwarded to the lexer, except `state` which
342 is handled separately.
343
344 `state` specifies the state that the new lexer will start in, and can
345 be an enumerable such as ('root', 'inline', 'string') or a simple
346 string which is assumed to be on top of the root state.
347
348 Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
349 """
350 gt_kwargs = {}
351 if 'state' in kwargs:
352 s = kwargs.pop('state')
353 if isinstance(s, (list, tuple)):
354 gt_kwargs['stack'] = s
355 else:
356 gt_kwargs['stack'] = ('root', s)
357
358 if _other is this:
359 def callback(lexer, match, ctx=None):
360 # if keyword arguments are given the callback
361 # function has to create a new lexer instance
362 if kwargs:
363 # XXX: cache that somehow
364 kwargs.update(lexer.options)
365 lx = lexer.__class__(**kwargs)
366 else:
367 lx = lexer
368 s = match.start()
369 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
370 yield i + s, t, v
371 if ctx:
372 ctx.pos = match.end()
373 else:
374 def callback(lexer, match, ctx=None):
375 # XXX: cache that somehow
376 kwargs.update(lexer.options)
377 lx = _other(**kwargs)
378
379 s = match.start()
380 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
381 yield i + s, t, v
382 if ctx:
383 ctx.pos = match.end()
384 return callback
385
386
387 class default:
388 """
389 Indicates a state or state action (e.g. #pop) to apply.
390 For example default('#pop') is equivalent to ('', Token, '#pop')
391 Note that state tuples may be used as well.
392
393 .. versionadded:: 2.0
394 """
395 def __init__(self, state):
396 self.state = state
397
398
399 class words(Future):
400 """
401 Indicates a list of literal words that is transformed into an optimized
402 regex that matches any of the words.
403
404 .. versionadded:: 2.0
405 """
406 def __init__(self, words, prefix='', suffix=''):
407 self.words = words
408 self.prefix = prefix
409 self.suffix = suffix
410
411 def get(self):
412 return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
413
414
415 class RegexLexerMeta(LexerMeta):
416 """
417 Metaclass for RegexLexer, creates the self._tokens attribute from
418 self.tokens on the first instantiation.
419 """
420
421 def _process_regex(cls, regex, rflags, state):
422 """Preprocess the regular expression component of a token definition."""
423 if isinstance(regex, Future):
424 regex = regex.get()
425 return re.compile(regex, rflags).match
426
427 def _process_token(cls, token):
428 """Preprocess the token component of a token definition."""
429 assert type(token) is _TokenType or callable(token), \
430 'token type must be simple type or callable, not %r' % (token,)
431 return token
432
433 def _process_new_state(cls, new_state, unprocessed, processed):
434 """Preprocess the state transition action of a token definition."""
435 if isinstance(new_state, str):
436 # an existing state
437 if new_state == '#pop':
438 return -1
439 elif new_state in unprocessed:
440 return (new_state,)
441 elif new_state == '#push':
442 return new_state
443 elif new_state[:5] == '#pop:':
444 return -int(new_state[5:])
445 else:
446 assert False, 'unknown new state %r' % new_state
447 elif isinstance(new_state, combined):
448 # combine a new state from existing ones
449 tmp_state = '_tmp_%d' % cls._tmpname
450 cls._tmpname += 1
451 itokens = []
452 for istate in new_state:
453 assert istate != new_state, 'circular state ref %r' % istate
454 itokens.extend(cls._process_state(unprocessed,
455 processed, istate))
456 processed[tmp_state] = itokens
457 return (tmp_state,)
458 elif isinstance(new_state, tuple):
459 # push more than one state
460 for istate in new_state:
461 assert (istate in unprocessed or
462 istate in ('#pop', '#push')), \
463 'unknown new state ' + istate
464 return new_state
465 else:
466 assert False, 'unknown new state def %r' % new_state
467
468 def _process_state(cls, unprocessed, processed, state):
469 """Preprocess a single state definition."""
470 assert type(state) is str, "wrong state name %r" % state
471 assert state[0] != '#', "invalid state name %r" % state
472 if state in processed:
473 return processed[state]
474 tokens = processed[state] = []
475 rflags = cls.flags
476 for tdef in unprocessed[state]:
477 if isinstance(tdef, include):
478 # it's a state reference
479 assert tdef != state, "circular state reference %r" % state
480 tokens.extend(cls._process_state(unprocessed, processed,
481 str(tdef)))
482 continue
483 if isinstance(tdef, _inherit):
484 # should be processed already, but may not in the case of:
485 # 1. the state has no counterpart in any parent
486 # 2. the state includes more than one 'inherit'
487 continue
488 if isinstance(tdef, default):
489 new_state = cls._process_new_state(tdef.state, unprocessed, processed)
490 tokens.append((re.compile('').match, None, new_state))
491 continue
492
493 assert type(tdef) is tuple, "wrong rule def %r" % tdef
494
495 try:
496 rex = cls._process_regex(tdef[0], rflags, state)
497 except Exception as err:
498 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
499 (tdef[0], state, cls, err)) from err
500
501 token = cls._process_token(tdef[1])
502
503 if len(tdef) == 2:
504 new_state = None
505 else:
506 new_state = cls._process_new_state(tdef[2],
507 unprocessed, processed)
508
509 tokens.append((rex, token, new_state))
510 return tokens
511
512 def process_tokendef(cls, name, tokendefs=None):
513 """Preprocess a dictionary of token definitions."""
514 processed = cls._all_tokens[name] = {}
515 tokendefs = tokendefs or cls.tokens[name]
516 for state in list(tokendefs):
517 cls._process_state(tokendefs, processed, state)
518 return processed
519
520 def get_tokendefs(cls):
521 """
522 Merge tokens from superclasses in MRO order, returning a single tokendef
523 dictionary.
524
525 Any state that is not defined by a subclass will be inherited
526 automatically. States that *are* defined by subclasses will, by
527 default, override that state in the superclass. If a subclass wishes to
528 inherit definitions from a superclass, it can use the special value
529 "inherit", which will cause the superclass' state definition to be
530 included at that point in the state.
531 """
532 tokens = {}
533 inheritable = {}
534 for c in cls.__mro__:
535 toks = c.__dict__.get('tokens', {})
536
537 for state, items in toks.items():
538 curitems = tokens.get(state)
539 if curitems is None:
540 # N.b. because this is assigned by reference, sufficiently
541 # deep hierarchies are processed incrementally (e.g. for
542 # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
543 # will not see any inherits in B).
544 tokens[state] = items
545 try:
546 inherit_ndx = items.index(inherit)
547 except ValueError:
548 continue
549 inheritable[state] = inherit_ndx
550 continue
551
552 inherit_ndx = inheritable.pop(state, None)
553 if inherit_ndx is None:
554 continue
555
556 # Replace the "inherit" value with the items
557 curitems[inherit_ndx:inherit_ndx+1] = items
558 try:
559 # N.b. this is the index in items (that is, the superclass
560 # copy), so offset required when storing below.
561 new_inh_ndx = items.index(inherit)
562 except ValueError:
563 pass
564 else:
565 inheritable[state] = inherit_ndx + new_inh_ndx
566
567 return tokens
568
569 def __call__(cls, *args, **kwds):
570 """Instantiate cls after preprocessing its token definitions."""
571 if '_tokens' not in cls.__dict__:
572 cls._all_tokens = {}
573 cls._tmpname = 0
574 if hasattr(cls, 'token_variants') and cls.token_variants:
575 # don't process yet
576 pass
577 else:
578 cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
579
580 return type.__call__(cls, *args, **kwds)
581
582
583 class RegexLexer(Lexer, metaclass=RegexLexerMeta):
584 """
585 Base for simple stateful regular expression-based lexers.
586 Simplifies the lexing process so that you need only
587 provide a list of states and regular expressions.
588 """
589
590 #: Flags for compiling the regular expressions.
591 #: Defaults to MULTILINE.
592 flags = re.MULTILINE
593
594 #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
595 #:
596 #: The initial state is 'root'.
597 #: ``new_state`` can be omitted to signify no state transition.
598 #: If it is a string, the state is pushed on the stack and changed.
599 #: If it is a tuple of strings, all states are pushed on the stack and
600 #: the current state will be the topmost.
601 #: It can also be ``combined('state1', 'state2', ...)``
602 #: to signify a new, anonymous state combined from the rules of two
603 #: or more existing ones.
604 #: Furthermore, it can be '#pop' to signify going back one step in
605 #: the state stack, or '#push' to push the current state on the stack
606 #: again.
607 #:
608 #: The tuple can also be replaced with ``include('state')``, in which
609 #: case the rules from the state named by the string are included in the
610 #: current one.
611 tokens = {}
612
613 def get_tokens_unprocessed(self, text, stack=('root',)):
614 """
615 Split ``text`` into (tokentype, text) pairs.
616
617 ``stack`` is the inital stack (default: ``['root']``)
618 """
619 pos = 0
620 tokendefs = self._tokens
621 statestack = list(stack)
622 statetokens = tokendefs[statestack[-1]]
623 while 1:
624 for rexmatch, action, new_state in statetokens:
625 m = rexmatch(text, pos)
626 if m:
627 if action is not None:
628 if type(action) is _TokenType:
629 yield pos, action, m.group()
630 else:
631 yield from action(self, m)
632 pos = m.end()
633 if new_state is not None:
634 # state transition
635 if isinstance(new_state, tuple):
636 for state in new_state:
637 if state == '#pop':
638 if len(statestack) > 1:
639 statestack.pop()
640 elif state == '#push':
641 statestack.append(statestack[-1])
642 else:
643 statestack.append(state)
644 elif isinstance(new_state, int):
645 # pop, but keep at least one state on the stack
646 # (random code leading to unexpected pops should
647 # not allow exceptions)
648 if abs(new_state) >= len(statestack):
649 del statestack[1:]
650 else:
651 del statestack[new_state:]
652 elif new_state == '#push':
653 statestack.append(statestack[-1])
654 else:
655 assert False, "wrong state def: %r" % new_state
656 statetokens = tokendefs[statestack[-1]]
657 break
658 else:
659 # We are here only if all state tokens have been considered
660 # and there was not a match on any of them.
661 try:
662 if text[pos] == '\n':
663 # at EOL, reset state to "root"
664 statestack = ['root']
665 statetokens = tokendefs['root']
666 yield pos, Text, '\n'
667 pos += 1
668 continue
669 yield pos, Error, text[pos]
670 pos += 1
671 except IndexError:
672 break
673
674
675 class LexerContext:
676 """
677 A helper object that holds lexer position data.
678 """
679
680 def __init__(self, text, pos, stack=None, end=None):
681 self.text = text
682 self.pos = pos
683 self.end = end or len(text) # end=0 not supported ;-)
684 self.stack = stack or ['root']
685
686 def __repr__(self):
687 return 'LexerContext(%r, %r, %r)' % (
688 self.text, self.pos, self.stack)
689
690
691 class ExtendedRegexLexer(RegexLexer):
692 """
693 A RegexLexer that uses a context object to store its state.
694 """
695
696 def get_tokens_unprocessed(self, text=None, context=None):
697 """
698 Split ``text`` into (tokentype, text) pairs.
699 If ``context`` is given, use this lexer context instead.
700 """
701 tokendefs = self._tokens
702 if not context:
703 ctx = LexerContext(text, 0)
704 statetokens = tokendefs['root']
705 else:
706 ctx = context
707 statetokens = tokendefs[ctx.stack[-1]]
708 text = ctx.text
709 while 1:
710 for rexmatch, action, new_state in statetokens:
711 m = rexmatch(text, ctx.pos, ctx.end)
712 if m:
713 if action is not None:
714 if type(action) is _TokenType:
715 yield ctx.pos, action, m.group()
716 ctx.pos = m.end()
717 else:
718 yield from action(self, m, ctx)
719 if not new_state:
720 # altered the state stack?
721 statetokens = tokendefs[ctx.stack[-1]]
722 # CAUTION: callback must set ctx.pos!
723 if new_state is not None:
724 # state transition
725 if isinstance(new_state, tuple):
726 for state in new_state:
727 if state == '#pop':
728 if len(ctx.stack) > 1:
729 ctx.stack.pop()
730 elif state == '#push':
731 ctx.stack.append(ctx.stack[-1])
732 else:
733 ctx.stack.append(state)
734 elif isinstance(new_state, int):
735 # see RegexLexer for why this check is made
736 if abs(new_state) >= len(ctx.stack):
737 del ctx.state[1:]
738 else:
739 del ctx.stack[new_state:]
740 elif new_state == '#push':
741 ctx.stack.append(ctx.stack[-1])
742 else:
743 assert False, "wrong state def: %r" % new_state
744 statetokens = tokendefs[ctx.stack[-1]]
745 break
746 else:
747 try:
748 if ctx.pos >= ctx.end:
749 break
750 if text[ctx.pos] == '\n':
751 # at EOL, reset state to "root"
752 ctx.stack = ['root']
753 statetokens = tokendefs['root']
754 yield ctx.pos, Text, '\n'
755 ctx.pos += 1
756 continue
757 yield ctx.pos, Error, text[ctx.pos]
758 ctx.pos += 1
759 except IndexError:
760 break
761
762
763 def do_insertions(insertions, tokens):
764 """
765 Helper for lexers which must combine the results of several
766 sublexers.
767
768 ``insertions`` is a list of ``(index, itokens)`` pairs.
769 Each ``itokens`` iterable should be inserted at position
770 ``index`` into the token stream given by the ``tokens``
771 argument.
772
773 The result is a combined token stream.
774
775 TODO: clean up the code here.
776 """
777 insertions = iter(insertions)
778 try:
779 index, itokens = next(insertions)
780 except StopIteration:
781 # no insertions
782 yield from tokens
783 return
784
785 realpos = None
786 insleft = True
787
788 # iterate over the token stream where we want to insert
789 # the tokens from the insertion list.
790 for i, t, v in tokens:
791 # first iteration. store the postition of first item
792 if realpos is None:
793 realpos = i
794 oldi = 0
795 while insleft and i + len(v) >= index:
796 tmpval = v[oldi:index - i]
797 if tmpval:
798 yield realpos, t, tmpval
799 realpos += len(tmpval)
800 for it_index, it_token, it_value in itokens:
801 yield realpos, it_token, it_value
802 realpos += len(it_value)
803 oldi = index - i
804 try:
805 index, itokens = next(insertions)
806 except StopIteration:
807 insleft = False
808 break # not strictly necessary
809 if oldi < len(v):
810 yield realpos, t, v[oldi:]
811 realpos += len(v) - oldi
812
813 # leftover tokens
814 while insleft:
815 # no normal tokens, set realpos to zero
816 realpos = realpos or 0
817 for p, t, v in itokens:
818 yield realpos, t, v
819 realpos += len(v)
820 try:
821 index, itokens = next(insertions)
822 except StopIteration:
823 insleft = False
824 break # not strictly necessary
825
826
827 class ProfilingRegexLexerMeta(RegexLexerMeta):
828 """Metaclass for ProfilingRegexLexer, collects regex timing info."""
829
830 def _process_regex(cls, regex, rflags, state):
831 if isinstance(regex, words):
832 rex = regex_opt(regex.words, prefix=regex.prefix,
833 suffix=regex.suffix)
834 else:
835 rex = regex
836 compiled = re.compile(rex, rflags)
837
838 def match_func(text, pos, endpos=sys.maxsize):
839 info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
840 t0 = time.time()
841 res = compiled.match(text, pos, endpos)
842 t1 = time.time()
843 info[0] += 1
844 info[1] += t1 - t0
845 return res
846 return match_func
847
848
849 class ProfilingRegexLexer(RegexLexer, metaclass=ProfilingRegexLexerMeta):
850 """Drop-in replacement for RegexLexer that does profiling of its regexes."""
851
852 _prof_data = []
853 _prof_sort_index = 4 # defaults to time per call
854
855 def get_tokens_unprocessed(self, text, stack=('root',)):
856 # this needs to be a stack, since using(this) will produce nested calls
857 self.__class__._prof_data.append({})
858 yield from RegexLexer.get_tokens_unprocessed(self, text, stack)
859 rawdata = self.__class__._prof_data.pop()
860 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
861 n, 1000 * t, 1000 * t / n)
862 for ((s, r), (n, t)) in rawdata.items()),
863 key=lambda x: x[self._prof_sort_index],
864 reverse=True)
865 sum_total = sum(x[3] for x in data)
866
867 print()
868 print('Profiling result for %s lexing %d chars in %.3f ms' %
869 (self.__class__.__name__, len(text), sum_total))
870 print('=' * 110)
871 print('%-20s %-64s ncalls tottime percall' % ('state', 'regex'))
872 print('-' * 110)
873 for d in data:
874 print('%-20s %-65s %5d %8.4f %8.4f' % d)
875 print('=' * 110)

eric ide

mercurial