eric: comparison ThirdParty/Pygments/pygments/lexer.py

-:8bc578136279
+:4f20dba37ab6
 pygments.lexer
 ~~~~~~~~~~~~~~
 Base lexer classes.
-:copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS.
+:copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
 :license: BSD, see LICENSE for details.
 """
-try:
-str = unicode
+from __future__ import print_function
-except NameError:
-basestring = str
+import re
+import sys
-import re, itertools
+import time
+import itertools
 from pygments.filter import apply_filters, Filter
 from pygments.filters import get_filter_by_name
 from pygments.token import Error, Text, Other, _TokenType
 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
-make_analysator
+make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode
-import collections
+from pygments.regexopt import regex_opt
 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
-'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this']
+'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
+'default', 'words']
-_encoding_map = [('\xef\xbb\xbf', 'utf-8'),
-('\xff\xfe\0\0', 'utf-32'),
+_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
-('\0\0\xfe\xff', 'utf-32be'),
+(b'\xff\xfe\0\0', 'utf-32'),
-('\xff\xfe', 'utf-16'),
+(b'\0\0\xfe\xff', 'utf-32be'),
-('\xfe\xff', 'utf-16be')]
+(b'\xff\xfe', 'utf-16'),
+(b'\xfe\xff', 'utf-16be')]
 _default_analyse = staticmethod(lambda x: 0.0)
-def with_metaclass(meta, base=object):
-"""
-Python independent version to create a base class with a metaclass.
-Taken from six 1.3.0 (http://pythonhosted.org/six)
-"""
-return meta("NewBase", (base,), {})
 class LexerMeta(type):
 """
 This metaclass automagically converts ``analyse_text`` methods into
 if 'analyse_text' in d:
 d['analyse_text'] = make_analysator(d['analyse_text'])
 return type.__new__(cls, name, bases, d)
-class Lexer(with_metaclass(LexerMeta, object)):
+@add_metaclass(LexerMeta)
+class Lexer(object):
 """
 Lexer for a specific language.
 Basic options recognized:
 ``stripnl``
 Strip all leading and trailing whitespace from the input
 (default: False).
 ``ensurenl``
 Make sure that the input ends with a newline (default: True).  This
 is required for some lexers that consume input linewise.
-*New in Pygments 1.3.*
+.. versionadded:: 1.3
 ``tabsize``
 If given and greater than 0, expand tabs in the input (default: 0).
 ``encoding``
 If given, must be an encoding name. This encoding will be used to
 convert the input string to Unicode, if it is not already a Unicode
-string (default: ``'latin1'``).
+string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
-Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
+Latin1 detection.  Can also be ``'chardet'`` to use the chardet
-``'chardet'`` to use the chardet library, if it is installed.
+library, if it is installed.
+``inencoding``
+Overrides the ``encoding`` if given.
 """
 #: Name of the lexer
 name = None
 self.options = options
 self.stripnl = get_bool_opt(options, 'stripnl', True)
 self.stripall = get_bool_opt(options, 'stripall', False)
 self.ensurenl = get_bool_opt(options, 'ensurenl', True)
 self.tabsize = get_int_opt(options, 'tabsize', 0)
-self.encoding = options.get('encoding', 'latin1')
+self.encoding = options.get('encoding', 'guess')
-# self.encoding = options.get('inencoding', None) or self.encoding
+self.encoding = options.get('inencoding') or self.encoding
 self.filters = []
 for filter_ in get_list_opt(options, 'filters', ()):
 self.add_filter(filter_)
 def __repr__(self):
 is bypassed even if filters are defined.
 Also preprocess the text, i.e. expand tabs and strip it if
 wanted and applies registered filters.
 """
-if not isinstance(text, str):
+if not isinstance(text, text_type):
 if self.encoding == 'guess':
-try:
+text, _ = guess_decode(text)
-text = text.decode('utf-8')
-if text.startswith('\ufeff'):
-text = text[len('\ufeff'):]
-except UnicodeDecodeError:
-text = text.decode('latin1')
 elif self.encoding == 'chardet':
 try:
 import chardet
 except ImportError:
 raise ImportError('To enable chardet encoding guessing, '
 'from http://chardet.feedparser.org/')
 # check for BOM first
 decoded = None
 for bom, encoding in _encoding_map:
 if text.startswith(bom):
-decoded = str(text[len(bom):], encoding,
+decoded = text[len(bom):].decode(encoding, 'replace')
-errors='replace')
 break
 # no BOM found, so use chardet
 if decoded is None:
-enc = chardet.detect(text[:1024]) # Guess using first 1KB
+enc = chardet.detect(text[:1024])  # Guess using first 1KB
-decoded = str(text, enc.get('encoding') or 'utf-8',
+decoded = text.decode(enc.get('encoding') or 'utf-8',
-errors='replace')
+'replace')
 text = decoded
 else:
 text = text.decode(self.encoding)
+if text.startswith(u'\ufeff'):
+text = text[len(u'\ufeff'):]
 else:
-if text.startswith('\ufeff'):
+if text.startswith(u'\ufeff'):
-text = text[len('\ufeff'):]
+text = text[len(u'\ufeff'):]
 # text now *is* a unicode string
 text = text.replace('\r\n', '\n')
 text = text.replace('\r', '\n')
 if self.stripall:
 stream = apply_filters(stream, self.filters, self)
 return stream
 def get_tokens_unprocessed(self, text):
 """
-Return an iterable of (tokentype, value) pairs.
+Return an iterable of (index, tokentype, value) pairs where "index"
+is the starting position of the token within the input text.
 In subclasses, implement this method as a generator to
 maximize effectiveness.
 """
 raise NotImplementedError
 insertions.append((len(buffered), lng_buffer))
 return do_insertions(insertions,
 self.root_lexer.get_tokens_unprocessed(buffered))
-#-------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 # RegexLexer and ExtendedRegexLexer
 #
 class include(str):
 if ctx:
 ctx.pos = match.end()
 return callback
+class default:
+"""
+Indicates a state or state action (e.g. #pop) to apply.
+For example default('#pop') is equivalent to ('', Token, '#pop')
+Note that state tuples may be used as well.
+.. versionadded:: 2.0
+"""
+def __init__(self, state):
+self.state = state
+class words(Future):
+"""
+Indicates a list of literal words that is transformed into an optimized
+regex that matches any of the words.
+.. versionadded:: 2.0
+"""
+def __init__(self, words, prefix='', suffix=''):
+self.words = words
+self.prefix = prefix
+self.suffix = suffix
+def get(self):
+return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
 class RegexLexerMeta(LexerMeta):
 """
 Metaclass for RegexLexer, creates the self._tokens attribute from
 self.tokens on the first instantiation.
 """
-def _process_regex(cls, regex, rflags):
+def _process_regex(cls, regex, rflags, state):
 """Preprocess the regular expression component of a token definition."""
+if isinstance(regex, Future):
+regex = regex.get()
 return re.compile(regex, rflags).match
 def _process_token(cls, token):
 """Preprocess the token component of a token definition."""
-assert type(token) is _TokenType or isinstance(token, collections.Callable), \
+assert type(token) is _TokenType or callable(token), \
 'token type must be simple type or callable, not %r' % (token,)
 return token
 def _process_new_state(cls, new_state, unprocessed, processed):
 """Preprocess the state transition action of a token definition."""
-if isinstance(new_state, basestring):
+if isinstance(new_state, str):
 # an existing state
 if new_state == '#pop':
 return -1
 elif new_state in unprocessed:
 return (new_state,)
 elif isinstance(new_state, tuple):
 # push more than one state
 for istate in new_state:
 assert (istate in unprocessed or
 istate in ('#pop', '#push')), \
 'unknown new state ' + istate
 return new_state
 else:
 assert False, 'unknown new state def %r' % new_state
 def _process_state(cls, unprocessed, processed, state):
 """Preprocess a single state definition."""
-assert isinstance(state, basestring), "wrong state name %r" % state
+assert type(state) is str, "wrong state name %r" % state
 assert state[0] != '#', "invalid state name %r" % state
 if state in processed:
 return processed[state]
 tokens = processed[state] = []
 rflags = cls.flags
 assert tdef != state, "circular state reference %r" % state
 tokens.extend(cls._process_state(unprocessed, processed,
 str(tdef)))
 continue
 if isinstance(tdef, _inherit):
-# processed already
+# should be processed already, but may not in the case of:
+# 1. the state has no counterpart in any parent
+# 2. the state includes more than one 'inherit'
 continue
+if isinstance(tdef, default):
+new_state = cls._process_new_state(tdef.state, unprocessed, processed)
+tokens.append((re.compile('').match, None, new_state))
+continue
 assert type(tdef) is tuple, "wrong rule def %r" % tdef
 try:
-rex = cls._process_regex(tdef[0], rflags)
+rex = cls._process_regex(tdef[0], rflags, state)
 except Exception as err:
 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
 (tdef[0], state, cls, err))
 token = cls._process_token(tdef[1])
 def process_tokendef(cls, name, tokendefs=None):
 """Preprocess a dictionary of token definitions."""
 processed = cls._all_tokens[name] = {}
 tokendefs = tokendefs or cls.tokens[name]
-for state in list(tokendefs.keys()):
+for state in list(tokendefs):
 cls._process_state(tokendefs, processed, state)
 return processed
 def get_tokendefs(cls):
 """
 "inherit", which will cause the superclass' state definition to be
 included at that point in the state.
 """
 tokens = {}
 inheritable = {}
-for c in itertools.chain((cls,), cls.__mro__):
+for c in cls.__mro__:
 toks = c.__dict__.get('tokens', {})
-for state, items in toks.items():
+for state, items in iteritems(toks):
 curitems = tokens.get(state)
 if curitems is None:
+# N.b. because this is assigned by reference, sufficiently
+# deep hierarchies are processed incrementally (e.g. for
+# A(B), B(C), C(RegexLexer), B will be premodified so X(B)
+# will not see any inherits in B).
 tokens[state] = items
 try:
 inherit_ndx = items.index(inherit)
 except ValueError:
 continue
 continue
 # Replace the "inherit" value with the items
 curitems[inherit_ndx:inherit_ndx+1] = items
 try:
+# N.b. this is the index in items (that is, the superclass
+# copy), so offset required when storing below.
 new_inh_ndx = items.index(inherit)
 except ValueError:
 pass
 else:
 inheritable[state] = inherit_ndx + new_inh_ndx
 cls._tokens = cls.process_tokendef('', cls.get_tokendefs())
 return type.__call__(cls, *args, **kwds)
-class RegexLexer(with_metaclass(RegexLexerMeta, Lexer)):
+@add_metaclass(RegexLexerMeta)
+class RegexLexer(Lexer):
 """
 Base for simple stateful regular expression-based lexers.
 Simplifies the lexing process so that you need only
 provide a list of states and regular expressions.
 """
 statetokens = tokendefs[statestack[-1]]
 while 1:
 for rexmatch, action, new_state in statetokens:
 m = rexmatch(text, pos)
 if m:
-if type(action) is _TokenType:
+if action is not None:
-yield pos, action, m.group()
+if type(action) is _TokenType:
-else:
+yield pos, action, m.group()
-for item in action(self, m):
+else:
-yield item
+for item in action(self, m):
+yield item
 pos = m.end()
 if new_state is not None:
 # state transition
 if isinstance(new_state, tuple):
 for state in new_state:
 try:
 if text[pos] == '\n':
 # at EOL, reset state to "root"
 statestack = ['root']
 statetokens = tokendefs['root']
-yield pos, Text, '\n'
+yield pos, Text, u'\n'
 pos += 1
 continue
 yield pos, Error, text[pos]
 pos += 1
 except IndexError:
 """
 def __init__(self, text, pos, stack=None, end=None):
 self.text = text
 self.pos = pos
-self.end = end or len(text) # end=0 not supported ;-)
+self.end = end or len(text)  # end=0 not supported ;-)
 self.stack = stack or ['root']
 def __repr__(self):
 return 'LexerContext(%r, %r, %r)' % (
 self.text, self.pos, self.stack)
 text = ctx.text
 while 1:
 for rexmatch, action, new_state in statetokens:
 m = rexmatch(text, ctx.pos, ctx.end)
 if m:
-if type(action) is _TokenType:
+if action is not None:
-yield ctx.pos, action, m.group()
+if type(action) is _TokenType:
-ctx.pos = m.end()
+yield ctx.pos, action, m.group()
-else:
+ctx.pos = m.end()
-for item in action(self, m, ctx):
+else:
-yield item
+for item in action(self, m, ctx):
-if not new_state:
+yield item
-# altered the state stack?
+if not new_state:
-statetokens = tokendefs[ctx.stack[-1]]
+# altered the state stack?
+statetokens = tokendefs[ctx.stack[-1]]
 # CAUTION: callback must set ctx.pos!
 if new_state is not None:
 # state transition
 if isinstance(new_state, tuple):
 for state in new_state:
 if state == '#pop':
 ctx.stack.pop()
 elif state == '#push':
-ctx.stack.append(statestack[-1])
+ctx.stack.append(ctx.stack[-1])
 else:
 ctx.stack.append(state)
 elif isinstance(new_state, int):
 # pop
 del ctx.stack[new_state:]
 break
 if text[ctx.pos] == '\n':
 # at EOL, reset state to "root"
 ctx.stack = ['root']
 statetokens = tokendefs['root']
-yield ctx.pos, Text, '\n'
+yield ctx.pos, Text, u'\n'
 ctx.pos += 1
 continue
 yield ctx.pos, Error, text[ctx.pos]
 ctx.pos += 1
 except IndexError:
 try:
 index, itokens = next(insertions)
 except StopIteration:
 insleft = False
 break  # not strictly necessary
+class ProfilingRegexLexerMeta(RegexLexerMeta):
+"""Metaclass for ProfilingRegexLexer, collects regex timing info."""
+def _process_regex(cls, regex, rflags, state):
+if isinstance(regex, words):
+rex = regex_opt(regex.words, prefix=regex.prefix,
+suffix=regex.suffix)
+else:
+rex = regex
+compiled = re.compile(rex, rflags)
+def match_func(text, pos, endpos=sys.maxsize):
+info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
+t0 = time.time()
+res = compiled.match(text, pos, endpos)
+t1 = time.time()
+info[0] += 1
+info[1] += t1 - t0
+return res
+return match_func
+@add_metaclass(ProfilingRegexLexerMeta)
+class ProfilingRegexLexer(RegexLexer):
+"""Drop-in replacement for RegexLexer that does profiling of its regexes."""
+_prof_data = []
+_prof_sort_index = 4  # defaults to time per call
+def get_tokens_unprocessed(self, text, stack=('root',)):
+# this needs to be a stack, since using(this) will produce nested calls
+self.__class__._prof_data.append({})
+for tok in RegexLexer.get_tokens_unprocessed(self, text, stack):
+yield tok
+rawdata = self.__class__._prof_data.pop()
+data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
+n, 1000 * t, 1000 * t / n)
+for ((s, r), (n, t)) in rawdata.items()),
+key=lambda x: x[self._prof_sort_index],
+reverse=True)
+sum_total = sum(x[3] for x in data)
+print()
+print('Profiling result for %s lexing %d chars in %.3f ms' %
+(self.__class__.__name__, len(text), sum_total))
+print('=' * 110)
+print('%-20s %-64s ncalls  tottime  percall' % ('state', 'regex'))
+print('-' * 110)
+for d in data:
+print('%-20s %-65s %5d %8.4f %8.4f' % d)
+print('=' * 110)

Mercurial Repositories > eric / file comparison

comparison: ThirdParty/Pygments/pygments/lexer.py

ThirdParty/Pygments/pygments/lexer.py