eric: comparison ThirdParty/Pygments/pygments/lexer.py

--1:000000000000
+:de9c2efb9d02
+# -*- coding: utf-8 -*-
+"""
+pygments.lexer
+~~~~~~~~~~~~~~
+Base lexer classes.
+:copyright: Copyright 2006-2009 by the Pygments team, see AUTHORS.
+:license: BSD, see LICENSE for details.
+"""
+import re
+try:
+set
+except NameError:
+from sets import Set as set
+from pygments.filter import apply_filters, Filter
+from pygments.filters import get_filter_by_name
+from pygments.token import Error, Text, Other, _TokenType
+from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
+make_analysator
+__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
+'LexerContext', 'include', 'flags', 'bygroups', 'using', 'this']
+_default_analyse = staticmethod(lambda x: 0.0)
+class LexerMeta(type):
+"""
+This metaclass automagically converts ``analyse_text`` methods into
+static methods which always return float values.
+"""
+def __new__(cls, name, bases, d):
+if 'analyse_text' in d:
+d['analyse_text'] = make_analysator(d['analyse_text'])
+return type.__new__(cls, name, bases, d)
+class Lexer(object):
+"""
+Lexer for a specific language.
+Basic options recognized:
+``stripnl``
+Strip leading and trailing newlines from the input (default: True).
+``stripall``
+Strip all leading and trailing whitespace from the input
+(default: False).
+``tabsize``
+If given and greater than 0, expand tabs in the input (default: 0).
+``encoding``
+If given, must be an encoding name. This encoding will be used to
+convert the input string to Unicode, if it is not already a Unicode
+string (default: ``'latin1'``).
+Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
+``'chardet'`` to use the chardet library, if it is installed.
+"""
+#: Name of the lexer
+name = None
+#: Shortcuts for the lexer
+aliases = []
+#: fn match rules
+filenames = []
+#: fn alias filenames
+alias_filenames = []
+#: mime types
+mimetypes = []
+__metaclass__ = LexerMeta
+def __init__(self, **options):
+self.options = options
+self.stripnl = get_bool_opt(options, 'stripnl', True)
+self.stripall = get_bool_opt(options, 'stripall', False)
+self.tabsize = get_int_opt(options, 'tabsize', 0)
+self.encoding = options.get('encoding', 'latin1')
+# self.encoding = options.get('inencoding', None) or self.encoding
+self.filters = []
+for filter_ in get_list_opt(options, 'filters', ()):
+self.add_filter(filter_)
+def __repr__(self):
+if self.options:
+return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
+self.options)
+else:
+return '<pygments.lexers.%s>' % self.__class__.__name__
+def add_filter(self, filter_, **options):
+"""
+Add a new stream filter to this lexer.
+"""
+if not isinstance(filter_, Filter):
+filter_ = get_filter_by_name(filter_, **options)
+self.filters.append(filter_)
+def analyse_text(text):
+"""
+Has to return a float between ``0`` and ``1`` that indicates
+if a lexer wants to highlight this text. Used by ``guess_lexer``.
+If this method returns ``0`` it won't highlight it in any case, if
+it returns ``1`` highlighting with this lexer is guaranteed.
+The `LexerMeta` metaclass automatically wraps this function so
+that it works like a static method (no ``self`` or ``cls``
+parameter) and the return value is automatically converted to
+`float`. If the return value is an object that is boolean `False`
+it's the same as if the return values was ``0.0``.
+"""
+def get_tokens(self, text, unfiltered=False):
+"""
+Return an iterable of (tokentype, value) pairs generated from
+`text`. If `unfiltered` is set to `True`, the filtering mechanism
+is bypassed even if filters are defined.
+Also preprocess the text, i.e. expand tabs and strip it if
+wanted and applies registered filters.
+"""
+if not isinstance(text, unicode):
+if self.encoding == 'guess':
+try:
+text = text.decode('utf-8')
+if text.startswith(u'\ufeff'):
+text = text[len(u'\ufeff'):]
+except UnicodeDecodeError:
+text = text.decode('latin1')
+elif self.encoding == 'chardet':
+try:
+import chardet
+except ImportError:
+raise ImportError('To enable chardet encoding guessing, '
+'please install the chardet library '
+'from http://chardet.feedparser.org/')
+enc = chardet.detect(text)
+text = text.decode(enc['encoding'])
+else:
+text = text.decode(self.encoding)
+# text now *is* a unicode string
+text = text.replace('\r\n', '\n')
+text = text.replace('\r', '\n')
+if self.stripall:
+text = text.strip()
+elif self.stripnl:
+text = text.strip('\n')
+if self.tabsize > 0:
+text = text.expandtabs(self.tabsize)
+if not text.endswith('\n'):
+text += '\n'
+def streamer():
+for i, t, v in self.get_tokens_unprocessed(text):
+yield t, v
+stream = streamer()
+if not unfiltered:
+stream = apply_filters(stream, self.filters, self)
+return stream
+def get_tokens_unprocessed(self, text):
+"""
+Return an iterable of (tokentype, value) pairs.
+In subclasses, implement this method as a generator to
+maximize effectiveness.
+"""
+raise NotImplementedError
+class DelegatingLexer(Lexer):
+"""
+This lexer takes two lexer as arguments. A root lexer and
+a language lexer. First everything is scanned using the language
+lexer, afterwards all ``Other`` tokens are lexed using the root
+lexer.
+The lexers from the ``template`` lexer package use this base lexer.
+"""
+def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
+self.root_lexer = _root_lexer(**options)
+self.language_lexer = _language_lexer(**options)
+self.needle = _needle
+Lexer.__init__(self, **options)
+def get_tokens_unprocessed(self, text):
+buffered = ''
+insertions = []
+lng_buffer = []
+for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
+if t is self.needle:
+if lng_buffer:
+insertions.append((len(buffered), lng_buffer))
+lng_buffer = []
+buffered += v
+else:
+lng_buffer.append((i, t, v))
+if lng_buffer:
+insertions.append((len(buffered), lng_buffer))
+return do_insertions(insertions,
+self.root_lexer.get_tokens_unprocessed(buffered))
+#-------------------------------------------------------------------------------
+# RegexLexer and ExtendedRegexLexer
+#
+class include(str):
+"""
+Indicates that a state should include rules from another state.
+"""
+pass
+class combined(tuple):
+"""
+Indicates a state combined from multiple states.
+"""
+def __new__(cls, *args):
+return tuple.__new__(cls, args)
+def __init__(self, *args):
+# tuple.__init__ doesn't do anything
+pass
+class _PseudoMatch(object):
+"""
+A pseudo match object constructed from a string.
+"""
+def __init__(self, start, text):
+self._text = text
+self._start = start
+def start(self, arg=None):
+return self._start
+def end(self, arg=None):
+return self._start + len(self._text)
+def group(self, arg=None):
+if arg:
+raise IndexError('No such group')
+return self._text
+def groups(self):
+return (self._text,)
+def groupdict(self):
+return {}
+def bygroups(*args):
+"""
+Callback that yields multiple actions for each group in the match.
+"""
+def callback(lexer, match, ctx=None):
+for i, action in enumerate(args):
+if action is None:
+continue
+elif type(action) is _TokenType:
+data = match.group(i + 1)
+if data:
+yield match.start(i + 1), action, data
+else:
+if ctx:
+ctx.pos = match.start(i + 1)
+for item in action(lexer, _PseudoMatch(match.start(i + 1),
+match.group(i + 1)), ctx):
+if item:
+yield item
+if ctx:
+ctx.pos = match.end()
+return callback
+class _This(object):
+"""
+Special singleton used for indicating the caller class.
+Used by ``using``.
+"""
+this = _This()
+def using(_other, **kwargs):
+"""
+Callback that processes the match with a different lexer.
+The keyword arguments are forwarded to the lexer, except `state` which
+is handled separately.
+`state` specifies the state that the new lexer will start in, and can
+be an enumerable such as ('root', 'inline', 'string') or a simple
+string which is assumed to be on top of the root state.
+Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
+"""
+gt_kwargs = {}
+if 'state' in kwargs:
+s = kwargs.pop('state')
+if isinstance(s, (list, tuple)):
+gt_kwargs['stack'] = s
+else:
+gt_kwargs['stack'] = ('root', s)
+if _other is this:
+def callback(lexer, match, ctx=None):
+# if keyword arguments are given the callback
+# function has to create a new lexer instance
+if kwargs:
+# XXX: cache that somehow
+kwargs.update(lexer.options)
+lx = lexer.__class__(**kwargs)
+else:
+lx = lexer
+s = match.start()
+for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
+yield i + s, t, v
+if ctx:
+ctx.pos = match.end()
+else:
+def callback(lexer, match, ctx=None):
+# XXX: cache that somehow
+kwargs.update(lexer.options)
+lx = _other(**kwargs)
+s = match.start()
+for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
+yield i + s, t, v
+if ctx:
+ctx.pos = match.end()
+return callback
+class RegexLexerMeta(LexerMeta):
+"""
+Metaclass for RegexLexer, creates the self._tokens attribute from
+self.tokens on the first instantiation.
+"""
+def _process_state(cls, unprocessed, processed, state):
+assert type(state) is str, "wrong state name %r" % state
+assert state[0] != '#', "invalid state name %r" % state
+if state in processed:
+return processed[state]
+tokens = processed[state] = []
+rflags = cls.flags
+for tdef in unprocessed[state]:
+if isinstance(tdef, include):
+# it's a state reference
+assert tdef != state, "circular state reference %r" % state
+tokens.extend(cls._process_state(unprocessed, processed, str(tdef)))
+continue
+assert type(tdef) is tuple, "wrong rule def %r" % tdef
+try:
+rex = re.compile(tdef[0], rflags).match
+except Exception, err:
+raise ValueError("uncompilable regex %r in state %r of %r: %s" %
+(tdef[0], state, cls, err))
+assert type(tdef[1]) is _TokenType or callable(tdef[1]), \
+'token type must be simple type or callable, not %r' % (tdef[1],)
+if len(tdef) == 2:
+new_state = None
+else:
+tdef2 = tdef[2]
+if isinstance(tdef2, str):
+# an existing state
+if tdef2 == '#pop':
+new_state = -1
+elif tdef2 in unprocessed:
+new_state = (tdef2,)
+elif tdef2 == '#push':
+new_state = tdef2
+elif tdef2[:5] == '#pop:':
+new_state = -int(tdef2[5:])
+else:
+assert False, 'unknown new state %r' % tdef2
+elif isinstance(tdef2, combined):
+# combine a new state from existing ones
+new_state = '_tmp_%d' % cls._tmpname
+cls._tmpname += 1
+itokens = []
+for istate in tdef2:
+assert istate != state, 'circular state ref %r' % istate
+itokens.extend(cls._process_state(unprocessed,
+processed, istate))
+processed[new_state] = itokens
+new_state = (new_state,)
+elif isinstance(tdef2, tuple):
+# push more than one state
+for state in tdef2:
+assert (state in unprocessed or
+state in ('#pop', '#push')), \
+'unknown new state ' + state
+new_state = tdef2
+else:
+assert False, 'unknown new state def %r' % tdef2
+tokens.append((rex, tdef[1], new_state))
+return tokens
+def process_tokendef(cls, name, tokendefs=None):
+processed = cls._all_tokens[name] = {}
+tokendefs = tokendefs or cls.tokens[name]
+for state in tokendefs.keys():
+cls._process_state(tokendefs, processed, state)
+return processed
+def __call__(cls, *args, **kwds):
+if not hasattr(cls, '_tokens'):
+cls._all_tokens = {}
+cls._tmpname = 0
+if hasattr(cls, 'token_variants') and cls.token_variants:
+# don't process yet
+pass
+else:
+cls._tokens = cls.process_tokendef('', cls.tokens)
+return type.__call__(cls, *args, **kwds)
+class RegexLexer(Lexer):
+"""
+Base for simple stateful regular expression-based lexers.
+Simplifies the lexing process so that you need only
+provide a list of states and regular expressions.
+"""
+__metaclass__ = RegexLexerMeta
+#: Flags for compiling the regular expressions.
+#: Defaults to MULTILINE.
+flags = re.MULTILINE
+#: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
+#:
+#: The initial state is 'root'.
+#: ``new_state`` can be omitted to signify no state transition.
+#: If it is a string, the state is pushed on the stack and changed.
+#: If it is a tuple of strings, all states are pushed on the stack and
+#: the current state will be the topmost.
+#: It can also be ``combined('state1', 'state2', ...)``
+#: to signify a new, anonymous state combined from the rules of two
+#: or more existing ones.
+#: Furthermore, it can be '#pop' to signify going back one step in
+#: the state stack, or '#push' to push the current state on the stack
+#: again.
+#:
+#: The tuple can also be replaced with ``include('state')``, in which
+#: case the rules from the state named by the string are included in the
+#: current one.
+tokens = {}
+def get_tokens_unprocessed(self, text, stack=('root',)):
+"""
+Split ``text`` into (tokentype, text) pairs.
+``stack`` is the inital stack (default: ``['root']``)
+"""
+pos = 0
+tokendefs = self._tokens
+statestack = list(stack)
+statetokens = tokendefs[statestack[-1]]
+while 1:
+for rexmatch, action, new_state in statetokens:
+m = rexmatch(text, pos)
+if m:
+if type(action) is _TokenType:
+yield pos, action, m.group()
+else:
+for item in action(self, m):
+yield item
+pos = m.end()
+if new_state is not None:
+# state transition
+if isinstance(new_state, tuple):
+for state in new_state:
+if state == '#pop':
+statestack.pop()
+elif state == '#push':
+statestack.append(statestack[-1])
+else:
+statestack.append(state)
+elif isinstance(new_state, int):
+# pop
+del statestack[new_state:]
+elif new_state == '#push':
+statestack.append(statestack[-1])
+else:
+assert False, "wrong state def: %r" % new_state
+statetokens = tokendefs[statestack[-1]]
+break
+else:
+try:
+if text[pos] == '\n':
+# at EOL, reset state to "root"
+pos += 1
+statestack = ['root']
+statetokens = tokendefs['root']
+yield pos, Text, u'\n'
+continue
+yield pos, Error, text[pos]
+pos += 1
+except IndexError:
+break
+class LexerContext(object):
+"""
+A helper object that holds lexer position data.
+"""
+def __init__(self, text, pos, stack=None, end=None):
+self.text = text
+self.pos = pos
+self.end = end or len(text) # end=0 not supported ;-)
+self.stack = stack or ['root']
+def __repr__(self):
+return 'LexerContext(%r, %r, %r)' % (
+self.text, self.pos, self.stack)
+class ExtendedRegexLexer(RegexLexer):
+"""
+A RegexLexer that uses a context object to store its state.
+"""
+def get_tokens_unprocessed(self, text=None, context=None):
+"""
+Split ``text`` into (tokentype, text) pairs.
+If ``context`` is given, use this lexer context instead.
+"""
+tokendefs = self._tokens
+if not context:
+ctx = LexerContext(text, 0)
+statetokens = tokendefs['root']
+else:
+ctx = context
+statetokens = tokendefs[ctx.stack[-1]]
+text = ctx.text
+while 1:
+for rexmatch, action, new_state in statetokens:
+m = rexmatch(text, ctx.pos, ctx.end)
+if m:
+if type(action) is _TokenType:
+yield ctx.pos, action, m.group()
+ctx.pos = m.end()
+else:
+for item in action(self, m, ctx):
+yield item
+if not new_state:
+# altered the state stack?
+statetokens = tokendefs[ctx.stack[-1]]
+# CAUTION: callback must set ctx.pos!
+if new_state is not None:
+# state transition
+if isinstance(new_state, tuple):
+ctx.stack.extend(new_state)
+elif isinstance(new_state, int):
+# pop
+del ctx.stack[new_state:]
+elif new_state == '#push':
+ctx.stack.append(ctx.stack[-1])
+else:
+assert False, "wrong state def: %r" % new_state
+statetokens = tokendefs[ctx.stack[-1]]
+break
+else:
+try:
+if ctx.pos >= ctx.end:
+break
+if text[ctx.pos] == '\n':
+# at EOL, reset state to "root"
+ctx.pos += 1
+ctx.stack = ['root']
+statetokens = tokendefs['root']
+yield ctx.pos, Text, u'\n'
+continue
+yield ctx.pos, Error, text[ctx.pos]
+ctx.pos += 1
+except IndexError:
+break
+def do_insertions(insertions, tokens):
+"""
+Helper for lexers which must combine the results of several
+sublexers.
+``insertions`` is a list of ``(index, itokens)`` pairs.
+Each ``itokens`` iterable should be inserted at position
+``index`` into the token stream given by the ``tokens``
+argument.
+The result is a combined token stream.
+TODO: clean up the code here.
+"""
+insertions = iter(insertions)
+try:
+index, itokens = insertions.next()
+except StopIteration:
+# no insertions
+for item in tokens:
+yield item
+return
+realpos = None
+insleft = True
+# iterate over the token stream where we want to insert
+# the tokens from the insertion list.
+for i, t, v in tokens:
+# first iteration. store the postition of first item
+if realpos is None:
+realpos = i
+oldi = 0
+while insleft and i + len(v) >= index:
+tmpval = v[oldi:index - i]
+yield realpos, t, tmpval
+realpos += len(tmpval)
+for it_index, it_token, it_value in itokens:
+yield realpos, it_token, it_value
+realpos += len(it_value)
+oldi = index - i
+try:
+index, itokens = insertions.next()
+except StopIteration:
+insleft = False
+break  # not strictly necessary
+yield realpos, t, v[oldi:]
+realpos += len(v) - oldi
+# leftover tokens
+if insleft:
+# no normal tokens, set realpos to zero
+realpos = realpos or 0
+for p, t, v in itokens:
+yield realpos, t, v
+realpos += len(v)

Mercurial Repositories > eric / file comparison

comparison: ThirdParty/Pygments/pygments/lexer.py

ThirdParty/Pygments/pygments/lexer.py