ThirdParty/Pygments/pygments/lexer.py

changeset 4172
4f20dba37ab6
parent 3484
645c12de6b0c
child 4697
c2e9bf425554
diff -r 8bc578136279 -r 4f20dba37ab6 ThirdParty/Pygments/pygments/lexer.py
--- a/ThirdParty/Pygments/pygments/lexer.py	Wed Mar 11 18:25:37 2015 +0100
+++ b/ThirdParty/Pygments/pygments/lexer.py	Wed Mar 11 18:32:27 2015 +0100
@@ -5,45 +5,38 @@
 
     Base lexer classes.
 
-    :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS.
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
     :license: BSD, see LICENSE for details.
 """
-try:
-    str = unicode
-except NameError:
-    basestring = str
+
+from __future__ import print_function
 
-import re, itertools
+import re
+import sys
+import time
+import itertools
 
 from pygments.filter import apply_filters, Filter
 from pygments.filters import get_filter_by_name
 from pygments.token import Error, Text, Other, _TokenType
 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
-     make_analysator
-import collections
-
+    make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode
+from pygments.regexopt import regex_opt
 
 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
-           'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this']
+           'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
+           'default', 'words']
 
 
-_encoding_map = [('\xef\xbb\xbf', 'utf-8'),
-                 ('\xff\xfe\0\0', 'utf-32'),
-                 ('\0\0\xfe\xff', 'utf-32be'),
-                 ('\xff\xfe', 'utf-16'),
-                 ('\xfe\xff', 'utf-16be')]
+_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
+                 (b'\xff\xfe\0\0', 'utf-32'),
+                 (b'\0\0\xfe\xff', 'utf-32be'),
+                 (b'\xff\xfe', 'utf-16'),
+                 (b'\xfe\xff', 'utf-16be')]
 
 _default_analyse = staticmethod(lambda x: 0.0)
 
 
-def with_metaclass(meta, base=object):
-    """
-    Python independent version to create a base class with a metaclass.
-    Taken from six 1.3.0 (http://pythonhosted.org/six)
-    """
-    return meta("NewBase", (base,), {})
-
-
 class LexerMeta(type):
     """
     This metaclass automagically converts ``analyse_text`` methods into
@@ -56,7 +49,8 @@
         return type.__new__(cls, name, bases, d)
 
 
-class Lexer(with_metaclass(LexerMeta, object)):
+@add_metaclass(LexerMeta)
+class Lexer(object):
     """
     Lexer for a specific language.
 
@@ -69,15 +63,19 @@
     ``ensurenl``
         Make sure that the input ends with a newline (default: True).  This
         is required for some lexers that consume input linewise.
-        *New in Pygments 1.3.*
+
+        .. versionadded:: 1.3
+
     ``tabsize``
         If given and greater than 0, expand tabs in the input (default: 0).
     ``encoding``
         If given, must be an encoding name. This encoding will be used to
         convert the input string to Unicode, if it is not already a Unicode
-        string (default: ``'latin1'``).
-        Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
-        ``'chardet'`` to use the chardet library, if it is installed.
+        string (default: ``'guess'``, which uses a simple UTF-8 / Locale /
+        Latin1 detection.  Can also be ``'chardet'`` to use the chardet
+        library, if it is installed.
+    ``inencoding``
+        Overrides the ``encoding`` if given.
     """
 
     #: Name of the lexer
@@ -104,8 +102,8 @@
         self.stripall = get_bool_opt(options, 'stripall', False)
         self.ensurenl = get_bool_opt(options, 'ensurenl', True)
         self.tabsize = get_int_opt(options, 'tabsize', 0)
-        self.encoding = options.get('encoding', 'latin1')
-        # self.encoding = options.get('inencoding', None) or self.encoding
+        self.encoding = options.get('encoding', 'guess')
+        self.encoding = options.get('inencoding') or self.encoding
         self.filters = []
         for filter_ in get_list_opt(options, 'filters', ()):
             self.add_filter(filter_)
@@ -148,14 +146,9 @@
         Also preprocess the text, i.e. expand tabs and strip it if
         wanted and applies registered filters.
         """
-        if not isinstance(text, str):
+        if not isinstance(text, text_type):
             if self.encoding == 'guess':
-                try:
-                    text = text.decode('utf-8')
-                    if text.startswith('\ufeff'):
-                        text = text[len('\ufeff'):]
-                except UnicodeDecodeError:
-                    text = text.decode('latin1')
+                text, _ = guess_decode(text)
             elif self.encoding == 'chardet':
                 try:
                     import chardet
@@ -167,20 +160,21 @@
                 decoded = None
                 for bom, encoding in _encoding_map:
                     if text.startswith(bom):
-                        decoded = str(text[len(bom):], encoding,
-                                          errors='replace')
+                        decoded = text[len(bom):].decode(encoding, 'replace')
                         break
                 # no BOM found, so use chardet
                 if decoded is None:
-                    enc = chardet.detect(text[:1024]) # Guess using first 1KB
-                    decoded = str(text, enc.get('encoding') or 'utf-8',
-                                      errors='replace')
+                    enc = chardet.detect(text[:1024])  # Guess using first 1KB
+                    decoded = text.decode(enc.get('encoding') or 'utf-8',
+                                          'replace')
                 text = decoded
             else:
                 text = text.decode(self.encoding)
+                if text.startswith(u'\ufeff'):
+                    text = text[len(u'\ufeff'):]
         else:
-            if text.startswith('\ufeff'):
-                text = text[len('\ufeff'):]
+            if text.startswith(u'\ufeff'):
+                text = text[len(u'\ufeff'):]
 
         # text now *is* a unicode string
         text = text.replace('\r\n', '\n')
@@ -204,7 +198,9 @@
 
     def get_tokens_unprocessed(self, text):
         """
-        Return an iterable of (tokentype, value) pairs.
+        Return an iterable of (index, tokentype, value) pairs where "index"
+        is the starting position of the token within the input text.
+
         In subclasses, implement this method as a generator to
         maximize effectiveness.
         """
@@ -245,7 +241,7 @@
                              self.root_lexer.get_tokens_unprocessed(buffered))
 
 
-#-------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 # RegexLexer and ExtendedRegexLexer
 #
 
@@ -391,25 +387,55 @@
     return callback
 
 
+class default:
+    """
+    Indicates a state or state action (e.g. #pop) to apply.
+    For example default('#pop') is equivalent to ('', Token, '#pop')
+    Note that state tuples may be used as well.
+
+    .. versionadded:: 2.0
+    """
+    def __init__(self, state):
+        self.state = state
+
+
+class words(Future):
+    """
+    Indicates a list of literal words that is transformed into an optimized
+    regex that matches any of the words.
+
+    .. versionadded:: 2.0
+    """
+    def __init__(self, words, prefix='', suffix=''):
+        self.words = words
+        self.prefix = prefix
+        self.suffix = suffix
+
+    def get(self):
+        return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix)
+
+
 class RegexLexerMeta(LexerMeta):
     """
     Metaclass for RegexLexer, creates the self._tokens attribute from
     self.tokens on the first instantiation.
     """
 
-    def _process_regex(cls, regex, rflags):
+    def _process_regex(cls, regex, rflags, state):
         """Preprocess the regular expression component of a token definition."""
+        if isinstance(regex, Future):
+            regex = regex.get()
         return re.compile(regex, rflags).match
 
     def _process_token(cls, token):
         """Preprocess the token component of a token definition."""
-        assert type(token) is _TokenType or isinstance(token, collections.Callable), \
-               'token type must be simple type or callable, not %r' % (token,)
+        assert type(token) is _TokenType or callable(token), \
+            'token type must be simple type or callable, not %r' % (token,)
         return token
 
     def _process_new_state(cls, new_state, unprocessed, processed):
         """Preprocess the state transition action of a token definition."""
-        if isinstance(new_state, basestring):
+        if isinstance(new_state, str):
             # an existing state
             if new_state == '#pop':
                 return -1
@@ -437,14 +463,14 @@
             for istate in new_state:
                 assert (istate in unprocessed or
                         istate in ('#pop', '#push')), \
-                       'unknown new state ' + istate
+                    'unknown new state ' + istate
             return new_state
         else:
             assert False, 'unknown new state def %r' % new_state
 
     def _process_state(cls, unprocessed, processed, state):
         """Preprocess a single state definition."""
-        assert isinstance(state, basestring), "wrong state name %r" % state
+        assert type(state) is str, "wrong state name %r" % state
         assert state[0] != '#', "invalid state name %r" % state
         if state in processed:
             return processed[state]
@@ -458,13 +484,19 @@
                                                  str(tdef)))
                 continue
             if isinstance(tdef, _inherit):
-                # processed already
+                # should be processed already, but may not in the case of:
+                # 1. the state has no counterpart in any parent
+                # 2. the state includes more than one 'inherit'
+                continue
+            if isinstance(tdef, default):
+                new_state = cls._process_new_state(tdef.state, unprocessed, processed)
+                tokens.append((re.compile('').match, None, new_state))
                 continue
 
             assert type(tdef) is tuple, "wrong rule def %r" % tdef
 
             try:
-                rex = cls._process_regex(tdef[0], rflags)
+                rex = cls._process_regex(tdef[0], rflags, state)
             except Exception as err:
                 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
                                  (tdef[0], state, cls, err))
@@ -484,7 +516,7 @@
         """Preprocess a dictionary of token definitions."""
         processed = cls._all_tokens[name] = {}
         tokendefs = tokendefs or cls.tokens[name]
-        for state in list(tokendefs.keys()):
+        for state in list(tokendefs):
             cls._process_state(tokendefs, processed, state)
         return processed
 
@@ -502,12 +534,16 @@
         """
         tokens = {}
         inheritable = {}
-        for c in itertools.chain((cls,), cls.__mro__):
+        for c in cls.__mro__:
             toks = c.__dict__.get('tokens', {})
 
-            for state, items in toks.items():
+            for state, items in iteritems(toks):
                 curitems = tokens.get(state)
                 if curitems is None:
+                    # N.b. because this is assigned by reference, sufficiently
+                    # deep hierarchies are processed incrementally (e.g. for
+                    # A(B), B(C), C(RegexLexer), B will be premodified so X(B)
+                    # will not see any inherits in B).
                     tokens[state] = items
                     try:
                         inherit_ndx = items.index(inherit)
@@ -523,6 +559,8 @@
                 # Replace the "inherit" value with the items
                 curitems[inherit_ndx:inherit_ndx+1] = items
                 try:
+                    # N.b. this is the index in items (that is, the superclass
+                    # copy), so offset required when storing below.
                     new_inh_ndx = items.index(inherit)
                 except ValueError:
                     pass
@@ -545,7 +583,8 @@
         return type.__call__(cls, *args, **kwds)
 
 
-class RegexLexer(with_metaclass(RegexLexerMeta, Lexer)):
+@add_metaclass(RegexLexerMeta)
+class RegexLexer(Lexer):
     """
     Base for simple stateful regular expression-based lexers.
     Simplifies the lexing process so that you need only
@@ -589,11 +628,12 @@
             for rexmatch, action, new_state in statetokens:
                 m = rexmatch(text, pos)
                 if m:
-                    if type(action) is _TokenType:
-                        yield pos, action, m.group()
-                    else:
-                        for item in action(self, m):
-                            yield item
+                    if action is not None:
+                        if type(action) is _TokenType:
+                            yield pos, action, m.group()
+                        else:
+                            for item in action(self, m):
+                                yield item
                     pos = m.end()
                     if new_state is not None:
                         # state transition
@@ -620,7 +660,7 @@
                         # at EOL, reset state to "root"
                         statestack = ['root']
                         statetokens = tokendefs['root']
-                        yield pos, Text, '\n'
+                        yield pos, Text, u'\n'
                         pos += 1
                         continue
                     yield pos, Error, text[pos]
@@ -637,7 +677,7 @@
     def __init__(self, text, pos, stack=None, end=None):
         self.text = text
         self.pos = pos
-        self.end = end or len(text) # end=0 not supported ;-)
+        self.end = end or len(text)  # end=0 not supported ;-)
         self.stack = stack or ['root']
 
     def __repr__(self):
@@ -667,15 +707,16 @@
             for rexmatch, action, new_state in statetokens:
                 m = rexmatch(text, ctx.pos, ctx.end)
                 if m:
-                    if type(action) is _TokenType:
-                        yield ctx.pos, action, m.group()
-                        ctx.pos = m.end()
-                    else:
-                        for item in action(self, m, ctx):
-                            yield item
-                        if not new_state:
-                            # altered the state stack?
-                            statetokens = tokendefs[ctx.stack[-1]]
+                    if action is not None:
+                        if type(action) is _TokenType:
+                            yield ctx.pos, action, m.group()
+                            ctx.pos = m.end()
+                        else:
+                            for item in action(self, m, ctx):
+                                yield item
+                            if not new_state:
+                                # altered the state stack?
+                                statetokens = tokendefs[ctx.stack[-1]]
                     # CAUTION: callback must set ctx.pos!
                     if new_state is not None:
                         # state transition
@@ -684,7 +725,7 @@
                                 if state == '#pop':
                                     ctx.stack.pop()
                                 elif state == '#push':
-                                    ctx.stack.append(statestack[-1])
+                                    ctx.stack.append(ctx.stack[-1])
                                 else:
                                     ctx.stack.append(state)
                         elif isinstance(new_state, int):
@@ -704,7 +745,7 @@
                         # at EOL, reset state to "root"
                         ctx.stack = ['root']
                         statetokens = tokendefs['root']
-                        yield ctx.pos, Text, '\n'
+                        yield ctx.pos, Text, u'\n'
                         ctx.pos += 1
                         continue
                     yield ctx.pos, Error, text[ctx.pos]
@@ -774,3 +815,56 @@
         except StopIteration:
             insleft = False
             break  # not strictly necessary
+
+
+class ProfilingRegexLexerMeta(RegexLexerMeta):
+    """Metaclass for ProfilingRegexLexer, collects regex timing info."""
+
+    def _process_regex(cls, regex, rflags, state):
+        if isinstance(regex, words):
+            rex = regex_opt(regex.words, prefix=regex.prefix,
+                            suffix=regex.suffix)
+        else:
+            rex = regex
+        compiled = re.compile(rex, rflags)
+
+        def match_func(text, pos, endpos=sys.maxsize):
+            info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0])
+            t0 = time.time()
+            res = compiled.match(text, pos, endpos)
+            t1 = time.time()
+            info[0] += 1
+            info[1] += t1 - t0
+            return res
+        return match_func
+
+
+@add_metaclass(ProfilingRegexLexerMeta)
+class ProfilingRegexLexer(RegexLexer):
+    """Drop-in replacement for RegexLexer that does profiling of its regexes."""
+
+    _prof_data = []
+    _prof_sort_index = 4  # defaults to time per call
+
+    def get_tokens_unprocessed(self, text, stack=('root',)):
+        # this needs to be a stack, since using(this) will produce nested calls
+        self.__class__._prof_data.append({})
+        for tok in RegexLexer.get_tokens_unprocessed(self, text, stack):
+            yield tok
+        rawdata = self.__class__._prof_data.pop()
+        data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65],
+                        n, 1000 * t, 1000 * t / n)
+                       for ((s, r), (n, t)) in rawdata.items()),
+                      key=lambda x: x[self._prof_sort_index],
+                      reverse=True)
+        sum_total = sum(x[3] for x in data)
+
+        print()
+        print('Profiling result for %s lexing %d chars in %.3f ms' %
+              (self.__class__.__name__, len(text), sum_total))
+        print('=' * 110)
+        print('%-20s %-64s ncalls  tottime  percall' % ('state', 'regex'))
+        print('-' * 110)
+        for d in data:
+            print('%-20s %-65s %5d %8.4f %8.4f' % d)
+        print('=' * 110)

eric ide

mercurial