diff -r 8bc578136279 -r 4f20dba37ab6 ThirdParty/Pygments/pygments/lexer.py --- a/ThirdParty/Pygments/pygments/lexer.py Wed Mar 11 18:25:37 2015 +0100 +++ b/ThirdParty/Pygments/pygments/lexer.py Wed Mar 11 18:32:27 2015 +0100 @@ -5,45 +5,38 @@ Base lexer classes. - :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS. + :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ -try: - str = unicode -except NameError: - basestring = str + +from __future__ import print_function -import re, itertools +import re +import sys +import time +import itertools from pygments.filter import apply_filters, Filter from pygments.filters import get_filter_by_name from pygments.token import Error, Text, Other, _TokenType from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ - make_analysator -import collections - + make_analysator, text_type, add_metaclass, iteritems, Future, guess_decode +from pygments.regexopt import regex_opt __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', - 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this'] + 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', + 'default', 'words'] -_encoding_map = [('\xef\xbb\xbf', 'utf-8'), - ('\xff\xfe\0\0', 'utf-32'), - ('\0\0\xfe\xff', 'utf-32be'), - ('\xff\xfe', 'utf-16'), - ('\xfe\xff', 'utf-16be')] +_encoding_map = [(b'\xef\xbb\xbf', 'utf-8'), + (b'\xff\xfe\0\0', 'utf-32'), + (b'\0\0\xfe\xff', 'utf-32be'), + (b'\xff\xfe', 'utf-16'), + (b'\xfe\xff', 'utf-16be')] _default_analyse = staticmethod(lambda x: 0.0) -def with_metaclass(meta, base=object): - """ - Python independent version to create a base class with a metaclass. - Taken from six 1.3.0 (http://pythonhosted.org/six) - """ - return meta("NewBase", (base,), {}) - - class LexerMeta(type): """ This metaclass automagically converts ``analyse_text`` methods into @@ -56,7 +49,8 @@ return type.__new__(cls, name, bases, d) -class Lexer(with_metaclass(LexerMeta, object)): +@add_metaclass(LexerMeta) +class Lexer(object): """ Lexer for a specific language. @@ -69,15 +63,19 @@ ``ensurenl`` Make sure that the input ends with a newline (default: True). This is required for some lexers that consume input linewise. - *New in Pygments 1.3.* + + .. versionadded:: 1.3 + ``tabsize`` If given and greater than 0, expand tabs in the input (default: 0). ``encoding`` If given, must be an encoding name. This encoding will be used to convert the input string to Unicode, if it is not already a Unicode - string (default: ``'latin1'``). - Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or - ``'chardet'`` to use the chardet library, if it is installed. + string (default: ``'guess'``, which uses a simple UTF-8 / Locale / + Latin1 detection. Can also be ``'chardet'`` to use the chardet + library, if it is installed. + ``inencoding`` + Overrides the ``encoding`` if given. """ #: Name of the lexer @@ -104,8 +102,8 @@ self.stripall = get_bool_opt(options, 'stripall', False) self.ensurenl = get_bool_opt(options, 'ensurenl', True) self.tabsize = get_int_opt(options, 'tabsize', 0) - self.encoding = options.get('encoding', 'latin1') - # self.encoding = options.get('inencoding', None) or self.encoding + self.encoding = options.get('encoding', 'guess') + self.encoding = options.get('inencoding') or self.encoding self.filters = [] for filter_ in get_list_opt(options, 'filters', ()): self.add_filter(filter_) @@ -148,14 +146,9 @@ Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. """ - if not isinstance(text, str): + if not isinstance(text, text_type): if self.encoding == 'guess': - try: - text = text.decode('utf-8') - if text.startswith('\ufeff'): - text = text[len('\ufeff'):] - except UnicodeDecodeError: - text = text.decode('latin1') + text, _ = guess_decode(text) elif self.encoding == 'chardet': try: import chardet @@ -167,20 +160,21 @@ decoded = None for bom, encoding in _encoding_map: if text.startswith(bom): - decoded = str(text[len(bom):], encoding, - errors='replace') + decoded = text[len(bom):].decode(encoding, 'replace') break # no BOM found, so use chardet if decoded is None: - enc = chardet.detect(text[:1024]) # Guess using first 1KB - decoded = str(text, enc.get('encoding') or 'utf-8', - errors='replace') + enc = chardet.detect(text[:1024]) # Guess using first 1KB + decoded = text.decode(enc.get('encoding') or 'utf-8', + 'replace') text = decoded else: text = text.decode(self.encoding) + if text.startswith(u'\ufeff'): + text = text[len(u'\ufeff'):] else: - if text.startswith('\ufeff'): - text = text[len('\ufeff'):] + if text.startswith(u'\ufeff'): + text = text[len(u'\ufeff'):] # text now *is* a unicode string text = text.replace('\r\n', '\n') @@ -204,7 +198,9 @@ def get_tokens_unprocessed(self, text): """ - Return an iterable of (tokentype, value) pairs. + Return an iterable of (index, tokentype, value) pairs where "index" + is the starting position of the token within the input text. + In subclasses, implement this method as a generator to maximize effectiveness. """ @@ -245,7 +241,7 @@ self.root_lexer.get_tokens_unprocessed(buffered)) -#------------------------------------------------------------------------------- +# ------------------------------------------------------------------------------ # RegexLexer and ExtendedRegexLexer # @@ -391,25 +387,55 @@ return callback +class default: + """ + Indicates a state or state action (e.g. #pop) to apply. + For example default('#pop') is equivalent to ('', Token, '#pop') + Note that state tuples may be used as well. + + .. versionadded:: 2.0 + """ + def __init__(self, state): + self.state = state + + +class words(Future): + """ + Indicates a list of literal words that is transformed into an optimized + regex that matches any of the words. + + .. versionadded:: 2.0 + """ + def __init__(self, words, prefix='', suffix=''): + self.words = words + self.prefix = prefix + self.suffix = suffix + + def get(self): + return regex_opt(self.words, prefix=self.prefix, suffix=self.suffix) + + class RegexLexerMeta(LexerMeta): """ Metaclass for RegexLexer, creates the self._tokens attribute from self.tokens on the first instantiation. """ - def _process_regex(cls, regex, rflags): + def _process_regex(cls, regex, rflags, state): """Preprocess the regular expression component of a token definition.""" + if isinstance(regex, Future): + regex = regex.get() return re.compile(regex, rflags).match def _process_token(cls, token): """Preprocess the token component of a token definition.""" - assert type(token) is _TokenType or isinstance(token, collections.Callable), \ - 'token type must be simple type or callable, not %r' % (token,) + assert type(token) is _TokenType or callable(token), \ + 'token type must be simple type or callable, not %r' % (token,) return token def _process_new_state(cls, new_state, unprocessed, processed): """Preprocess the state transition action of a token definition.""" - if isinstance(new_state, basestring): + if isinstance(new_state, str): # an existing state if new_state == '#pop': return -1 @@ -437,14 +463,14 @@ for istate in new_state: assert (istate in unprocessed or istate in ('#pop', '#push')), \ - 'unknown new state ' + istate + 'unknown new state ' + istate return new_state else: assert False, 'unknown new state def %r' % new_state def _process_state(cls, unprocessed, processed, state): """Preprocess a single state definition.""" - assert isinstance(state, basestring), "wrong state name %r" % state + assert type(state) is str, "wrong state name %r" % state assert state[0] != '#', "invalid state name %r" % state if state in processed: return processed[state] @@ -458,13 +484,19 @@ str(tdef))) continue if isinstance(tdef, _inherit): - # processed already + # should be processed already, but may not in the case of: + # 1. the state has no counterpart in any parent + # 2. the state includes more than one 'inherit' + continue + if isinstance(tdef, default): + new_state = cls._process_new_state(tdef.state, unprocessed, processed) + tokens.append((re.compile('').match, None, new_state)) continue assert type(tdef) is tuple, "wrong rule def %r" % tdef try: - rex = cls._process_regex(tdef[0], rflags) + rex = cls._process_regex(tdef[0], rflags, state) except Exception as err: raise ValueError("uncompilable regex %r in state %r of %r: %s" % (tdef[0], state, cls, err)) @@ -484,7 +516,7 @@ """Preprocess a dictionary of token definitions.""" processed = cls._all_tokens[name] = {} tokendefs = tokendefs or cls.tokens[name] - for state in list(tokendefs.keys()): + for state in list(tokendefs): cls._process_state(tokendefs, processed, state) return processed @@ -502,12 +534,16 @@ """ tokens = {} inheritable = {} - for c in itertools.chain((cls,), cls.__mro__): + for c in cls.__mro__: toks = c.__dict__.get('tokens', {}) - for state, items in toks.items(): + for state, items in iteritems(toks): curitems = tokens.get(state) if curitems is None: + # N.b. because this is assigned by reference, sufficiently + # deep hierarchies are processed incrementally (e.g. for + # A(B), B(C), C(RegexLexer), B will be premodified so X(B) + # will not see any inherits in B). tokens[state] = items try: inherit_ndx = items.index(inherit) @@ -523,6 +559,8 @@ # Replace the "inherit" value with the items curitems[inherit_ndx:inherit_ndx+1] = items try: + # N.b. this is the index in items (that is, the superclass + # copy), so offset required when storing below. new_inh_ndx = items.index(inherit) except ValueError: pass @@ -545,7 +583,8 @@ return type.__call__(cls, *args, **kwds) -class RegexLexer(with_metaclass(RegexLexerMeta, Lexer)): +@add_metaclass(RegexLexerMeta) +class RegexLexer(Lexer): """ Base for simple stateful regular expression-based lexers. Simplifies the lexing process so that you need only @@ -589,11 +628,12 @@ for rexmatch, action, new_state in statetokens: m = rexmatch(text, pos) if m: - if type(action) is _TokenType: - yield pos, action, m.group() - else: - for item in action(self, m): - yield item + if action is not None: + if type(action) is _TokenType: + yield pos, action, m.group() + else: + for item in action(self, m): + yield item pos = m.end() if new_state is not None: # state transition @@ -620,7 +660,7 @@ # at EOL, reset state to "root" statestack = ['root'] statetokens = tokendefs['root'] - yield pos, Text, '\n' + yield pos, Text, u'\n' pos += 1 continue yield pos, Error, text[pos] @@ -637,7 +677,7 @@ def __init__(self, text, pos, stack=None, end=None): self.text = text self.pos = pos - self.end = end or len(text) # end=0 not supported ;-) + self.end = end or len(text) # end=0 not supported ;-) self.stack = stack or ['root'] def __repr__(self): @@ -667,15 +707,16 @@ for rexmatch, action, new_state in statetokens: m = rexmatch(text, ctx.pos, ctx.end) if m: - if type(action) is _TokenType: - yield ctx.pos, action, m.group() - ctx.pos = m.end() - else: - for item in action(self, m, ctx): - yield item - if not new_state: - # altered the state stack? - statetokens = tokendefs[ctx.stack[-1]] + if action is not None: + if type(action) is _TokenType: + yield ctx.pos, action, m.group() + ctx.pos = m.end() + else: + for item in action(self, m, ctx): + yield item + if not new_state: + # altered the state stack? + statetokens = tokendefs[ctx.stack[-1]] # CAUTION: callback must set ctx.pos! if new_state is not None: # state transition @@ -684,7 +725,7 @@ if state == '#pop': ctx.stack.pop() elif state == '#push': - ctx.stack.append(statestack[-1]) + ctx.stack.append(ctx.stack[-1]) else: ctx.stack.append(state) elif isinstance(new_state, int): @@ -704,7 +745,7 @@ # at EOL, reset state to "root" ctx.stack = ['root'] statetokens = tokendefs['root'] - yield ctx.pos, Text, '\n' + yield ctx.pos, Text, u'\n' ctx.pos += 1 continue yield ctx.pos, Error, text[ctx.pos] @@ -774,3 +815,56 @@ except StopIteration: insleft = False break # not strictly necessary + + +class ProfilingRegexLexerMeta(RegexLexerMeta): + """Metaclass for ProfilingRegexLexer, collects regex timing info.""" + + def _process_regex(cls, regex, rflags, state): + if isinstance(regex, words): + rex = regex_opt(regex.words, prefix=regex.prefix, + suffix=regex.suffix) + else: + rex = regex + compiled = re.compile(rex, rflags) + + def match_func(text, pos, endpos=sys.maxsize): + info = cls._prof_data[-1].setdefault((state, rex), [0, 0.0]) + t0 = time.time() + res = compiled.match(text, pos, endpos) + t1 = time.time() + info[0] += 1 + info[1] += t1 - t0 + return res + return match_func + + +@add_metaclass(ProfilingRegexLexerMeta) +class ProfilingRegexLexer(RegexLexer): + """Drop-in replacement for RegexLexer that does profiling of its regexes.""" + + _prof_data = [] + _prof_sort_index = 4 # defaults to time per call + + def get_tokens_unprocessed(self, text, stack=('root',)): + # this needs to be a stack, since using(this) will produce nested calls + self.__class__._prof_data.append({}) + for tok in RegexLexer.get_tokens_unprocessed(self, text, stack): + yield tok + rawdata = self.__class__._prof_data.pop() + data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65], + n, 1000 * t, 1000 * t / n) + for ((s, r), (n, t)) in rawdata.items()), + key=lambda x: x[self._prof_sort_index], + reverse=True) + sum_total = sum(x[3] for x in data) + + print() + print('Profiling result for %s lexing %d chars in %.3f ms' % + (self.__class__.__name__, len(text), sum_total)) + print('=' * 110) + print('%-20s %-64s ncalls tottime percall' % ('state', 'regex')) + print('-' * 110) + for d in data: + print('%-20s %-65s %5d %8.4f %8.4f' % d) + print('=' * 110)