--- a/ThirdParty/Pygments/pygments/lexer.py Mon Mar 12 19:01:48 2012 +0100 +++ b/ThirdParty/Pygments/pygments/lexer.py Mon Mar 12 19:03:42 2012 +0100 @@ -5,7 +5,7 @@ Base lexer classes. - :copyright: Copyright 2006-2010 by the Pygments team, see AUTHORS. + :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS. :license: BSD, see LICENSE for details. """ import re @@ -15,12 +15,19 @@ from pygments.token import Error, Text, Other, _TokenType from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ make_analysator +import collections __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 'LexerContext', 'include', 'bygroups', 'using', 'this'] +_encoding_map = [('\xef\xbb\xbf', 'utf-8'), + ('\xff\xfe\0\0', 'utf-32'), + ('\0\0\xfe\xff', 'utf-32be'), + ('\xff\xfe', 'utf-16'), + ('\xfe\xff', 'utf-16be')] + _default_analyse = staticmethod(lambda x: 0.0) @@ -140,8 +147,19 @@ raise ImportError('To enable chardet encoding guessing, ' 'please install the chardet library ' 'from http://chardet.feedparser.org/') - enc = chardet.detect(text) - text = text.decode(enc['encoding']) + # check for BOM first + decoded = None + for bom, encoding in _encoding_map: + if text.startswith(bom): + decoded = str(text[len(bom):], encoding, + errors='replace') + break + # no BOM found, so use chardet + if decoded is None: + enc = chardet.detect(text[:1024]) # Guess using first 1KB + decoded = str(text, enc.get('encoding') or 'utf-8', + errors='replace') + text = decoded else: text = text.decode(self.encoding) # text now *is* a unicode string @@ -272,12 +290,14 @@ if data: yield match.start(i + 1), action, data else: - if ctx: - ctx.pos = match.start(i + 1) - for item in action(lexer, _PseudoMatch(match.start(i + 1), - match.group(i + 1)), ctx): - if item: - yield item + data = match.group(i + 1) + if data is not None: + if ctx: + ctx.pos = match.start(i + 1) + for item in action(lexer, _PseudoMatch(match.start(i + 1), + data), ctx): + if item: + yield item if ctx: ctx.pos = match.end() return callback @@ -353,7 +373,7 @@ def _process_token(cls, token): """Preprocess the token component of a token definition.""" - assert type(token) is _TokenType or hasattr(token, '__call__'), \ + assert type(token) is _TokenType or isinstance(token, collections.Callable), \ 'token type must be simple type or callable, not %r' % (token,) return token @@ -437,7 +457,7 @@ def __call__(cls, *args, **kwds): """Instantiate cls after preprocessing its token definitions.""" - if not hasattr(cls, '_tokens'): + if '_tokens' not in cls.__dict__: cls._all_tokens = {} cls._tmpname = 0 if hasattr(cls, 'token_variants') and cls.token_variants: