ThirdParty/Pygments/pygments/lexer.py

changeset 1705
b0fbc9300f2b
parent 808
8f85926125ef
child 2426
da76c71624de
--- a/ThirdParty/Pygments/pygments/lexer.py	Mon Mar 12 19:01:48 2012 +0100
+++ b/ThirdParty/Pygments/pygments/lexer.py	Mon Mar 12 19:03:42 2012 +0100
@@ -5,7 +5,7 @@
 
     Base lexer classes.
 
-    :copyright: Copyright 2006-2010 by the Pygments team, see AUTHORS.
+    :copyright: Copyright 2006-2012 by the Pygments team, see AUTHORS.
     :license: BSD, see LICENSE for details.
 """
 import re
@@ -15,12 +15,19 @@
 from pygments.token import Error, Text, Other, _TokenType
 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
      make_analysator
+import collections
 
 
 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
            'LexerContext', 'include', 'bygroups', 'using', 'this']
 
 
+_encoding_map = [('\xef\xbb\xbf', 'utf-8'),
+                 ('\xff\xfe\0\0', 'utf-32'),
+                 ('\0\0\xfe\xff', 'utf-32be'),
+                 ('\xff\xfe', 'utf-16'),
+                 ('\xfe\xff', 'utf-16be')]
+
 _default_analyse = staticmethod(lambda x: 0.0)
 
 
@@ -140,8 +147,19 @@
                     raise ImportError('To enable chardet encoding guessing, '
                                       'please install the chardet library '
                                       'from http://chardet.feedparser.org/')
-                enc = chardet.detect(text)
-                text = text.decode(enc['encoding'])
+                # check for BOM first
+                decoded = None
+                for bom, encoding in _encoding_map:
+                    if text.startswith(bom):
+                        decoded = str(text[len(bom):], encoding,
+                                          errors='replace')
+                        break
+                # no BOM found, so use chardet
+                if decoded is None:
+                    enc = chardet.detect(text[:1024]) # Guess using first 1KB
+                    decoded = str(text, enc.get('encoding') or 'utf-8',
+                                      errors='replace')
+                text = decoded
             else:
                 text = text.decode(self.encoding)
         # text now *is* a unicode string
@@ -272,12 +290,14 @@
                 if data:
                     yield match.start(i + 1), action, data
             else:
-                if ctx:
-                    ctx.pos = match.start(i + 1)
-                for item in action(lexer, _PseudoMatch(match.start(i + 1),
-                                   match.group(i + 1)), ctx):
-                    if item:
-                        yield item
+                data = match.group(i + 1)
+                if data is not None:
+                    if ctx:
+                        ctx.pos = match.start(i + 1)
+                    for item in action(lexer, _PseudoMatch(match.start(i + 1),
+                                       data), ctx):
+                        if item:
+                            yield item
         if ctx:
             ctx.pos = match.end()
     return callback
@@ -353,7 +373,7 @@
 
     def _process_token(cls, token):
         """Preprocess the token component of a token definition."""
-        assert type(token) is _TokenType or hasattr(token, '__call__'), \
+        assert type(token) is _TokenType or isinstance(token, collections.Callable), \
                'token type must be simple type or callable, not %r' % (token,)
         return token
 
@@ -437,7 +457,7 @@
 
     def __call__(cls, *args, **kwds):
         """Instantiate cls after preprocessing its token definitions."""
-        if not hasattr(cls, '_tokens'):
+        if '_tokens' not in cls.__dict__:
             cls._all_tokens = {}
             cls._tmpname = 0
             if hasattr(cls, 'token_variants') and cls.token_variants:

eric ide

mercurial