ThirdParty/Pygments/pygments/util.py

changeset 4172
4f20dba37ab6
parent 3484
645c12de6b0c
child 4697
c2e9bf425554
equal deleted inserted replaced
4170:8bc578136279 4172:4f20dba37ab6
3 pygments.util 3 pygments.util
4 ~~~~~~~~~~~~~ 4 ~~~~~~~~~~~~~
5 5
6 Utility functions. 6 Utility functions.
7 7
8 :copyright: Copyright 2006-2013 by the Pygments team, see AUTHORS. 8 :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details. 9 :license: BSD, see LICENSE for details.
10 """ 10 """
11 11
12 from __future__ import unicode_literals
13 try:
14 chr = unichr
15 except NameError:
16 pass
17
18 import re 12 import re
19 import sys 13 import sys
20 import codecs
21 14
22 15
23 split_path_re = re.compile(r'[/\\ ]') 16 split_path_re = re.compile(r'[/\\ ]')
24 doctype_lookup_re = re.compile(r'''(?smx) 17 doctype_lookup_re = re.compile(r'''(?smx)
25 (<\?.*?\?>)?\s* 18 (<\?.*?\?>)?\s*
26 <!DOCTYPE\s+( 19 <!DOCTYPE\s+(
20 [a-zA-Z_][a-zA-Z0-9]*
21 (?: \s+ # optional in HTML5
27 [a-zA-Z_][a-zA-Z0-9]*\s+ 22 [a-zA-Z_][a-zA-Z0-9]*\s+
28 [a-zA-Z_][a-zA-Z0-9]*\s+ 23 "[^"]*")?
29 "[^"]*") 24 )
30 [^>]*> 25 [^>]*>
31 ''') 26 ''')
32 tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</.+?>(?uism)') 27 tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</.+?>(?uism)')
28 xml_decl_re = re.compile(r'\s*<\?xml[^>]*\?>', re.I)
33 29
34 30
35 class ClassNotFound(ValueError): 31 class ClassNotFound(ValueError):
36 """ 32 """Raised if one of the lookup functions didn't find a matching class."""
37 If one of the get_*_by_* functions didn't find a matching class.
38 """
39 33
40 34
41 class OptionError(Exception): 35 class OptionError(Exception):
42 pass 36 pass
43 37
56 string = options.get(optname, default) 50 string = options.get(optname, default)
57 if isinstance(string, bool): 51 if isinstance(string, bool):
58 return string 52 return string
59 elif isinstance(string, int): 53 elif isinstance(string, int):
60 return bool(string) 54 return bool(string)
61 elif not isinstance(string, str): 55 elif not isinstance(string, string_types):
62 raise OptionError('Invalid type %r for option %s; use ' 56 raise OptionError('Invalid type %r for option %s; use '
63 '1/0, yes/no, true/false, on/off' % ( 57 '1/0, yes/no, true/false, on/off' % (
64 string, optname)) 58 string, optname))
65 elif string.lower() in ('1', 'yes', 'true', 'on'): 59 elif string.lower() in ('1', 'yes', 'true', 'on'):
66 return True 60 return True
67 elif string.lower() in ('0', 'no', 'false', 'off'): 61 elif string.lower() in ('0', 'no', 'false', 'off'):
68 return False 62 return False
69 else: 63 else:
70 raise OptionError('Invalid value %r for option %s; use ' 64 raise OptionError('Invalid value %r for option %s; use '
71 '1/0, yes/no, true/false, on/off' % ( 65 '1/0, yes/no, true/false, on/off' % (
72 string, optname)) 66 string, optname))
73 67
74 68
75 def get_int_opt(options, optname, default=None): 69 def get_int_opt(options, optname, default=None):
76 string = options.get(optname, default) 70 string = options.get(optname, default)
77 try: 71 try:
78 return int(string) 72 return int(string)
79 except TypeError: 73 except TypeError:
80 raise OptionError('Invalid type %r for option %s; you ' 74 raise OptionError('Invalid type %r for option %s; you '
81 'must give an integer value' % ( 75 'must give an integer value' % (
82 string, optname)) 76 string, optname))
83 except ValueError: 77 except ValueError:
84 raise OptionError('Invalid value %r for option %s; you ' 78 raise OptionError('Invalid value %r for option %s; you '
85 'must give an integer value' % ( 79 'must give an integer value' % (
86 string, optname)) 80 string, optname))
87 81
88 82
89 def get_list_opt(options, optname, default=None): 83 def get_list_opt(options, optname, default=None):
90 val = options.get(optname, default) 84 val = options.get(optname, default)
91 if isinstance(val, str): 85 if isinstance(val, string_types):
92 return val.split() 86 return val.split()
93 elif isinstance(val, (list, tuple)): 87 elif isinstance(val, (list, tuple)):
94 return list(val) 88 return list(val)
95 else: 89 else:
96 raise OptionError('Invalid type %r for option %s; you ' 90 raise OptionError('Invalid type %r for option %s; you '
97 'must give a list value' % ( 91 'must give a list value' % (
98 val, optname)) 92 val, optname))
99 93
100 94
101 def docstring_headline(obj): 95 def docstring_headline(obj):
102 if not obj.__doc__: 96 if not obj.__doc__:
103 return '' 97 return ''
109 break 103 break
110 return ''.join(res).lstrip() 104 return ''.join(res).lstrip()
111 105
112 106
113 def make_analysator(f): 107 def make_analysator(f):
114 """ 108 """Return a static text analyser function that returns float values."""
115 Return a static text analysation function that
116 returns float values.
117 """
118 def text_analyse(text): 109 def text_analyse(text):
119 try: 110 try:
120 rv = f(text) 111 rv = f(text)
121 except Exception: 112 except Exception:
122 return 0.0 113 return 0.0
129 text_analyse.__doc__ = f.__doc__ 120 text_analyse.__doc__ = f.__doc__
130 return staticmethod(text_analyse) 121 return staticmethod(text_analyse)
131 122
132 123
133 def shebang_matches(text, regex): 124 def shebang_matches(text, regex):
134 """ 125 """Check if the given regular expression matches the last part of the
135 Check if the given regular expression matches the last part of the
136 shebang if one exists. 126 shebang if one exists.
137 127
138 >>> from pygments.util import shebang_matches 128 >>> from pygments.util import shebang_matches
139 >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?') 129 >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?')
140 True 130 True
175 return True 165 return True
176 return False 166 return False
177 167
178 168
179 def doctype_matches(text, regex): 169 def doctype_matches(text, regex):
180 """ 170 """Check if the doctype matches a regular expression (if present).
181 Check if the doctype matches a regular expression (if present). 171
182 Note that this method only checks the first part of a DOCTYPE. 172 Note that this method only checks the first part of a DOCTYPE.
183 eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' 173 eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
184 """ 174 """
185 m = doctype_lookup_re.match(text) 175 m = doctype_lookup_re.match(text)
186 if m is None: 176 if m is None:
187 return False 177 return False
188 doctype = m.group(2) 178 doctype = m.group(2)
189 return re.compile(regex).match(doctype.strip()) is not None 179 return re.compile(regex, re.I).match(doctype.strip()) is not None
190 180
191 181
192 def html_doctype_matches(text): 182 def html_doctype_matches(text):
193 """ 183 """Check if the file looks like it has a html doctype."""
194 Check if the file looks like it has a html doctype. 184 return doctype_matches(text, r'html')
195 """
196 return doctype_matches(text, r'html\s+PUBLIC\s+"-//W3C//DTD X?HTML.*')
197 185
198 186
199 _looks_like_xml_cache = {} 187 _looks_like_xml_cache = {}
188
189
200 def looks_like_xml(text): 190 def looks_like_xml(text):
201 """ 191 """Check if a doctype exists or if we have some tags."""
202 Check if a doctype exists or if we have some tags. 192 if xml_decl_re.match(text):
203 """ 193 return True
204 key = hash(text) 194 key = hash(text)
205 try: 195 try:
206 return _looks_like_xml_cache[key] 196 return _looks_like_xml_cache[key]
207 except KeyError: 197 except KeyError:
208 m = doctype_lookup_re.match(text) 198 m = doctype_lookup_re.match(text)
210 return True 200 return True
211 rv = tag_re.search(text[:1000]) is not None 201 rv = tag_re.search(text[:1000]) is not None
212 _looks_like_xml_cache[key] = rv 202 _looks_like_xml_cache[key] = rv
213 return rv 203 return rv
214 204
205
215 # Python narrow build compatibility 206 # Python narrow build compatibility
216 207
217 def _surrogatepair(c): 208 def _surrogatepair(c):
209 # Given a unicode character code
210 # with length greater than 16 bits,
211 # return the two 16 bit surrogate pair.
212 # From example D28 of:
213 # http://www.unicode.org/book/ch03.pdf
218 return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff))) 214 return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff)))
219 215
216
220 def unirange(a, b): 217 def unirange(a, b):
221 """ 218 """Returns a regular expression string to match the given non-BMP range."""
222 Returns a regular expression string to match the given non-BMP range.
223 """
224 if b < a: 219 if b < a:
225 raise ValueError("Bad character range") 220 raise ValueError("Bad character range")
226 if a < 0x10000 or b < 0x10000: 221 if a < 0x10000 or b < 0x10000:
227 raise ValueError("unirange is only defined for non-BMP ranges") 222 raise ValueError("unirange is only defined for non-BMP ranges")
228 223
229 if sys.maxunicode > 0xffff: 224 if sys.maxunicode > 0xffff:
230 # wide build 225 # wide build
231 return '[%s-%s]' % (chr(a), chr(b)) 226 return u'[%s-%s]' % (unichr(a), unichr(b))
232 else: 227 else:
233 # narrow build stores surrogates, and the 're' module handles them 228 # narrow build stores surrogates, and the 're' module handles them
234 # (incorrectly) as characters. Since there is still ordering among 229 # (incorrectly) as characters. Since there is still ordering among
235 # these characters, expand the range to one that it understands. Some 230 # these characters, expand the range to one that it understands. Some
236 # background in http://bugs.python.org/issue3665 and 231 # background in http://bugs.python.org/issue3665 and
240 # literals because jython [which uses the wide path] can't load this 235 # literals because jython [which uses the wide path] can't load this
241 # file if they are literals. 236 # file if they are literals.
242 ah, al = _surrogatepair(a) 237 ah, al = _surrogatepair(a)
243 bh, bl = _surrogatepair(b) 238 bh, bl = _surrogatepair(b)
244 if ah == bh: 239 if ah == bh:
245 return '(?:%s[%s-%s])' % (chr(ah), chr(al), chr(bl)) 240 return u'(?:%s[%s-%s])' % (unichr(ah), unichr(al), unichr(bl))
246 else: 241 else:
247 buf = [] 242 buf = []
248 buf.append('%s[%s-%s]' % 243 buf.append(u'%s[%s-%s]' %
249 (chr(ah), chr(al), 244 (unichr(ah), unichr(al),
250 ah == bh and chr(bl) or chr(0xdfff))) 245 ah == bh and unichr(bl) or unichr(0xdfff)))
251 if ah - bh > 1: 246 if ah - bh > 1:
252 buf.append('[%s-%s][%s-%s]' % 247 buf.append(u'[%s-%s][%s-%s]' %
253 chr(ah+1), chr(bh-1), chr(0xdc00), chr(0xdfff)) 248 unichr(ah+1), unichr(bh-1), unichr(0xdc00), unichr(0xdfff))
254 if ah != bh: 249 if ah != bh:
255 buf.append('%s[%s-%s]' % 250 buf.append(u'%s[%s-%s]' %
256 (chr(bh), chr(0xdc00), chr(bl))) 251 (unichr(bh), unichr(0xdc00), unichr(bl)))
257 252
258 return '(?:' + '|'.join(buf) + ')' 253 return u'(?:' + u'|'.join(buf) + u')'
254
255
256 def format_lines(var_name, seq, raw=False, indent_level=0):
257 """Formats a sequence of strings for output."""
258 lines = []
259 base_indent = ' ' * indent_level * 4
260 inner_indent = ' ' * (indent_level + 1) * 4
261 lines.append(base_indent + var_name + ' = (')
262 if raw:
263 # These should be preformatted reprs of, say, tuples.
264 for i in seq:
265 lines.append(inner_indent + i + ',')
266 else:
267 for i in seq:
268 # Force use of single quotes
269 r = repr(i + '"')
270 lines.append(inner_indent + r[:-2] + r[-1] + ',')
271 lines.append(base_indent + ')')
272 return '\n'.join(lines)
273
274
275 def duplicates_removed(it, already_seen=()):
276 """
277 Returns a list with duplicates removed from the iterable `it`.
278
279 Order is preserved.
280 """
281 lst = []
282 seen = set()
283 for i in it:
284 if i in seen or i in already_seen:
285 continue
286 lst.append(i)
287 seen.add(i)
288 return lst
289
290
291 class Future(object):
292 """Generic class to defer some work.
293
294 Handled specially in RegexLexerMeta, to support regex string construction at
295 first use.
296 """
297 def get(self):
298 raise NotImplementedError
299
300
301 def guess_decode(text):
302 """Decode *text* with guessed encoding.
303
304 First try UTF-8; this should fail for non-UTF-8 encodings.
305 Then try the preferred locale encoding.
306 Fall back to latin-1, which always works.
307 """
308 try:
309 text = text.decode('utf-8')
310 return text, 'utf-8'
311 except UnicodeDecodeError:
312 try:
313 import locale
314 prefencoding = locale.getpreferredencoding()
315 text = text.decode()
316 return text, prefencoding
317 except (UnicodeDecodeError, LookupError):
318 text = text.decode('latin1')
319 return text, 'latin1'
320
321
322 def guess_decode_from_terminal(text, term):
323 """Decode *text* coming from terminal *term*.
324
325 First try the terminal encoding, if given.
326 Then try UTF-8. Then try the preferred locale encoding.
327 Fall back to latin-1, which always works.
328 """
329 if getattr(term, 'encoding', None):
330 try:
331 text = text.decode(term.encoding)
332 except UnicodeDecodeError:
333 pass
334 else:
335 return text, term.encoding
336 return guess_decode(text)
337
338
339 def terminal_encoding(term):
340 """Return our best guess of encoding for the given *term*."""
341 if getattr(term, 'encoding', None):
342 return term.encoding
343 import locale
344 return locale.getpreferredencoding()
345
259 346
260 # Python 2/3 compatibility 347 # Python 2/3 compatibility
261 348
262 if sys.version_info < (3,0): 349 if sys.version_info < (3, 0):
263 b = bytes = str 350 unichr = unichr
351 xrange = xrange
352 string_types = (str, unicode)
353 text_type = unicode
264 u_prefix = 'u' 354 u_prefix = 'u'
265 import StringIO, cStringIO 355 iteritems = dict.iteritems
356 itervalues = dict.itervalues
357 import StringIO
358 import cStringIO
359 # unfortunately, io.StringIO in Python 2 doesn't accept str at all
360 StringIO = StringIO.StringIO
266 BytesIO = cStringIO.StringIO 361 BytesIO = cStringIO.StringIO
267 StringIO = StringIO.StringIO
268 uni_open = codecs.open
269 else: 362 else:
270 import builtins 363 unichr = chr
271 bytes = builtins.bytes 364 xrange = range
365 string_types = (str,)
366 text_type = str
272 u_prefix = '' 367 u_prefix = ''
273 def b(s): 368 iteritems = dict.items
274 if isinstance(s, str): 369 itervalues = dict.values
275 return bytes(list(map(ord, s))) 370 from io import StringIO, BytesIO, TextIOWrapper
276 elif isinstance(s, bytes): 371
277 return s 372 class UnclosingTextIOWrapper(TextIOWrapper):
278 else: 373 # Don't close underlying buffer on destruction.
279 raise TypeError("Invalid argument %r for b()" % (s,)) 374 def close(self):
280 import io 375 pass
281 BytesIO = io.BytesIO 376
282 StringIO = io.StringIO 377
283 uni_open = builtins.open 378 def add_metaclass(metaclass):
379 """Class decorator for creating a class with a metaclass."""
380 def wrapper(cls):
381 orig_vars = cls.__dict__.copy()
382 orig_vars.pop('__dict__', None)
383 orig_vars.pop('__weakref__', None)
384 for slots_var in orig_vars.get('__slots__', ()):
385 orig_vars.pop(slots_var)
386 return metaclass(cls.__name__, cls.__bases__, orig_vars)
387 return wrapper

eric ide

mercurial