|
1 # -*- coding: utf-8 -*- |
|
2 """ |
|
3 pygments.util |
|
4 ~~~~~~~~~~~~~ |
|
5 |
|
6 Utility functions. |
|
7 |
|
8 :copyright: Copyright 2006-2017 by the Pygments team, see AUTHORS. |
|
9 :license: BSD, see LICENSE for details. |
|
10 """ |
|
11 |
|
12 import re |
|
13 import sys |
|
14 |
|
15 |
|
16 split_path_re = re.compile(r'[/\\ ]') |
|
17 doctype_lookup_re = re.compile(r''' |
|
18 (<\?.*?\?>)?\s* |
|
19 <!DOCTYPE\s+( |
|
20 [a-zA-Z_][a-zA-Z0-9]* |
|
21 (?: \s+ # optional in HTML5 |
|
22 [a-zA-Z_][a-zA-Z0-9]*\s+ |
|
23 "[^"]*")? |
|
24 ) |
|
25 [^>]*> |
|
26 ''', re.DOTALL | re.MULTILINE | re.VERBOSE) |
|
27 tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</.+?>', |
|
28 re.UNICODE | re.IGNORECASE | re.DOTALL | re.MULTILINE) |
|
29 xml_decl_re = re.compile(r'\s*<\?xml[^>]*\?>', re.I) |
|
30 |
|
31 |
|
32 class ClassNotFound(ValueError): |
|
33 """Raised if one of the lookup functions didn't find a matching class.""" |
|
34 |
|
35 |
|
36 class OptionError(Exception): |
|
37 pass |
|
38 |
|
39 |
|
40 def get_choice_opt(options, optname, allowed, default=None, normcase=False): |
|
41 string = options.get(optname, default) |
|
42 if normcase: |
|
43 string = string.lower() |
|
44 if string not in allowed: |
|
45 raise OptionError('Value for option %s must be one of %s' % |
|
46 (optname, ', '.join(map(str, allowed)))) |
|
47 return string |
|
48 |
|
49 |
|
50 def get_bool_opt(options, optname, default=None): |
|
51 string = options.get(optname, default) |
|
52 if isinstance(string, bool): |
|
53 return string |
|
54 elif isinstance(string, int): |
|
55 return bool(string) |
|
56 elif not isinstance(string, string_types): |
|
57 raise OptionError('Invalid type %r for option %s; use ' |
|
58 '1/0, yes/no, true/false, on/off' % ( |
|
59 string, optname)) |
|
60 elif string.lower() in ('1', 'yes', 'true', 'on'): |
|
61 return True |
|
62 elif string.lower() in ('0', 'no', 'false', 'off'): |
|
63 return False |
|
64 else: |
|
65 raise OptionError('Invalid value %r for option %s; use ' |
|
66 '1/0, yes/no, true/false, on/off' % ( |
|
67 string, optname)) |
|
68 |
|
69 |
|
70 def get_int_opt(options, optname, default=None): |
|
71 string = options.get(optname, default) |
|
72 try: |
|
73 return int(string) |
|
74 except TypeError: |
|
75 raise OptionError('Invalid type %r for option %s; you ' |
|
76 'must give an integer value' % ( |
|
77 string, optname)) |
|
78 except ValueError: |
|
79 raise OptionError('Invalid value %r for option %s; you ' |
|
80 'must give an integer value' % ( |
|
81 string, optname)) |
|
82 |
|
83 |
|
84 def get_list_opt(options, optname, default=None): |
|
85 val = options.get(optname, default) |
|
86 if isinstance(val, string_types): |
|
87 return val.split() |
|
88 elif isinstance(val, (list, tuple)): |
|
89 return list(val) |
|
90 else: |
|
91 raise OptionError('Invalid type %r for option %s; you ' |
|
92 'must give a list value' % ( |
|
93 val, optname)) |
|
94 |
|
95 |
|
96 def docstring_headline(obj): |
|
97 if not obj.__doc__: |
|
98 return '' |
|
99 res = [] |
|
100 for line in obj.__doc__.strip().splitlines(): |
|
101 if line.strip(): |
|
102 res.append(" " + line.strip()) |
|
103 else: |
|
104 break |
|
105 return ''.join(res).lstrip() |
|
106 |
|
107 |
|
108 def make_analysator(f): |
|
109 """Return a static text analyser function that returns float values.""" |
|
110 def text_analyse(text): |
|
111 try: |
|
112 rv = f(text) |
|
113 except Exception: |
|
114 return 0.0 |
|
115 if not rv: |
|
116 return 0.0 |
|
117 try: |
|
118 return min(1.0, max(0.0, float(rv))) |
|
119 except (ValueError, TypeError): |
|
120 return 0.0 |
|
121 text_analyse.__doc__ = f.__doc__ |
|
122 return staticmethod(text_analyse) |
|
123 |
|
124 |
|
125 def shebang_matches(text, regex): |
|
126 r"""Check if the given regular expression matches the last part of the |
|
127 shebang if one exists. |
|
128 |
|
129 >>> from pygments.util import shebang_matches |
|
130 >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?') |
|
131 True |
|
132 >>> shebang_matches('#!/usr/bin/python2.4', r'python(2\.\d)?') |
|
133 True |
|
134 >>> shebang_matches('#!/usr/bin/python-ruby', r'python(2\.\d)?') |
|
135 False |
|
136 >>> shebang_matches('#!/usr/bin/python/ruby', r'python(2\.\d)?') |
|
137 False |
|
138 >>> shebang_matches('#!/usr/bin/startsomethingwith python', |
|
139 ... r'python(2\.\d)?') |
|
140 True |
|
141 |
|
142 It also checks for common windows executable file extensions:: |
|
143 |
|
144 >>> shebang_matches('#!C:\\Python2.4\\Python.exe', r'python(2\.\d)?') |
|
145 True |
|
146 |
|
147 Parameters (``'-f'`` or ``'--foo'`` are ignored so ``'perl'`` does |
|
148 the same as ``'perl -e'``) |
|
149 |
|
150 Note that this method automatically searches the whole string (eg: |
|
151 the regular expression is wrapped in ``'^$'``) |
|
152 """ |
|
153 index = text.find('\n') |
|
154 if index >= 0: |
|
155 first_line = text[:index].lower() |
|
156 else: |
|
157 first_line = text.lower() |
|
158 if first_line.startswith('#!'): |
|
159 try: |
|
160 found = [x for x in split_path_re.split(first_line[2:].strip()) |
|
161 if x and not x.startswith('-')][-1] |
|
162 except IndexError: |
|
163 return False |
|
164 regex = re.compile(r'^%s(\.(exe|cmd|bat|bin))?$' % regex, re.IGNORECASE) |
|
165 if regex.search(found) is not None: |
|
166 return True |
|
167 return False |
|
168 |
|
169 |
|
170 def doctype_matches(text, regex): |
|
171 """Check if the doctype matches a regular expression (if present). |
|
172 |
|
173 Note that this method only checks the first part of a DOCTYPE. |
|
174 eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' |
|
175 """ |
|
176 m = doctype_lookup_re.match(text) |
|
177 if m is None: |
|
178 return False |
|
179 doctype = m.group(2) |
|
180 return re.compile(regex, re.I).match(doctype.strip()) is not None |
|
181 |
|
182 |
|
183 def html_doctype_matches(text): |
|
184 """Check if the file looks like it has a html doctype.""" |
|
185 return doctype_matches(text, r'html') |
|
186 |
|
187 |
|
188 _looks_like_xml_cache = {} |
|
189 |
|
190 |
|
191 def looks_like_xml(text): |
|
192 """Check if a doctype exists or if we have some tags.""" |
|
193 if xml_decl_re.match(text): |
|
194 return True |
|
195 key = hash(text) |
|
196 try: |
|
197 return _looks_like_xml_cache[key] |
|
198 except KeyError: |
|
199 m = doctype_lookup_re.match(text) |
|
200 if m is not None: |
|
201 return True |
|
202 rv = tag_re.search(text[:1000]) is not None |
|
203 _looks_like_xml_cache[key] = rv |
|
204 return rv |
|
205 |
|
206 |
|
207 # Python narrow build compatibility |
|
208 |
|
209 def _surrogatepair(c): |
|
210 # Given a unicode character code |
|
211 # with length greater than 16 bits, |
|
212 # return the two 16 bit surrogate pair. |
|
213 # From example D28 of: |
|
214 # http://www.unicode.org/book/ch03.pdf |
|
215 return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff))) |
|
216 |
|
217 |
|
218 def unirange(a, b): |
|
219 """Returns a regular expression string to match the given non-BMP range.""" |
|
220 if b < a: |
|
221 raise ValueError("Bad character range") |
|
222 if a < 0x10000 or b < 0x10000: |
|
223 raise ValueError("unirange is only defined for non-BMP ranges") |
|
224 |
|
225 if sys.maxunicode > 0xffff: |
|
226 # wide build |
|
227 return u'[%s-%s]' % (unichr(a), unichr(b)) |
|
228 else: |
|
229 # narrow build stores surrogates, and the 're' module handles them |
|
230 # (incorrectly) as characters. Since there is still ordering among |
|
231 # these characters, expand the range to one that it understands. Some |
|
232 # background in http://bugs.python.org/issue3665 and |
|
233 # http://bugs.python.org/issue12749 |
|
234 # |
|
235 # Additionally, the lower constants are using unichr rather than |
|
236 # literals because jython [which uses the wide path] can't load this |
|
237 # file if they are literals. |
|
238 ah, al = _surrogatepair(a) |
|
239 bh, bl = _surrogatepair(b) |
|
240 if ah == bh: |
|
241 return u'(?:%s[%s-%s])' % (unichr(ah), unichr(al), unichr(bl)) |
|
242 else: |
|
243 buf = [] |
|
244 buf.append(u'%s[%s-%s]' % |
|
245 (unichr(ah), unichr(al), |
|
246 ah == bh and unichr(bl) or unichr(0xdfff))) |
|
247 if ah - bh > 1: |
|
248 buf.append(u'[%s-%s][%s-%s]' % |
|
249 unichr(ah+1), unichr(bh-1), unichr(0xdc00), unichr(0xdfff)) |
|
250 if ah != bh: |
|
251 buf.append(u'%s[%s-%s]' % |
|
252 (unichr(bh), unichr(0xdc00), unichr(bl))) |
|
253 |
|
254 return u'(?:' + u'|'.join(buf) + u')' |
|
255 |
|
256 |
|
257 def format_lines(var_name, seq, raw=False, indent_level=0): |
|
258 """Formats a sequence of strings for output.""" |
|
259 lines = [] |
|
260 base_indent = ' ' * indent_level * 4 |
|
261 inner_indent = ' ' * (indent_level + 1) * 4 |
|
262 lines.append(base_indent + var_name + ' = (') |
|
263 if raw: |
|
264 # These should be preformatted reprs of, say, tuples. |
|
265 for i in seq: |
|
266 lines.append(inner_indent + i + ',') |
|
267 else: |
|
268 for i in seq: |
|
269 # Force use of single quotes |
|
270 r = repr(i + '"') |
|
271 lines.append(inner_indent + r[:-2] + r[-1] + ',') |
|
272 lines.append(base_indent + ')') |
|
273 return '\n'.join(lines) |
|
274 |
|
275 |
|
276 def duplicates_removed(it, already_seen=()): |
|
277 """ |
|
278 Returns a list with duplicates removed from the iterable `it`. |
|
279 |
|
280 Order is preserved. |
|
281 """ |
|
282 lst = [] |
|
283 seen = set() |
|
284 for i in it: |
|
285 if i in seen or i in already_seen: |
|
286 continue |
|
287 lst.append(i) |
|
288 seen.add(i) |
|
289 return lst |
|
290 |
|
291 |
|
292 class Future(object): |
|
293 """Generic class to defer some work. |
|
294 |
|
295 Handled specially in RegexLexerMeta, to support regex string construction at |
|
296 first use. |
|
297 """ |
|
298 def get(self): |
|
299 raise NotImplementedError |
|
300 |
|
301 |
|
302 def guess_decode(text): |
|
303 """Decode *text* with guessed encoding. |
|
304 |
|
305 First try UTF-8; this should fail for non-UTF-8 encodings. |
|
306 Then try the preferred locale encoding. |
|
307 Fall back to latin-1, which always works. |
|
308 """ |
|
309 try: |
|
310 text = text.decode('utf-8') |
|
311 return text, 'utf-8' |
|
312 except UnicodeDecodeError: |
|
313 try: |
|
314 import locale |
|
315 prefencoding = locale.getpreferredencoding() |
|
316 text = text.decode() |
|
317 return text, prefencoding |
|
318 except (UnicodeDecodeError, LookupError): |
|
319 text = text.decode('latin1') |
|
320 return text, 'latin1' |
|
321 |
|
322 |
|
323 def guess_decode_from_terminal(text, term): |
|
324 """Decode *text* coming from terminal *term*. |
|
325 |
|
326 First try the terminal encoding, if given. |
|
327 Then try UTF-8. Then try the preferred locale encoding. |
|
328 Fall back to latin-1, which always works. |
|
329 """ |
|
330 if getattr(term, 'encoding', None): |
|
331 try: |
|
332 text = text.decode(term.encoding) |
|
333 except UnicodeDecodeError: |
|
334 pass |
|
335 else: |
|
336 return text, term.encoding |
|
337 return guess_decode(text) |
|
338 |
|
339 |
|
340 def terminal_encoding(term): |
|
341 """Return our best guess of encoding for the given *term*.""" |
|
342 if getattr(term, 'encoding', None): |
|
343 return term.encoding |
|
344 import locale |
|
345 return locale.getpreferredencoding() |
|
346 |
|
347 |
|
348 # Python 2/3 compatibility |
|
349 |
|
350 if sys.version_info < (3, 0): |
|
351 unichr = unichr |
|
352 xrange = xrange |
|
353 string_types = (str, unicode) |
|
354 text_type = unicode |
|
355 u_prefix = 'u' |
|
356 iteritems = dict.iteritems |
|
357 itervalues = dict.itervalues |
|
358 import StringIO |
|
359 import cStringIO |
|
360 # unfortunately, io.StringIO in Python 2 doesn't accept str at all |
|
361 StringIO = StringIO.StringIO |
|
362 BytesIO = cStringIO.StringIO |
|
363 else: |
|
364 unichr = chr |
|
365 xrange = range |
|
366 string_types = (str,) |
|
367 text_type = str |
|
368 u_prefix = '' |
|
369 iteritems = dict.items |
|
370 itervalues = dict.values |
|
371 from io import StringIO, BytesIO, TextIOWrapper |
|
372 |
|
373 class UnclosingTextIOWrapper(TextIOWrapper): |
|
374 # Don't close underlying buffer on destruction. |
|
375 def close(self): |
|
376 self.flush() |
|
377 |
|
378 |
|
379 def add_metaclass(metaclass): |
|
380 """Class decorator for creating a class with a metaclass.""" |
|
381 def wrapper(cls): |
|
382 orig_vars = cls.__dict__.copy() |
|
383 orig_vars.pop('__dict__', None) |
|
384 orig_vars.pop('__weakref__', None) |
|
385 for slots_var in orig_vars.get('__slots__', ()): |
|
386 orig_vars.pop(slots_var) |
|
387 return metaclass(cls.__name__, cls.__bases__, orig_vars) |
|
388 return wrapper |