1 # -*- coding: utf-8 -*- |
|
2 """ |
|
3 pygments.util |
|
4 ~~~~~~~~~~~~~ |
|
5 |
|
6 Utility functions. |
|
7 |
|
8 :copyright: Copyright 2006-2021 by the Pygments team, see AUTHORS. |
|
9 :license: BSD, see LICENSE for details. |
|
10 """ |
|
11 |
|
12 import re |
|
13 from io import TextIOWrapper |
|
14 |
|
15 |
|
16 split_path_re = re.compile(r'[/\\ ]') |
|
17 doctype_lookup_re = re.compile(r''' |
|
18 <!DOCTYPE\s+( |
|
19 [a-zA-Z_][a-zA-Z0-9]* |
|
20 (?: \s+ # optional in HTML5 |
|
21 [a-zA-Z_][a-zA-Z0-9]*\s+ |
|
22 "[^"]*")? |
|
23 ) |
|
24 [^>]*> |
|
25 ''', re.DOTALL | re.MULTILINE | re.VERBOSE) |
|
26 tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</.+?>', |
|
27 re.UNICODE | re.IGNORECASE | re.DOTALL | re.MULTILINE) |
|
28 xml_decl_re = re.compile(r'\s*<\?xml[^>]*\?>', re.I) |
|
29 |
|
30 |
|
31 class ClassNotFound(ValueError): |
|
32 """Raised if one of the lookup functions didn't find a matching class.""" |
|
33 |
|
34 |
|
35 class OptionError(Exception): |
|
36 pass |
|
37 |
|
38 |
|
39 def get_choice_opt(options, optname, allowed, default=None, normcase=False): |
|
40 string = options.get(optname, default) |
|
41 if normcase: |
|
42 string = string.lower() |
|
43 if string not in allowed: |
|
44 raise OptionError('Value for option %s must be one of %s' % |
|
45 (optname, ', '.join(map(str, allowed)))) |
|
46 return string |
|
47 |
|
48 |
|
49 def get_bool_opt(options, optname, default=None): |
|
50 string = options.get(optname, default) |
|
51 if isinstance(string, bool): |
|
52 return string |
|
53 elif isinstance(string, int): |
|
54 return bool(string) |
|
55 elif not isinstance(string, str): |
|
56 raise OptionError('Invalid type %r for option %s; use ' |
|
57 '1/0, yes/no, true/false, on/off' % ( |
|
58 string, optname)) |
|
59 elif string.lower() in ('1', 'yes', 'true', 'on'): |
|
60 return True |
|
61 elif string.lower() in ('0', 'no', 'false', 'off'): |
|
62 return False |
|
63 else: |
|
64 raise OptionError('Invalid value %r for option %s; use ' |
|
65 '1/0, yes/no, true/false, on/off' % ( |
|
66 string, optname)) |
|
67 |
|
68 |
|
69 def get_int_opt(options, optname, default=None): |
|
70 string = options.get(optname, default) |
|
71 try: |
|
72 return int(string) |
|
73 except TypeError: |
|
74 raise OptionError('Invalid type %r for option %s; you ' |
|
75 'must give an integer value' % ( |
|
76 string, optname)) |
|
77 except ValueError: |
|
78 raise OptionError('Invalid value %r for option %s; you ' |
|
79 'must give an integer value' % ( |
|
80 string, optname)) |
|
81 |
|
82 |
|
83 def get_list_opt(options, optname, default=None): |
|
84 val = options.get(optname, default) |
|
85 if isinstance(val, str): |
|
86 return val.split() |
|
87 elif isinstance(val, (list, tuple)): |
|
88 return list(val) |
|
89 else: |
|
90 raise OptionError('Invalid type %r for option %s; you ' |
|
91 'must give a list value' % ( |
|
92 val, optname)) |
|
93 |
|
94 |
|
95 def docstring_headline(obj): |
|
96 if not obj.__doc__: |
|
97 return '' |
|
98 res = [] |
|
99 for line in obj.__doc__.strip().splitlines(): |
|
100 if line.strip(): |
|
101 res.append(" " + line.strip()) |
|
102 else: |
|
103 break |
|
104 return ''.join(res).lstrip() |
|
105 |
|
106 |
|
107 def make_analysator(f): |
|
108 """Return a static text analyser function that returns float values.""" |
|
109 def text_analyse(text): |
|
110 try: |
|
111 rv = f(text) |
|
112 except Exception: |
|
113 return 0.0 |
|
114 if not rv: |
|
115 return 0.0 |
|
116 try: |
|
117 return min(1.0, max(0.0, float(rv))) |
|
118 except (ValueError, TypeError): |
|
119 return 0.0 |
|
120 text_analyse.__doc__ = f.__doc__ |
|
121 return staticmethod(text_analyse) |
|
122 |
|
123 |
|
124 def shebang_matches(text, regex): |
|
125 r"""Check if the given regular expression matches the last part of the |
|
126 shebang if one exists. |
|
127 |
|
128 >>> from pygments.util import shebang_matches |
|
129 >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?') |
|
130 True |
|
131 >>> shebang_matches('#!/usr/bin/python2.4', r'python(2\.\d)?') |
|
132 True |
|
133 >>> shebang_matches('#!/usr/bin/python-ruby', r'python(2\.\d)?') |
|
134 False |
|
135 >>> shebang_matches('#!/usr/bin/python/ruby', r'python(2\.\d)?') |
|
136 False |
|
137 >>> shebang_matches('#!/usr/bin/startsomethingwith python', |
|
138 ... r'python(2\.\d)?') |
|
139 True |
|
140 |
|
141 It also checks for common windows executable file extensions:: |
|
142 |
|
143 >>> shebang_matches('#!C:\\Python2.4\\Python.exe', r'python(2\.\d)?') |
|
144 True |
|
145 |
|
146 Parameters (``'-f'`` or ``'--foo'`` are ignored so ``'perl'`` does |
|
147 the same as ``'perl -e'``) |
|
148 |
|
149 Note that this method automatically searches the whole string (eg: |
|
150 the regular expression is wrapped in ``'^$'``) |
|
151 """ |
|
152 index = text.find('\n') |
|
153 if index >= 0: |
|
154 first_line = text[:index].lower() |
|
155 else: |
|
156 first_line = text.lower() |
|
157 if first_line.startswith('#!'): |
|
158 try: |
|
159 found = [x for x in split_path_re.split(first_line[2:].strip()) |
|
160 if x and not x.startswith('-')][-1] |
|
161 except IndexError: |
|
162 return False |
|
163 regex = re.compile(r'^%s(\.(exe|cmd|bat|bin))?$' % regex, re.IGNORECASE) |
|
164 if regex.search(found) is not None: |
|
165 return True |
|
166 return False |
|
167 |
|
168 |
|
169 def doctype_matches(text, regex): |
|
170 """Check if the doctype matches a regular expression (if present). |
|
171 |
|
172 Note that this method only checks the first part of a DOCTYPE. |
|
173 eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' |
|
174 """ |
|
175 m = doctype_lookup_re.search(text) |
|
176 if m is None: |
|
177 return False |
|
178 doctype = m.group(1) |
|
179 return re.compile(regex, re.I).match(doctype.strip()) is not None |
|
180 |
|
181 |
|
182 def html_doctype_matches(text): |
|
183 """Check if the file looks like it has a html doctype.""" |
|
184 return doctype_matches(text, r'html') |
|
185 |
|
186 |
|
187 _looks_like_xml_cache = {} |
|
188 |
|
189 |
|
190 def looks_like_xml(text): |
|
191 """Check if a doctype exists or if we have some tags.""" |
|
192 if xml_decl_re.match(text): |
|
193 return True |
|
194 key = hash(text) |
|
195 try: |
|
196 return _looks_like_xml_cache[key] |
|
197 except KeyError: |
|
198 m = doctype_lookup_re.search(text) |
|
199 if m is not None: |
|
200 return True |
|
201 rv = tag_re.search(text[:1000]) is not None |
|
202 _looks_like_xml_cache[key] = rv |
|
203 return rv |
|
204 |
|
205 |
|
206 def surrogatepair(c): |
|
207 """Given a unicode character code with length greater than 16 bits, |
|
208 return the two 16 bit surrogate pair. |
|
209 """ |
|
210 # From example D28 of: |
|
211 # http://www.unicode.org/book/ch03.pdf |
|
212 return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff))) |
|
213 |
|
214 |
|
215 def format_lines(var_name, seq, raw=False, indent_level=0): |
|
216 """Formats a sequence of strings for output.""" |
|
217 lines = [] |
|
218 base_indent = ' ' * indent_level * 4 |
|
219 inner_indent = ' ' * (indent_level + 1) * 4 |
|
220 lines.append(base_indent + var_name + ' = (') |
|
221 if raw: |
|
222 # These should be preformatted reprs of, say, tuples. |
|
223 for i in seq: |
|
224 lines.append(inner_indent + i + ',') |
|
225 else: |
|
226 for i in seq: |
|
227 # Force use of single quotes |
|
228 r = repr(i + '"') |
|
229 lines.append(inner_indent + r[:-2] + r[-1] + ',') |
|
230 lines.append(base_indent + ')') |
|
231 return '\n'.join(lines) |
|
232 |
|
233 |
|
234 def duplicates_removed(it, already_seen=()): |
|
235 """ |
|
236 Returns a list with duplicates removed from the iterable `it`. |
|
237 |
|
238 Order is preserved. |
|
239 """ |
|
240 lst = [] |
|
241 seen = set() |
|
242 for i in it: |
|
243 if i in seen or i in already_seen: |
|
244 continue |
|
245 lst.append(i) |
|
246 seen.add(i) |
|
247 return lst |
|
248 |
|
249 |
|
250 class Future: |
|
251 """Generic class to defer some work. |
|
252 |
|
253 Handled specially in RegexLexerMeta, to support regex string construction at |
|
254 first use. |
|
255 """ |
|
256 def get(self): |
|
257 raise NotImplementedError |
|
258 |
|
259 |
|
260 def guess_decode(text): |
|
261 """Decode *text* with guessed encoding. |
|
262 |
|
263 First try UTF-8; this should fail for non-UTF-8 encodings. |
|
264 Then try the preferred locale encoding. |
|
265 Fall back to latin-1, which always works. |
|
266 """ |
|
267 try: |
|
268 text = text.decode('utf-8') |
|
269 return text, 'utf-8' |
|
270 except UnicodeDecodeError: |
|
271 try: |
|
272 import locale |
|
273 prefencoding = locale.getpreferredencoding() |
|
274 text = text.decode() |
|
275 return text, prefencoding |
|
276 except (UnicodeDecodeError, LookupError): |
|
277 text = text.decode('latin1') |
|
278 return text, 'latin1' |
|
279 |
|
280 |
|
281 def guess_decode_from_terminal(text, term): |
|
282 """Decode *text* coming from terminal *term*. |
|
283 |
|
284 First try the terminal encoding, if given. |
|
285 Then try UTF-8. Then try the preferred locale encoding. |
|
286 Fall back to latin-1, which always works. |
|
287 """ |
|
288 if getattr(term, 'encoding', None): |
|
289 try: |
|
290 text = text.decode(term.encoding) |
|
291 except UnicodeDecodeError: |
|
292 pass |
|
293 else: |
|
294 return text, term.encoding |
|
295 return guess_decode(text) |
|
296 |
|
297 |
|
298 def terminal_encoding(term): |
|
299 """Return our best guess of encoding for the given *term*.""" |
|
300 if getattr(term, 'encoding', None): |
|
301 return term.encoding |
|
302 import locale |
|
303 return locale.getpreferredencoding() |
|
304 |
|
305 |
|
306 class UnclosingTextIOWrapper(TextIOWrapper): |
|
307 # Don't close underlying buffer on destruction. |
|
308 def close(self): |
|
309 self.flush() |
|