ThirdParty/Pygments/pygments/lexer.py

changeset 0
de9c2efb9d02
child 12
1d8dd9706f46
equal deleted inserted replaced
-1:000000000000 0:de9c2efb9d02
1 # -*- coding: utf-8 -*-
2 """
3 pygments.lexer
4 ~~~~~~~~~~~~~~
5
6 Base lexer classes.
7
8 :copyright: Copyright 2006-2009 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details.
10 """
11 import re
12
13 try:
14 set
15 except NameError:
16 from sets import Set as set
17
18 from pygments.filter import apply_filters, Filter
19 from pygments.filters import get_filter_by_name
20 from pygments.token import Error, Text, Other, _TokenType
21 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
22 make_analysator
23
24
25 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
26 'LexerContext', 'include', 'flags', 'bygroups', 'using', 'this']
27
28
29 _default_analyse = staticmethod(lambda x: 0.0)
30
31
32 class LexerMeta(type):
33 """
34 This metaclass automagically converts ``analyse_text`` methods into
35 static methods which always return float values.
36 """
37
38 def __new__(cls, name, bases, d):
39 if 'analyse_text' in d:
40 d['analyse_text'] = make_analysator(d['analyse_text'])
41 return type.__new__(cls, name, bases, d)
42
43
44 class Lexer(object):
45 """
46 Lexer for a specific language.
47
48 Basic options recognized:
49 ``stripnl``
50 Strip leading and trailing newlines from the input (default: True).
51 ``stripall``
52 Strip all leading and trailing whitespace from the input
53 (default: False).
54 ``tabsize``
55 If given and greater than 0, expand tabs in the input (default: 0).
56 ``encoding``
57 If given, must be an encoding name. This encoding will be used to
58 convert the input string to Unicode, if it is not already a Unicode
59 string (default: ``'latin1'``).
60 Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or
61 ``'chardet'`` to use the chardet library, if it is installed.
62 """
63
64 #: Name of the lexer
65 name = None
66
67 #: Shortcuts for the lexer
68 aliases = []
69
70 #: fn match rules
71 filenames = []
72
73 #: fn alias filenames
74 alias_filenames = []
75
76 #: mime types
77 mimetypes = []
78
79 __metaclass__ = LexerMeta
80
81 def __init__(self, **options):
82 self.options = options
83 self.stripnl = get_bool_opt(options, 'stripnl', True)
84 self.stripall = get_bool_opt(options, 'stripall', False)
85 self.tabsize = get_int_opt(options, 'tabsize', 0)
86 self.encoding = options.get('encoding', 'latin1')
87 # self.encoding = options.get('inencoding', None) or self.encoding
88 self.filters = []
89 for filter_ in get_list_opt(options, 'filters', ()):
90 self.add_filter(filter_)
91
92 def __repr__(self):
93 if self.options:
94 return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
95 self.options)
96 else:
97 return '<pygments.lexers.%s>' % self.__class__.__name__
98
99 def add_filter(self, filter_, **options):
100 """
101 Add a new stream filter to this lexer.
102 """
103 if not isinstance(filter_, Filter):
104 filter_ = get_filter_by_name(filter_, **options)
105 self.filters.append(filter_)
106
107 def analyse_text(text):
108 """
109 Has to return a float between ``0`` and ``1`` that indicates
110 if a lexer wants to highlight this text. Used by ``guess_lexer``.
111 If this method returns ``0`` it won't highlight it in any case, if
112 it returns ``1`` highlighting with this lexer is guaranteed.
113
114 The `LexerMeta` metaclass automatically wraps this function so
115 that it works like a static method (no ``self`` or ``cls``
116 parameter) and the return value is automatically converted to
117 `float`. If the return value is an object that is boolean `False`
118 it's the same as if the return values was ``0.0``.
119 """
120
121 def get_tokens(self, text, unfiltered=False):
122 """
123 Return an iterable of (tokentype, value) pairs generated from
124 `text`. If `unfiltered` is set to `True`, the filtering mechanism
125 is bypassed even if filters are defined.
126
127 Also preprocess the text, i.e. expand tabs and strip it if
128 wanted and applies registered filters.
129 """
130 if not isinstance(text, unicode):
131 if self.encoding == 'guess':
132 try:
133 text = text.decode('utf-8')
134 if text.startswith(u'\ufeff'):
135 text = text[len(u'\ufeff'):]
136 except UnicodeDecodeError:
137 text = text.decode('latin1')
138 elif self.encoding == 'chardet':
139 try:
140 import chardet
141 except ImportError:
142 raise ImportError('To enable chardet encoding guessing, '
143 'please install the chardet library '
144 'from http://chardet.feedparser.org/')
145 enc = chardet.detect(text)
146 text = text.decode(enc['encoding'])
147 else:
148 text = text.decode(self.encoding)
149 # text now *is* a unicode string
150 text = text.replace('\r\n', '\n')
151 text = text.replace('\r', '\n')
152 if self.stripall:
153 text = text.strip()
154 elif self.stripnl:
155 text = text.strip('\n')
156 if self.tabsize > 0:
157 text = text.expandtabs(self.tabsize)
158 if not text.endswith('\n'):
159 text += '\n'
160
161 def streamer():
162 for i, t, v in self.get_tokens_unprocessed(text):
163 yield t, v
164 stream = streamer()
165 if not unfiltered:
166 stream = apply_filters(stream, self.filters, self)
167 return stream
168
169 def get_tokens_unprocessed(self, text):
170 """
171 Return an iterable of (tokentype, value) pairs.
172 In subclasses, implement this method as a generator to
173 maximize effectiveness.
174 """
175 raise NotImplementedError
176
177
178 class DelegatingLexer(Lexer):
179 """
180 This lexer takes two lexer as arguments. A root lexer and
181 a language lexer. First everything is scanned using the language
182 lexer, afterwards all ``Other`` tokens are lexed using the root
183 lexer.
184
185 The lexers from the ``template`` lexer package use this base lexer.
186 """
187
188 def __init__(self, _root_lexer, _language_lexer, _needle=Other, **options):
189 self.root_lexer = _root_lexer(**options)
190 self.language_lexer = _language_lexer(**options)
191 self.needle = _needle
192 Lexer.__init__(self, **options)
193
194 def get_tokens_unprocessed(self, text):
195 buffered = ''
196 insertions = []
197 lng_buffer = []
198 for i, t, v in self.language_lexer.get_tokens_unprocessed(text):
199 if t is self.needle:
200 if lng_buffer:
201 insertions.append((len(buffered), lng_buffer))
202 lng_buffer = []
203 buffered += v
204 else:
205 lng_buffer.append((i, t, v))
206 if lng_buffer:
207 insertions.append((len(buffered), lng_buffer))
208 return do_insertions(insertions,
209 self.root_lexer.get_tokens_unprocessed(buffered))
210
211
212 #-------------------------------------------------------------------------------
213 # RegexLexer and ExtendedRegexLexer
214 #
215
216
217 class include(str):
218 """
219 Indicates that a state should include rules from another state.
220 """
221 pass
222
223
224 class combined(tuple):
225 """
226 Indicates a state combined from multiple states.
227 """
228
229 def __new__(cls, *args):
230 return tuple.__new__(cls, args)
231
232 def __init__(self, *args):
233 # tuple.__init__ doesn't do anything
234 pass
235
236
237 class _PseudoMatch(object):
238 """
239 A pseudo match object constructed from a string.
240 """
241
242 def __init__(self, start, text):
243 self._text = text
244 self._start = start
245
246 def start(self, arg=None):
247 return self._start
248
249 def end(self, arg=None):
250 return self._start + len(self._text)
251
252 def group(self, arg=None):
253 if arg:
254 raise IndexError('No such group')
255 return self._text
256
257 def groups(self):
258 return (self._text,)
259
260 def groupdict(self):
261 return {}
262
263
264 def bygroups(*args):
265 """
266 Callback that yields multiple actions for each group in the match.
267 """
268 def callback(lexer, match, ctx=None):
269 for i, action in enumerate(args):
270 if action is None:
271 continue
272 elif type(action) is _TokenType:
273 data = match.group(i + 1)
274 if data:
275 yield match.start(i + 1), action, data
276 else:
277 if ctx:
278 ctx.pos = match.start(i + 1)
279 for item in action(lexer, _PseudoMatch(match.start(i + 1),
280 match.group(i + 1)), ctx):
281 if item:
282 yield item
283 if ctx:
284 ctx.pos = match.end()
285 return callback
286
287
288 class _This(object):
289 """
290 Special singleton used for indicating the caller class.
291 Used by ``using``.
292 """
293 this = _This()
294
295
296 def using(_other, **kwargs):
297 """
298 Callback that processes the match with a different lexer.
299
300 The keyword arguments are forwarded to the lexer, except `state` which
301 is handled separately.
302
303 `state` specifies the state that the new lexer will start in, and can
304 be an enumerable such as ('root', 'inline', 'string') or a simple
305 string which is assumed to be on top of the root state.
306
307 Note: For that to work, `_other` must not be an `ExtendedRegexLexer`.
308 """
309 gt_kwargs = {}
310 if 'state' in kwargs:
311 s = kwargs.pop('state')
312 if isinstance(s, (list, tuple)):
313 gt_kwargs['stack'] = s
314 else:
315 gt_kwargs['stack'] = ('root', s)
316
317 if _other is this:
318 def callback(lexer, match, ctx=None):
319 # if keyword arguments are given the callback
320 # function has to create a new lexer instance
321 if kwargs:
322 # XXX: cache that somehow
323 kwargs.update(lexer.options)
324 lx = lexer.__class__(**kwargs)
325 else:
326 lx = lexer
327 s = match.start()
328 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
329 yield i + s, t, v
330 if ctx:
331 ctx.pos = match.end()
332 else:
333 def callback(lexer, match, ctx=None):
334 # XXX: cache that somehow
335 kwargs.update(lexer.options)
336 lx = _other(**kwargs)
337
338 s = match.start()
339 for i, t, v in lx.get_tokens_unprocessed(match.group(), **gt_kwargs):
340 yield i + s, t, v
341 if ctx:
342 ctx.pos = match.end()
343 return callback
344
345
346 class RegexLexerMeta(LexerMeta):
347 """
348 Metaclass for RegexLexer, creates the self._tokens attribute from
349 self.tokens on the first instantiation.
350 """
351
352 def _process_state(cls, unprocessed, processed, state):
353 assert type(state) is str, "wrong state name %r" % state
354 assert state[0] != '#', "invalid state name %r" % state
355 if state in processed:
356 return processed[state]
357 tokens = processed[state] = []
358 rflags = cls.flags
359 for tdef in unprocessed[state]:
360 if isinstance(tdef, include):
361 # it's a state reference
362 assert tdef != state, "circular state reference %r" % state
363 tokens.extend(cls._process_state(unprocessed, processed, str(tdef)))
364 continue
365
366 assert type(tdef) is tuple, "wrong rule def %r" % tdef
367
368 try:
369 rex = re.compile(tdef[0], rflags).match
370 except Exception, err:
371 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
372 (tdef[0], state, cls, err))
373
374 assert type(tdef[1]) is _TokenType or callable(tdef[1]), \
375 'token type must be simple type or callable, not %r' % (tdef[1],)
376
377 if len(tdef) == 2:
378 new_state = None
379 else:
380 tdef2 = tdef[2]
381 if isinstance(tdef2, str):
382 # an existing state
383 if tdef2 == '#pop':
384 new_state = -1
385 elif tdef2 in unprocessed:
386 new_state = (tdef2,)
387 elif tdef2 == '#push':
388 new_state = tdef2
389 elif tdef2[:5] == '#pop:':
390 new_state = -int(tdef2[5:])
391 else:
392 assert False, 'unknown new state %r' % tdef2
393 elif isinstance(tdef2, combined):
394 # combine a new state from existing ones
395 new_state = '_tmp_%d' % cls._tmpname
396 cls._tmpname += 1
397 itokens = []
398 for istate in tdef2:
399 assert istate != state, 'circular state ref %r' % istate
400 itokens.extend(cls._process_state(unprocessed,
401 processed, istate))
402 processed[new_state] = itokens
403 new_state = (new_state,)
404 elif isinstance(tdef2, tuple):
405 # push more than one state
406 for state in tdef2:
407 assert (state in unprocessed or
408 state in ('#pop', '#push')), \
409 'unknown new state ' + state
410 new_state = tdef2
411 else:
412 assert False, 'unknown new state def %r' % tdef2
413 tokens.append((rex, tdef[1], new_state))
414 return tokens
415
416 def process_tokendef(cls, name, tokendefs=None):
417 processed = cls._all_tokens[name] = {}
418 tokendefs = tokendefs or cls.tokens[name]
419 for state in tokendefs.keys():
420 cls._process_state(tokendefs, processed, state)
421 return processed
422
423 def __call__(cls, *args, **kwds):
424 if not hasattr(cls, '_tokens'):
425 cls._all_tokens = {}
426 cls._tmpname = 0
427 if hasattr(cls, 'token_variants') and cls.token_variants:
428 # don't process yet
429 pass
430 else:
431 cls._tokens = cls.process_tokendef('', cls.tokens)
432
433 return type.__call__(cls, *args, **kwds)
434
435
436 class RegexLexer(Lexer):
437 """
438 Base for simple stateful regular expression-based lexers.
439 Simplifies the lexing process so that you need only
440 provide a list of states and regular expressions.
441 """
442 __metaclass__ = RegexLexerMeta
443
444 #: Flags for compiling the regular expressions.
445 #: Defaults to MULTILINE.
446 flags = re.MULTILINE
447
448 #: Dict of ``{'state': [(regex, tokentype, new_state), ...], ...}``
449 #:
450 #: The initial state is 'root'.
451 #: ``new_state`` can be omitted to signify no state transition.
452 #: If it is a string, the state is pushed on the stack and changed.
453 #: If it is a tuple of strings, all states are pushed on the stack and
454 #: the current state will be the topmost.
455 #: It can also be ``combined('state1', 'state2', ...)``
456 #: to signify a new, anonymous state combined from the rules of two
457 #: or more existing ones.
458 #: Furthermore, it can be '#pop' to signify going back one step in
459 #: the state stack, or '#push' to push the current state on the stack
460 #: again.
461 #:
462 #: The tuple can also be replaced with ``include('state')``, in which
463 #: case the rules from the state named by the string are included in the
464 #: current one.
465 tokens = {}
466
467 def get_tokens_unprocessed(self, text, stack=('root',)):
468 """
469 Split ``text`` into (tokentype, text) pairs.
470
471 ``stack`` is the inital stack (default: ``['root']``)
472 """
473 pos = 0
474 tokendefs = self._tokens
475 statestack = list(stack)
476 statetokens = tokendefs[statestack[-1]]
477 while 1:
478 for rexmatch, action, new_state in statetokens:
479 m = rexmatch(text, pos)
480 if m:
481 if type(action) is _TokenType:
482 yield pos, action, m.group()
483 else:
484 for item in action(self, m):
485 yield item
486 pos = m.end()
487 if new_state is not None:
488 # state transition
489 if isinstance(new_state, tuple):
490 for state in new_state:
491 if state == '#pop':
492 statestack.pop()
493 elif state == '#push':
494 statestack.append(statestack[-1])
495 else:
496 statestack.append(state)
497 elif isinstance(new_state, int):
498 # pop
499 del statestack[new_state:]
500 elif new_state == '#push':
501 statestack.append(statestack[-1])
502 else:
503 assert False, "wrong state def: %r" % new_state
504 statetokens = tokendefs[statestack[-1]]
505 break
506 else:
507 try:
508 if text[pos] == '\n':
509 # at EOL, reset state to "root"
510 pos += 1
511 statestack = ['root']
512 statetokens = tokendefs['root']
513 yield pos, Text, u'\n'
514 continue
515 yield pos, Error, text[pos]
516 pos += 1
517 except IndexError:
518 break
519
520
521 class LexerContext(object):
522 """
523 A helper object that holds lexer position data.
524 """
525
526 def __init__(self, text, pos, stack=None, end=None):
527 self.text = text
528 self.pos = pos
529 self.end = end or len(text) # end=0 not supported ;-)
530 self.stack = stack or ['root']
531
532 def __repr__(self):
533 return 'LexerContext(%r, %r, %r)' % (
534 self.text, self.pos, self.stack)
535
536
537 class ExtendedRegexLexer(RegexLexer):
538 """
539 A RegexLexer that uses a context object to store its state.
540 """
541
542 def get_tokens_unprocessed(self, text=None, context=None):
543 """
544 Split ``text`` into (tokentype, text) pairs.
545 If ``context`` is given, use this lexer context instead.
546 """
547 tokendefs = self._tokens
548 if not context:
549 ctx = LexerContext(text, 0)
550 statetokens = tokendefs['root']
551 else:
552 ctx = context
553 statetokens = tokendefs[ctx.stack[-1]]
554 text = ctx.text
555 while 1:
556 for rexmatch, action, new_state in statetokens:
557 m = rexmatch(text, ctx.pos, ctx.end)
558 if m:
559 if type(action) is _TokenType:
560 yield ctx.pos, action, m.group()
561 ctx.pos = m.end()
562 else:
563 for item in action(self, m, ctx):
564 yield item
565 if not new_state:
566 # altered the state stack?
567 statetokens = tokendefs[ctx.stack[-1]]
568 # CAUTION: callback must set ctx.pos!
569 if new_state is not None:
570 # state transition
571 if isinstance(new_state, tuple):
572 ctx.stack.extend(new_state)
573 elif isinstance(new_state, int):
574 # pop
575 del ctx.stack[new_state:]
576 elif new_state == '#push':
577 ctx.stack.append(ctx.stack[-1])
578 else:
579 assert False, "wrong state def: %r" % new_state
580 statetokens = tokendefs[ctx.stack[-1]]
581 break
582 else:
583 try:
584 if ctx.pos >= ctx.end:
585 break
586 if text[ctx.pos] == '\n':
587 # at EOL, reset state to "root"
588 ctx.pos += 1
589 ctx.stack = ['root']
590 statetokens = tokendefs['root']
591 yield ctx.pos, Text, u'\n'
592 continue
593 yield ctx.pos, Error, text[ctx.pos]
594 ctx.pos += 1
595 except IndexError:
596 break
597
598
599 def do_insertions(insertions, tokens):
600 """
601 Helper for lexers which must combine the results of several
602 sublexers.
603
604 ``insertions`` is a list of ``(index, itokens)`` pairs.
605 Each ``itokens`` iterable should be inserted at position
606 ``index`` into the token stream given by the ``tokens``
607 argument.
608
609 The result is a combined token stream.
610
611 TODO: clean up the code here.
612 """
613 insertions = iter(insertions)
614 try:
615 index, itokens = insertions.next()
616 except StopIteration:
617 # no insertions
618 for item in tokens:
619 yield item
620 return
621
622 realpos = None
623 insleft = True
624
625 # iterate over the token stream where we want to insert
626 # the tokens from the insertion list.
627 for i, t, v in tokens:
628 # first iteration. store the postition of first item
629 if realpos is None:
630 realpos = i
631 oldi = 0
632 while insleft and i + len(v) >= index:
633 tmpval = v[oldi:index - i]
634 yield realpos, t, tmpval
635 realpos += len(tmpval)
636 for it_index, it_token, it_value in itokens:
637 yield realpos, it_token, it_value
638 realpos += len(it_value)
639 oldi = index - i
640 try:
641 index, itokens = insertions.next()
642 except StopIteration:
643 insleft = False
644 break # not strictly necessary
645 yield realpos, t, v[oldi:]
646 realpos += len(v) - oldi
647
648 # leftover tokens
649 if insleft:
650 # no normal tokens, set realpos to zero
651 realpos = realpos or 0
652 for p, t, v in itokens:
653 yield realpos, t, v
654 realpos += len(v)

eric ide

mercurial