ThirdParty/Pygments/pygments/lexer.py

changeset 12
1d8dd9706f46
parent 0
de9c2efb9d02
child 684
2f29a0b6e1c7
equal deleted inserted replaced
11:b0996e4a289e 12:1d8dd9706f46
39 if 'analyse_text' in d: 39 if 'analyse_text' in d:
40 d['analyse_text'] = make_analysator(d['analyse_text']) 40 d['analyse_text'] = make_analysator(d['analyse_text'])
41 return type.__new__(cls, name, bases, d) 41 return type.__new__(cls, name, bases, d)
42 42
43 43
44 class Lexer(object): 44 class Lexer(object, metaclass=LexerMeta):
45 """ 45 """
46 Lexer for a specific language. 46 Lexer for a specific language.
47 47
48 Basic options recognized: 48 Basic options recognized:
49 ``stripnl`` 49 ``stripnl``
74 alias_filenames = [] 74 alias_filenames = []
75 75
76 #: mime types 76 #: mime types
77 mimetypes = [] 77 mimetypes = []
78 78
79 __metaclass__ = LexerMeta
80
81 def __init__(self, **options): 79 def __init__(self, **options):
82 self.options = options 80 self.options = options
83 self.stripnl = get_bool_opt(options, 'stripnl', True) 81 self.stripnl = get_bool_opt(options, 'stripnl', True)
84 self.stripall = get_bool_opt(options, 'stripall', False) 82 self.stripall = get_bool_opt(options, 'stripall', False)
85 self.tabsize = get_int_opt(options, 'tabsize', 0) 83 self.tabsize = get_int_opt(options, 'tabsize', 0)
125 is bypassed even if filters are defined. 123 is bypassed even if filters are defined.
126 124
127 Also preprocess the text, i.e. expand tabs and strip it if 125 Also preprocess the text, i.e. expand tabs and strip it if
128 wanted and applies registered filters. 126 wanted and applies registered filters.
129 """ 127 """
130 if not isinstance(text, unicode): 128 if not isinstance(text, str):
131 if self.encoding == 'guess': 129 if self.encoding == 'guess':
132 try: 130 try:
133 text = text.decode('utf-8') 131 text = text.decode('utf-8')
134 if text.startswith(u'\ufeff'): 132 if text.startswith('\ufeff'):
135 text = text[len(u'\ufeff'):] 133 text = text[len('\ufeff'):]
136 except UnicodeDecodeError: 134 except UnicodeDecodeError:
137 text = text.decode('latin1') 135 text = text.decode('latin1')
138 elif self.encoding == 'chardet': 136 elif self.encoding == 'chardet':
139 try: 137 try:
140 import chardet 138 import chardet
365 363
366 assert type(tdef) is tuple, "wrong rule def %r" % tdef 364 assert type(tdef) is tuple, "wrong rule def %r" % tdef
367 365
368 try: 366 try:
369 rex = re.compile(tdef[0], rflags).match 367 rex = re.compile(tdef[0], rflags).match
370 except Exception, err: 368 except Exception as err:
371 raise ValueError("uncompilable regex %r in state %r of %r: %s" % 369 raise ValueError("uncompilable regex %r in state %r of %r: %s" %
372 (tdef[0], state, cls, err)) 370 (tdef[0], state, cls, err))
373 371
374 assert type(tdef[1]) is _TokenType or callable(tdef[1]), \ 372 assert type(tdef[1]) is _TokenType or hasattr(tdef[1], '__call__'), \
375 'token type must be simple type or callable, not %r' % (tdef[1],) 373 'token type must be simple type or callable, not %r' % (tdef[1],)
376 374
377 if len(tdef) == 2: 375 if len(tdef) == 2:
378 new_state = None 376 new_state = None
379 else: 377 else:
414 return tokens 412 return tokens
415 413
416 def process_tokendef(cls, name, tokendefs=None): 414 def process_tokendef(cls, name, tokendefs=None):
417 processed = cls._all_tokens[name] = {} 415 processed = cls._all_tokens[name] = {}
418 tokendefs = tokendefs or cls.tokens[name] 416 tokendefs = tokendefs or cls.tokens[name]
419 for state in tokendefs.keys(): 417 for state in list(tokendefs.keys()):
420 cls._process_state(tokendefs, processed, state) 418 cls._process_state(tokendefs, processed, state)
421 return processed 419 return processed
422 420
423 def __call__(cls, *args, **kwds): 421 def __call__(cls, *args, **kwds):
424 if not hasattr(cls, '_tokens'): 422 if not hasattr(cls, '_tokens'):
431 cls._tokens = cls.process_tokendef('', cls.tokens) 429 cls._tokens = cls.process_tokendef('', cls.tokens)
432 430
433 return type.__call__(cls, *args, **kwds) 431 return type.__call__(cls, *args, **kwds)
434 432
435 433
436 class RegexLexer(Lexer): 434 class RegexLexer(Lexer, metaclass=RegexLexerMeta):
437 """ 435 """
438 Base for simple stateful regular expression-based lexers. 436 Base for simple stateful regular expression-based lexers.
439 Simplifies the lexing process so that you need only 437 Simplifies the lexing process so that you need only
440 provide a list of states and regular expressions. 438 provide a list of states and regular expressions.
441 """ 439 """
442 __metaclass__ = RegexLexerMeta
443 440
444 #: Flags for compiling the regular expressions. 441 #: Flags for compiling the regular expressions.
445 #: Defaults to MULTILINE. 442 #: Defaults to MULTILINE.
446 flags = re.MULTILINE 443 flags = re.MULTILINE
447 444
508 if text[pos] == '\n': 505 if text[pos] == '\n':
509 # at EOL, reset state to "root" 506 # at EOL, reset state to "root"
510 pos += 1 507 pos += 1
511 statestack = ['root'] 508 statestack = ['root']
512 statetokens = tokendefs['root'] 509 statetokens = tokendefs['root']
513 yield pos, Text, u'\n' 510 yield pos, Text, '\n'
514 continue 511 continue
515 yield pos, Error, text[pos] 512 yield pos, Error, text[pos]
516 pos += 1 513 pos += 1
517 except IndexError: 514 except IndexError:
518 break 515 break
586 if text[ctx.pos] == '\n': 583 if text[ctx.pos] == '\n':
587 # at EOL, reset state to "root" 584 # at EOL, reset state to "root"
588 ctx.pos += 1 585 ctx.pos += 1
589 ctx.stack = ['root'] 586 ctx.stack = ['root']
590 statetokens = tokendefs['root'] 587 statetokens = tokendefs['root']
591 yield ctx.pos, Text, u'\n' 588 yield ctx.pos, Text, '\n'
592 continue 589 continue
593 yield ctx.pos, Error, text[ctx.pos] 590 yield ctx.pos, Error, text[ctx.pos]
594 ctx.pos += 1 591 ctx.pos += 1
595 except IndexError: 592 except IndexError:
596 break 593 break
610 607
611 TODO: clean up the code here. 608 TODO: clean up the code here.
612 """ 609 """
613 insertions = iter(insertions) 610 insertions = iter(insertions)
614 try: 611 try:
615 index, itokens = insertions.next() 612 index, itokens = next(insertions)
616 except StopIteration: 613 except StopIteration:
617 # no insertions 614 # no insertions
618 for item in tokens: 615 for item in tokens:
619 yield item 616 yield item
620 return 617 return
636 for it_index, it_token, it_value in itokens: 633 for it_index, it_token, it_value in itokens:
637 yield realpos, it_token, it_value 634 yield realpos, it_token, it_value
638 realpos += len(it_value) 635 realpos += len(it_value)
639 oldi = index - i 636 oldi = index - i
640 try: 637 try:
641 index, itokens = insertions.next() 638 index, itokens = next(insertions)
642 except StopIteration: 639 except StopIteration:
643 insleft = False 640 insleft = False
644 break # not strictly necessary 641 break # not strictly necessary
645 yield realpos, t, v[oldi:] 642 yield realpos, t, v[oldi:]
646 realpos += len(v) - oldi 643 realpos += len(v) - oldi

eric ide

mercurial