39 if 'analyse_text' in d: |
39 if 'analyse_text' in d: |
40 d['analyse_text'] = make_analysator(d['analyse_text']) |
40 d['analyse_text'] = make_analysator(d['analyse_text']) |
41 return type.__new__(cls, name, bases, d) |
41 return type.__new__(cls, name, bases, d) |
42 |
42 |
43 |
43 |
44 class Lexer(object): |
44 class Lexer(object, metaclass=LexerMeta): |
45 """ |
45 """ |
46 Lexer for a specific language. |
46 Lexer for a specific language. |
47 |
47 |
48 Basic options recognized: |
48 Basic options recognized: |
49 ``stripnl`` |
49 ``stripnl`` |
74 alias_filenames = [] |
74 alias_filenames = [] |
75 |
75 |
76 #: mime types |
76 #: mime types |
77 mimetypes = [] |
77 mimetypes = [] |
78 |
78 |
79 __metaclass__ = LexerMeta |
|
80 |
|
81 def __init__(self, **options): |
79 def __init__(self, **options): |
82 self.options = options |
80 self.options = options |
83 self.stripnl = get_bool_opt(options, 'stripnl', True) |
81 self.stripnl = get_bool_opt(options, 'stripnl', True) |
84 self.stripall = get_bool_opt(options, 'stripall', False) |
82 self.stripall = get_bool_opt(options, 'stripall', False) |
85 self.tabsize = get_int_opt(options, 'tabsize', 0) |
83 self.tabsize = get_int_opt(options, 'tabsize', 0) |
125 is bypassed even if filters are defined. |
123 is bypassed even if filters are defined. |
126 |
124 |
127 Also preprocess the text, i.e. expand tabs and strip it if |
125 Also preprocess the text, i.e. expand tabs and strip it if |
128 wanted and applies registered filters. |
126 wanted and applies registered filters. |
129 """ |
127 """ |
130 if not isinstance(text, unicode): |
128 if not isinstance(text, str): |
131 if self.encoding == 'guess': |
129 if self.encoding == 'guess': |
132 try: |
130 try: |
133 text = text.decode('utf-8') |
131 text = text.decode('utf-8') |
134 if text.startswith(u'\ufeff'): |
132 if text.startswith('\ufeff'): |
135 text = text[len(u'\ufeff'):] |
133 text = text[len('\ufeff'):] |
136 except UnicodeDecodeError: |
134 except UnicodeDecodeError: |
137 text = text.decode('latin1') |
135 text = text.decode('latin1') |
138 elif self.encoding == 'chardet': |
136 elif self.encoding == 'chardet': |
139 try: |
137 try: |
140 import chardet |
138 import chardet |
365 |
363 |
366 assert type(tdef) is tuple, "wrong rule def %r" % tdef |
364 assert type(tdef) is tuple, "wrong rule def %r" % tdef |
367 |
365 |
368 try: |
366 try: |
369 rex = re.compile(tdef[0], rflags).match |
367 rex = re.compile(tdef[0], rflags).match |
370 except Exception, err: |
368 except Exception as err: |
371 raise ValueError("uncompilable regex %r in state %r of %r: %s" % |
369 raise ValueError("uncompilable regex %r in state %r of %r: %s" % |
372 (tdef[0], state, cls, err)) |
370 (tdef[0], state, cls, err)) |
373 |
371 |
374 assert type(tdef[1]) is _TokenType or callable(tdef[1]), \ |
372 assert type(tdef[1]) is _TokenType or hasattr(tdef[1], '__call__'), \ |
375 'token type must be simple type or callable, not %r' % (tdef[1],) |
373 'token type must be simple type or callable, not %r' % (tdef[1],) |
376 |
374 |
377 if len(tdef) == 2: |
375 if len(tdef) == 2: |
378 new_state = None |
376 new_state = None |
379 else: |
377 else: |
414 return tokens |
412 return tokens |
415 |
413 |
416 def process_tokendef(cls, name, tokendefs=None): |
414 def process_tokendef(cls, name, tokendefs=None): |
417 processed = cls._all_tokens[name] = {} |
415 processed = cls._all_tokens[name] = {} |
418 tokendefs = tokendefs or cls.tokens[name] |
416 tokendefs = tokendefs or cls.tokens[name] |
419 for state in tokendefs.keys(): |
417 for state in list(tokendefs.keys()): |
420 cls._process_state(tokendefs, processed, state) |
418 cls._process_state(tokendefs, processed, state) |
421 return processed |
419 return processed |
422 |
420 |
423 def __call__(cls, *args, **kwds): |
421 def __call__(cls, *args, **kwds): |
424 if not hasattr(cls, '_tokens'): |
422 if not hasattr(cls, '_tokens'): |
431 cls._tokens = cls.process_tokendef('', cls.tokens) |
429 cls._tokens = cls.process_tokendef('', cls.tokens) |
432 |
430 |
433 return type.__call__(cls, *args, **kwds) |
431 return type.__call__(cls, *args, **kwds) |
434 |
432 |
435 |
433 |
436 class RegexLexer(Lexer): |
434 class RegexLexer(Lexer, metaclass=RegexLexerMeta): |
437 """ |
435 """ |
438 Base for simple stateful regular expression-based lexers. |
436 Base for simple stateful regular expression-based lexers. |
439 Simplifies the lexing process so that you need only |
437 Simplifies the lexing process so that you need only |
440 provide a list of states and regular expressions. |
438 provide a list of states and regular expressions. |
441 """ |
439 """ |
442 __metaclass__ = RegexLexerMeta |
|
443 |
440 |
444 #: Flags for compiling the regular expressions. |
441 #: Flags for compiling the regular expressions. |
445 #: Defaults to MULTILINE. |
442 #: Defaults to MULTILINE. |
446 flags = re.MULTILINE |
443 flags = re.MULTILINE |
447 |
444 |
508 if text[pos] == '\n': |
505 if text[pos] == '\n': |
509 # at EOL, reset state to "root" |
506 # at EOL, reset state to "root" |
510 pos += 1 |
507 pos += 1 |
511 statestack = ['root'] |
508 statestack = ['root'] |
512 statetokens = tokendefs['root'] |
509 statetokens = tokendefs['root'] |
513 yield pos, Text, u'\n' |
510 yield pos, Text, '\n' |
514 continue |
511 continue |
515 yield pos, Error, text[pos] |
512 yield pos, Error, text[pos] |
516 pos += 1 |
513 pos += 1 |
517 except IndexError: |
514 except IndexError: |
518 break |
515 break |
586 if text[ctx.pos] == '\n': |
583 if text[ctx.pos] == '\n': |
587 # at EOL, reset state to "root" |
584 # at EOL, reset state to "root" |
588 ctx.pos += 1 |
585 ctx.pos += 1 |
589 ctx.stack = ['root'] |
586 ctx.stack = ['root'] |
590 statetokens = tokendefs['root'] |
587 statetokens = tokendefs['root'] |
591 yield ctx.pos, Text, u'\n' |
588 yield ctx.pos, Text, '\n' |
592 continue |
589 continue |
593 yield ctx.pos, Error, text[ctx.pos] |
590 yield ctx.pos, Error, text[ctx.pos] |
594 ctx.pos += 1 |
591 ctx.pos += 1 |
595 except IndexError: |
592 except IndexError: |
596 break |
593 break |
636 for it_index, it_token, it_value in itokens: |
633 for it_index, it_token, it_value in itokens: |
637 yield realpos, it_token, it_value |
634 yield realpos, it_token, it_value |
638 realpos += len(it_value) |
635 realpos += len(it_value) |
639 oldi = index - i |
636 oldi = index - i |
640 try: |
637 try: |
641 index, itokens = insertions.next() |
638 index, itokens = next(insertions) |
642 except StopIteration: |
639 except StopIteration: |
643 insleft = False |
640 insleft = False |
644 break # not strictly necessary |
641 break # not strictly necessary |
645 yield realpos, t, v[oldi:] |
642 yield realpos, t, v[oldi:] |
646 realpos += len(v) - oldi |
643 realpos += len(v) - oldi |