|
1 # -*- coding: utf-8 -*- |
|
2 """ |
|
3 pygments.scanner |
|
4 ~~~~~~~~~~~~~~~~ |
|
5 |
|
6 This library implements a regex based scanner. Some languages |
|
7 like Pascal are easy to parse but have some keywords that |
|
8 depend on the context. Because of this it's impossible to lex |
|
9 that just by using a regular expression lexer like the |
|
10 `RegexLexer`. |
|
11 |
|
12 Have a look at the `DelphiLexer` to get an idea of how to use |
|
13 this scanner. |
|
14 |
|
15 :copyright: Copyright 2006-2009 by the Pygments team, see AUTHORS. |
|
16 :license: BSD, see LICENSE for details. |
|
17 """ |
|
18 import re |
|
19 |
|
20 |
|
21 class EndOfText(RuntimeError): |
|
22 """ |
|
23 Raise if end of text is reached and the user |
|
24 tried to call a match function. |
|
25 """ |
|
26 |
|
27 |
|
28 class Scanner(object): |
|
29 """ |
|
30 Simple scanner |
|
31 |
|
32 All method patterns are regular expression strings (not |
|
33 compiled expressions!) |
|
34 """ |
|
35 |
|
36 def __init__(self, text, flags=0): |
|
37 """ |
|
38 :param text: The text which should be scanned |
|
39 :param flags: default regular expression flags |
|
40 """ |
|
41 self.data = text |
|
42 self.data_length = len(text) |
|
43 self.start_pos = 0 |
|
44 self.pos = 0 |
|
45 self.flags = flags |
|
46 self.last = None |
|
47 self.match = None |
|
48 self._re_cache = {} |
|
49 |
|
50 def eos(self): |
|
51 """`True` if the scanner reached the end of text.""" |
|
52 return self.pos >= self.data_length |
|
53 eos = property(eos, eos.__doc__) |
|
54 |
|
55 def check(self, pattern): |
|
56 """ |
|
57 Apply `pattern` on the current position and return |
|
58 the match object. (Doesn't touch pos). Use this for |
|
59 lookahead. |
|
60 """ |
|
61 if self.eos: |
|
62 raise EndOfText() |
|
63 if pattern not in self._re_cache: |
|
64 self._re_cache[pattern] = re.compile(pattern, self.flags) |
|
65 return self._re_cache[pattern].match(self.data, self.pos) |
|
66 |
|
67 def test(self, pattern): |
|
68 """Apply a pattern on the current position and check |
|
69 if it patches. Doesn't touch pos.""" |
|
70 return self.check(pattern) is not None |
|
71 |
|
72 def scan(self, pattern): |
|
73 """ |
|
74 Scan the text for the given pattern and update pos/match |
|
75 and related fields. The return value is a boolen that |
|
76 indicates if the pattern matched. The matched value is |
|
77 stored on the instance as ``match``, the last value is |
|
78 stored as ``last``. ``start_pos`` is the position of the |
|
79 pointer before the pattern was matched, ``pos`` is the |
|
80 end position. |
|
81 """ |
|
82 if self.eos: |
|
83 raise EndOfText() |
|
84 if pattern not in self._re_cache: |
|
85 self._re_cache[pattern] = re.compile(pattern, self.flags) |
|
86 self.last = self.match |
|
87 m = self._re_cache[pattern].match(self.data, self.pos) |
|
88 if m is None: |
|
89 return False |
|
90 self.start_pos = m.start() |
|
91 self.pos = m.end() |
|
92 self.match = m.group() |
|
93 return True |
|
94 |
|
95 def get_char(self): |
|
96 """Scan exactly one char.""" |
|
97 self.scan('.') |
|
98 |
|
99 def __repr__(self): |
|
100 return '<%s %d/%d>' % ( |
|
101 self.__class__.__name__, |
|
102 self.pos, |
|
103 self.data_length |
|
104 ) |