Sun, 24 Jan 2016 19:28:37 +0100
Updated Pygments to 2.1.
4172
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
1 | # -*- coding: utf-8 -*- |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
2 | """ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
3 | pygments.regexopt |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
4 | ~~~~~~~~~~~~~~~~~ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
5 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
6 | An algorithm that generates optimized regexes for matching long lists of |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
7 | literal strings. |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
8 | |
4697
c2e9bf425554
Updated Pygments to 2.1.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
4172
diff
changeset
|
9 | :copyright: Copyright 2006-2015 by the Pygments team, see AUTHORS. |
4172
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
10 | :license: BSD, see LICENSE for details. |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
11 | """ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
12 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
13 | import re |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
14 | from re import escape |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
15 | from os.path import commonprefix |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
16 | from itertools import groupby |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
17 | from operator import itemgetter |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
18 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
19 | CS_ESCAPE = re.compile(r'[\^\\\-\]]') |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
20 | FIRST_ELEMENT = itemgetter(0) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
21 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
22 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
23 | def make_charset(letters): |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
24 | return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
25 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
26 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
27 | def regex_opt_inner(strings, open_paren): |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
28 | """Return a regex that matches any string in the sorted list of strings.""" |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
29 | close_paren = open_paren and ')' or '' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
30 | # print strings, repr(open_paren) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
31 | if not strings: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
32 | # print '-> nothing left' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
33 | return '' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
34 | first = strings[0] |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
35 | if len(strings) == 1: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
36 | # print '-> only 1 string' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
37 | return open_paren + escape(first) + close_paren |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
38 | if not first: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
39 | # print '-> first string empty' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
40 | return open_paren + regex_opt_inner(strings[1:], '(?:') \ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
41 | + '?' + close_paren |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
42 | if len(first) == 1: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
43 | # multiple one-char strings? make a charset |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
44 | oneletter = [] |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
45 | rest = [] |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
46 | for s in strings: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
47 | if len(s) == 1: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
48 | oneletter.append(s) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
49 | else: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
50 | rest.append(s) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
51 | if len(oneletter) > 1: # do we have more than one oneletter string? |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
52 | if rest: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
53 | # print '-> 1-character + rest' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
54 | return open_paren + regex_opt_inner(rest, '') + '|' \ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
55 | + make_charset(oneletter) + close_paren |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
56 | # print '-> only 1-character' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
57 | return make_charset(oneletter) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
58 | prefix = commonprefix(strings) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
59 | if prefix: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
60 | plen = len(prefix) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
61 | # we have a prefix for all strings |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
62 | # print '-> prefix:', prefix |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
63 | return open_paren + escape(prefix) \ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
64 | + regex_opt_inner([s[plen:] for s in strings], '(?:') \ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
65 | + close_paren |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
66 | # is there a suffix? |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
67 | strings_rev = [s[::-1] for s in strings] |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
68 | suffix = commonprefix(strings_rev) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
69 | if suffix: |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
70 | slen = len(suffix) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
71 | # print '-> suffix:', suffix[::-1] |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
72 | return open_paren \ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
73 | + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
74 | + escape(suffix[::-1]) + close_paren |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
75 | # recurse on common 1-string prefixes |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
76 | # print '-> last resort' |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
77 | return open_paren + \ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
78 | '|'.join(regex_opt_inner(list(group[1]), '') |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
79 | for group in groupby(strings, lambda s: s[0] == first[0])) \ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
80 | + close_paren |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
81 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
82 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
83 | def regex_opt(strings, prefix='', suffix=''): |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
84 | """Return a compiled regex that matches any string in the given list. |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
85 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
86 | The strings to match must be literal strings, not regexes. They will be |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
87 | regex-escaped. |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
88 | |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
89 | *prefix* and *suffix* are pre- and appended to the final regex. |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
90 | """ |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
91 | strings = sorted(strings) |
4f20dba37ab6
Updated Pygments to 2.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
92 | return prefix + regex_opt_inner(strings, '(') + suffix |