|
1 # -*- coding: utf-8 -*- |
|
2 """ |
|
3 pygments.regexopt |
|
4 ~~~~~~~~~~~~~~~~~ |
|
5 |
|
6 An algorithm that generates optimized regexes for matching long lists of |
|
7 literal strings. |
|
8 |
|
9 :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS. |
|
10 :license: BSD, see LICENSE for details. |
|
11 """ |
|
12 |
|
13 import re |
|
14 from re import escape |
|
15 from os.path import commonprefix |
|
16 from itertools import groupby |
|
17 from operator import itemgetter |
|
18 |
|
19 CS_ESCAPE = re.compile(r'[\^\\\-\]]') |
|
20 FIRST_ELEMENT = itemgetter(0) |
|
21 |
|
22 |
|
23 def make_charset(letters): |
|
24 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' |
|
25 |
|
26 |
|
27 def regex_opt_inner(strings, open_paren): |
|
28 """Return a regex that matches any string in the sorted list of strings.""" |
|
29 close_paren = open_paren and ')' or '' |
|
30 # print strings, repr(open_paren) |
|
31 if not strings: |
|
32 # print '-> nothing left' |
|
33 return '' |
|
34 first = strings[0] |
|
35 if len(strings) == 1: |
|
36 # print '-> only 1 string' |
|
37 return open_paren + escape(first) + close_paren |
|
38 if not first: |
|
39 # print '-> first string empty' |
|
40 return open_paren + regex_opt_inner(strings[1:], '(?:') \ |
|
41 + '?' + close_paren |
|
42 if len(first) == 1: |
|
43 # multiple one-char strings? make a charset |
|
44 oneletter = [] |
|
45 rest = [] |
|
46 for s in strings: |
|
47 if len(s) == 1: |
|
48 oneletter.append(s) |
|
49 else: |
|
50 rest.append(s) |
|
51 if len(oneletter) > 1: # do we have more than one oneletter string? |
|
52 if rest: |
|
53 # print '-> 1-character + rest' |
|
54 return open_paren + regex_opt_inner(rest, '') + '|' \ |
|
55 + make_charset(oneletter) + close_paren |
|
56 # print '-> only 1-character' |
|
57 return make_charset(oneletter) |
|
58 prefix = commonprefix(strings) |
|
59 if prefix: |
|
60 plen = len(prefix) |
|
61 # we have a prefix for all strings |
|
62 # print '-> prefix:', prefix |
|
63 return open_paren + escape(prefix) \ |
|
64 + regex_opt_inner([s[plen:] for s in strings], '(?:') \ |
|
65 + close_paren |
|
66 # is there a suffix? |
|
67 strings_rev = [s[::-1] for s in strings] |
|
68 suffix = commonprefix(strings_rev) |
|
69 if suffix: |
|
70 slen = len(suffix) |
|
71 # print '-> suffix:', suffix[::-1] |
|
72 return open_paren \ |
|
73 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ |
|
74 + escape(suffix[::-1]) + close_paren |
|
75 # recurse on common 1-string prefixes |
|
76 # print '-> last resort' |
|
77 return open_paren + \ |
|
78 '|'.join(regex_opt_inner(list(group[1]), '') |
|
79 for group in groupby(strings, lambda s: s[0] == first[0])) \ |
|
80 + close_paren |
|
81 |
|
82 |
|
83 def regex_opt(strings, prefix='', suffix=''): |
|
84 """Return a compiled regex that matches any string in the given list. |
|
85 |
|
86 The strings to match must be literal strings, not regexes. They will be |
|
87 regex-escaped. |
|
88 |
|
89 *prefix* and *suffix* are pre- and appended to the final regex. |
|
90 """ |
|
91 strings = sorted(strings) |
|
92 return prefix + regex_opt_inner(strings, '(') + suffix |