ThirdParty/Pygments/pygments/regexopt.py

changeset 4172
4f20dba37ab6
child 4697
c2e9bf425554
equal deleted inserted replaced
4170:8bc578136279 4172:4f20dba37ab6
1 # -*- coding: utf-8 -*-
2 """
3 pygments.regexopt
4 ~~~~~~~~~~~~~~~~~
5
6 An algorithm that generates optimized regexes for matching long lists of
7 literal strings.
8
9 :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
10 :license: BSD, see LICENSE for details.
11 """
12
13 import re
14 from re import escape
15 from os.path import commonprefix
16 from itertools import groupby
17 from operator import itemgetter
18
19 CS_ESCAPE = re.compile(r'[\^\\\-\]]')
20 FIRST_ELEMENT = itemgetter(0)
21
22
23 def make_charset(letters):
24 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
25
26
27 def regex_opt_inner(strings, open_paren):
28 """Return a regex that matches any string in the sorted list of strings."""
29 close_paren = open_paren and ')' or ''
30 # print strings, repr(open_paren)
31 if not strings:
32 # print '-> nothing left'
33 return ''
34 first = strings[0]
35 if len(strings) == 1:
36 # print '-> only 1 string'
37 return open_paren + escape(first) + close_paren
38 if not first:
39 # print '-> first string empty'
40 return open_paren + regex_opt_inner(strings[1:], '(?:') \
41 + '?' + close_paren
42 if len(first) == 1:
43 # multiple one-char strings? make a charset
44 oneletter = []
45 rest = []
46 for s in strings:
47 if len(s) == 1:
48 oneletter.append(s)
49 else:
50 rest.append(s)
51 if len(oneletter) > 1: # do we have more than one oneletter string?
52 if rest:
53 # print '-> 1-character + rest'
54 return open_paren + regex_opt_inner(rest, '') + '|' \
55 + make_charset(oneletter) + close_paren
56 # print '-> only 1-character'
57 return make_charset(oneletter)
58 prefix = commonprefix(strings)
59 if prefix:
60 plen = len(prefix)
61 # we have a prefix for all strings
62 # print '-> prefix:', prefix
63 return open_paren + escape(prefix) \
64 + regex_opt_inner([s[plen:] for s in strings], '(?:') \
65 + close_paren
66 # is there a suffix?
67 strings_rev = [s[::-1] for s in strings]
68 suffix = commonprefix(strings_rev)
69 if suffix:
70 slen = len(suffix)
71 # print '-> suffix:', suffix[::-1]
72 return open_paren \
73 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
74 + escape(suffix[::-1]) + close_paren
75 # recurse on common 1-string prefixes
76 # print '-> last resort'
77 return open_paren + \
78 '|'.join(regex_opt_inner(list(group[1]), '')
79 for group in groupby(strings, lambda s: s[0] == first[0])) \
80 + close_paren
81
82
83 def regex_opt(strings, prefix='', suffix=''):
84 """Return a compiled regex that matches any string in the given list.
85
86 The strings to match must be literal strings, not regexes. They will be
87 regex-escaped.
88
89 *prefix* and *suffix* are pre- and appended to the final regex.
90 """
91 strings = sorted(strings)
92 return prefix + regex_opt_inner(strings, '(') + suffix

eric ide

mercurial