eric6/ThirdParty/Pygments/pygments/regexopt.py

changeset 6942
2602857055c5
parent 5713
6762afd9f963
child 7547
21b0534faebc
equal deleted inserted replaced
6941:f99d60d6b59b 6942:2602857055c5
1 # -*- coding: utf-8 -*-
2 """
3 pygments.regexopt
4 ~~~~~~~~~~~~~~~~~
5
6 An algorithm that generates optimized regexes for matching long lists of
7 literal strings.
8
9 :copyright: Copyright 2006-2017 by the Pygments team, see AUTHORS.
10 :license: BSD, see LICENSE for details.
11 """
12
13 import re
14 from re import escape
15 from os.path import commonprefix
16 from itertools import groupby
17 from operator import itemgetter
18
19 CS_ESCAPE = re.compile(r'[\^\\\-\]]')
20 FIRST_ELEMENT = itemgetter(0)
21
22
23 def make_charset(letters):
24 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
25
26
27 def regex_opt_inner(strings, open_paren):
28 """Return a regex that matches any string in the sorted list of strings."""
29 close_paren = open_paren and ')' or ''
30 # print strings, repr(open_paren)
31 if not strings:
32 # print '-> nothing left'
33 return ''
34 first = strings[0]
35 if len(strings) == 1:
36 # print '-> only 1 string'
37 return open_paren + escape(first) + close_paren
38 if not first:
39 # print '-> first string empty'
40 return open_paren + regex_opt_inner(strings[1:], '(?:') \
41 + '?' + close_paren
42 if len(first) == 1:
43 # multiple one-char strings? make a charset
44 oneletter = []
45 rest = []
46 for s in strings:
47 if len(s) == 1:
48 oneletter.append(s)
49 else:
50 rest.append(s)
51 if len(oneletter) > 1: # do we have more than one oneletter string?
52 if rest:
53 # print '-> 1-character + rest'
54 return open_paren + regex_opt_inner(rest, '') + '|' \
55 + make_charset(oneletter) + close_paren
56 # print '-> only 1-character'
57 return open_paren + make_charset(oneletter) + close_paren
58 prefix = commonprefix(strings)
59 if prefix:
60 plen = len(prefix)
61 # we have a prefix for all strings
62 # print '-> prefix:', prefix
63 return open_paren + escape(prefix) \
64 + regex_opt_inner([s[plen:] for s in strings], '(?:') \
65 + close_paren
66 # is there a suffix?
67 strings_rev = [s[::-1] for s in strings]
68 suffix = commonprefix(strings_rev)
69 if suffix:
70 slen = len(suffix)
71 # print '-> suffix:', suffix[::-1]
72 return open_paren \
73 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
74 + escape(suffix[::-1]) + close_paren
75 # recurse on common 1-string prefixes
76 # print '-> last resort'
77 return open_paren + \
78 '|'.join(regex_opt_inner(list(group[1]), '')
79 for group in groupby(strings, lambda s: s[0] == first[0])) \
80 + close_paren
81
82
83 def regex_opt(strings, prefix='', suffix=''):
84 """Return a compiled regex that matches any string in the given list.
85
86 The strings to match must be literal strings, not regexes. They will be
87 regex-escaped.
88
89 *prefix* and *suffix* are pre- and appended to the final regex.
90 """
91 strings = sorted(strings)
92 return prefix + regex_opt_inner(strings, '(') + suffix

eric ide

mercurial