eric6/ThirdParty/Pygments/pygments/regexopt.py

Tue, 15 Sep 2020 19:09:05 +0200

author
Detlev Offenbach <detlev@die-offenbachs.de>
date
Tue, 15 Sep 2020 19:09:05 +0200
changeset 7701
25f42e208e08
parent 7547
21b0534faebc
child 7983
54c5cfbb1e29
permissions
-rw-r--r--

Pygments: updated to 2.7.0.

7701
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
1 # -*- coding: utf-8 -*-
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
2 """
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
3 pygments.regexopt
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
4 ~~~~~~~~~~~~~~~~~
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
5
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
6 An algorithm that generates optimized regexes for matching long lists of
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
7 literal strings.
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
8
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
9 :copyright: Copyright 2006-2020 by the Pygments team, see AUTHORS.
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
10 :license: BSD, see LICENSE for details.
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
11 """
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
12
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
13 import re
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
14 from re import escape
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
15 from os.path import commonprefix
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
16 from itertools import groupby
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
17 from operator import itemgetter
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
18
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
19 CS_ESCAPE = re.compile(r'[\^\\\-\]]')
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
20 FIRST_ELEMENT = itemgetter(0)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
21
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
22
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
23 def make_charset(letters):
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
24 return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
25
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
26
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
27 def regex_opt_inner(strings, open_paren):
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
28 """Return a regex that matches any string in the sorted list of strings."""
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
29 close_paren = open_paren and ')' or ''
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
30 # print strings, repr(open_paren)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
31 if not strings:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
32 # print '-> nothing left'
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
33 return ''
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
34 first = strings[0]
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
35 if len(strings) == 1:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
36 # print '-> only 1 string'
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
37 return open_paren + escape(first) + close_paren
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
38 if not first:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
39 # print '-> first string empty'
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
40 return open_paren + regex_opt_inner(strings[1:], '(?:') \
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
41 + '?' + close_paren
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
42 if len(first) == 1:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
43 # multiple one-char strings? make a charset
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
44 oneletter = []
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
45 rest = []
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
46 for s in strings:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
47 if len(s) == 1:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
48 oneletter.append(s)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
49 else:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
50 rest.append(s)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
51 if len(oneletter) > 1: # do we have more than one oneletter string?
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
52 if rest:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
53 # print '-> 1-character + rest'
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
54 return open_paren + regex_opt_inner(rest, '') + '|' \
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
55 + make_charset(oneletter) + close_paren
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
56 # print '-> only 1-character'
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
57 return open_paren + make_charset(oneletter) + close_paren
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
58 prefix = commonprefix(strings)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
59 if prefix:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
60 plen = len(prefix)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
61 # we have a prefix for all strings
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
62 # print '-> prefix:', prefix
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
63 return open_paren + escape(prefix) \
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
64 + regex_opt_inner([s[plen:] for s in strings], '(?:') \
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
65 + close_paren
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
66 # is there a suffix?
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
67 strings_rev = [s[::-1] for s in strings]
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
68 suffix = commonprefix(strings_rev)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
69 if suffix:
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
70 slen = len(suffix)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
71 # print '-> suffix:', suffix[::-1]
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
72 return open_paren \
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
73 + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
74 + escape(suffix[::-1]) + close_paren
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
75 # recurse on common 1-string prefixes
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
76 # print '-> last resort'
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
77 return open_paren + \
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
78 '|'.join(regex_opt_inner(list(group[1]), '')
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
79 for group in groupby(strings, lambda s: s[0] == first[0])) \
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
80 + close_paren
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
81
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
82
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
83 def regex_opt(strings, prefix='', suffix=''):
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
84 """Return a compiled regex that matches any string in the given list.
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
85
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
86 The strings to match must be literal strings, not regexes. They will be
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
87 regex-escaped.
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
88
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
89 *prefix* and *suffix* are pre- and appended to the final regex.
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
90 """
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
91 strings = sorted(strings)
25f42e208e08 Pygments: updated to 2.7.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 7547
diff changeset
92 return prefix + regex_opt_inner(strings, '(') + suffix

eric ide

mercurial