eric6/ThirdParty/Pygments/pygments/lexers/mime.py

changeset 7547
21b0534faebc
child 7701
25f42e208e08
equal deleted inserted replaced
7546:bf5f777260a6 7547:21b0534faebc
1 # -*- coding: utf-8 -*-
2 """
3 pygments.lexers.mime
4 ~~~~~~~~~~~~~~~~~~~~
5
6 Lexer for Multipurpose Internet Mail Extensions (MIME) data.
7
8 :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS.
9 :license: BSD, see LICENSE for details.
10 """
11
12 import re
13
14 from pygments.lexer import RegexLexer, include
15 from pygments.lexers import get_lexer_for_mimetype
16 from pygments.token import Text, Name, String, Operator, Comment, Other
17 from pygments.util import get_int_opt, ClassNotFound
18
19 __all__ = ["MIMELexer"]
20
21
22 class MIMELexer(RegexLexer):
23 """
24 Lexer for Multipurpose Internet Mail Extensions (MIME) data. This lexer is
25 designed to process the nested mulitpart data.
26
27 It assumes that the given data contains both header and body (and is
28 splitted by empty line). If no valid header is found, then the entire data
29 would be treated as body.
30
31 Additional options accepted:
32
33 `MIME-max-level`
34 Max recurssion level for nested MIME structure. Any negative number
35 would treated as unlimited. (default: -1)
36
37 `Content-Type`
38 Treat the data as specific content type. Useful when header is
39 missing, or this lexer would try to parse from header. (default:
40 `text/plain`)
41
42 `Multipart-Boundary`
43 Set the default multipart boundary delimiter. This option is only used
44 when `Content-Type` is `multipart` and header is missing. This lexer
45 would try to parse from header by default. (default: None)
46
47 `Content-Transfer-Encoding`
48 Treat the data as specific encoding. Or this lexer would try to parse
49 from header by default. (default: None)
50
51 .. versionadded:: 2.5
52 """
53
54 name = "MIME"
55 aliases = ["mime"]
56 mimetypes = ["multipart/mixed",
57 "multipart/related",
58 "multipart/alternative"]
59
60 def __init__(self, **options):
61 super(MIMELexer, self).__init__(**options)
62 self.boundary = options.get("Multipart-Boundary")
63 self.content_transfer_encoding = options.get("Content_Transfer_Encoding")
64 self.content_type = options.get("Content_Type", "text/plain")
65 self.max_nested_level = get_int_opt(options, "MIME-max-level", -1)
66
67 def analyse_text(text):
68 try:
69 header, body = text.strip().split("\n\n", 1)
70 if not body.strip():
71 return 0.1
72
73 invalid_headers = MIMELexer.tokens["header"].sub("", header)
74 if invalid_headers.strip():
75 return 0.1
76 else:
77 return 1
78
79 except ValueError:
80 return 0.1
81
82 def get_header_tokens(self, match):
83 field = match.group(1)
84
85 if field.lower() in self.attention_headers:
86 yield match.start(1), Name.Tag, field + ":"
87 yield match.start(2), Text.Whitespace, match.group(2)
88
89 pos = match.end(2)
90 body = match.group(3)
91 for i, t, v in self.get_tokens_unprocessed(body, ("root", field.lower())):
92 yield pos + i, t, v
93
94 else:
95 yield match.start(), Comment, match.group()
96
97 def get_body_tokens(self, match):
98 pos_body_start = match.start()
99 entire_body = match.group()
100
101 # skip first newline
102 if entire_body[0] == '\n':
103 yield pos_body_start, Text.Whitespace, u'\n'
104 pos_body_start = pos_body_start + 1
105 entire_body = entire_body[1:]
106
107 # if it is not a mulitpart
108 if not self.content_type.startswith("multipart") or not self.boundary:
109 for i, t, v in self.get_bodypart_tokens(entire_body):
110 yield pos_body_start + i, t, v
111 return
112
113 # find boundary
114 bdry_pattern = r"^--%s(--)?\n" % re.escape(self.boundary)
115 bdry_matcher = re.compile(bdry_pattern, re.MULTILINE)
116
117 # some data has prefix text before first boundary
118 m = bdry_matcher.search(entire_body)
119 if m:
120 pos_part_start = pos_body_start + m.end()
121 pos_iter_start = lpos_end = m.end()
122 yield pos_body_start, Text, entire_body[:m.start()]
123 yield pos_body_start + lpos_end, String.Delimiter, m.group()
124 else:
125 pos_part_start = pos_body_start
126 pos_iter_start = 0
127
128 # process tokens of each body part
129 for m in bdry_matcher.finditer(entire_body, pos_iter_start):
130 # bodypart
131 lpos_start = pos_part_start - pos_body_start
132 lpos_end = m.start()
133 part = entire_body[lpos_start:lpos_end]
134 for i, t, v in self.get_bodypart_tokens(part):
135 yield pos_part_start + i, t, v
136
137 # boundary
138 yield pos_body_start + lpos_end, String.Delimiter, m.group()
139 pos_part_start = pos_body_start + m.end()
140
141 # some data has suffix text after last boundary
142 lpos_start = pos_part_start - pos_body_start
143 if lpos_start != len(entire_body):
144 yield pos_part_start, Text, entire_body[lpos_start:]
145
146 def get_bodypart_tokens(self, text):
147 # return if:
148 # * no content
149 # * no content type specific
150 # * content encoding is not readable
151 # * max recurrsion exceed
152 if not text.strip() or not self.content_type:
153 return [(0, Other, text)]
154
155 cte = self.content_transfer_encoding
156 if cte and cte not in {"8bit", "7bit", "quoted-printable"}:
157 return [(0, Other, text)]
158
159 if self.max_nested_level == 0:
160 return [(0, Other, text)]
161
162 # get lexer
163 try:
164 lexer = get_lexer_for_mimetype(self.content_type)
165 except ClassNotFound:
166 return [(0, Other, text)]
167
168 if isinstance(lexer, type(self)):
169 lexer.max_nested_level = self.max_nested_level - 1
170
171 return lexer.get_tokens_unprocessed(text)
172
173 def store_content_type(self, match):
174 self.content_type = match.group(1)
175
176 prefix_len = match.start(1) - match.start(0)
177 yield match.start(0), Text.Whitespace, match.group(0)[:prefix_len]
178 yield match.start(1), Name.Label, match.group(2)
179 yield match.end(2), String.Delimiter, u"/"
180 yield match.start(3), Name.Label, match.group(3)
181
182 def get_content_type_subtokens(self, match):
183 yield match.start(1), Text, match.group(1)
184 yield match.start(2), Text.Whitespace, match.group(2)
185 yield match.start(3), Name.Attribute, match.group(3)
186 yield match.start(4), Operator, match.group(4)
187 yield match.start(5), String, match.group(5)
188
189 if match.group(3).lower() == "boundary":
190 boundary = match.group(5).strip()
191 if boundary[0] == '"' and boundary[-1] == '"':
192 boundary = boundary[1:-1]
193 self.boundary = boundary
194
195 def store_content_transfer_encoding(self, match):
196 self.content_transfer_encoding = match.group(0).lower()
197 yield match.start(0), Name.Constant, match.group(0)
198
199 attention_headers = {"content-type", "content-transfer-encoding"}
200
201 tokens = {
202 "root": [
203 (r"^([\w-]+):( *)([\s\S]*?\n)(?![ \t])", get_header_tokens),
204 (r"^$[\s\S]+", get_body_tokens),
205 ],
206 "header": [
207 # folding
208 (r"\n[ \t]", Text.Whitespace),
209 (r"\n(?![ \t])", Text.Whitespace, "#pop"),
210 ],
211 "content-type": [
212 include("header"),
213 (
214 r"^\s*((multipart|application|audio|font|image|model|text|video"
215 r"|message)/([\w-]+))",
216 store_content_type,
217 ),
218 (r'(;)((?:[ \t]|\n[ \t])*)([\w:-]+)(=)([\s\S]*?)(?=;|\n(?![ \t]))',
219 get_content_type_subtokens),
220 (r';[ \t]*\n(?![ \t])', Text, '#pop'),
221 ],
222 "content-transfer-encoding": [
223 include("header"),
224 (r"([\w-]+)", store_content_transfer_encoding),
225 ],
226 }

eric ide

mercurial