|
1 # -*- coding: utf-8 -*- |
|
2 """ |
|
3 pygments.lexers.mime |
|
4 ~~~~~~~~~~~~~~~~~~~~ |
|
5 |
|
6 Lexer for Multipurpose Internet Mail Extensions (MIME) data. |
|
7 |
|
8 :copyright: Copyright 2006-2019 by the Pygments team, see AUTHORS. |
|
9 :license: BSD, see LICENSE for details. |
|
10 """ |
|
11 |
|
12 import re |
|
13 |
|
14 from pygments.lexer import RegexLexer, include |
|
15 from pygments.lexers import get_lexer_for_mimetype |
|
16 from pygments.token import Text, Name, String, Operator, Comment, Other |
|
17 from pygments.util import get_int_opt, ClassNotFound |
|
18 |
|
19 __all__ = ["MIMELexer"] |
|
20 |
|
21 |
|
22 class MIMELexer(RegexLexer): |
|
23 """ |
|
24 Lexer for Multipurpose Internet Mail Extensions (MIME) data. This lexer is |
|
25 designed to process the nested mulitpart data. |
|
26 |
|
27 It assumes that the given data contains both header and body (and is |
|
28 splitted by empty line). If no valid header is found, then the entire data |
|
29 would be treated as body. |
|
30 |
|
31 Additional options accepted: |
|
32 |
|
33 `MIME-max-level` |
|
34 Max recurssion level for nested MIME structure. Any negative number |
|
35 would treated as unlimited. (default: -1) |
|
36 |
|
37 `Content-Type` |
|
38 Treat the data as specific content type. Useful when header is |
|
39 missing, or this lexer would try to parse from header. (default: |
|
40 `text/plain`) |
|
41 |
|
42 `Multipart-Boundary` |
|
43 Set the default multipart boundary delimiter. This option is only used |
|
44 when `Content-Type` is `multipart` and header is missing. This lexer |
|
45 would try to parse from header by default. (default: None) |
|
46 |
|
47 `Content-Transfer-Encoding` |
|
48 Treat the data as specific encoding. Or this lexer would try to parse |
|
49 from header by default. (default: None) |
|
50 |
|
51 .. versionadded:: 2.5 |
|
52 """ |
|
53 |
|
54 name = "MIME" |
|
55 aliases = ["mime"] |
|
56 mimetypes = ["multipart/mixed", |
|
57 "multipart/related", |
|
58 "multipart/alternative"] |
|
59 |
|
60 def __init__(self, **options): |
|
61 super(MIMELexer, self).__init__(**options) |
|
62 self.boundary = options.get("Multipart-Boundary") |
|
63 self.content_transfer_encoding = options.get("Content_Transfer_Encoding") |
|
64 self.content_type = options.get("Content_Type", "text/plain") |
|
65 self.max_nested_level = get_int_opt(options, "MIME-max-level", -1) |
|
66 |
|
67 def analyse_text(text): |
|
68 try: |
|
69 header, body = text.strip().split("\n\n", 1) |
|
70 if not body.strip(): |
|
71 return 0.1 |
|
72 |
|
73 invalid_headers = MIMELexer.tokens["header"].sub("", header) |
|
74 if invalid_headers.strip(): |
|
75 return 0.1 |
|
76 else: |
|
77 return 1 |
|
78 |
|
79 except ValueError: |
|
80 return 0.1 |
|
81 |
|
82 def get_header_tokens(self, match): |
|
83 field = match.group(1) |
|
84 |
|
85 if field.lower() in self.attention_headers: |
|
86 yield match.start(1), Name.Tag, field + ":" |
|
87 yield match.start(2), Text.Whitespace, match.group(2) |
|
88 |
|
89 pos = match.end(2) |
|
90 body = match.group(3) |
|
91 for i, t, v in self.get_tokens_unprocessed(body, ("root", field.lower())): |
|
92 yield pos + i, t, v |
|
93 |
|
94 else: |
|
95 yield match.start(), Comment, match.group() |
|
96 |
|
97 def get_body_tokens(self, match): |
|
98 pos_body_start = match.start() |
|
99 entire_body = match.group() |
|
100 |
|
101 # skip first newline |
|
102 if entire_body[0] == '\n': |
|
103 yield pos_body_start, Text.Whitespace, u'\n' |
|
104 pos_body_start = pos_body_start + 1 |
|
105 entire_body = entire_body[1:] |
|
106 |
|
107 # if it is not a mulitpart |
|
108 if not self.content_type.startswith("multipart") or not self.boundary: |
|
109 for i, t, v in self.get_bodypart_tokens(entire_body): |
|
110 yield pos_body_start + i, t, v |
|
111 return |
|
112 |
|
113 # find boundary |
|
114 bdry_pattern = r"^--%s(--)?\n" % re.escape(self.boundary) |
|
115 bdry_matcher = re.compile(bdry_pattern, re.MULTILINE) |
|
116 |
|
117 # some data has prefix text before first boundary |
|
118 m = bdry_matcher.search(entire_body) |
|
119 if m: |
|
120 pos_part_start = pos_body_start + m.end() |
|
121 pos_iter_start = lpos_end = m.end() |
|
122 yield pos_body_start, Text, entire_body[:m.start()] |
|
123 yield pos_body_start + lpos_end, String.Delimiter, m.group() |
|
124 else: |
|
125 pos_part_start = pos_body_start |
|
126 pos_iter_start = 0 |
|
127 |
|
128 # process tokens of each body part |
|
129 for m in bdry_matcher.finditer(entire_body, pos_iter_start): |
|
130 # bodypart |
|
131 lpos_start = pos_part_start - pos_body_start |
|
132 lpos_end = m.start() |
|
133 part = entire_body[lpos_start:lpos_end] |
|
134 for i, t, v in self.get_bodypart_tokens(part): |
|
135 yield pos_part_start + i, t, v |
|
136 |
|
137 # boundary |
|
138 yield pos_body_start + lpos_end, String.Delimiter, m.group() |
|
139 pos_part_start = pos_body_start + m.end() |
|
140 |
|
141 # some data has suffix text after last boundary |
|
142 lpos_start = pos_part_start - pos_body_start |
|
143 if lpos_start != len(entire_body): |
|
144 yield pos_part_start, Text, entire_body[lpos_start:] |
|
145 |
|
146 def get_bodypart_tokens(self, text): |
|
147 # return if: |
|
148 # * no content |
|
149 # * no content type specific |
|
150 # * content encoding is not readable |
|
151 # * max recurrsion exceed |
|
152 if not text.strip() or not self.content_type: |
|
153 return [(0, Other, text)] |
|
154 |
|
155 cte = self.content_transfer_encoding |
|
156 if cte and cte not in {"8bit", "7bit", "quoted-printable"}: |
|
157 return [(0, Other, text)] |
|
158 |
|
159 if self.max_nested_level == 0: |
|
160 return [(0, Other, text)] |
|
161 |
|
162 # get lexer |
|
163 try: |
|
164 lexer = get_lexer_for_mimetype(self.content_type) |
|
165 except ClassNotFound: |
|
166 return [(0, Other, text)] |
|
167 |
|
168 if isinstance(lexer, type(self)): |
|
169 lexer.max_nested_level = self.max_nested_level - 1 |
|
170 |
|
171 return lexer.get_tokens_unprocessed(text) |
|
172 |
|
173 def store_content_type(self, match): |
|
174 self.content_type = match.group(1) |
|
175 |
|
176 prefix_len = match.start(1) - match.start(0) |
|
177 yield match.start(0), Text.Whitespace, match.group(0)[:prefix_len] |
|
178 yield match.start(1), Name.Label, match.group(2) |
|
179 yield match.end(2), String.Delimiter, u"/" |
|
180 yield match.start(3), Name.Label, match.group(3) |
|
181 |
|
182 def get_content_type_subtokens(self, match): |
|
183 yield match.start(1), Text, match.group(1) |
|
184 yield match.start(2), Text.Whitespace, match.group(2) |
|
185 yield match.start(3), Name.Attribute, match.group(3) |
|
186 yield match.start(4), Operator, match.group(4) |
|
187 yield match.start(5), String, match.group(5) |
|
188 |
|
189 if match.group(3).lower() == "boundary": |
|
190 boundary = match.group(5).strip() |
|
191 if boundary[0] == '"' and boundary[-1] == '"': |
|
192 boundary = boundary[1:-1] |
|
193 self.boundary = boundary |
|
194 |
|
195 def store_content_transfer_encoding(self, match): |
|
196 self.content_transfer_encoding = match.group(0).lower() |
|
197 yield match.start(0), Name.Constant, match.group(0) |
|
198 |
|
199 attention_headers = {"content-type", "content-transfer-encoding"} |
|
200 |
|
201 tokens = { |
|
202 "root": [ |
|
203 (r"^([\w-]+):( *)([\s\S]*?\n)(?![ \t])", get_header_tokens), |
|
204 (r"^$[\s\S]+", get_body_tokens), |
|
205 ], |
|
206 "header": [ |
|
207 # folding |
|
208 (r"\n[ \t]", Text.Whitespace), |
|
209 (r"\n(?![ \t])", Text.Whitespace, "#pop"), |
|
210 ], |
|
211 "content-type": [ |
|
212 include("header"), |
|
213 ( |
|
214 r"^\s*((multipart|application|audio|font|image|model|text|video" |
|
215 r"|message)/([\w-]+))", |
|
216 store_content_type, |
|
217 ), |
|
218 (r'(;)((?:[ \t]|\n[ \t])*)([\w:-]+)(=)([\s\S]*?)(?=;|\n(?![ \t]))', |
|
219 get_content_type_subtokens), |
|
220 (r';[ \t]*\n(?![ \t])', Text, '#pop'), |
|
221 ], |
|
222 "content-transfer-encoding": [ |
|
223 include("header"), |
|
224 (r"([\w-]+)", store_content_transfer_encoding), |
|
225 ], |
|
226 } |