|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2015 - 2021 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing a Content-Disposition parser iaw. RFC 6266. |
|
8 """ |
|
9 |
|
10 # |
|
11 # This code is adapted from the rfc6266.py module of qutebrowser. |
|
12 # Original copyright 2014-2015 Florian Bruhin (The Compiler) |
|
13 # <mail@qutebrowser.org> |
|
14 # |
|
15 |
|
16 import urllib.parse as parse |
|
17 import collections |
|
18 import string |
|
19 import re |
|
20 |
|
21 try: |
|
22 import pypeg2 as peg |
|
23 |
|
24 class UniqueNamespace(peg.Namespace): |
|
25 """ |
|
26 A pyPEG2 namespace which prevents setting a value twice. |
|
27 """ |
|
28 def __setitem__(self, key, value): |
|
29 """ |
|
30 Special method to set an item. |
|
31 |
|
32 @param key key for the item |
|
33 @param value value of the item |
|
34 """ |
|
35 if key in self: |
|
36 raise DuplicateParamError(key) |
|
37 super().__setitem__(key, value) |
|
38 |
|
39 # RFC 2616 |
|
40 separator_chars = "()<>@,;:\\\"/[]?={} \t" # __IGNORE_WARNING_M613__ |
|
41 ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127) |
|
42 nontoken_chars = separator_chars + ctl_chars |
|
43 |
|
44 # RFC 5987 |
|
45 attr_chars_nonalnum = '!#$&+-.^_`|~' |
|
46 attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum |
|
47 |
|
48 # RFC 5987 gives this alternative construction of the token character class |
|
49 token_chars = attr_chars + "*'%" # __IGNORE_WARNING_M601__ |
|
50 |
|
51 # Definitions from https://tools.ietf.org/html/rfc2616#section-2.2 |
|
52 # token was redefined from attr_chars to avoid using AnyBut, |
|
53 # which might include non-ascii octets. |
|
54 token_re = '[{0}]+'.format(re.escape(token_chars)) |
|
55 |
|
56 class Token(str): |
|
57 """ |
|
58 A token (RFC 2616, Section 2.2). |
|
59 """ |
|
60 grammar = re.compile(token_re) |
|
61 |
|
62 # RFC 2616 says some linear whitespace (LWS) is in fact allowed in text |
|
63 # and qdtext; however it also mentions folding that whitespace into |
|
64 # a single SP (which isn't in CTL) before interpretation. |
|
65 # Assume the caller already that folding when parsing headers. |
|
66 |
|
67 # Note: qdtext also allows non-ascii, which we choose to parse |
|
68 # as ISO-8859-1; rejecting it entirely would also be permitted. |
|
69 # Some broken browsers attempt encoding-sniffing, which is broken |
|
70 # because the spec only allows iso, and because encoding-sniffing |
|
71 # can mangle valid values. |
|
72 # Everything else in this grammar (including RFC 5987 ext values) |
|
73 # is in an ascii-safe encoding. |
|
74 |
|
75 qdtext_re = r'[^"{0}]'.format(re.escape(ctl_chars)) |
|
76 quoted_pair_re = r'\\[{0}]'.format(re.escape( |
|
77 ''.join(chr(i) for i in range(128)))) |
|
78 |
|
79 class QuotedString(str): |
|
80 """ |
|
81 A quoted string (RFC 2616, Section 2.2). |
|
82 """ |
|
83 grammar = re.compile(r'"({0}|{1})+"'.format(quoted_pair_re, qdtext_re)) |
|
84 |
|
85 def __str__(self): |
|
86 s = super().__str__() |
|
87 s = s[1:-1] # remove quotes |
|
88 s = re.sub(r'\\(.)', r'\1', s) # drop backslashes |
|
89 return s |
|
90 |
|
91 class Value(str): |
|
92 """ |
|
93 A value. (RFC 2616, Section 3.6). |
|
94 """ |
|
95 grammar = [re.compile(token_re), QuotedString] |
|
96 |
|
97 class Charset(str): |
|
98 """ |
|
99 A charset (RFC5987, Section 3.2.1). |
|
100 """ |
|
101 # Other charsets are forbidden, the spec reserves them |
|
102 # for future evolutions. |
|
103 grammar = re.compile('UTF-8|ISO-8859-1', re.I) |
|
104 |
|
105 class Language(str): |
|
106 """ |
|
107 A language-tag (RFC 5646, Section 2.1). |
|
108 |
|
109 Fixme: This grammar is not 100% correct yet. |
|
110 https://github.com/The-Compiler/qutebrowser/issues/105 |
|
111 """ |
|
112 grammar = re.compile('[A-Za-z0-9-]+') |
|
113 |
|
114 attr_char_re = '[{0}]'.format(re.escape(attr_chars)) |
|
115 hex_digit_re = '%[' + string.hexdigits + ']{2}' |
|
116 |
|
117 class ValueChars(str): |
|
118 """ |
|
119 A value of an attribute. |
|
120 |
|
121 Fixme: Can we merge this with Value? |
|
122 https://github.com/The-Compiler/qutebrowser/issues/105 |
|
123 """ |
|
124 grammar = re.compile('({0}|{1})*'.format(attr_char_re, hex_digit_re)) |
|
125 |
|
126 class ExtValue(peg.List): |
|
127 """ |
|
128 An ext-value of an attribute (RFC 5987, Section 3.2). |
|
129 """ |
|
130 grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'", |
|
131 ValueChars) |
|
132 |
|
133 class ExtToken(peg.Symbol): |
|
134 """ |
|
135 A token introducing an extended value (RFC 6266, Section 4.1). |
|
136 """ |
|
137 regex = re.compile(token_re + r'\*') |
|
138 |
|
139 def __str__(self): |
|
140 return super().__str__().lower() |
|
141 |
|
142 class NoExtToken(peg.Symbol): |
|
143 """ |
|
144 A token introducing a normal value (RFC 6266, Section 4.1). |
|
145 """ |
|
146 regex = re.compile(token_re + r'(?<!\*)') |
|
147 |
|
148 def __str__(self): |
|
149 return super().__str__().lower() |
|
150 |
|
151 class DispositionParm(str): |
|
152 """ |
|
153 A parameter for the Disposition-Type header (RFC6266, Section 4.1). |
|
154 """ |
|
155 grammar = peg.attr('name', NoExtToken), '=', Value |
|
156 |
|
157 class ExtDispositionParm: |
|
158 """ |
|
159 An extended parameter (RFC6266, Section 4.1). |
|
160 """ |
|
161 grammar = peg.attr('name', ExtToken), '=', ExtValue |
|
162 |
|
163 def __init__(self, value, name=None): |
|
164 self.name = name |
|
165 self.value = value |
|
166 |
|
167 class DispositionType(peg.List): |
|
168 """ |
|
169 The disposition type (RFC6266, Section 4.1). |
|
170 """ |
|
171 grammar = [re.compile('(inline|attachment)', re.I), Token] |
|
172 |
|
173 class DispositionParmList(UniqueNamespace): |
|
174 """ |
|
175 A list of disposition parameters (RFC6266, Section 4.1). |
|
176 """ |
|
177 grammar = peg.maybe_some(';', [ExtDispositionParm, DispositionParm]) |
|
178 |
|
179 class ContentDispositionValue: |
|
180 """ |
|
181 A complete Content-Disposition value (RFC 6266, Section 4.1). |
|
182 """ |
|
183 # Allows nonconformant final semicolon |
|
184 # I've seen it in the wild, and browsers accept it |
|
185 # http://greenbytes.de/tech/tc2231/#attwithasciifilenamenqs |
|
186 grammar = (peg.attr('dtype', DispositionType), |
|
187 peg.attr('params', DispositionParmList), |
|
188 peg.optional(';')) |
|
189 |
|
190 LangTagged = collections.namedtuple('LangTagged', ['string', 'langtag']) |
|
191 |
|
192 class DuplicateParamError(Exception): |
|
193 """ |
|
194 Exception raised when a parameter has been given twice. |
|
195 """ |
|
196 |
|
197 class InvalidISO8859Error(Exception): |
|
198 """ |
|
199 Exception raised when a byte is invalid in ISO-8859-1. |
|
200 """ |
|
201 |
|
202 class ContentDisposition: |
|
203 """ |
|
204 Records various indications and hints about content disposition. |
|
205 |
|
206 These can be used to know if a file should be downloaded or |
|
207 displayed directly, and to hint what filename it should have |
|
208 in the download case. |
|
209 """ |
|
210 def __init__(self, disposition='inline', assocs=None): |
|
211 """ |
|
212 Used internally after parsing the header. |
|
213 |
|
214 Instances should generally be created from a factory |
|
215 function, such as parse_headers and its variants. |
|
216 """ |
|
217 if len(disposition) != 1: |
|
218 self.disposition = 'inline' |
|
219 else: |
|
220 self.disposition = disposition[0] |
|
221 if assocs is None: |
|
222 self.assocs = {} |
|
223 else: |
|
224 self.assocs = dict(assocs) # So we can change values |
|
225 if 'filename*' in self.assocs: |
|
226 param = self.assocs['filename*'] |
|
227 if isinstance(param, ExtDispositionParm): |
|
228 self.assocs['filename*'] = ( |
|
229 parse_ext_value(param.value).string |
|
230 ) |
|
231 |
|
232 def filename(self): |
|
233 """ |
|
234 The filename from the Content-Disposition header or None. |
|
235 |
|
236 On safety: |
|
237 This property records the intent of the sender. |
|
238 |
|
239 You shouldn't use this sender-controlled value as a filesystem |
|
240 path, it can be insecure. Serving files with this filename can be |
|
241 dangerous as well, due to a certain browser using the part after |
|
242 the dot for mime-sniffing. Saving it to a database is fine by |
|
243 itself though. |
|
244 """ |
|
245 if 'filename*' in self.assocs: |
|
246 return self.assocs['filename*'] |
|
247 elif 'filename' in self.assocs: |
|
248 # XXX Reject non-ascii (parsed via qdtext) here? |
|
249 return self.assocs['filename'] |
|
250 else: |
|
251 return None |
|
252 |
|
253 def is_inline(self): |
|
254 """ |
|
255 Return if the file should be handled inline. |
|
256 |
|
257 If not, and unless your application supports other dispositions |
|
258 than the standard inline and attachment, it should be handled |
|
259 as an attachment. |
|
260 """ |
|
261 return self.disposition.lower() == 'inline' |
|
262 |
|
263 def normalize_ws(text): |
|
264 """ |
|
265 Do LWS (linear whitespace) folding. |
|
266 """ |
|
267 return ' '.join(text.split()) |
|
268 |
|
269 def parse_headers(content_disposition): |
|
270 """ |
|
271 Build a ContentDisposition from header values. |
|
272 |
|
273 @param content_disposition contents of the disposition header |
|
274 @type bytes |
|
275 """ |
|
276 # We allow non-ascii here (it will only be parsed inside of qdtext, and |
|
277 # rejected by the grammar if it appears in other places), although |
|
278 # parsing it can be ambiguous. Parsing it ensures that a non-ambiguous |
|
279 # filename* value won't get dismissed because of an unrelated ambiguity |
|
280 # in the filename parameter. But it does mean we occasionally give |
|
281 # less-than-certain values for some legacy senders. |
|
282 content_disposition = content_disposition.decode('iso-8859-1') |
|
283 |
|
284 # Our parsing is relaxed in these regards: |
|
285 # - The grammar allows a final ';' in the header; |
|
286 # - We do LWS-folding, and possibly normalise other broken |
|
287 # whitespace, instead of rejecting non-lws-safe text. |
|
288 # XXX Would prefer to accept only the quoted whitespace |
|
289 # case, rather than normalising everything. |
|
290 content_disposition = normalize_ws(content_disposition) |
|
291 try: |
|
292 parsed = peg.parse(content_disposition, ContentDispositionValue) |
|
293 except (SyntaxError, DuplicateParamError, InvalidISO8859Error): |
|
294 return ContentDisposition() |
|
295 else: |
|
296 return ContentDisposition(disposition=parsed.dtype, |
|
297 assocs=parsed.params) |
|
298 |
|
299 def parse_ext_value(val): |
|
300 """ |
|
301 Parse the value of an extended attribute. |
|
302 """ |
|
303 if len(val) == 3: |
|
304 charset, langtag, coded = val |
|
305 else: |
|
306 charset, coded = val |
|
307 langtag = None |
|
308 decoded = parse.unquote(coded, charset, errors='strict') |
|
309 if charset == 'iso-8859-1': |
|
310 # Fail if the filename contains an invalid ISO-8859-1 char |
|
311 for c in decoded: |
|
312 if 0x7F <= ord(c) <= 0x9F: |
|
313 raise InvalidISO8859Error(c) |
|
314 return LangTagged(decoded, langtag) |
|
315 |
|
316 except ImportError: |
|
317 class ContentDisposition: |
|
318 """ |
|
319 Records various indications and hints about content disposition. |
|
320 |
|
321 These can be used to know if a file should be downloaded or |
|
322 displayed directly, and to hint what filename it should have |
|
323 in the download case. |
|
324 """ |
|
325 def __init__(self, filename): |
|
326 """ |
|
327 Constructor |
|
328 |
|
329 @param filename file name to be stored in this surrogate class |
|
330 @type str |
|
331 """ |
|
332 self.__filename = filename |
|
333 |
|
334 def filename(self): |
|
335 """ |
|
336 Public method to get the stored file name |
|
337 |
|
338 @return file name |
|
339 @rtype str |
|
340 """ |
|
341 return self.__filename |
|
342 |
|
343 def parse_headers(content_disposition): |
|
344 """ |
|
345 Build a ContentDisposition from header values. |
|
346 |
|
347 @param content_disposition contents of the disposition header |
|
348 @type bytes |
|
349 """ |
|
350 header = content_disposition.decode() |
|
351 if header: |
|
352 pos = header.find("filename=") |
|
353 if pos != -1: |
|
354 path = header[pos + 9:] |
|
355 if path.startswith('"') and path.endswith('"'): |
|
356 path = path[1:-1] |
|
357 return ContentDisposition(path) |
|
358 return ContentDisposition("") |