|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2015 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing a Content-Disposition parser iaw. RFC 6266. |
|
8 """ |
|
9 |
|
10 # |
|
11 # This code is adapted from the rfc6266.py module of qutebrowser. |
|
12 # Original copyright 2014-2015 Florian Bruhin (The Compiler) |
|
13 # <mail@qutebrowser.org> |
|
14 # |
|
15 |
|
16 from __future__ import unicode_literals |
|
17 |
|
18 try: # Py3 |
|
19 import urllib.parse as parse |
|
20 except (ImportError): |
|
21 import urlparse as parse # __IGNORE_WARNING__ |
|
22 import collections |
|
23 import string |
|
24 import re |
|
25 |
|
26 try: |
|
27 import pypeg2 as peg |
|
28 |
|
29 class UniqueNamespace(peg.Namespace): |
|
30 """ |
|
31 A pyPEG2 namespace which prevents setting a value twice. |
|
32 """ |
|
33 def __setitem__(self, key, value): |
|
34 """ |
|
35 Special method to set an item. |
|
36 |
|
37 @param key key for the item |
|
38 @param value value of the item |
|
39 """ |
|
40 if key in self: |
|
41 raise DuplicateParamError(key) |
|
42 super().__setitem__(key, value) |
|
43 |
|
44 # RFC 2616 |
|
45 separator_chars = "()<>@,;:\\\"/[]?={} \t" |
|
46 ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127) |
|
47 nontoken_chars = separator_chars + ctl_chars |
|
48 |
|
49 # RFC 5987 |
|
50 attr_chars_nonalnum = '!#$&+-.^_`|~' |
|
51 attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum |
|
52 |
|
53 # RFC 5987 gives this alternative construction of the token character class |
|
54 token_chars = attr_chars + "*'%" |
|
55 |
|
56 # Definitions from https://tools.ietf.org/html/rfc2616#section-2.2 |
|
57 # token was redefined from attr_chars to avoid using AnyBut, |
|
58 # which might include non-ascii octets. |
|
59 token_re = '[{}]+'.format(re.escape(token_chars)) |
|
60 |
|
61 class Token(str): |
|
62 """ |
|
63 A token (RFC 2616, Section 2.2). |
|
64 """ |
|
65 grammar = re.compile(token_re) |
|
66 |
|
67 # RFC 2616 says some linear whitespace (LWS) is in fact allowed in text |
|
68 # and qdtext; however it also mentions folding that whitespace into |
|
69 # a single SP (which isn't in CTL) before interpretation. |
|
70 # Assume the caller already that folding when parsing headers. |
|
71 |
|
72 # Note: qdtext also allows non-ascii, which we choose to parse |
|
73 # as ISO-8859-1; rejecting it entirely would also be permitted. |
|
74 # Some broken browsers attempt encoding-sniffing, which is broken |
|
75 # because the spec only allows iso, and because encoding-sniffing |
|
76 # can mangle valid values. |
|
77 # Everything else in this grammar (including RFC 5987 ext values) |
|
78 # is in an ascii-safe encoding. |
|
79 |
|
80 qdtext_re = r'[^"{}]'.format(re.escape(ctl_chars)) |
|
81 quoted_pair_re = r'\\[{}]'.format(re.escape( |
|
82 ''.join(chr(i) for i in range(128)))) |
|
83 |
|
84 class QuotedString(str): |
|
85 """ |
|
86 A quoted string (RFC 2616, Section 2.2). |
|
87 """ |
|
88 grammar = re.compile(r'"({}|{})+"'.format(quoted_pair_re, qdtext_re)) |
|
89 |
|
90 def __str__(self): |
|
91 s = super().__str__() |
|
92 s = s[1:-1] # remove quotes |
|
93 s = re.sub(r'\\(.)', r'\1', s) # drop backslashes |
|
94 return s |
|
95 |
|
96 class Value(str): |
|
97 """ |
|
98 A value. (RFC 2616, Section 3.6). |
|
99 """ |
|
100 grammar = [re.compile(token_re), QuotedString] |
|
101 |
|
102 class Charset(str): |
|
103 """ |
|
104 A charset (RFC5987, Section 3.2.1). |
|
105 """ |
|
106 # Other charsets are forbidden, the spec reserves them |
|
107 # for future evolutions. |
|
108 grammar = re.compile('UTF-8|ISO-8859-1', re.I) |
|
109 |
|
110 class Language(str): |
|
111 """ |
|
112 A language-tag (RFC 5646, Section 2.1). |
|
113 |
|
114 Fixme: This grammar is not 100% correct yet. |
|
115 https://github.com/The-Compiler/qutebrowser/issues/105 |
|
116 """ |
|
117 grammar = re.compile('[A-Za-z0-9-]+') |
|
118 |
|
119 attr_char_re = '[{}]'.format(re.escape(attr_chars)) |
|
120 hex_digit_re = '%[' + string.hexdigits + ']{2}' |
|
121 |
|
122 class ValueChars(str): |
|
123 """ |
|
124 A value of an attribute. |
|
125 |
|
126 Fixme: Can we merge this with Value? |
|
127 https://github.com/The-Compiler/qutebrowser/issues/105 |
|
128 """ |
|
129 grammar = re.compile('({}|{})*'.format(attr_char_re, hex_digit_re)) |
|
130 |
|
131 class ExtValue(peg.List): |
|
132 """ |
|
133 An ext-value of an attribute (RFC 5987, Section 3.2). |
|
134 """ |
|
135 grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'", |
|
136 ValueChars) |
|
137 |
|
138 class ExtToken(peg.Symbol): |
|
139 """ |
|
140 A token introducing an extended value (RFC 6266, Section 4.1). |
|
141 """ |
|
142 regex = re.compile(token_re + r'\*') |
|
143 |
|
144 def __str__(self): |
|
145 return super().__str__().lower() |
|
146 |
|
147 class NoExtToken(peg.Symbol): |
|
148 """ |
|
149 A token introducing a normal value (RFC 6266, Section 4.1). |
|
150 """ |
|
151 regex = re.compile(token_re + r'(?<!\*)') |
|
152 |
|
153 def __str__(self): |
|
154 return super().__str__().lower() |
|
155 |
|
156 class DispositionParm(str): |
|
157 """ |
|
158 A parameter for the Disposition-Type header (RFC6266, Section 4.1). |
|
159 """ |
|
160 grammar = peg.attr('name', NoExtToken), '=', Value |
|
161 |
|
162 class ExtDispositionParm: |
|
163 """ |
|
164 An extended parameter (RFC6266, Section 4.1). |
|
165 """ |
|
166 grammar = peg.attr('name', ExtToken), '=', ExtValue |
|
167 |
|
168 def __init__(self, value, name=None): |
|
169 self.name = name |
|
170 self.value = value |
|
171 |
|
172 class DispositionType(peg.List): |
|
173 """ |
|
174 The disposition type (RFC6266, Section 4.1). |
|
175 """ |
|
176 grammar = [re.compile('(inline|attachment)', re.I), Token] |
|
177 |
|
178 class DispositionParmList(UniqueNamespace): |
|
179 """ |
|
180 A list of disposition parameters (RFC6266, Section 4.1). |
|
181 """ |
|
182 grammar = peg.maybe_some(';', [ExtDispositionParm, DispositionParm]) |
|
183 |
|
184 class ContentDispositionValue: |
|
185 """ |
|
186 A complete Content-Disposition value (RFC 6266, Section 4.1). |
|
187 """ |
|
188 # Allows nonconformant final semicolon |
|
189 # I've seen it in the wild, and browsers accept it |
|
190 # http://greenbytes.de/tech/tc2231/#attwithasciifilenamenqs |
|
191 grammar = (peg.attr('dtype', DispositionType), |
|
192 peg.attr('params', DispositionParmList), |
|
193 peg.optional(';')) |
|
194 |
|
195 LangTagged = collections.namedtuple('LangTagged', ['string', 'langtag']) |
|
196 |
|
197 class DuplicateParamError(Exception): |
|
198 """ |
|
199 Exception raised when a parameter has been given twice. |
|
200 """ |
|
201 |
|
202 class InvalidISO8859Error(Exception): |
|
203 """ |
|
204 Exception raised when a byte is invalid in ISO-8859-1. |
|
205 """ |
|
206 |
|
207 class ContentDisposition: |
|
208 """ |
|
209 Records various indications and hints about content disposition. |
|
210 |
|
211 These can be used to know if a file should be downloaded or |
|
212 displayed directly, and to hint what filename it should have |
|
213 in the download case. |
|
214 """ |
|
215 def __init__(self, disposition='inline', assocs=None): |
|
216 """ |
|
217 Used internally after parsing the header. |
|
218 |
|
219 Instances should generally be created from a factory |
|
220 function, such as parse_headers and its variants. |
|
221 """ |
|
222 if len(disposition) != 1: |
|
223 self.disposition = 'inline' |
|
224 else: |
|
225 self.disposition = disposition[0] |
|
226 if assocs is None: |
|
227 self.assocs = {} |
|
228 else: |
|
229 self.assocs = dict(assocs) # So we can change values |
|
230 if 'filename*' in self.assocs: |
|
231 param = self.assocs['filename*'] |
|
232 assert isinstance(param, ExtDispositionParm) |
|
233 self.assocs['filename*'] = \ |
|
234 parse_ext_value(param.value).string |
|
235 |
|
236 def filename(self): |
|
237 """ |
|
238 The filename from the Content-Disposition header or None. |
|
239 |
|
240 On safety: |
|
241 This property records the intent of the sender. |
|
242 |
|
243 You shouldn't use this sender-controlled value as a filesystem |
|
244 path, it can be insecure. Serving files with this filename can be |
|
245 dangerous as well, due to a certain browser using the part after |
|
246 the dot for mime-sniffing. Saving it to a database is fine by |
|
247 itself though. |
|
248 """ |
|
249 if 'filename*' in self.assocs: |
|
250 return self.assocs['filename*'] |
|
251 elif 'filename' in self.assocs: |
|
252 # XXX Reject non-ascii (parsed via qdtext) here? |
|
253 return self.assocs['filename'] |
|
254 |
|
255 def is_inline(self): |
|
256 """ |
|
257 Return if the file should be handled inline. |
|
258 |
|
259 If not, and unless your application supports other dispositions |
|
260 than the standard inline and attachment, it should be handled |
|
261 as an attachment. |
|
262 """ |
|
263 return self.disposition.lower() == 'inline' |
|
264 |
|
265 def normalize_ws(text): |
|
266 """ |
|
267 Do LWS (linear whitespace) folding. |
|
268 """ |
|
269 return ' '.join(text.split()) |
|
270 |
|
271 def parse_headers(content_disposition): |
|
272 """ |
|
273 Build a ContentDisposition from header values. |
|
274 |
|
275 @param content_disposition contents of the disposition header |
|
276 @type bytes |
|
277 """ |
|
278 # We allow non-ascii here (it will only be parsed inside of qdtext, and |
|
279 # rejected by the grammar if it appears in other places), although |
|
280 # parsing it can be ambiguous. Parsing it ensures that a non-ambiguous |
|
281 # filename* value won't get dismissed because of an unrelated ambiguity |
|
282 # in the filename parameter. But it does mean we occasionally give |
|
283 # less-than-certain values for some legacy senders. |
|
284 content_disposition = content_disposition.decode('iso-8859-1') |
|
285 |
|
286 # Our parsing is relaxed in these regards: |
|
287 # - The grammar allows a final ';' in the header; |
|
288 # - We do LWS-folding, and possibly normalise other broken |
|
289 # whitespace, instead of rejecting non-lws-safe text. |
|
290 # XXX Would prefer to accept only the quoted whitespace |
|
291 # case, rather than normalising everything. |
|
292 content_disposition = normalize_ws(content_disposition) |
|
293 try: |
|
294 parsed = peg.parse(content_disposition, ContentDispositionValue) |
|
295 except (SyntaxError, DuplicateParamError, InvalidISO8859Error): |
|
296 return ContentDisposition() |
|
297 else: |
|
298 return ContentDisposition(disposition=parsed.dtype, |
|
299 assocs=parsed.params) |
|
300 |
|
301 def parse_ext_value(val): |
|
302 """ |
|
303 Parse the value of an extended attribute. |
|
304 """ |
|
305 if len(val) == 3: |
|
306 charset, langtag, coded = val |
|
307 else: |
|
308 charset, coded = val |
|
309 langtag = None |
|
310 decoded = parse.unquote(coded, charset, errors='strict') |
|
311 if charset == 'iso-8859-1': |
|
312 # Fail if the filename contains an invalid ISO-8859-1 char |
|
313 for c in decoded: |
|
314 if 0x7F <= ord(c) <= 0x9F: |
|
315 raise InvalidISO8859Error(c) |
|
316 return LangTagged(decoded, langtag) |
|
317 |
|
318 except ImportError: |
|
319 class ContentDisposition: |
|
320 """ |
|
321 Records various indications and hints about content disposition. |
|
322 |
|
323 These can be used to know if a file should be downloaded or |
|
324 displayed directly, and to hint what filename it should have |
|
325 in the download case. |
|
326 """ |
|
327 def __init__(self, filename): |
|
328 """ |
|
329 Constructor |
|
330 |
|
331 @param filename file name to be stored in this surrogate class |
|
332 @type str |
|
333 """ |
|
334 self.__filename = filename |
|
335 |
|
336 def filename(self): |
|
337 """ |
|
338 Public method to get the stored file name |
|
339 |
|
340 @return file name |
|
341 @rtype str |
|
342 """ |
|
343 return self.__filename |
|
344 |
|
345 def parse_headers(content_disposition): |
|
346 """ |
|
347 Build a ContentDisposition from header values. |
|
348 |
|
349 @param content_disposition contents of the disposition header |
|
350 @type bytes |
|
351 """ |
|
352 header = content_disposition.decode() |
|
353 if header: |
|
354 pos = header.find("filename=") |
|
355 if pos != -1: |
|
356 path = header[pos + 9:] |
|
357 if path.startswith('"') and path.endswith('"'): |
|
358 path = path[1:-1] |
|
359 return ContentDisposition(path) |
|
360 return ContentDisposition("") |