eric6/E5Network/E5RFC6266.py

changeset 6942
2602857055c5
parent 6891
93f82da09f22
child 7192
a22eee00b052
equal deleted inserted replaced
6941:f99d60d6b59b 6942:2602857055c5
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2015 - 2019 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing a Content-Disposition parser iaw. RFC 6266.
8 """
9
10 #
11 # This code is adapted from the rfc6266.py module of qutebrowser.
12 # Original copyright 2014-2015 Florian Bruhin (The Compiler)
13 # <mail@qutebrowser.org>
14 #
15
16 from __future__ import unicode_literals
17
18 try: # Py3
19 import urllib.parse as parse
20 except (ImportError):
21 import urlparse as parse # __IGNORE_WARNING__
22 import collections
23 import string
24 import re
25
26 try:
27 import pypeg2 as peg
28
29 class UniqueNamespace(peg.Namespace):
30 """
31 A pyPEG2 namespace which prevents setting a value twice.
32 """
33 def __setitem__(self, key, value):
34 """
35 Special method to set an item.
36
37 @param key key for the item
38 @param value value of the item
39 """
40 if key in self:
41 raise DuplicateParamError(key)
42 super(UniqueNamespace, self).__setitem__(key, value)
43
44 # RFC 2616
45 separator_chars = "()<>@,;:\\\"/[]?={} \t" # __IGNORE_WARNING_M613__
46 ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127)
47 nontoken_chars = separator_chars + ctl_chars
48
49 # RFC 5987
50 attr_chars_nonalnum = '!#$&+-.^_`|~'
51 attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum
52
53 # RFC 5987 gives this alternative construction of the token character class
54 token_chars = attr_chars + "*'%" # __IGNORE_WARNING_M601__
55
56 # Definitions from https://tools.ietf.org/html/rfc2616#section-2.2
57 # token was redefined from attr_chars to avoid using AnyBut,
58 # which might include non-ascii octets.
59 token_re = '[{0}]+'.format(re.escape(token_chars))
60
61 class Token(str):
62 """
63 A token (RFC 2616, Section 2.2).
64 """
65 grammar = re.compile(token_re)
66
67 # RFC 2616 says some linear whitespace (LWS) is in fact allowed in text
68 # and qdtext; however it also mentions folding that whitespace into
69 # a single SP (which isn't in CTL) before interpretation.
70 # Assume the caller already that folding when parsing headers.
71
72 # Note: qdtext also allows non-ascii, which we choose to parse
73 # as ISO-8859-1; rejecting it entirely would also be permitted.
74 # Some broken browsers attempt encoding-sniffing, which is broken
75 # because the spec only allows iso, and because encoding-sniffing
76 # can mangle valid values.
77 # Everything else in this grammar (including RFC 5987 ext values)
78 # is in an ascii-safe encoding.
79
80 qdtext_re = r'[^"{0}]'.format(re.escape(ctl_chars))
81 quoted_pair_re = r'\\[{0}]'.format(re.escape(
82 ''.join(chr(i) for i in range(128))))
83
84 class QuotedString(str):
85 """
86 A quoted string (RFC 2616, Section 2.2).
87 """
88 grammar = re.compile(r'"({0}|{1})+"'.format(quoted_pair_re, qdtext_re))
89
90 def __str__(self):
91 s = super(QuotedString, self).__str__()
92 s = s[1:-1] # remove quotes
93 s = re.sub(r'\\(.)', r'\1', s) # drop backslashes
94 return s
95
96 class Value(str):
97 """
98 A value. (RFC 2616, Section 3.6).
99 """
100 grammar = [re.compile(token_re), QuotedString]
101
102 class Charset(str):
103 """
104 A charset (RFC5987, Section 3.2.1).
105 """
106 # Other charsets are forbidden, the spec reserves them
107 # for future evolutions.
108 grammar = re.compile('UTF-8|ISO-8859-1', re.I)
109
110 class Language(str):
111 """
112 A language-tag (RFC 5646, Section 2.1).
113
114 Fixme: This grammar is not 100% correct yet.
115 https://github.com/The-Compiler/qutebrowser/issues/105
116 """
117 grammar = re.compile('[A-Za-z0-9-]+')
118
119 attr_char_re = '[{0}]'.format(re.escape(attr_chars))
120 hex_digit_re = '%[' + string.hexdigits + ']{2}'
121
122 class ValueChars(str):
123 """
124 A value of an attribute.
125
126 Fixme: Can we merge this with Value?
127 https://github.com/The-Compiler/qutebrowser/issues/105
128 """
129 grammar = re.compile('({0}|{1})*'.format(attr_char_re, hex_digit_re))
130
131 class ExtValue(peg.List):
132 """
133 An ext-value of an attribute (RFC 5987, Section 3.2).
134 """
135 grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'",
136 ValueChars)
137
138 class ExtToken(peg.Symbol):
139 """
140 A token introducing an extended value (RFC 6266, Section 4.1).
141 """
142 regex = re.compile(token_re + r'\*')
143
144 def __str__(self):
145 return super(ExtToken, self).__str__().lower()
146
147 class NoExtToken(peg.Symbol):
148 """
149 A token introducing a normal value (RFC 6266, Section 4.1).
150 """
151 regex = re.compile(token_re + r'(?<!\*)')
152
153 def __str__(self):
154 return super(NoExtToken, self).__str__().lower()
155
156 class DispositionParm(str):
157 """
158 A parameter for the Disposition-Type header (RFC6266, Section 4.1).
159 """
160 grammar = peg.attr('name', NoExtToken), '=', Value
161
162 class ExtDispositionParm:
163 """
164 An extended parameter (RFC6266, Section 4.1).
165 """
166 grammar = peg.attr('name', ExtToken), '=', ExtValue
167
168 def __init__(self, value, name=None):
169 self.name = name
170 self.value = value
171
172 class DispositionType(peg.List):
173 """
174 The disposition type (RFC6266, Section 4.1).
175 """
176 grammar = [re.compile('(inline|attachment)', re.I), Token]
177
178 class DispositionParmList(UniqueNamespace):
179 """
180 A list of disposition parameters (RFC6266, Section 4.1).
181 """
182 grammar = peg.maybe_some(';', [ExtDispositionParm, DispositionParm])
183
184 class ContentDispositionValue:
185 """
186 A complete Content-Disposition value (RFC 6266, Section 4.1).
187 """
188 # Allows nonconformant final semicolon
189 # I've seen it in the wild, and browsers accept it
190 # http://greenbytes.de/tech/tc2231/#attwithasciifilenamenqs
191 grammar = (peg.attr('dtype', DispositionType),
192 peg.attr('params', DispositionParmList),
193 peg.optional(';'))
194
195 LangTagged = collections.namedtuple('LangTagged', ['string', 'langtag'])
196
197 class DuplicateParamError(Exception):
198 """
199 Exception raised when a parameter has been given twice.
200 """
201
202 class InvalidISO8859Error(Exception):
203 """
204 Exception raised when a byte is invalid in ISO-8859-1.
205 """
206
207 class ContentDisposition:
208 """
209 Records various indications and hints about content disposition.
210
211 These can be used to know if a file should be downloaded or
212 displayed directly, and to hint what filename it should have
213 in the download case.
214 """
215 def __init__(self, disposition='inline', assocs=None):
216 """
217 Used internally after parsing the header.
218
219 Instances should generally be created from a factory
220 function, such as parse_headers and its variants.
221 """
222 if len(disposition) != 1:
223 self.disposition = 'inline'
224 else:
225 self.disposition = disposition[0]
226 if assocs is None:
227 self.assocs = {}
228 else:
229 self.assocs = dict(assocs) # So we can change values
230 if 'filename*' in self.assocs:
231 param = self.assocs['filename*']
232 assert isinstance(param, ExtDispositionParm)
233 self.assocs['filename*'] = \
234 parse_ext_value(param.value).string
235
236 def filename(self):
237 """
238 The filename from the Content-Disposition header or None.
239
240 On safety:
241 This property records the intent of the sender.
242
243 You shouldn't use this sender-controlled value as a filesystem
244 path, it can be insecure. Serving files with this filename can be
245 dangerous as well, due to a certain browser using the part after
246 the dot for mime-sniffing. Saving it to a database is fine by
247 itself though.
248 """
249 if 'filename*' in self.assocs:
250 return self.assocs['filename*']
251 elif 'filename' in self.assocs:
252 # XXX Reject non-ascii (parsed via qdtext) here?
253 return self.assocs['filename']
254 else:
255 return None
256
257 def is_inline(self):
258 """
259 Return if the file should be handled inline.
260
261 If not, and unless your application supports other dispositions
262 than the standard inline and attachment, it should be handled
263 as an attachment.
264 """
265 return self.disposition.lower() == 'inline'
266
267 def normalize_ws(text):
268 """
269 Do LWS (linear whitespace) folding.
270 """
271 return ' '.join(text.split())
272
273 def parse_headers(content_disposition):
274 """
275 Build a ContentDisposition from header values.
276
277 @param content_disposition contents of the disposition header
278 @type bytes
279 """
280 # We allow non-ascii here (it will only be parsed inside of qdtext, and
281 # rejected by the grammar if it appears in other places), although
282 # parsing it can be ambiguous. Parsing it ensures that a non-ambiguous
283 # filename* value won't get dismissed because of an unrelated ambiguity
284 # in the filename parameter. But it does mean we occasionally give
285 # less-than-certain values for some legacy senders.
286 content_disposition = content_disposition.decode('iso-8859-1')
287
288 # Our parsing is relaxed in these regards:
289 # - The grammar allows a final ';' in the header;
290 # - We do LWS-folding, and possibly normalise other broken
291 # whitespace, instead of rejecting non-lws-safe text.
292 # XXX Would prefer to accept only the quoted whitespace
293 # case, rather than normalising everything.
294 content_disposition = normalize_ws(content_disposition)
295 try:
296 parsed = peg.parse(content_disposition, ContentDispositionValue)
297 except (SyntaxError, DuplicateParamError, InvalidISO8859Error):
298 return ContentDisposition()
299 else:
300 return ContentDisposition(disposition=parsed.dtype,
301 assocs=parsed.params)
302
303 def parse_ext_value(val):
304 """
305 Parse the value of an extended attribute.
306 """
307 if len(val) == 3:
308 charset, langtag, coded = val
309 else:
310 charset, coded = val
311 langtag = None
312 decoded = parse.unquote(coded, charset, errors='strict')
313 if charset == 'iso-8859-1':
314 # Fail if the filename contains an invalid ISO-8859-1 char
315 for c in decoded:
316 if 0x7F <= ord(c) <= 0x9F:
317 raise InvalidISO8859Error(c)
318 return LangTagged(decoded, langtag)
319
320 except ImportError:
321 class ContentDisposition:
322 """
323 Records various indications and hints about content disposition.
324
325 These can be used to know if a file should be downloaded or
326 displayed directly, and to hint what filename it should have
327 in the download case.
328 """
329 def __init__(self, filename):
330 """
331 Constructor
332
333 @param filename file name to be stored in this surrogate class
334 @type str
335 """
336 self.__filename = filename
337
338 def filename(self):
339 """
340 Public method to get the stored file name
341
342 @return file name
343 @rtype str
344 """
345 return self.__filename
346
347 def parse_headers(content_disposition):
348 """
349 Build a ContentDisposition from header values.
350
351 @param content_disposition contents of the disposition header
352 @type bytes
353 """
354 header = content_disposition.decode()
355 if header:
356 pos = header.find("filename=")
357 if pos != -1:
358 path = header[pos + 9:]
359 if path.startswith('"') and path.endswith('"'):
360 path = path[1:-1]
361 return ContentDisposition(path)
362 return ContentDisposition("")

eric ide

mercurial