eric7/E5Network/E5RFC6266.py

branch
eric7
changeset 8312
800c432b34c8
parent 8218
7c09585bd960
equal deleted inserted replaced
8311:4e8b98454baa 8312:800c432b34c8
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2015 - 2021 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing a Content-Disposition parser iaw. RFC 6266.
8 """
9
10 #
11 # This code is adapted from the rfc6266.py module of qutebrowser.
12 # Original copyright 2014-2015 Florian Bruhin (The Compiler)
13 # <mail@qutebrowser.org>
14 #
15
16 import urllib.parse as parse
17 import collections
18 import string
19 import re
20
21 try:
22 import pypeg2 as peg
23
24 class UniqueNamespace(peg.Namespace):
25 """
26 A pyPEG2 namespace which prevents setting a value twice.
27 """
28 def __setitem__(self, key, value):
29 """
30 Special method to set an item.
31
32 @param key key for the item
33 @param value value of the item
34 """
35 if key in self:
36 raise DuplicateParamError(key)
37 super().__setitem__(key, value)
38
39 # RFC 2616
40 separator_chars = "()<>@,;:\\\"/[]?={} \t" # __IGNORE_WARNING_M613__
41 ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127)
42 nontoken_chars = separator_chars + ctl_chars
43
44 # RFC 5987
45 attr_chars_nonalnum = '!#$&+-.^_`|~'
46 attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum
47
48 # RFC 5987 gives this alternative construction of the token character class
49 token_chars = attr_chars + "*'%" # __IGNORE_WARNING_M601__
50
51 # Definitions from https://tools.ietf.org/html/rfc2616#section-2.2
52 # token was redefined from attr_chars to avoid using AnyBut,
53 # which might include non-ascii octets.
54 token_re = '[{0}]+'.format(re.escape(token_chars))
55
56 class Token(str):
57 """
58 A token (RFC 2616, Section 2.2).
59 """
60 grammar = re.compile(token_re)
61
62 # RFC 2616 says some linear whitespace (LWS) is in fact allowed in text
63 # and qdtext; however it also mentions folding that whitespace into
64 # a single SP (which isn't in CTL) before interpretation.
65 # Assume the caller already that folding when parsing headers.
66
67 # Note: qdtext also allows non-ascii, which we choose to parse
68 # as ISO-8859-1; rejecting it entirely would also be permitted.
69 # Some broken browsers attempt encoding-sniffing, which is broken
70 # because the spec only allows iso, and because encoding-sniffing
71 # can mangle valid values.
72 # Everything else in this grammar (including RFC 5987 ext values)
73 # is in an ascii-safe encoding.
74
75 qdtext_re = r'[^"{0}]'.format(re.escape(ctl_chars))
76 quoted_pair_re = r'\\[{0}]'.format(re.escape(
77 ''.join(chr(i) for i in range(128))))
78
79 class QuotedString(str):
80 """
81 A quoted string (RFC 2616, Section 2.2).
82 """
83 grammar = re.compile(r'"({0}|{1})+"'.format(quoted_pair_re, qdtext_re))
84
85 def __str__(self):
86 s = super().__str__()
87 s = s[1:-1] # remove quotes
88 s = re.sub(r'\\(.)', r'\1', s) # drop backslashes
89 return s
90
91 class Value(str):
92 """
93 A value. (RFC 2616, Section 3.6).
94 """
95 grammar = [re.compile(token_re), QuotedString]
96
97 class Charset(str):
98 """
99 A charset (RFC5987, Section 3.2.1).
100 """
101 # Other charsets are forbidden, the spec reserves them
102 # for future evolutions.
103 grammar = re.compile('UTF-8|ISO-8859-1', re.I)
104
105 class Language(str):
106 """
107 A language-tag (RFC 5646, Section 2.1).
108
109 Fixme: This grammar is not 100% correct yet.
110 https://github.com/The-Compiler/qutebrowser/issues/105
111 """
112 grammar = re.compile('[A-Za-z0-9-]+')
113
114 attr_char_re = '[{0}]'.format(re.escape(attr_chars))
115 hex_digit_re = '%[' + string.hexdigits + ']{2}'
116
117 class ValueChars(str):
118 """
119 A value of an attribute.
120
121 Fixme: Can we merge this with Value?
122 https://github.com/The-Compiler/qutebrowser/issues/105
123 """
124 grammar = re.compile('({0}|{1})*'.format(attr_char_re, hex_digit_re))
125
126 class ExtValue(peg.List):
127 """
128 An ext-value of an attribute (RFC 5987, Section 3.2).
129 """
130 grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'",
131 ValueChars)
132
133 class ExtToken(peg.Symbol):
134 """
135 A token introducing an extended value (RFC 6266, Section 4.1).
136 """
137 regex = re.compile(token_re + r'\*')
138
139 def __str__(self):
140 return super().__str__().lower()
141
142 class NoExtToken(peg.Symbol):
143 """
144 A token introducing a normal value (RFC 6266, Section 4.1).
145 """
146 regex = re.compile(token_re + r'(?<!\*)')
147
148 def __str__(self):
149 return super().__str__().lower()
150
151 class DispositionParm(str):
152 """
153 A parameter for the Disposition-Type header (RFC6266, Section 4.1).
154 """
155 grammar = peg.attr('name', NoExtToken), '=', Value
156
157 class ExtDispositionParm:
158 """
159 An extended parameter (RFC6266, Section 4.1).
160 """
161 grammar = peg.attr('name', ExtToken), '=', ExtValue
162
163 def __init__(self, value, name=None):
164 self.name = name
165 self.value = value
166
167 class DispositionType(peg.List):
168 """
169 The disposition type (RFC6266, Section 4.1).
170 """
171 grammar = [re.compile('(inline|attachment)', re.I), Token]
172
173 class DispositionParmList(UniqueNamespace):
174 """
175 A list of disposition parameters (RFC6266, Section 4.1).
176 """
177 grammar = peg.maybe_some(';', [ExtDispositionParm, DispositionParm])
178
179 class ContentDispositionValue:
180 """
181 A complete Content-Disposition value (RFC 6266, Section 4.1).
182 """
183 # Allows nonconformant final semicolon
184 # I've seen it in the wild, and browsers accept it
185 # http://greenbytes.de/tech/tc2231/#attwithasciifilenamenqs
186 grammar = (peg.attr('dtype', DispositionType),
187 peg.attr('params', DispositionParmList),
188 peg.optional(';'))
189
190 LangTagged = collections.namedtuple('LangTagged', ['string', 'langtag'])
191
192 class DuplicateParamError(Exception):
193 """
194 Exception raised when a parameter has been given twice.
195 """
196
197 class InvalidISO8859Error(Exception):
198 """
199 Exception raised when a byte is invalid in ISO-8859-1.
200 """
201
202 class ContentDisposition:
203 """
204 Records various indications and hints about content disposition.
205
206 These can be used to know if a file should be downloaded or
207 displayed directly, and to hint what filename it should have
208 in the download case.
209 """
210 def __init__(self, disposition='inline', assocs=None):
211 """
212 Used internally after parsing the header.
213
214 Instances should generally be created from a factory
215 function, such as parse_headers and its variants.
216 """
217 if len(disposition) != 1:
218 self.disposition = 'inline'
219 else:
220 self.disposition = disposition[0]
221 if assocs is None:
222 self.assocs = {}
223 else:
224 self.assocs = dict(assocs) # So we can change values
225 if 'filename*' in self.assocs:
226 param = self.assocs['filename*']
227 if isinstance(param, ExtDispositionParm):
228 self.assocs['filename*'] = (
229 parse_ext_value(param.value).string
230 )
231
232 def filename(self):
233 """
234 The filename from the Content-Disposition header or None.
235
236 On safety:
237 This property records the intent of the sender.
238
239 You shouldn't use this sender-controlled value as a filesystem
240 path, it can be insecure. Serving files with this filename can be
241 dangerous as well, due to a certain browser using the part after
242 the dot for mime-sniffing. Saving it to a database is fine by
243 itself though.
244 """
245 if 'filename*' in self.assocs:
246 return self.assocs['filename*']
247 elif 'filename' in self.assocs:
248 # XXX Reject non-ascii (parsed via qdtext) here?
249 return self.assocs['filename']
250 else:
251 return None
252
253 def is_inline(self):
254 """
255 Return if the file should be handled inline.
256
257 If not, and unless your application supports other dispositions
258 than the standard inline and attachment, it should be handled
259 as an attachment.
260 """
261 return self.disposition.lower() == 'inline'
262
263 def normalize_ws(text):
264 """
265 Do LWS (linear whitespace) folding.
266 """
267 return ' '.join(text.split())
268
269 def parse_headers(content_disposition):
270 """
271 Build a ContentDisposition from header values.
272
273 @param content_disposition contents of the disposition header
274 @type bytes
275 """
276 # We allow non-ascii here (it will only be parsed inside of qdtext, and
277 # rejected by the grammar if it appears in other places), although
278 # parsing it can be ambiguous. Parsing it ensures that a non-ambiguous
279 # filename* value won't get dismissed because of an unrelated ambiguity
280 # in the filename parameter. But it does mean we occasionally give
281 # less-than-certain values for some legacy senders.
282 content_disposition = content_disposition.decode('iso-8859-1')
283
284 # Our parsing is relaxed in these regards:
285 # - The grammar allows a final ';' in the header;
286 # - We do LWS-folding, and possibly normalise other broken
287 # whitespace, instead of rejecting non-lws-safe text.
288 # XXX Would prefer to accept only the quoted whitespace
289 # case, rather than normalising everything.
290 content_disposition = normalize_ws(content_disposition)
291 try:
292 parsed = peg.parse(content_disposition, ContentDispositionValue)
293 except (SyntaxError, DuplicateParamError, InvalidISO8859Error):
294 return ContentDisposition()
295 else:
296 return ContentDisposition(disposition=parsed.dtype,
297 assocs=parsed.params)
298
299 def parse_ext_value(val):
300 """
301 Parse the value of an extended attribute.
302 """
303 if len(val) == 3:
304 charset, langtag, coded = val
305 else:
306 charset, coded = val
307 langtag = None
308 decoded = parse.unquote(coded, charset, errors='strict')
309 if charset == 'iso-8859-1':
310 # Fail if the filename contains an invalid ISO-8859-1 char
311 for c in decoded:
312 if 0x7F <= ord(c) <= 0x9F:
313 raise InvalidISO8859Error(c)
314 return LangTagged(decoded, langtag)
315
316 except ImportError:
317 class ContentDisposition:
318 """
319 Records various indications and hints about content disposition.
320
321 These can be used to know if a file should be downloaded or
322 displayed directly, and to hint what filename it should have
323 in the download case.
324 """
325 def __init__(self, filename):
326 """
327 Constructor
328
329 @param filename file name to be stored in this surrogate class
330 @type str
331 """
332 self.__filename = filename
333
334 def filename(self):
335 """
336 Public method to get the stored file name
337
338 @return file name
339 @rtype str
340 """
341 return self.__filename
342
343 def parse_headers(content_disposition):
344 """
345 Build a ContentDisposition from header values.
346
347 @param content_disposition contents of the disposition header
348 @type bytes
349 """
350 header = content_disposition.decode()
351 if header:
352 pos = header.find("filename=")
353 if pos != -1:
354 path = header[pos + 9:]
355 if path.startswith('"') and path.endswith('"'):
356 path = path[1:-1]
357 return ContentDisposition(path)
358 return ContentDisposition("")

eric ide

mercurial