E5Network/E5RFC6266.py

changeset 4318
c2f374ca452b
child 4541
e8ddd9d76414
equal deleted inserted replaced
4317:0de465a93200 4318:c2f374ca452b
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2015 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing a Content-Disposition parser iaw. RFC 6266.
8 """
9
10 #
11 # This code is adapted from the rfc6266.py module of qutebrowser.
12 # Original copyright 2014-2015 Florian Bruhin (The Compiler)
13 # <mail@qutebrowser.org>
14 #
15
16 from __future__ import unicode_literals
17
18 try: # Py3
19 import urllib.parse as parse
20 except (ImportError):
21 import urlparse as parse # __IGNORE_WARNING__
22 import collections
23 import string
24 import re
25
26 try:
27 import pypeg2 as peg
28
29 class UniqueNamespace(peg.Namespace):
30 """
31 A pyPEG2 namespace which prevents setting a value twice.
32 """
33 def __setitem__(self, key, value):
34 """
35 Special method to set an item.
36
37 @param key key for the item
38 @param value value of the item
39 """
40 if key in self:
41 raise DuplicateParamError(key)
42 super().__setitem__(key, value)
43
44 # RFC 2616
45 separator_chars = "()<>@,;:\\\"/[]?={} \t"
46 ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127)
47 nontoken_chars = separator_chars + ctl_chars
48
49 # RFC 5987
50 attr_chars_nonalnum = '!#$&+-.^_`|~'
51 attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum
52
53 # RFC 5987 gives this alternative construction of the token character class
54 token_chars = attr_chars + "*'%"
55
56 # Definitions from https://tools.ietf.org/html/rfc2616#section-2.2
57 # token was redefined from attr_chars to avoid using AnyBut,
58 # which might include non-ascii octets.
59 token_re = '[{}]+'.format(re.escape(token_chars))
60
61 class Token(str):
62 """
63 A token (RFC 2616, Section 2.2).
64 """
65 grammar = re.compile(token_re)
66
67 # RFC 2616 says some linear whitespace (LWS) is in fact allowed in text
68 # and qdtext; however it also mentions folding that whitespace into
69 # a single SP (which isn't in CTL) before interpretation.
70 # Assume the caller already that folding when parsing headers.
71
72 # Note: qdtext also allows non-ascii, which we choose to parse
73 # as ISO-8859-1; rejecting it entirely would also be permitted.
74 # Some broken browsers attempt encoding-sniffing, which is broken
75 # because the spec only allows iso, and because encoding-sniffing
76 # can mangle valid values.
77 # Everything else in this grammar (including RFC 5987 ext values)
78 # is in an ascii-safe encoding.
79
80 qdtext_re = r'[^"{}]'.format(re.escape(ctl_chars))
81 quoted_pair_re = r'\\[{}]'.format(re.escape(
82 ''.join(chr(i) for i in range(128))))
83
84 class QuotedString(str):
85 """
86 A quoted string (RFC 2616, Section 2.2).
87 """
88 grammar = re.compile(r'"({}|{})+"'.format(quoted_pair_re, qdtext_re))
89
90 def __str__(self):
91 s = super().__str__()
92 s = s[1:-1] # remove quotes
93 s = re.sub(r'\\(.)', r'\1', s) # drop backslashes
94 return s
95
96 class Value(str):
97 """
98 A value. (RFC 2616, Section 3.6).
99 """
100 grammar = [re.compile(token_re), QuotedString]
101
102 class Charset(str):
103 """
104 A charset (RFC5987, Section 3.2.1).
105 """
106 # Other charsets are forbidden, the spec reserves them
107 # for future evolutions.
108 grammar = re.compile('UTF-8|ISO-8859-1', re.I)
109
110 class Language(str):
111 """
112 A language-tag (RFC 5646, Section 2.1).
113
114 Fixme: This grammar is not 100% correct yet.
115 https://github.com/The-Compiler/qutebrowser/issues/105
116 """
117 grammar = re.compile('[A-Za-z0-9-]+')
118
119 attr_char_re = '[{}]'.format(re.escape(attr_chars))
120 hex_digit_re = '%[' + string.hexdigits + ']{2}'
121
122 class ValueChars(str):
123 """
124 A value of an attribute.
125
126 Fixme: Can we merge this with Value?
127 https://github.com/The-Compiler/qutebrowser/issues/105
128 """
129 grammar = re.compile('({}|{})*'.format(attr_char_re, hex_digit_re))
130
131 class ExtValue(peg.List):
132 """
133 An ext-value of an attribute (RFC 5987, Section 3.2).
134 """
135 grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'",
136 ValueChars)
137
138 class ExtToken(peg.Symbol):
139 """
140 A token introducing an extended value (RFC 6266, Section 4.1).
141 """
142 regex = re.compile(token_re + r'\*')
143
144 def __str__(self):
145 return super().__str__().lower()
146
147 class NoExtToken(peg.Symbol):
148 """
149 A token introducing a normal value (RFC 6266, Section 4.1).
150 """
151 regex = re.compile(token_re + r'(?<!\*)')
152
153 def __str__(self):
154 return super().__str__().lower()
155
156 class DispositionParm(str):
157 """
158 A parameter for the Disposition-Type header (RFC6266, Section 4.1).
159 """
160 grammar = peg.attr('name', NoExtToken), '=', Value
161
162 class ExtDispositionParm:
163 """
164 An extended parameter (RFC6266, Section 4.1).
165 """
166 grammar = peg.attr('name', ExtToken), '=', ExtValue
167
168 def __init__(self, value, name=None):
169 self.name = name
170 self.value = value
171
172 class DispositionType(peg.List):
173 """
174 The disposition type (RFC6266, Section 4.1).
175 """
176 grammar = [re.compile('(inline|attachment)', re.I), Token]
177
178 class DispositionParmList(UniqueNamespace):
179 """
180 A list of disposition parameters (RFC6266, Section 4.1).
181 """
182 grammar = peg.maybe_some(';', [ExtDispositionParm, DispositionParm])
183
184 class ContentDispositionValue:
185 """
186 A complete Content-Disposition value (RFC 6266, Section 4.1).
187 """
188 # Allows nonconformant final semicolon
189 # I've seen it in the wild, and browsers accept it
190 # http://greenbytes.de/tech/tc2231/#attwithasciifilenamenqs
191 grammar = (peg.attr('dtype', DispositionType),
192 peg.attr('params', DispositionParmList),
193 peg.optional(';'))
194
195 LangTagged = collections.namedtuple('LangTagged', ['string', 'langtag'])
196
197 class DuplicateParamError(Exception):
198 """
199 Exception raised when a parameter has been given twice.
200 """
201
202 class InvalidISO8859Error(Exception):
203 """
204 Exception raised when a byte is invalid in ISO-8859-1.
205 """
206
207 class ContentDisposition:
208 """
209 Records various indications and hints about content disposition.
210
211 These can be used to know if a file should be downloaded or
212 displayed directly, and to hint what filename it should have
213 in the download case.
214 """
215 def __init__(self, disposition='inline', assocs=None):
216 """
217 Used internally after parsing the header.
218
219 Instances should generally be created from a factory
220 function, such as parse_headers and its variants.
221 """
222 if len(disposition) != 1:
223 self.disposition = 'inline'
224 else:
225 self.disposition = disposition[0]
226 if assocs is None:
227 self.assocs = {}
228 else:
229 self.assocs = dict(assocs) # So we can change values
230 if 'filename*' in self.assocs:
231 param = self.assocs['filename*']
232 assert isinstance(param, ExtDispositionParm)
233 self.assocs['filename*'] = \
234 parse_ext_value(param.value).string
235
236 def filename(self):
237 """
238 The filename from the Content-Disposition header or None.
239
240 On safety:
241 This property records the intent of the sender.
242
243 You shouldn't use this sender-controlled value as a filesystem
244 path, it can be insecure. Serving files with this filename can be
245 dangerous as well, due to a certain browser using the part after
246 the dot for mime-sniffing. Saving it to a database is fine by
247 itself though.
248 """
249 if 'filename*' in self.assocs:
250 return self.assocs['filename*']
251 elif 'filename' in self.assocs:
252 # XXX Reject non-ascii (parsed via qdtext) here?
253 return self.assocs['filename']
254
255 def is_inline(self):
256 """
257 Return if the file should be handled inline.
258
259 If not, and unless your application supports other dispositions
260 than the standard inline and attachment, it should be handled
261 as an attachment.
262 """
263 return self.disposition.lower() == 'inline'
264
265 def normalize_ws(text):
266 """
267 Do LWS (linear whitespace) folding.
268 """
269 return ' '.join(text.split())
270
271 def parse_headers(content_disposition):
272 """
273 Build a ContentDisposition from header values.
274
275 @param content_disposition contents of the disposition header
276 @type bytes
277 """
278 # We allow non-ascii here (it will only be parsed inside of qdtext, and
279 # rejected by the grammar if it appears in other places), although
280 # parsing it can be ambiguous. Parsing it ensures that a non-ambiguous
281 # filename* value won't get dismissed because of an unrelated ambiguity
282 # in the filename parameter. But it does mean we occasionally give
283 # less-than-certain values for some legacy senders.
284 content_disposition = content_disposition.decode('iso-8859-1')
285
286 # Our parsing is relaxed in these regards:
287 # - The grammar allows a final ';' in the header;
288 # - We do LWS-folding, and possibly normalise other broken
289 # whitespace, instead of rejecting non-lws-safe text.
290 # XXX Would prefer to accept only the quoted whitespace
291 # case, rather than normalising everything.
292 content_disposition = normalize_ws(content_disposition)
293 try:
294 parsed = peg.parse(content_disposition, ContentDispositionValue)
295 except (SyntaxError, DuplicateParamError, InvalidISO8859Error):
296 return ContentDisposition()
297 else:
298 return ContentDisposition(disposition=parsed.dtype,
299 assocs=parsed.params)
300
301 def parse_ext_value(val):
302 """
303 Parse the value of an extended attribute.
304 """
305 if len(val) == 3:
306 charset, langtag, coded = val
307 else:
308 charset, coded = val
309 langtag = None
310 decoded = parse.unquote(coded, charset, errors='strict')
311 if charset == 'iso-8859-1':
312 # Fail if the filename contains an invalid ISO-8859-1 char
313 for c in decoded:
314 if 0x7F <= ord(c) <= 0x9F:
315 raise InvalidISO8859Error(c)
316 return LangTagged(decoded, langtag)
317
318 except ImportError:
319 class ContentDisposition:
320 """
321 Records various indications and hints about content disposition.
322
323 These can be used to know if a file should be downloaded or
324 displayed directly, and to hint what filename it should have
325 in the download case.
326 """
327 def __init__(self, filename):
328 """
329 Constructor
330
331 @param filename file name to be stored in this surrogate class
332 @type str
333 """
334 self.__filename = filename
335
336 def filename(self):
337 """
338 Public method to get the stored file name
339
340 @return file name
341 @rtype str
342 """
343 return self.__filename
344
345 def parse_headers(content_disposition):
346 """
347 Build a ContentDisposition from header values.
348
349 @param content_disposition contents of the disposition header
350 @type bytes
351 """
352 header = content_disposition.decode()
353 if header:
354 pos = header.find("filename=")
355 if pos != -1:
356 path = header[pos + 9:]
357 if path.startswith('"') and path.endswith('"'):
358 path = path[1:-1]
359 return ContentDisposition(path)
360 return ContentDisposition("")

eric ide

mercurial