--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/E5Network/E5RFC6266.py Sat Jul 18 16:50:24 2015 +0200 @@ -0,0 +1,360 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2015 Detlev Offenbach <detlev@die-offenbachs.de> +# + +""" +Module implementing a Content-Disposition parser iaw. RFC 6266. +""" + +# +# This code is adapted from the rfc6266.py module of qutebrowser. +# Original copyright 2014-2015 Florian Bruhin (The Compiler) +# <mail@qutebrowser.org> +# + +from __future__ import unicode_literals + +try: # Py3 + import urllib.parse as parse +except (ImportError): + import urlparse as parse # __IGNORE_WARNING__ +import collections +import string +import re + +try: + import pypeg2 as peg + + class UniqueNamespace(peg.Namespace): + """ + A pyPEG2 namespace which prevents setting a value twice. + """ + def __setitem__(self, key, value): + """ + Special method to set an item. + + @param key key for the item + @param value value of the item + """ + if key in self: + raise DuplicateParamError(key) + super().__setitem__(key, value) + + # RFC 2616 + separator_chars = "()<>@,;:\\\"/[]?={} \t" + ctl_chars = ''.join(chr(i) for i in range(32)) + chr(127) + nontoken_chars = separator_chars + ctl_chars + + # RFC 5987 + attr_chars_nonalnum = '!#$&+-.^_`|~' + attr_chars = string.ascii_letters + string.digits + attr_chars_nonalnum + + # RFC 5987 gives this alternative construction of the token character class + token_chars = attr_chars + "*'%" + + # Definitions from https://tools.ietf.org/html/rfc2616#section-2.2 + # token was redefined from attr_chars to avoid using AnyBut, + # which might include non-ascii octets. + token_re = '[{}]+'.format(re.escape(token_chars)) + + class Token(str): + """ + A token (RFC 2616, Section 2.2). + """ + grammar = re.compile(token_re) + + # RFC 2616 says some linear whitespace (LWS) is in fact allowed in text + # and qdtext; however it also mentions folding that whitespace into + # a single SP (which isn't in CTL) before interpretation. + # Assume the caller already that folding when parsing headers. + + # Note: qdtext also allows non-ascii, which we choose to parse + # as ISO-8859-1; rejecting it entirely would also be permitted. + # Some broken browsers attempt encoding-sniffing, which is broken + # because the spec only allows iso, and because encoding-sniffing + # can mangle valid values. + # Everything else in this grammar (including RFC 5987 ext values) + # is in an ascii-safe encoding. + + qdtext_re = r'[^"{}]'.format(re.escape(ctl_chars)) + quoted_pair_re = r'\\[{}]'.format(re.escape( + ''.join(chr(i) for i in range(128)))) + + class QuotedString(str): + """ + A quoted string (RFC 2616, Section 2.2). + """ + grammar = re.compile(r'"({}|{})+"'.format(quoted_pair_re, qdtext_re)) + + def __str__(self): + s = super().__str__() + s = s[1:-1] # remove quotes + s = re.sub(r'\\(.)', r'\1', s) # drop backslashes + return s + + class Value(str): + """ + A value. (RFC 2616, Section 3.6). + """ + grammar = [re.compile(token_re), QuotedString] + + class Charset(str): + """ + A charset (RFC5987, Section 3.2.1). + """ + # Other charsets are forbidden, the spec reserves them + # for future evolutions. + grammar = re.compile('UTF-8|ISO-8859-1', re.I) + + class Language(str): + """ + A language-tag (RFC 5646, Section 2.1). + + Fixme: This grammar is not 100% correct yet. + https://github.com/The-Compiler/qutebrowser/issues/105 + """ + grammar = re.compile('[A-Za-z0-9-]+') + + attr_char_re = '[{}]'.format(re.escape(attr_chars)) + hex_digit_re = '%[' + string.hexdigits + ']{2}' + + class ValueChars(str): + """ + A value of an attribute. + + Fixme: Can we merge this with Value? + https://github.com/The-Compiler/qutebrowser/issues/105 + """ + grammar = re.compile('({}|{})*'.format(attr_char_re, hex_digit_re)) + + class ExtValue(peg.List): + """ + An ext-value of an attribute (RFC 5987, Section 3.2). + """ + grammar = peg.contiguous(Charset, "'", peg.optional(Language), "'", + ValueChars) + + class ExtToken(peg.Symbol): + """ + A token introducing an extended value (RFC 6266, Section 4.1). + """ + regex = re.compile(token_re + r'\*') + + def __str__(self): + return super().__str__().lower() + + class NoExtToken(peg.Symbol): + """ + A token introducing a normal value (RFC 6266, Section 4.1). + """ + regex = re.compile(token_re + r'(?<!\*)') + + def __str__(self): + return super().__str__().lower() + + class DispositionParm(str): + """ + A parameter for the Disposition-Type header (RFC6266, Section 4.1). + """ + grammar = peg.attr('name', NoExtToken), '=', Value + + class ExtDispositionParm: + """ + An extended parameter (RFC6266, Section 4.1). + """ + grammar = peg.attr('name', ExtToken), '=', ExtValue + + def __init__(self, value, name=None): + self.name = name + self.value = value + + class DispositionType(peg.List): + """ + The disposition type (RFC6266, Section 4.1). + """ + grammar = [re.compile('(inline|attachment)', re.I), Token] + + class DispositionParmList(UniqueNamespace): + """ + A list of disposition parameters (RFC6266, Section 4.1). + """ + grammar = peg.maybe_some(';', [ExtDispositionParm, DispositionParm]) + + class ContentDispositionValue: + """ + A complete Content-Disposition value (RFC 6266, Section 4.1). + """ + # Allows nonconformant final semicolon + # I've seen it in the wild, and browsers accept it + # http://greenbytes.de/tech/tc2231/#attwithasciifilenamenqs + grammar = (peg.attr('dtype', DispositionType), + peg.attr('params', DispositionParmList), + peg.optional(';')) + + LangTagged = collections.namedtuple('LangTagged', ['string', 'langtag']) + + class DuplicateParamError(Exception): + """ + Exception raised when a parameter has been given twice. + """ + + class InvalidISO8859Error(Exception): + """ + Exception raised when a byte is invalid in ISO-8859-1. + """ + + class ContentDisposition: + """ + Records various indications and hints about content disposition. + + These can be used to know if a file should be downloaded or + displayed directly, and to hint what filename it should have + in the download case. + """ + def __init__(self, disposition='inline', assocs=None): + """ + Used internally after parsing the header. + + Instances should generally be created from a factory + function, such as parse_headers and its variants. + """ + if len(disposition) != 1: + self.disposition = 'inline' + else: + self.disposition = disposition[0] + if assocs is None: + self.assocs = {} + else: + self.assocs = dict(assocs) # So we can change values + if 'filename*' in self.assocs: + param = self.assocs['filename*'] + assert isinstance(param, ExtDispositionParm) + self.assocs['filename*'] = \ + parse_ext_value(param.value).string + + def filename(self): + """ + The filename from the Content-Disposition header or None. + + On safety: + This property records the intent of the sender. + + You shouldn't use this sender-controlled value as a filesystem + path, it can be insecure. Serving files with this filename can be + dangerous as well, due to a certain browser using the part after + the dot for mime-sniffing. Saving it to a database is fine by + itself though. + """ + if 'filename*' in self.assocs: + return self.assocs['filename*'] + elif 'filename' in self.assocs: + # XXX Reject non-ascii (parsed via qdtext) here? + return self.assocs['filename'] + + def is_inline(self): + """ + Return if the file should be handled inline. + + If not, and unless your application supports other dispositions + than the standard inline and attachment, it should be handled + as an attachment. + """ + return self.disposition.lower() == 'inline' + + def normalize_ws(text): + """ + Do LWS (linear whitespace) folding. + """ + return ' '.join(text.split()) + + def parse_headers(content_disposition): + """ + Build a ContentDisposition from header values. + + @param content_disposition contents of the disposition header + @type bytes + """ + # We allow non-ascii here (it will only be parsed inside of qdtext, and + # rejected by the grammar if it appears in other places), although + # parsing it can be ambiguous. Parsing it ensures that a non-ambiguous + # filename* value won't get dismissed because of an unrelated ambiguity + # in the filename parameter. But it does mean we occasionally give + # less-than-certain values for some legacy senders. + content_disposition = content_disposition.decode('iso-8859-1') + + # Our parsing is relaxed in these regards: + # - The grammar allows a final ';' in the header; + # - We do LWS-folding, and possibly normalise other broken + # whitespace, instead of rejecting non-lws-safe text. + # XXX Would prefer to accept only the quoted whitespace + # case, rather than normalising everything. + content_disposition = normalize_ws(content_disposition) + try: + parsed = peg.parse(content_disposition, ContentDispositionValue) + except (SyntaxError, DuplicateParamError, InvalidISO8859Error): + return ContentDisposition() + else: + return ContentDisposition(disposition=parsed.dtype, + assocs=parsed.params) + + def parse_ext_value(val): + """ + Parse the value of an extended attribute. + """ + if len(val) == 3: + charset, langtag, coded = val + else: + charset, coded = val + langtag = None + decoded = parse.unquote(coded, charset, errors='strict') + if charset == 'iso-8859-1': + # Fail if the filename contains an invalid ISO-8859-1 char + for c in decoded: + if 0x7F <= ord(c) <= 0x9F: + raise InvalidISO8859Error(c) + return LangTagged(decoded, langtag) + +except ImportError: + class ContentDisposition: + """ + Records various indications and hints about content disposition. + + These can be used to know if a file should be downloaded or + displayed directly, and to hint what filename it should have + in the download case. + """ + def __init__(self, filename): + """ + Constructor + + @param filename file name to be stored in this surrogate class + @type str + """ + self.__filename = filename + + def filename(self): + """ + Public method to get the stored file name + + @return file name + @rtype str + """ + return self.__filename + + def parse_headers(content_disposition): + """ + Build a ContentDisposition from header values. + + @param content_disposition contents of the disposition header + @type bytes + """ + header = content_disposition.decode() + if header: + pos = header.find("filename=") + if pos != -1: + path = header[pos + 9:] + if path.startswith('"') and path.endswith('"'): + path = path[1:-1] + return ContentDisposition(path) + return ContentDisposition("")