--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/WebBrowser/SafeBrowsing/SafeBrowsingUrl.py Sun Jul 16 19:34:54 2017 +0200 @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2017 Detlev Offenbach <detlev@die-offenbachs.de> +# + +""" +Module implementing an URL representation suitable for Google Safe Browsing. +""" + +from __future__ import unicode_literals + +try: + import urlparse # Py2 + import urllib # Py2 +except ImportError: + import urllib.parse as urllib + from urllib import parse as urlparse + +import re +import posixpath +import socket +import struct + +import Preferences + + +class SafeBrowsingUrl(object): + """ + Class implementing an URL representation suitable for Google Safe Browsing. + """ + # + # Modeled after the URL class of the gglsbl package. + # https://github.com/afilipovich/gglsbl + # + def __init__(self, url): + """ + Constructor + + @param url URL to be embedded + @type str + """ + self.__url = url + + def hashes(self): + """ + Public method to get the hashes of all possible permutations of the URL + in canonical form. + + @return generator for the URL hashes + @rtype generator of str + """ + for variant in self.permutations(self.canonical()): + urlHash = self.digest(variant) + yield urlHash + + def canonical(self): + """ + Public method to convert the URL to the canonical form. + + @return canonical form of the URL + @rtype str + """ + def fullUnescape(u): + """ + Method to recursively unescape an URL. + + @param u URL string to unescape + @type str + @return unescaped URL string + @rtype str + """ + uu = urllib.unquote(u) + if uu == u: + return uu + else: + return fullUnescape(uu) + + def quote(s): + """ + Method to quote a string. + + @param string to be quoted + @type str + @return quoted string + @rtype str + """ + safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~' + return urllib.quote(s, safe=safeChars) + + url = self.__url.strip() + url = url.replace('\n', '').replace('\r', '').replace('\t', '') + url = url.split('#', 1)[0] + if url.startswith('//'): + url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url + if len(url.split('://')) <= 1: + url = Preferences.getWebBrowser("DefaultScheme") + url + url = quote(fullUnescape(url)) + urlParts = urlparse.urlsplit(url) + if not urlParts[0]: + url = Preferences.getWebBrowser("DefaultScheme") + url + urlParts = urlparse.urlsplit(url) + protocol = urlParts.scheme + host = fullUnescape(urlParts.hostname) + path = fullUnescape(urlParts.path) + query = urlParts.query + if not query and '?' not in url: + query = None + if not path: + path = '/' + hasTrailingSlash = (path[-1] == '/') + path = posixpath.normpath(path).replace('//', '/') + if hasTrailingSlash and path[-1] != '/': + path += '/' + port = urlParts.port + host = host.strip('.') + host = re.sub(r'\.+', '.', host).lower() + if host.isdigit(): + try: + host = socket.inet_ntoa(struct.pack("!I", int(host))) + except Exception: + pass + if host.startswith('0x') and '.' not in host: + try: + host = socket.inet_ntoa(struct.pack("!I", int(host, 16))) + except Exception: + pass + quotedPath = quote(path) + quotedHost = quote(host) + if port is not None: + quotedHost = '{0}:{1}'.format(quotedHost, port) + canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) + if query is not None: + canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) + return canonicalUrl