Sun, 16 Jul 2017 19:34:54 +0200
Started implementing the SafeBrowsingUrl class.
# -*- coding: utf-8 -*- # Copyright (c) 2017 Detlev Offenbach <detlev@die-offenbachs.de> # """ Module implementing an URL representation suitable for Google Safe Browsing. """ from __future__ import unicode_literals try: import urlparse # Py2 import urllib # Py2 except ImportError: import urllib.parse as urllib from urllib import parse as urlparse import re import posixpath import socket import struct import Preferences class SafeBrowsingUrl(object): """ Class implementing an URL representation suitable for Google Safe Browsing. """ # # Modeled after the URL class of the gglsbl package. # https://github.com/afilipovich/gglsbl # def __init__(self, url): """ Constructor @param url URL to be embedded @type str """ self.__url = url def hashes(self): """ Public method to get the hashes of all possible permutations of the URL in canonical form. @return generator for the URL hashes @rtype generator of str """ for variant in self.permutations(self.canonical()): urlHash = self.digest(variant) yield urlHash def canonical(self): """ Public method to convert the URL to the canonical form. @return canonical form of the URL @rtype str """ def fullUnescape(u): """ Method to recursively unescape an URL. @param u URL string to unescape @type str @return unescaped URL string @rtype str """ uu = urllib.unquote(u) if uu == u: return uu else: return fullUnescape(uu) def quote(s): """ Method to quote a string. @param string to be quoted @type str @return quoted string @rtype str """ safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~' return urllib.quote(s, safe=safeChars) url = self.__url.strip() url = url.replace('\n', '').replace('\r', '').replace('\t', '') url = url.split('#', 1)[0] if url.startswith('//'): url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url if len(url.split('://')) <= 1: url = Preferences.getWebBrowser("DefaultScheme") + url url = quote(fullUnescape(url)) urlParts = urlparse.urlsplit(url) if not urlParts[0]: url = Preferences.getWebBrowser("DefaultScheme") + url urlParts = urlparse.urlsplit(url) protocol = urlParts.scheme host = fullUnescape(urlParts.hostname) path = fullUnescape(urlParts.path) query = urlParts.query if not query and '?' not in url: query = None if not path: path = '/' hasTrailingSlash = (path[-1] == '/') path = posixpath.normpath(path).replace('//', '/') if hasTrailingSlash and path[-1] != '/': path += '/' port = urlParts.port host = host.strip('.') host = re.sub(r'\.+', '.', host).lower() if host.isdigit(): try: host = socket.inet_ntoa(struct.pack("!I", int(host))) except Exception: pass if host.startswith('0x') and '.' not in host: try: host = socket.inet_ntoa(struct.pack("!I", int(host, 16))) except Exception: pass quotedPath = quote(path) quotedHost = quote(host) if port is not None: quotedHost = '{0}:{1}'.format(quotedHost, port) canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) if query is not None: canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) return canonicalUrl