src/eric7/WebBrowser/SafeBrowsing/SafeBrowsingUrl.py

branch
eric7
changeset 9209
b99e7fd55fd3
parent 8881
54e42bc2437a
child 9221
bf71ee032bb4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/eric7/WebBrowser/SafeBrowsing/SafeBrowsingUrl.py	Thu Jul 07 11:23:56 2022 +0200
@@ -0,0 +1,199 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2017 - 2022 Detlev Offenbach <detlev@die-offenbachs.de>
+#
+
+"""
+Module implementing an URL representation suitable for Google Safe Browsing.
+"""
+
+import re
+import posixpath
+import socket
+import struct
+import hashlib
+import urllib.parse
+import contextlib
+
+import Preferences
+
+
+class SafeBrowsingUrl:
+    """
+    Class implementing an URL representation suitable for Google Safe Browsing.
+    """
+    #
+    # Modeled after the URL class of the gglsbl package.
+    # https://github.com/afilipovich/gglsbl
+    #
+    def __init__(self, url):
+        """
+        Constructor
+        
+        @param url URL to be embedded
+        @type str
+        """
+        self.__url = url
+    
+    def hashes(self):
+        """
+        Public method to get the hashes of all possible permutations of the URL
+        in canonical form.
+        
+        @yield URL hashes
+        @ytype bytes
+        """
+        for variant in self.permutations(self.canonical()):
+            urlHash = self.digest(variant)
+            yield urlHash
+    
+    def canonical(self):
+        """
+        Public method to convert the URL to the canonical form.
+        
+        @return canonical form of the URL
+        @rtype str
+        """
+        def fullUnescape(u):
+            """
+            Method to recursively unescape an URL.
+            
+            @param u URL string to unescape
+            @type str
+            @return unescaped URL string
+            @rtype str
+            """
+            uu = urllib.parse.unquote(u)
+            if uu == u:
+                return uu
+            else:
+                return fullUnescape(uu)
+        
+        def quote(s):
+            """
+            Method to quote a string.
+            
+            @param string to be quoted
+            @type str
+            @return quoted string
+            @rtype str
+            """
+            safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~'
+            return urllib.parse.quote(s, safe=safeChars)
+        
+        url = self.__url.strip()
+        url = url.replace('\n', '').replace('\r', '').replace('\t', '')
+        url = url.split('#', 1)[0]
+        if url.startswith('//'):
+            url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url
+        if len(url.split('://')) <= 1:
+            url = Preferences.getWebBrowser("DefaultScheme") + url
+        url = quote(fullUnescape(url))
+        urlParts = urllib.parse.urlsplit(url)
+        if not urlParts[0]:
+            url = Preferences.getWebBrowser("DefaultScheme") + url
+            urlParts = urllib.parse.urlsplit(url)
+        protocol = urlParts.scheme
+        host = fullUnescape(urlParts.hostname)
+        path = fullUnescape(urlParts.path)
+        query = urlParts.query
+        if not query and '?' not in url:
+            query = None
+        if not path:
+            path = '/'
+        path = posixpath.normpath(path).replace('//', '/')
+        if path[-1] != '/':
+            path += '/'
+        port = urlParts.port
+        host = host.strip('.')
+        host = re.sub(r'\.+', '.', host).lower()
+        if host.isdigit():
+            with contextlib.suppress(Exception):
+                host = socket.inet_ntoa(struct.pack("!I", int(host)))
+        if host.startswith('0x') and '.' not in host:
+            with contextlib.suppress(Exception):
+                host = socket.inet_ntoa(struct.pack("!I", int(host, 16)))
+        quotedPath = quote(path)
+        quotedHost = quote(host)
+        if port is not None:
+            quotedHost = '{0}:{1}'.format(quotedHost, port)
+        canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath)
+        if query is not None:
+            canonicalUrl = '{0}?{1}'.format(canonicalUrl, query)
+        return canonicalUrl
+    
+    @staticmethod
+    def permutations(url):
+        """
+        Static method to determine all permutations of host name and path
+        which can be applied to blacklisted URLs.
+        
+        @param url URL string to be permuted
+        @type str
+        @yield permutated URL strings
+        @ytype str
+        """
+        def hostPermutations(host):
+            """
+            Method to generate the permutations of the host name.
+            
+            @param host host name
+            @type str
+            @yield permutated host names
+            @ytype str
+            """
+            if re.match(r'\d+\.\d+\.\d+\.\d+', host):
+                yield host
+                return
+            parts = host.split('.')
+            partsLen = min(len(parts), 5)
+            if partsLen > 4:
+                yield host
+            for i in range(partsLen - 1):
+                yield '.'.join(parts[i - partsLen:])
+        
+        def pathPermutations(path):
+            """
+            Method to generate the permutations of the path.
+            
+            @param path path to be processed
+            @type str
+            @yield permutated paths
+            @ytype str
+            """
+            yield path
+            query = None
+            if '?' in path:
+                path, query = path.split('?', 1)
+            if query is not None:
+                yield path
+            pathParts = path.split('/')[0:-1]
+            curPath = ''
+            for i in range(min(4, len(pathParts))):
+                curPath = curPath + pathParts[i] + '/'
+                yield curPath
+        
+        protocol, addressStr = urllib.parse.splittype(url)
+        host, path = urllib.parse.splithost(addressStr)
+        user, host = urllib.parse.splituser(host)
+        host, port = urllib.parse.splitport(host)
+        host = host.strip('/')
+        seenPermutations = set()
+        for h in hostPermutations(host):
+            for p in pathPermutations(path):
+                u = '{0}{1}'.format(h, p)
+                if u not in seenPermutations:
+                    yield u
+                    seenPermutations.add(u)
+
+    @staticmethod
+    def digest(url):
+        """
+        Static method to calculate the SHA256 digest of an URL string.
+        
+        @param url URL string
+        @type str
+        @return SHA256 digest of the URL string
+        @rtype bytes
+        """
+        return hashlib.sha256(url.encode('utf-8')).digest()

eric ide

mercurial