|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2017 - 2022 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing an URL representation suitable for Google Safe Browsing. |
|
8 """ |
|
9 |
|
10 import re |
|
11 import posixpath |
|
12 import socket |
|
13 import struct |
|
14 import hashlib |
|
15 import urllib.parse |
|
16 import contextlib |
|
17 |
|
18 import Preferences |
|
19 |
|
20 |
|
21 class SafeBrowsingUrl: |
|
22 """ |
|
23 Class implementing an URL representation suitable for Google Safe Browsing. |
|
24 """ |
|
25 # |
|
26 # Modeled after the URL class of the gglsbl package. |
|
27 # https://github.com/afilipovich/gglsbl |
|
28 # |
|
29 def __init__(self, url): |
|
30 """ |
|
31 Constructor |
|
32 |
|
33 @param url URL to be embedded |
|
34 @type str |
|
35 """ |
|
36 self.__url = url |
|
37 |
|
38 def hashes(self): |
|
39 """ |
|
40 Public method to get the hashes of all possible permutations of the URL |
|
41 in canonical form. |
|
42 |
|
43 @yield URL hashes |
|
44 @ytype bytes |
|
45 """ |
|
46 for variant in self.permutations(self.canonical()): |
|
47 urlHash = self.digest(variant) |
|
48 yield urlHash |
|
49 |
|
50 def canonical(self): |
|
51 """ |
|
52 Public method to convert the URL to the canonical form. |
|
53 |
|
54 @return canonical form of the URL |
|
55 @rtype str |
|
56 """ |
|
57 def fullUnescape(u): |
|
58 """ |
|
59 Method to recursively unescape an URL. |
|
60 |
|
61 @param u URL string to unescape |
|
62 @type str |
|
63 @return unescaped URL string |
|
64 @rtype str |
|
65 """ |
|
66 uu = urllib.parse.unquote(u) |
|
67 if uu == u: |
|
68 return uu |
|
69 else: |
|
70 return fullUnescape(uu) |
|
71 |
|
72 def quote(s): |
|
73 """ |
|
74 Method to quote a string. |
|
75 |
|
76 @param string to be quoted |
|
77 @type str |
|
78 @return quoted string |
|
79 @rtype str |
|
80 """ |
|
81 safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~' |
|
82 return urllib.parse.quote(s, safe=safeChars) |
|
83 |
|
84 url = self.__url.strip() |
|
85 url = url.replace('\n', '').replace('\r', '').replace('\t', '') |
|
86 url = url.split('#', 1)[0] |
|
87 if url.startswith('//'): |
|
88 url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url |
|
89 if len(url.split('://')) <= 1: |
|
90 url = Preferences.getWebBrowser("DefaultScheme") + url |
|
91 url = quote(fullUnescape(url)) |
|
92 urlParts = urllib.parse.urlsplit(url) |
|
93 if not urlParts[0]: |
|
94 url = Preferences.getWebBrowser("DefaultScheme") + url |
|
95 urlParts = urllib.parse.urlsplit(url) |
|
96 protocol = urlParts.scheme |
|
97 host = fullUnescape(urlParts.hostname) |
|
98 path = fullUnescape(urlParts.path) |
|
99 query = urlParts.query |
|
100 if not query and '?' not in url: |
|
101 query = None |
|
102 if not path: |
|
103 path = '/' |
|
104 path = posixpath.normpath(path).replace('//', '/') |
|
105 if path[-1] != '/': |
|
106 path += '/' |
|
107 port = urlParts.port |
|
108 host = host.strip('.') |
|
109 host = re.sub(r'\.+', '.', host).lower() |
|
110 if host.isdigit(): |
|
111 with contextlib.suppress(Exception): |
|
112 host = socket.inet_ntoa(struct.pack("!I", int(host))) |
|
113 if host.startswith('0x') and '.' not in host: |
|
114 with contextlib.suppress(Exception): |
|
115 host = socket.inet_ntoa(struct.pack("!I", int(host, 16))) |
|
116 quotedPath = quote(path) |
|
117 quotedHost = quote(host) |
|
118 if port is not None: |
|
119 quotedHost = '{0}:{1}'.format(quotedHost, port) |
|
120 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) |
|
121 if query is not None: |
|
122 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) |
|
123 return canonicalUrl |
|
124 |
|
125 @staticmethod |
|
126 def permutations(url): |
|
127 """ |
|
128 Static method to determine all permutations of host name and path |
|
129 which can be applied to blacklisted URLs. |
|
130 |
|
131 @param url URL string to be permuted |
|
132 @type str |
|
133 @yield permutated URL strings |
|
134 @ytype str |
|
135 """ |
|
136 def hostPermutations(host): |
|
137 """ |
|
138 Method to generate the permutations of the host name. |
|
139 |
|
140 @param host host name |
|
141 @type str |
|
142 @yield permutated host names |
|
143 @ytype str |
|
144 """ |
|
145 if re.match(r'\d+\.\d+\.\d+\.\d+', host): |
|
146 yield host |
|
147 return |
|
148 parts = host.split('.') |
|
149 partsLen = min(len(parts), 5) |
|
150 if partsLen > 4: |
|
151 yield host |
|
152 for i in range(partsLen - 1): |
|
153 yield '.'.join(parts[i - partsLen:]) |
|
154 |
|
155 def pathPermutations(path): |
|
156 """ |
|
157 Method to generate the permutations of the path. |
|
158 |
|
159 @param path path to be processed |
|
160 @type str |
|
161 @yield permutated paths |
|
162 @ytype str |
|
163 """ |
|
164 yield path |
|
165 query = None |
|
166 if '?' in path: |
|
167 path, query = path.split('?', 1) |
|
168 if query is not None: |
|
169 yield path |
|
170 pathParts = path.split('/')[0:-1] |
|
171 curPath = '' |
|
172 for i in range(min(4, len(pathParts))): |
|
173 curPath = curPath + pathParts[i] + '/' |
|
174 yield curPath |
|
175 |
|
176 protocol, addressStr = urllib.parse.splittype(url) |
|
177 host, path = urllib.parse.splithost(addressStr) |
|
178 user, host = urllib.parse.splituser(host) |
|
179 host, port = urllib.parse.splitport(host) |
|
180 host = host.strip('/') |
|
181 seenPermutations = set() |
|
182 for h in hostPermutations(host): |
|
183 for p in pathPermutations(path): |
|
184 u = '{0}{1}'.format(h, p) |
|
185 if u not in seenPermutations: |
|
186 yield u |
|
187 seenPermutations.add(u) |
|
188 |
|
189 @staticmethod |
|
190 def digest(url): |
|
191 """ |
|
192 Static method to calculate the SHA256 digest of an URL string. |
|
193 |
|
194 @param url URL string |
|
195 @type str |
|
196 @return SHA256 digest of the URL string |
|
197 @rtype bytes |
|
198 """ |
|
199 return hashlib.sha256(url.encode('utf-8')).digest() |