|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2017 - 2019 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing an URL representation suitable for Google Safe Browsing. |
|
8 """ |
|
9 |
|
10 from __future__ import unicode_literals |
|
11 |
|
12 try: |
|
13 import urlparse # Py2 |
|
14 import urllib # Py2 |
|
15 except ImportError: |
|
16 import urllib.parse as urllib |
|
17 from urllib import parse as urlparse |
|
18 |
|
19 import re |
|
20 import posixpath |
|
21 import socket |
|
22 import struct |
|
23 import hashlib |
|
24 |
|
25 import Preferences |
|
26 |
|
27 |
|
28 class SafeBrowsingUrl(object): |
|
29 """ |
|
30 Class implementing an URL representation suitable for Google Safe Browsing. |
|
31 """ |
|
32 # |
|
33 # Modeled after the URL class of the gglsbl package. |
|
34 # https://github.com/afilipovich/gglsbl |
|
35 # |
|
36 def __init__(self, url): |
|
37 """ |
|
38 Constructor |
|
39 |
|
40 @param url URL to be embedded |
|
41 @type str |
|
42 """ |
|
43 self.__url = url |
|
44 |
|
45 def hashes(self): |
|
46 """ |
|
47 Public method to get the hashes of all possible permutations of the URL |
|
48 in canonical form. |
|
49 |
|
50 @return generator for the URL hashes |
|
51 @rtype generator of bytes |
|
52 """ |
|
53 for variant in self.permutations(self.canonical()): |
|
54 urlHash = self.digest(variant) |
|
55 yield urlHash |
|
56 |
|
57 def canonical(self): |
|
58 """ |
|
59 Public method to convert the URL to the canonical form. |
|
60 |
|
61 @return canonical form of the URL |
|
62 @rtype str |
|
63 """ |
|
64 def fullUnescape(u): |
|
65 """ |
|
66 Method to recursively unescape an URL. |
|
67 |
|
68 @param u URL string to unescape |
|
69 @type str |
|
70 @return unescaped URL string |
|
71 @rtype str |
|
72 """ |
|
73 uu = urllib.unquote(u) |
|
74 if uu == u: |
|
75 return uu |
|
76 else: |
|
77 return fullUnescape(uu) |
|
78 |
|
79 def quote(s): |
|
80 """ |
|
81 Method to quote a string. |
|
82 |
|
83 @param string to be quoted |
|
84 @type str |
|
85 @return quoted string |
|
86 @rtype str |
|
87 """ |
|
88 safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~' |
|
89 return urllib.quote(s, safe=safeChars) |
|
90 |
|
91 url = self.__url.strip() |
|
92 url = url.replace('\n', '').replace('\r', '').replace('\t', '') |
|
93 url = url.split('#', 1)[0] |
|
94 if url.startswith('//'): |
|
95 url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url |
|
96 if len(url.split('://')) <= 1: |
|
97 url = Preferences.getWebBrowser("DefaultScheme") + url |
|
98 url = quote(fullUnescape(url)) |
|
99 urlParts = urlparse.urlsplit(url) |
|
100 if not urlParts[0]: |
|
101 url = Preferences.getWebBrowser("DefaultScheme") + url |
|
102 urlParts = urlparse.urlsplit(url) |
|
103 protocol = urlParts.scheme |
|
104 host = fullUnescape(urlParts.hostname) |
|
105 path = fullUnescape(urlParts.path) |
|
106 query = urlParts.query |
|
107 if not query and '?' not in url: |
|
108 query = None |
|
109 if not path: |
|
110 path = '/' |
|
111 path = posixpath.normpath(path).replace('//', '/') |
|
112 if path[-1] != '/': |
|
113 path += '/' |
|
114 port = urlParts.port |
|
115 host = host.strip('.') |
|
116 host = re.sub(r'\.+', '.', host).lower() |
|
117 if host.isdigit(): |
|
118 try: |
|
119 host = socket.inet_ntoa(struct.pack("!I", int(host))) |
|
120 except Exception: |
|
121 pass |
|
122 if host.startswith('0x') and '.' not in host: |
|
123 try: |
|
124 host = socket.inet_ntoa(struct.pack("!I", int(host, 16))) |
|
125 except Exception: |
|
126 pass |
|
127 quotedPath = quote(path) |
|
128 quotedHost = quote(host) |
|
129 if port is not None: |
|
130 quotedHost = '{0}:{1}'.format(quotedHost, port) |
|
131 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) |
|
132 if query is not None: |
|
133 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) |
|
134 return canonicalUrl |
|
135 |
|
136 @staticmethod |
|
137 def permutations(url): |
|
138 """ |
|
139 Static method to determine all permutations of host name and path |
|
140 which can be applied to blacklisted URLs. |
|
141 |
|
142 @param url URL string to be permuted |
|
143 @type str |
|
144 @return generator of permuted URL strings |
|
145 @rtype generator of str |
|
146 """ |
|
147 def hostPermutations(host): |
|
148 """ |
|
149 Method to generate the permutations of the host name. |
|
150 |
|
151 @param host host name |
|
152 @type str |
|
153 @return generator of permuted host names |
|
154 @rtype generator of str |
|
155 """ |
|
156 if re.match(r'\d+\.\d+\.\d+\.\d+', host): |
|
157 yield host |
|
158 return |
|
159 parts = host.split('.') |
|
160 partsLen = min(len(parts), 5) |
|
161 if partsLen > 4: |
|
162 yield host |
|
163 for i in range(partsLen - 1): |
|
164 yield '.'.join(parts[i - partsLen:]) |
|
165 |
|
166 def pathPermutations(path): |
|
167 """ |
|
168 Method to generate the permutations of the path. |
|
169 |
|
170 @param path path to be processed |
|
171 @type str |
|
172 @return generator of permuted paths |
|
173 @rtype generator of str |
|
174 """ |
|
175 yield path |
|
176 query = None |
|
177 if '?' in path: |
|
178 path, query = path.split('?', 1) |
|
179 if query is not None: |
|
180 yield path |
|
181 pathParts = path.split('/')[0:-1] |
|
182 curPath = '' |
|
183 for i in range(min(4, len(pathParts))): |
|
184 curPath = curPath + pathParts[i] + '/' |
|
185 yield curPath |
|
186 |
|
187 protocol, addressStr = urllib.splittype(url) |
|
188 host, path = urllib.splithost(addressStr) |
|
189 user, host = urllib.splituser(host) |
|
190 host, port = urllib.splitport(host) |
|
191 host = host.strip('/') |
|
192 seenPermutations = set() |
|
193 for h in hostPermutations(host): |
|
194 for p in pathPermutations(path): |
|
195 u = '{0}{1}'.format(h, p) |
|
196 if u not in seenPermutations: |
|
197 yield u |
|
198 seenPermutations.add(u) |
|
199 |
|
200 @staticmethod |
|
201 def digest(url): |
|
202 """ |
|
203 Static method to calculate the SHA256 digest of an URL string. |
|
204 |
|
205 @param url URL string |
|
206 @type str |
|
207 @return SHA256 digest of the URL string |
|
208 @rtype bytes |
|
209 """ |
|
210 return hashlib.sha256(url.encode('utf-8')).digest() |