Wed, 30 Dec 2020 11:00:05 +0100
Updated copyright for 2021.
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
1 | # -*- coding: utf-8 -*- |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
2 | |
7923
91e843545d9a
Updated copyright for 2021.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
7781
diff
changeset
|
3 | # Copyright (c) 2017 - 2021 Detlev Offenbach <detlev@die-offenbachs.de> |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
4 | # |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
5 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
6 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
7 | Module implementing an URL representation suitable for Google Safe Browsing. |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
8 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
9 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
10 | import re |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
11 | import posixpath |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
12 | import socket |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
13 | import struct |
5809
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
14 | import hashlib |
7192
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
15 | import urllib.parse |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
16 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
17 | import Preferences |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
18 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
19 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
20 | class SafeBrowsingUrl(object): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
21 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
22 | Class implementing an URL representation suitable for Google Safe Browsing. |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
23 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
24 | # |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
25 | # Modeled after the URL class of the gglsbl package. |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
26 | # https://github.com/afilipovich/gglsbl |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
27 | # |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
28 | def __init__(self, url): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
29 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
30 | Constructor |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
31 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
32 | @param url URL to be embedded |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
33 | @type str |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
34 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
35 | self.__url = url |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
36 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
37 | def hashes(self): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
38 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
39 | Public method to get the hashes of all possible permutations of the URL |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
40 | in canonical form. |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
41 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
42 | @return generator for the URL hashes |
5817
a5f6c9128500
Started implementing the SafeBrowsingCache class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5811
diff
changeset
|
43 | @rtype generator of bytes |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
44 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
45 | for variant in self.permutations(self.canonical()): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
46 | urlHash = self.digest(variant) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
47 | yield urlHash |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
48 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
49 | def canonical(self): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
50 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
51 | Public method to convert the URL to the canonical form. |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
52 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
53 | @return canonical form of the URL |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
54 | @rtype str |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
55 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
56 | def fullUnescape(u): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
57 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
58 | Method to recursively unescape an URL. |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
59 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
60 | @param u URL string to unescape |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
61 | @type str |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
62 | @return unescaped URL string |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
63 | @rtype str |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
64 | """ |
7192
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
65 | uu = urllib.parse.unquote(u) |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
66 | if uu == u: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
67 | return uu |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
68 | else: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
69 | return fullUnescape(uu) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
70 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
71 | def quote(s): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
72 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
73 | Method to quote a string. |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
74 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
75 | @param string to be quoted |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
76 | @type str |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
77 | @return quoted string |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
78 | @rtype str |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
79 | """ |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
80 | safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~' |
7192
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
81 | return urllib.parse.quote(s, safe=safeChars) |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
82 | |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
83 | url = self.__url.strip() |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
84 | url = url.replace('\n', '').replace('\r', '').replace('\t', '') |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
85 | url = url.split('#', 1)[0] |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
86 | if url.startswith('//'): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
87 | url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
88 | if len(url.split('://')) <= 1: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
89 | url = Preferences.getWebBrowser("DefaultScheme") + url |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
90 | url = quote(fullUnescape(url)) |
7192
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
91 | urlParts = urllib.parse.parse.urlsplit(url) |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
92 | if not urlParts[0]: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
93 | url = Preferences.getWebBrowser("DefaultScheme") + url |
7192
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
94 | urlParts = urllib.parse.parse.urlsplit(url) |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
95 | protocol = urlParts.scheme |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
96 | host = fullUnescape(urlParts.hostname) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
97 | path = fullUnescape(urlParts.path) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
98 | query = urlParts.query |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
99 | if not query and '?' not in url: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
100 | query = None |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
101 | if not path: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
102 | path = '/' |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
103 | path = posixpath.normpath(path).replace('//', '/') |
5829
d3448873ced3
Finished coding the safe browsing module of the new web browser.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5817
diff
changeset
|
104 | if path[-1] != '/': |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
105 | path += '/' |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
106 | port = urlParts.port |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
107 | host = host.strip('.') |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
108 | host = re.sub(r'\.+', '.', host).lower() |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
109 | if host.isdigit(): |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
110 | try: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
111 | host = socket.inet_ntoa(struct.pack("!I", int(host))) |
7628
f904d0eef264
Checked the reported security related issue reports generated by the new security checker.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
7360
diff
changeset
|
112 | except Exception: # secok |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
113 | pass |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
114 | if host.startswith('0x') and '.' not in host: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
115 | try: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
116 | host = socket.inet_ntoa(struct.pack("!I", int(host, 16))) |
7628
f904d0eef264
Checked the reported security related issue reports generated by the new security checker.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
7360
diff
changeset
|
117 | except Exception: # secok |
5808
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
118 | pass |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
119 | quotedPath = quote(path) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
120 | quotedHost = quote(host) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
121 | if port is not None: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
122 | quotedHost = '{0}:{1}'.format(quotedHost, port) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
123 | canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
124 | if query is not None: |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
125 | canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) |
7bf90dcae4e1
Started implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
126 | return canonicalUrl |
5809
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
127 | |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
128 | @staticmethod |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
129 | def permutations(url): |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
130 | """ |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
131 | Static method to determine all permutations of host name and path |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
132 | which can be applied to blacklisted URLs. |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
133 | |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
134 | @param url URL string to be permuted |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
135 | @type str |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
136 | @return generator of permuted URL strings |
5832
28f36b9c925f
Updated source docu.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5829
diff
changeset
|
137 | @rtype generator of str |
5809
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
138 | """ |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
139 | def hostPermutations(host): |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
140 | """ |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
141 | Method to generate the permutations of the host name. |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
142 | |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
143 | @param host host name |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
144 | @type str |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
145 | @return generator of permuted host names |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
146 | @rtype generator of str |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
147 | """ |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
148 | if re.match(r'\d+\.\d+\.\d+\.\d+', host): |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
149 | yield host |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
150 | return |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
151 | parts = host.split('.') |
5811
5358a3c7995f
Done implementing the SafeBrowsingAPIClient class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5809
diff
changeset
|
152 | partsLen = min(len(parts), 5) |
5358a3c7995f
Done implementing the SafeBrowsingAPIClient class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5809
diff
changeset
|
153 | if partsLen > 4: |
5809
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
154 | yield host |
5811
5358a3c7995f
Done implementing the SafeBrowsingAPIClient class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5809
diff
changeset
|
155 | for i in range(partsLen - 1): |
5358a3c7995f
Done implementing the SafeBrowsingAPIClient class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5809
diff
changeset
|
156 | yield '.'.join(parts[i - partsLen:]) |
5809
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
157 | |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
158 | def pathPermutations(path): |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
159 | """ |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
160 | Method to generate the permutations of the path. |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
161 | |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
162 | @param path path to be processed |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
163 | @type str |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
164 | @return generator of permuted paths |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
165 | @rtype generator of str |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
166 | """ |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
167 | yield path |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
168 | query = None |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
169 | if '?' in path: |
5811
5358a3c7995f
Done implementing the SafeBrowsingAPIClient class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5809
diff
changeset
|
170 | path, query = path.split('?', 1) |
5809
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
171 | if query is not None: |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
172 | yield path |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
173 | pathParts = path.split('/')[0:-1] |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
174 | curPath = '' |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
175 | for i in range(min(4, len(pathParts))): |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
176 | curPath = curPath + pathParts[i] + '/' |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
177 | yield curPath |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
178 | |
7192
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
179 | protocol, addressStr = urllib.parse.splittype(url) |
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
180 | host, path = urllib.parse.splithost(addressStr) |
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
181 | user, host = urllib.parse.splituser(host) |
a22eee00b052
Started removing runtime support for Python2 and PyQt4.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
182 | host, port = urllib.parse.splitport(host) |
5809
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
183 | host = host.strip('/') |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
184 | seenPermutations = set() |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
185 | for h in hostPermutations(host): |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
186 | for p in pathPermutations(path): |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
187 | u = '{0}{1}'.format(h, p) |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
188 | if u not in seenPermutations: |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
189 | yield u |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
190 | seenPermutations.add(u) |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
191 | |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
192 | @staticmethod |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
193 | def digest(url): |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
194 | """ |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
195 | Static method to calculate the SHA256 digest of an URL string. |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
196 | |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
197 | @param url URL string |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
198 | @type str |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
199 | @return SHA256 digest of the URL string |
5817
a5f6c9128500
Started implementing the SafeBrowsingCache class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5811
diff
changeset
|
200 | @rtype bytes |
5809
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
201 | """ |
5b53c17b7d93
Done implementing the SafeBrowsingUrl class.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
5808
diff
changeset
|
202 | return hashlib.sha256(url.encode('utf-8')).digest() |