src/eric7/WebBrowser/SafeBrowsing/SafeBrowsingUrl.py

branch
eric7
changeset 9209
b99e7fd55fd3
parent 8881
54e42bc2437a
child 9221
bf71ee032bb4
equal deleted inserted replaced
9208:3fc8dfeb6ebe 9209:b99e7fd55fd3
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2017 - 2022 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing an URL representation suitable for Google Safe Browsing.
8 """
9
10 import re
11 import posixpath
12 import socket
13 import struct
14 import hashlib
15 import urllib.parse
16 import contextlib
17
18 import Preferences
19
20
21 class SafeBrowsingUrl:
22 """
23 Class implementing an URL representation suitable for Google Safe Browsing.
24 """
25 #
26 # Modeled after the URL class of the gglsbl package.
27 # https://github.com/afilipovich/gglsbl
28 #
29 def __init__(self, url):
30 """
31 Constructor
32
33 @param url URL to be embedded
34 @type str
35 """
36 self.__url = url
37
38 def hashes(self):
39 """
40 Public method to get the hashes of all possible permutations of the URL
41 in canonical form.
42
43 @yield URL hashes
44 @ytype bytes
45 """
46 for variant in self.permutations(self.canonical()):
47 urlHash = self.digest(variant)
48 yield urlHash
49
50 def canonical(self):
51 """
52 Public method to convert the URL to the canonical form.
53
54 @return canonical form of the URL
55 @rtype str
56 """
57 def fullUnescape(u):
58 """
59 Method to recursively unescape an URL.
60
61 @param u URL string to unescape
62 @type str
63 @return unescaped URL string
64 @rtype str
65 """
66 uu = urllib.parse.unquote(u)
67 if uu == u:
68 return uu
69 else:
70 return fullUnescape(uu)
71
72 def quote(s):
73 """
74 Method to quote a string.
75
76 @param string to be quoted
77 @type str
78 @return quoted string
79 @rtype str
80 """
81 safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~'
82 return urllib.parse.quote(s, safe=safeChars)
83
84 url = self.__url.strip()
85 url = url.replace('\n', '').replace('\r', '').replace('\t', '')
86 url = url.split('#', 1)[0]
87 if url.startswith('//'):
88 url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url
89 if len(url.split('://')) <= 1:
90 url = Preferences.getWebBrowser("DefaultScheme") + url
91 url = quote(fullUnescape(url))
92 urlParts = urllib.parse.urlsplit(url)
93 if not urlParts[0]:
94 url = Preferences.getWebBrowser("DefaultScheme") + url
95 urlParts = urllib.parse.urlsplit(url)
96 protocol = urlParts.scheme
97 host = fullUnescape(urlParts.hostname)
98 path = fullUnescape(urlParts.path)
99 query = urlParts.query
100 if not query and '?' not in url:
101 query = None
102 if not path:
103 path = '/'
104 path = posixpath.normpath(path).replace('//', '/')
105 if path[-1] != '/':
106 path += '/'
107 port = urlParts.port
108 host = host.strip('.')
109 host = re.sub(r'\.+', '.', host).lower()
110 if host.isdigit():
111 with contextlib.suppress(Exception):
112 host = socket.inet_ntoa(struct.pack("!I", int(host)))
113 if host.startswith('0x') and '.' not in host:
114 with contextlib.suppress(Exception):
115 host = socket.inet_ntoa(struct.pack("!I", int(host, 16)))
116 quotedPath = quote(path)
117 quotedHost = quote(host)
118 if port is not None:
119 quotedHost = '{0}:{1}'.format(quotedHost, port)
120 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath)
121 if query is not None:
122 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query)
123 return canonicalUrl
124
125 @staticmethod
126 def permutations(url):
127 """
128 Static method to determine all permutations of host name and path
129 which can be applied to blacklisted URLs.
130
131 @param url URL string to be permuted
132 @type str
133 @yield permutated URL strings
134 @ytype str
135 """
136 def hostPermutations(host):
137 """
138 Method to generate the permutations of the host name.
139
140 @param host host name
141 @type str
142 @yield permutated host names
143 @ytype str
144 """
145 if re.match(r'\d+\.\d+\.\d+\.\d+', host):
146 yield host
147 return
148 parts = host.split('.')
149 partsLen = min(len(parts), 5)
150 if partsLen > 4:
151 yield host
152 for i in range(partsLen - 1):
153 yield '.'.join(parts[i - partsLen:])
154
155 def pathPermutations(path):
156 """
157 Method to generate the permutations of the path.
158
159 @param path path to be processed
160 @type str
161 @yield permutated paths
162 @ytype str
163 """
164 yield path
165 query = None
166 if '?' in path:
167 path, query = path.split('?', 1)
168 if query is not None:
169 yield path
170 pathParts = path.split('/')[0:-1]
171 curPath = ''
172 for i in range(min(4, len(pathParts))):
173 curPath = curPath + pathParts[i] + '/'
174 yield curPath
175
176 protocol, addressStr = urllib.parse.splittype(url)
177 host, path = urllib.parse.splithost(addressStr)
178 user, host = urllib.parse.splituser(host)
179 host, port = urllib.parse.splitport(host)
180 host = host.strip('/')
181 seenPermutations = set()
182 for h in hostPermutations(host):
183 for p in pathPermutations(path):
184 u = '{0}{1}'.format(h, p)
185 if u not in seenPermutations:
186 yield u
187 seenPermutations.add(u)
188
189 @staticmethod
190 def digest(url):
191 """
192 Static method to calculate the SHA256 digest of an URL string.
193
194 @param url URL string
195 @type str
196 @return SHA256 digest of the URL string
197 @rtype bytes
198 """
199 return hashlib.sha256(url.encode('utf-8')).digest()

eric ide

mercurial