eric6/WebBrowser/SafeBrowsing/SafeBrowsingUrl.py

changeset 6942
2602857055c5
parent 6645
ad476851d7e0
child 7192
a22eee00b052
equal deleted inserted replaced
6941:f99d60d6b59b 6942:2602857055c5
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2017 - 2019 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing an URL representation suitable for Google Safe Browsing.
8 """
9
10 from __future__ import unicode_literals
11
12 try:
13 import urlparse # Py2
14 import urllib # Py2
15 except ImportError:
16 import urllib.parse as urllib
17 from urllib import parse as urlparse
18
19 import re
20 import posixpath
21 import socket
22 import struct
23 import hashlib
24
25 import Preferences
26
27
28 class SafeBrowsingUrl(object):
29 """
30 Class implementing an URL representation suitable for Google Safe Browsing.
31 """
32 #
33 # Modeled after the URL class of the gglsbl package.
34 # https://github.com/afilipovich/gglsbl
35 #
36 def __init__(self, url):
37 """
38 Constructor
39
40 @param url URL to be embedded
41 @type str
42 """
43 self.__url = url
44
45 def hashes(self):
46 """
47 Public method to get the hashes of all possible permutations of the URL
48 in canonical form.
49
50 @return generator for the URL hashes
51 @rtype generator of bytes
52 """
53 for variant in self.permutations(self.canonical()):
54 urlHash = self.digest(variant)
55 yield urlHash
56
57 def canonical(self):
58 """
59 Public method to convert the URL to the canonical form.
60
61 @return canonical form of the URL
62 @rtype str
63 """
64 def fullUnescape(u):
65 """
66 Method to recursively unescape an URL.
67
68 @param u URL string to unescape
69 @type str
70 @return unescaped URL string
71 @rtype str
72 """
73 uu = urllib.unquote(u)
74 if uu == u:
75 return uu
76 else:
77 return fullUnescape(uu)
78
79 def quote(s):
80 """
81 Method to quote a string.
82
83 @param string to be quoted
84 @type str
85 @return quoted string
86 @rtype str
87 """
88 safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~'
89 return urllib.quote(s, safe=safeChars)
90
91 url = self.__url.strip()
92 url = url.replace('\n', '').replace('\r', '').replace('\t', '')
93 url = url.split('#', 1)[0]
94 if url.startswith('//'):
95 url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url
96 if len(url.split('://')) <= 1:
97 url = Preferences.getWebBrowser("DefaultScheme") + url
98 url = quote(fullUnescape(url))
99 urlParts = urlparse.urlsplit(url)
100 if not urlParts[0]:
101 url = Preferences.getWebBrowser("DefaultScheme") + url
102 urlParts = urlparse.urlsplit(url)
103 protocol = urlParts.scheme
104 host = fullUnescape(urlParts.hostname)
105 path = fullUnescape(urlParts.path)
106 query = urlParts.query
107 if not query and '?' not in url:
108 query = None
109 if not path:
110 path = '/'
111 path = posixpath.normpath(path).replace('//', '/')
112 if path[-1] != '/':
113 path += '/'
114 port = urlParts.port
115 host = host.strip('.')
116 host = re.sub(r'\.+', '.', host).lower()
117 if host.isdigit():
118 try:
119 host = socket.inet_ntoa(struct.pack("!I", int(host)))
120 except Exception:
121 pass
122 if host.startswith('0x') and '.' not in host:
123 try:
124 host = socket.inet_ntoa(struct.pack("!I", int(host, 16)))
125 except Exception:
126 pass
127 quotedPath = quote(path)
128 quotedHost = quote(host)
129 if port is not None:
130 quotedHost = '{0}:{1}'.format(quotedHost, port)
131 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath)
132 if query is not None:
133 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query)
134 return canonicalUrl
135
136 @staticmethod
137 def permutations(url):
138 """
139 Static method to determine all permutations of host name and path
140 which can be applied to blacklisted URLs.
141
142 @param url URL string to be permuted
143 @type str
144 @return generator of permuted URL strings
145 @rtype generator of str
146 """
147 def hostPermutations(host):
148 """
149 Method to generate the permutations of the host name.
150
151 @param host host name
152 @type str
153 @return generator of permuted host names
154 @rtype generator of str
155 """
156 if re.match(r'\d+\.\d+\.\d+\.\d+', host):
157 yield host
158 return
159 parts = host.split('.')
160 partsLen = min(len(parts), 5)
161 if partsLen > 4:
162 yield host
163 for i in range(partsLen - 1):
164 yield '.'.join(parts[i - partsLen:])
165
166 def pathPermutations(path):
167 """
168 Method to generate the permutations of the path.
169
170 @param path path to be processed
171 @type str
172 @return generator of permuted paths
173 @rtype generator of str
174 """
175 yield path
176 query = None
177 if '?' in path:
178 path, query = path.split('?', 1)
179 if query is not None:
180 yield path
181 pathParts = path.split('/')[0:-1]
182 curPath = ''
183 for i in range(min(4, len(pathParts))):
184 curPath = curPath + pathParts[i] + '/'
185 yield curPath
186
187 protocol, addressStr = urllib.splittype(url)
188 host, path = urllib.splithost(addressStr)
189 user, host = urllib.splituser(host)
190 host, port = urllib.splitport(host)
191 host = host.strip('/')
192 seenPermutations = set()
193 for h in hostPermutations(host):
194 for p in pathPermutations(path):
195 u = '{0}{1}'.format(h, p)
196 if u not in seenPermutations:
197 yield u
198 seenPermutations.add(u)
199
200 @staticmethod
201 def digest(url):
202 """
203 Static method to calculate the SHA256 digest of an URL string.
204
205 @param url URL string
206 @type str
207 @return SHA256 digest of the URL string
208 @rtype bytes
209 """
210 return hashlib.sha256(url.encode('utf-8')).digest()

eric ide

mercurial