WebBrowser/SafeBrowsing/SafeBrowsingUrl.py

branch
safe_browsing
changeset 5809
5b53c17b7d93
parent 5808
7bf90dcae4e1
child 5811
5358a3c7995f
equal deleted inserted replaced
5808:7bf90dcae4e1 5809:5b53c17b7d93
18 18
19 import re 19 import re
20 import posixpath 20 import posixpath
21 import socket 21 import socket
22 import struct 22 import struct
23 import hashlib
23 24
24 import Preferences 25 import Preferences
25 26
26 27
27 class SafeBrowsingUrl(object): 28 class SafeBrowsingUrl(object):
45 """ 46 """
46 Public method to get the hashes of all possible permutations of the URL 47 Public method to get the hashes of all possible permutations of the URL
47 in canonical form. 48 in canonical form.
48 49
49 @return generator for the URL hashes 50 @return generator for the URL hashes
50 @rtype generator of str 51 @rtype generator of str (Python2) or bytes (Python3)
51 """ 52 """
52 for variant in self.permutations(self.canonical()): 53 for variant in self.permutations(self.canonical()):
53 urlHash = self.digest(variant) 54 urlHash = self.digest(variant)
54 yield urlHash 55 yield urlHash
55 56
130 quotedHost = '{0}:{1}'.format(quotedHost, port) 131 quotedHost = '{0}:{1}'.format(quotedHost, port)
131 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) 132 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath)
132 if query is not None: 133 if query is not None:
133 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) 134 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query)
134 return canonicalUrl 135 return canonicalUrl
136
137 @staticmethod
138 def permutations(url):
139 """
140 Static method to determine all permutations of host name and path
141 which can be applied to blacklisted URLs.
142
143 @param url URL string to be permuted
144 @type str
145 @return generator of permuted URL strings
146 @type generator of str
147 """
148 def hostPermutations(host):
149 """
150 Method to generate the permutations of the host name.
151
152 @param host host name
153 @type str
154 @return generator of permuted host names
155 @rtype generator of str
156 """
157 if re.match(r'\d+\.\d+\.\d+\.\d+', host):
158 yield host
159 return
160 parts = host.split('.')
161 l = min(len(parts), 5)
162 if l > 4:
163 yield host
164 for i in range(l - 1):
165 yield '.'.join(parts[i - l:])
166
167 def pathPermutations(path):
168 """
169 Method to generate the permutations of the path.
170
171 @param path path to be processed
172 @type str
173 @return generator of permuted paths
174 @rtype generator of str
175 """
176 yield path
177 query = None
178 if '?' in path:
179 path, query = path.split('?', 1)
180 if query is not None:
181 yield path
182 pathParts = path.split('/')[0:-1]
183 curPath = ''
184 for i in range(min(4, len(pathParts))):
185 curPath = curPath + pathParts[i] + '/'
186 yield curPath
187
188 protocol, addressStr = urllib.splittype(url)
189 host, path = urllib.splithost(addressStr)
190 user, host = urllib.splituser(host)
191 host, port = urllib.splitport(host)
192 host = host.strip('/')
193 seenPermutations = set()
194 for h in hostPermutations(host):
195 for p in pathPermutations(path):
196 u = '{0}{1}'.format(h, p)
197 if u not in seenPermutations:
198 yield u
199 seenPermutations.add(u)
200
201 @staticmethod
202 def digest(url):
203 """
204 Static method to calculate the SHA256 digest of an URL string.
205
206 @param url URL string
207 @type str
208 @return SHA256 digest of the URL string
209 @rtype str (Python2) or bytes (Python3)
210 """
211 return hashlib.sha256(url.encode('utf-8')).digest()

eric ide

mercurial