130 quotedHost = '{0}:{1}'.format(quotedHost, port) |
131 quotedHost = '{0}:{1}'.format(quotedHost, port) |
131 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) |
132 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) |
132 if query is not None: |
133 if query is not None: |
133 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) |
134 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) |
134 return canonicalUrl |
135 return canonicalUrl |
|
136 |
|
137 @staticmethod |
|
138 def permutations(url): |
|
139 """ |
|
140 Static method to determine all permutations of host name and path |
|
141 which can be applied to blacklisted URLs. |
|
142 |
|
143 @param url URL string to be permuted |
|
144 @type str |
|
145 @return generator of permuted URL strings |
|
146 @type generator of str |
|
147 """ |
|
148 def hostPermutations(host): |
|
149 """ |
|
150 Method to generate the permutations of the host name. |
|
151 |
|
152 @param host host name |
|
153 @type str |
|
154 @return generator of permuted host names |
|
155 @rtype generator of str |
|
156 """ |
|
157 if re.match(r'\d+\.\d+\.\d+\.\d+', host): |
|
158 yield host |
|
159 return |
|
160 parts = host.split('.') |
|
161 l = min(len(parts), 5) |
|
162 if l > 4: |
|
163 yield host |
|
164 for i in range(l - 1): |
|
165 yield '.'.join(parts[i - l:]) |
|
166 |
|
167 def pathPermutations(path): |
|
168 """ |
|
169 Method to generate the permutations of the path. |
|
170 |
|
171 @param path path to be processed |
|
172 @type str |
|
173 @return generator of permuted paths |
|
174 @rtype generator of str |
|
175 """ |
|
176 yield path |
|
177 query = None |
|
178 if '?' in path: |
|
179 path, query = path.split('?', 1) |
|
180 if query is not None: |
|
181 yield path |
|
182 pathParts = path.split('/')[0:-1] |
|
183 curPath = '' |
|
184 for i in range(min(4, len(pathParts))): |
|
185 curPath = curPath + pathParts[i] + '/' |
|
186 yield curPath |
|
187 |
|
188 protocol, addressStr = urllib.splittype(url) |
|
189 host, path = urllib.splithost(addressStr) |
|
190 user, host = urllib.splituser(host) |
|
191 host, port = urllib.splitport(host) |
|
192 host = host.strip('/') |
|
193 seenPermutations = set() |
|
194 for h in hostPermutations(host): |
|
195 for p in pathPermutations(path): |
|
196 u = '{0}{1}'.format(h, p) |
|
197 if u not in seenPermutations: |
|
198 yield u |
|
199 seenPermutations.add(u) |
|
200 |
|
201 @staticmethod |
|
202 def digest(url): |
|
203 """ |
|
204 Static method to calculate the SHA256 digest of an URL string. |
|
205 |
|
206 @param url URL string |
|
207 @type str |
|
208 @return SHA256 digest of the URL string |
|
209 @rtype str (Python2) or bytes (Python3) |
|
210 """ |
|
211 return hashlib.sha256(url.encode('utf-8')).digest() |