|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2017 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing an URL representation suitable for Google Safe Browsing. |
|
8 """ |
|
9 |
|
10 from __future__ import unicode_literals |
|
11 |
|
12 try: |
|
13 import urlparse # Py2 |
|
14 import urllib # Py2 |
|
15 except ImportError: |
|
16 import urllib.parse as urllib |
|
17 from urllib import parse as urlparse |
|
18 |
|
19 import re |
|
20 import posixpath |
|
21 import socket |
|
22 import struct |
|
23 |
|
24 import Preferences |
|
25 |
|
26 |
|
27 class SafeBrowsingUrl(object): |
|
28 """ |
|
29 Class implementing an URL representation suitable for Google Safe Browsing. |
|
30 """ |
|
31 # |
|
32 # Modeled after the URL class of the gglsbl package. |
|
33 # https://github.com/afilipovich/gglsbl |
|
34 # |
|
35 def __init__(self, url): |
|
36 """ |
|
37 Constructor |
|
38 |
|
39 @param url URL to be embedded |
|
40 @type str |
|
41 """ |
|
42 self.__url = url |
|
43 |
|
44 def hashes(self): |
|
45 """ |
|
46 Public method to get the hashes of all possible permutations of the URL |
|
47 in canonical form. |
|
48 |
|
49 @return generator for the URL hashes |
|
50 @rtype generator of str |
|
51 """ |
|
52 for variant in self.permutations(self.canonical()): |
|
53 urlHash = self.digest(variant) |
|
54 yield urlHash |
|
55 |
|
56 def canonical(self): |
|
57 """ |
|
58 Public method to convert the URL to the canonical form. |
|
59 |
|
60 @return canonical form of the URL |
|
61 @rtype str |
|
62 """ |
|
63 def fullUnescape(u): |
|
64 """ |
|
65 Method to recursively unescape an URL. |
|
66 |
|
67 @param u URL string to unescape |
|
68 @type str |
|
69 @return unescaped URL string |
|
70 @rtype str |
|
71 """ |
|
72 uu = urllib.unquote(u) |
|
73 if uu == u: |
|
74 return uu |
|
75 else: |
|
76 return fullUnescape(uu) |
|
77 |
|
78 def quote(s): |
|
79 """ |
|
80 Method to quote a string. |
|
81 |
|
82 @param string to be quoted |
|
83 @type str |
|
84 @return quoted string |
|
85 @rtype str |
|
86 """ |
|
87 safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~' |
|
88 return urllib.quote(s, safe=safeChars) |
|
89 |
|
90 url = self.__url.strip() |
|
91 url = url.replace('\n', '').replace('\r', '').replace('\t', '') |
|
92 url = url.split('#', 1)[0] |
|
93 if url.startswith('//'): |
|
94 url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url |
|
95 if len(url.split('://')) <= 1: |
|
96 url = Preferences.getWebBrowser("DefaultScheme") + url |
|
97 url = quote(fullUnescape(url)) |
|
98 urlParts = urlparse.urlsplit(url) |
|
99 if not urlParts[0]: |
|
100 url = Preferences.getWebBrowser("DefaultScheme") + url |
|
101 urlParts = urlparse.urlsplit(url) |
|
102 protocol = urlParts.scheme |
|
103 host = fullUnescape(urlParts.hostname) |
|
104 path = fullUnescape(urlParts.path) |
|
105 query = urlParts.query |
|
106 if not query and '?' not in url: |
|
107 query = None |
|
108 if not path: |
|
109 path = '/' |
|
110 hasTrailingSlash = (path[-1] == '/') |
|
111 path = posixpath.normpath(path).replace('//', '/') |
|
112 if hasTrailingSlash and path[-1] != '/': |
|
113 path += '/' |
|
114 port = urlParts.port |
|
115 host = host.strip('.') |
|
116 host = re.sub(r'\.+', '.', host).lower() |
|
117 if host.isdigit(): |
|
118 try: |
|
119 host = socket.inet_ntoa(struct.pack("!I", int(host))) |
|
120 except Exception: |
|
121 pass |
|
122 if host.startswith('0x') and '.' not in host: |
|
123 try: |
|
124 host = socket.inet_ntoa(struct.pack("!I", int(host, 16))) |
|
125 except Exception: |
|
126 pass |
|
127 quotedPath = quote(path) |
|
128 quotedHost = quote(host) |
|
129 if port is not None: |
|
130 quotedHost = '{0}:{1}'.format(quotedHost, port) |
|
131 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath) |
|
132 if query is not None: |
|
133 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query) |
|
134 return canonicalUrl |