WebBrowser/SafeBrowsing/SafeBrowsingUrl.py

branch
safe_browsing
changeset 5808
7bf90dcae4e1
child 5809
5b53c17b7d93
equal deleted inserted replaced
5807:d2eb934fa6b4 5808:7bf90dcae4e1
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2017 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing an URL representation suitable for Google Safe Browsing.
8 """
9
10 from __future__ import unicode_literals
11
12 try:
13 import urlparse # Py2
14 import urllib # Py2
15 except ImportError:
16 import urllib.parse as urllib
17 from urllib import parse as urlparse
18
19 import re
20 import posixpath
21 import socket
22 import struct
23
24 import Preferences
25
26
27 class SafeBrowsingUrl(object):
28 """
29 Class implementing an URL representation suitable for Google Safe Browsing.
30 """
31 #
32 # Modeled after the URL class of the gglsbl package.
33 # https://github.com/afilipovich/gglsbl
34 #
35 def __init__(self, url):
36 """
37 Constructor
38
39 @param url URL to be embedded
40 @type str
41 """
42 self.__url = url
43
44 def hashes(self):
45 """
46 Public method to get the hashes of all possible permutations of the URL
47 in canonical form.
48
49 @return generator for the URL hashes
50 @rtype generator of str
51 """
52 for variant in self.permutations(self.canonical()):
53 urlHash = self.digest(variant)
54 yield urlHash
55
56 def canonical(self):
57 """
58 Public method to convert the URL to the canonical form.
59
60 @return canonical form of the URL
61 @rtype str
62 """
63 def fullUnescape(u):
64 """
65 Method to recursively unescape an URL.
66
67 @param u URL string to unescape
68 @type str
69 @return unescaped URL string
70 @rtype str
71 """
72 uu = urllib.unquote(u)
73 if uu == u:
74 return uu
75 else:
76 return fullUnescape(uu)
77
78 def quote(s):
79 """
80 Method to quote a string.
81
82 @param string to be quoted
83 @type str
84 @return quoted string
85 @rtype str
86 """
87 safeChars = '!"$&\'()*+,-./:;<=>?@[\\]^_`{|}~'
88 return urllib.quote(s, safe=safeChars)
89
90 url = self.__url.strip()
91 url = url.replace('\n', '').replace('\r', '').replace('\t', '')
92 url = url.split('#', 1)[0]
93 if url.startswith('//'):
94 url = Preferences.getWebBrowser("DefaultScheme")[:-3] + url
95 if len(url.split('://')) <= 1:
96 url = Preferences.getWebBrowser("DefaultScheme") + url
97 url = quote(fullUnescape(url))
98 urlParts = urlparse.urlsplit(url)
99 if not urlParts[0]:
100 url = Preferences.getWebBrowser("DefaultScheme") + url
101 urlParts = urlparse.urlsplit(url)
102 protocol = urlParts.scheme
103 host = fullUnescape(urlParts.hostname)
104 path = fullUnescape(urlParts.path)
105 query = urlParts.query
106 if not query and '?' not in url:
107 query = None
108 if not path:
109 path = '/'
110 hasTrailingSlash = (path[-1] == '/')
111 path = posixpath.normpath(path).replace('//', '/')
112 if hasTrailingSlash and path[-1] != '/':
113 path += '/'
114 port = urlParts.port
115 host = host.strip('.')
116 host = re.sub(r'\.+', '.', host).lower()
117 if host.isdigit():
118 try:
119 host = socket.inet_ntoa(struct.pack("!I", int(host)))
120 except Exception:
121 pass
122 if host.startswith('0x') and '.' not in host:
123 try:
124 host = socket.inet_ntoa(struct.pack("!I", int(host, 16)))
125 except Exception:
126 pass
127 quotedPath = quote(path)
128 quotedHost = quote(host)
129 if port is not None:
130 quotedHost = '{0}:{1}'.format(quotedHost, port)
131 canonicalUrl = '{0}://{1}{2}'.format(protocol, quotedHost, quotedPath)
132 if query is not None:
133 canonicalUrl = '{0}?{1}'.format(canonicalUrl, query)
134 return canonicalUrl

eric ide

mercurial