36 |
37 |
37 |
38 |
38 class EricTldExtractor(QObject): |
39 class EricTldExtractor(QObject): |
39 """ |
40 """ |
40 Class implementing the TLD Extractor. |
41 Class implementing the TLD Extractor. |
41 |
42 |
42 Note: The module function instance() should be used to get a reference |
43 Note: The module function instance() should be used to get a reference |
43 to a global object to avoid overhead. |
44 to a global object to avoid overhead. |
44 """ |
45 """ |
|
46 |
45 def __init__(self, withPrivate=False, parent=None): |
47 def __init__(self, withPrivate=False, parent=None): |
46 """ |
48 """ |
47 Constructor |
49 Constructor |
48 |
50 |
49 @param withPrivate flag indicating to load private TLDs as well |
51 @param withPrivate flag indicating to load private TLDs as well |
50 @type bool |
52 @type bool |
51 @param parent reference to the parent object |
53 @param parent reference to the parent object |
52 @type QObject |
54 @type QObject |
53 """ |
55 """ |
54 super().__init__(parent) |
56 super().__init__(parent) |
55 |
57 |
56 self.__withPrivate = withPrivate |
58 self.__withPrivate = withPrivate |
57 self.__dataFileName = "" |
59 self.__dataFileName = "" |
58 self.__dataSearchPaths = [] |
60 self.__dataSearchPaths = [] |
59 |
61 |
60 self.__tldDict = collections.defaultdict(list) |
62 self.__tldDict = collections.defaultdict(list) |
61 # dict with list of str as values |
63 # dict with list of str as values |
62 |
64 |
63 self.setDataSearchPaths() |
65 self.setDataSearchPaths() |
64 |
66 |
65 def isDataLoaded(self): |
67 def isDataLoaded(self): |
66 """ |
68 """ |
67 Public method to check, if the TLD data ia already loaded. |
69 Public method to check, if the TLD data ia already loaded. |
68 |
70 |
69 @return flag indicating data is loaded |
71 @return flag indicating data is loaded |
70 @rtype bool |
72 @rtype bool |
71 """ |
73 """ |
72 return bool(self.__tldDict) |
74 return bool(self.__tldDict) |
73 |
75 |
74 def tld(self, host): |
76 def tld(self, host): |
75 """ |
77 """ |
76 Public method to get the top level domain for a host. |
78 Public method to get the top level domain for a host. |
77 |
79 |
78 @param host host name to get TLD for |
80 @param host host name to get TLD for |
79 @type str |
81 @type str |
80 @return TLD for host |
82 @return TLD for host |
81 @rtype str |
83 @rtype str |
82 """ |
84 """ |
83 if not host or host.startswith("."): |
85 if not host or host.startswith("."): |
84 return "" |
86 return "" |
85 |
87 |
86 cleanHost = self.__normalizedHost(host) |
88 cleanHost = self.__normalizedHost(host) |
87 |
89 |
88 tldPart = cleanHost[cleanHost.rfind(".") + 1:] |
90 tldPart = cleanHost[cleanHost.rfind(".") + 1 :] |
89 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8") |
91 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8") |
90 |
92 |
91 self.__loadData() |
93 self.__loadData() |
92 |
94 |
93 if tldPart not in self.__tldDict: |
95 if tldPart not in self.__tldDict: |
94 return tldPart |
96 return tldPart |
95 |
97 |
96 tldRules = self.__tldDict[tldPart][:] |
98 tldRules = self.__tldDict[tldPart][:] |
97 |
99 |
98 if tldPart not in tldRules: |
100 if tldPart not in tldRules: |
99 tldRules.append(tldPart) |
101 tldRules.append(tldPart) |
100 |
102 |
101 maxLabelCount = 0 |
103 maxLabelCount = 0 |
102 isWildcardTLD = False |
104 isWildcardTLD = False |
103 |
105 |
104 for rule in tldRules: |
106 for rule in tldRules: |
105 labelCount = rule.count(".") + 1 |
107 labelCount = rule.count(".") + 1 |
106 |
108 |
107 if rule.startswith("!"): |
109 if rule.startswith("!"): |
108 rule = rule[1:] |
110 rule = rule[1:] |
109 |
111 |
110 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
112 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
111 |
113 |
112 # matches with exception TLD |
114 # matches with exception TLD |
113 if cleanHost.endswith(rule): |
115 if cleanHost.endswith(rule): |
114 tldPart = rule[rule.find(".") + 1:] |
116 tldPart = rule[rule.find(".") + 1 :] |
115 break |
117 break |
116 |
118 |
117 if rule.startswith("*"): |
119 if rule.startswith("*"): |
118 rule = rule[1:] |
120 rule = rule[1:] |
119 |
121 |
120 if rule.startswith("."): |
122 if rule.startswith("."): |
121 rule = rule[1:] |
123 rule = rule[1:] |
122 |
124 |
123 isWildcardTLD = True |
125 isWildcardTLD = True |
124 else: |
126 else: |
125 isWildcardTLD = False |
127 isWildcardTLD = False |
126 |
128 |
127 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
129 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
128 testRule = "." + rule |
130 testRule = "." + rule |
129 testUrl = "." + cleanHost |
131 testUrl = "." + cleanHost |
130 |
132 |
131 if labelCount > maxLabelCount and testUrl.endswith(testRule): |
133 if labelCount > maxLabelCount and testUrl.endswith(testRule): |
132 tldPart = rule |
134 tldPart = rule |
133 maxLabelCount = labelCount |
135 maxLabelCount = labelCount |
134 |
136 |
135 if isWildcardTLD: |
137 if isWildcardTLD: |
136 temp = cleanHost |
138 temp = cleanHost |
137 temp = temp[:temp.rfind(tldPart)] |
139 temp = temp[: temp.rfind(tldPart)] |
138 |
140 |
139 if temp.endswith("."): |
141 if temp.endswith("."): |
140 temp = temp[:-1] |
142 temp = temp[:-1] |
141 |
143 |
142 temp = temp[temp.rfind(".") + 1:] |
144 temp = temp[temp.rfind(".") + 1 :] |
143 |
145 |
144 if temp: |
146 if temp: |
145 tldPart = temp + "." + rule |
147 tldPart = temp + "." + rule |
146 else: |
148 else: |
147 tldPart = rule |
149 tldPart = rule |
148 |
150 |
149 temp = self.__normalizedHost(host) |
151 temp = self.__normalizedHost(host) |
150 tldPart = ".".join( |
152 tldPart = ".".join(temp.split(".")[temp.count(".") - tldPart.count(".") :]) |
151 temp.split(".")[temp.count(".") - tldPart.count("."):]) |
153 |
152 |
|
153 return tldPart |
154 return tldPart |
154 |
155 |
155 def domain(self, host): |
156 def domain(self, host): |
156 """ |
157 """ |
157 Public method to get the domain for a host. |
158 Public method to get the domain for a host. |
158 |
159 |
159 @param host host name to get the domain for |
160 @param host host name to get the domain for |
160 @type str |
161 @type str |
161 @return domain for host |
162 @return domain for host |
162 @rtype str |
163 @rtype str |
163 """ |
164 """ |
164 tldPart = self.tld(host) |
165 tldPart = self.tld(host) |
165 |
166 |
166 return self.__domainHelper(host, tldPart) |
167 return self.__domainHelper(host, tldPart) |
167 |
168 |
168 def registrableDomain(self, host): |
169 def registrableDomain(self, host): |
169 """ |
170 """ |
170 Public method to get the registrable domain for a host. |
171 Public method to get the registrable domain for a host. |
171 |
172 |
172 @param host host name to get the registrable domain for |
173 @param host host name to get the registrable domain for |
173 @type str |
174 @type str |
174 @return registrable domain for host |
175 @return registrable domain for host |
175 @rtype str |
176 @rtype str |
176 """ |
177 """ |
177 tldPart = self.tld(host) |
178 tldPart = self.tld(host) |
178 |
179 |
179 return self.__registrableDomainHelper( |
180 return self.__registrableDomainHelper( |
180 self.__domainHelper(host, tldPart), tldPart) |
181 self.__domainHelper(host, tldPart), tldPart |
181 |
182 ) |
|
183 |
182 def subdomain(self, host): |
184 def subdomain(self, host): |
183 """ |
185 """ |
184 Public method to get the subdomain for a host. |
186 Public method to get the subdomain for a host. |
185 |
187 |
186 @param host host name to get the subdomain for |
188 @param host host name to get the subdomain for |
187 @type str |
189 @type str |
188 @return subdomain for host |
190 @return subdomain for host |
189 @rtype str |
191 @rtype str |
190 """ |
192 """ |
191 return self.__subdomainHelper(host, self.registrableDomain(host)) |
193 return self.__subdomainHelper(host, self.registrableDomain(host)) |
192 |
194 |
193 def splitParts(self, host): |
195 def splitParts(self, host): |
194 """ |
196 """ |
195 Public method to split a host address into its parts. |
197 Public method to split a host address into its parts. |
196 |
198 |
197 @param host host address to be split |
199 @param host host address to be split |
198 @type str |
200 @type str |
199 @return splitted host address |
201 @return splitted host address |
200 @rtype EricTldHostParts |
202 @rtype EricTldHostParts |
201 """ |
203 """ |
202 hostParts = EricTldHostParts() |
204 hostParts = EricTldHostParts() |
203 hostParts.host = host |
205 hostParts.host = host |
204 hostParts.tld = self.tld(host) |
206 hostParts.tld = self.tld(host) |
205 hostParts.domain = self.__domainHelper(host, hostParts.tld) |
207 hostParts.domain = self.__domainHelper(host, hostParts.tld) |
206 hostParts.registrableDomain = self.__registrableDomainHelper( |
208 hostParts.registrableDomain = self.__registrableDomainHelper( |
207 hostParts.domain, hostParts.tld) |
209 hostParts.domain, hostParts.tld |
208 hostParts.subdomain = self.__subdomainHelper( |
210 ) |
209 host, hostParts.registrableDomain) |
211 hostParts.subdomain = self.__subdomainHelper(host, hostParts.registrableDomain) |
210 |
212 |
211 return hostParts |
213 return hostParts |
212 |
214 |
213 def dataSearchPaths(self): |
215 def dataSearchPaths(self): |
214 """ |
216 """ |
215 Public method to get the search paths for the TLD data file. |
217 Public method to get the search paths for the TLD data file. |
216 |
218 |
217 @return search paths for the TLD data file |
219 @return search paths for the TLD data file |
218 @rtype list of str |
220 @rtype list of str |
219 """ |
221 """ |
220 return self.__dataSearchPaths[:] |
222 return self.__dataSearchPaths[:] |
221 |
223 |
222 def setDataSearchPaths(self, searchPaths=None): |
224 def setDataSearchPaths(self, searchPaths=None): |
223 """ |
225 """ |
224 Public method to set the search paths for the TLD data file. |
226 Public method to set the search paths for the TLD data file. |
225 |
227 |
226 @param searchPaths search paths for the TLD data file or None, |
228 @param searchPaths search paths for the TLD data file or None, |
227 if the default search paths shall be set |
229 if the default search paths shall be set |
228 @type list of str |
230 @type list of str |
229 """ |
231 """ |
230 if searchPaths: |
232 if searchPaths: |
231 self.__dataSearchPaths = searchPaths[:] |
233 self.__dataSearchPaths = searchPaths[:] |
232 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths()) |
234 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths()) |
233 else: |
235 else: |
234 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:] |
236 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:] |
235 |
237 |
236 # remove duplicates |
238 # remove duplicates |
237 paths = [] |
239 paths = [] |
238 for p in self.__dataSearchPaths: |
240 for p in self.__dataSearchPaths: |
239 if p not in paths: |
241 if p not in paths: |
240 paths.append(p) |
242 paths.append(p) |
241 self.__dataSearchPaths = paths |
243 self.__dataSearchPaths = paths |
242 |
244 |
243 def __defaultDataSearchPaths(self): |
245 def __defaultDataSearchPaths(self): |
244 """ |
246 """ |
245 Private method to get the default search paths for the TLD data file. |
247 Private method to get the default search paths for the TLD data file. |
246 |
248 |
247 @return default search paths for the TLD data file |
249 @return default search paths for the TLD data file |
248 @rtype list of str |
250 @rtype list of str |
249 """ |
251 """ |
250 return [os.path.join(os.path.dirname(__file__), "data")] |
252 return [os.path.join(os.path.dirname(__file__), "data")] |
251 |
253 |
252 def getTldDownloadUrl(self): |
254 def getTldDownloadUrl(self): |
253 """ |
255 """ |
254 Public method to get the TLD data file download URL. |
256 Public method to get the TLD data file download URL. |
255 |
257 |
256 @return download URL |
258 @return download URL |
257 @rtype QUrl |
259 @rtype QUrl |
258 """ |
260 """ |
259 return QUrl( |
261 return QUrl( |
260 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
262 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
261 "effective_tld_names.dat?raw=1") |
263 "effective_tld_names.dat?raw=1" |
262 |
264 ) |
|
265 |
263 def __loadData(self): |
266 def __loadData(self): |
264 """ |
267 """ |
265 Private method to load the TLD data. |
268 Private method to load the TLD data. |
266 """ |
269 """ |
267 if self.isDataLoaded(): |
270 if self.isDataLoaded(): |
268 return |
271 return |
269 |
272 |
270 dataFileName = "" |
273 dataFileName = "" |
271 parsedDataFileExist = False |
274 parsedDataFileExist = False |
272 |
275 |
273 for searchPath in self.__dataSearchPaths: |
276 for searchPath in self.__dataSearchPaths: |
274 dataFileName = os.path.abspath( |
277 dataFileName = os.path.abspath( |
275 os.path.join(searchPath, "effective_tld_names.dat") |
278 os.path.join(searchPath, "effective_tld_names.dat") |
276 ) |
279 ) |
277 if os.path.exists(dataFileName): |
280 if os.path.exists(dataFileName): |
278 parsedDataFileExist = True |
281 parsedDataFileExist = True |
279 break |
282 break |
280 |
283 |
281 if not parsedDataFileExist: |
284 if not parsedDataFileExist: |
282 tldDataFileDownloadLink = ( |
285 tldDataFileDownloadLink = ( |
283 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
286 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
284 "effective_tld_names.dat?raw=1" |
287 "effective_tld_names.dat?raw=1" |
285 ) |
288 ) |
286 EricMessageBox.information( |
289 EricMessageBox.information( |
287 None, |
290 None, |
288 self.tr("TLD Data File not found"), |
291 self.tr("TLD Data File not found"), |
289 self.tr("""<p>The file 'effective_tld_names.dat' was not""" |
292 self.tr( |
290 """ found!<br/>You can download it from """ |
293 """<p>The file 'effective_tld_names.dat' was not""" |
291 """'<a href="{0}"><b>here</b></a>' to one of the""" |
294 """ found!<br/>You can download it from """ |
292 """ following paths:</p><ul>{1}</ul>""").format( |
295 """'<a href="{0}"><b>here</b></a>' to one of the""" |
|
296 """ following paths:</p><ul>{1}</ul>""" |
|
297 ).format( |
293 tldDataFileDownloadLink, |
298 tldDataFileDownloadLink, |
294 "".join(["<li>{0}</li>".format(p) |
299 "".join(["<li>{0}</li>".format(p) for p in self.__dataSearchPaths]), |
295 for p in self.__dataSearchPaths])) |
300 ), |
296 ) |
301 ) |
297 return |
302 return |
298 |
303 |
299 self.__dataFileName = dataFileName |
304 self.__dataFileName = dataFileName |
300 if not self.__parseData(dataFileName, |
305 if not self.__parseData(dataFileName, loadPrivateDomains=self.__withPrivate): |
301 loadPrivateDomains=self.__withPrivate): |
|
302 qWarning( |
306 qWarning( |
303 "EricTldExtractor: There are some parse errors for file: {0}" |
307 "EricTldExtractor: There are some parse errors for file: {0}".format( |
304 .format(dataFileName)) |
308 dataFileName |
305 |
309 ) |
|
310 ) |
|
311 |
306 def __parseData(self, dataFile, loadPrivateDomains=False): |
312 def __parseData(self, dataFile, loadPrivateDomains=False): |
307 """ |
313 """ |
308 Private method to parse TLD data. |
314 Private method to parse TLD data. |
309 |
315 |
310 @param dataFile name of the file containing the TLD data |
316 @param dataFile name of the file containing the TLD data |
311 @type str |
317 @type str |
312 @param loadPrivateDomains flag indicating to load private domains |
318 @param loadPrivateDomains flag indicating to load private domains |
313 @type bool |
319 @type bool |
314 @return flag indicating success |
320 @return flag indicating success |
315 @rtype bool |
321 @rtype bool |
316 """ |
322 """ |
317 # start with a fresh dictionary |
323 # start with a fresh dictionary |
318 self.__tldDict = collections.defaultdict(list) |
324 self.__tldDict = collections.defaultdict(list) |
319 |
325 |
320 seekToEndOfPrivateDomains = False |
326 seekToEndOfPrivateDomains = False |
321 |
327 |
322 try: |
328 try: |
323 with open(dataFile, "r", encoding="utf-8") as f: |
329 with open(dataFile, "r", encoding="utf-8") as f: |
324 for line in f.readlines(): |
330 for line in f.readlines(): |
325 if not line: |
331 if not line: |
326 continue |
332 continue |
327 |
333 |
328 if line.startswith("."): |
334 if line.startswith("."): |
329 line = line[1:] |
335 line = line[1:] |
330 |
336 |
331 if line.startswith("//"): |
337 if line.startswith("//"): |
332 if "===END PRIVATE DOMAINS===" in line: |
338 if "===END PRIVATE DOMAINS===" in line: |
333 seekToEndOfPrivateDomains = False |
339 seekToEndOfPrivateDomains = False |
334 |
340 |
335 if ( |
341 if ( |
336 not loadPrivateDomains and |
342 not loadPrivateDomains |
337 "===BEGIN PRIVATE DOMAINS===" in line |
343 and "===BEGIN PRIVATE DOMAINS===" in line |
338 ): |
344 ): |
339 seekToEndOfPrivateDomains = True |
345 seekToEndOfPrivateDomains = True |
340 |
346 |
341 continue |
347 continue |
342 |
348 |
343 if seekToEndOfPrivateDomains: |
349 if seekToEndOfPrivateDomains: |
344 continue |
350 continue |
345 |
351 |
346 # only data up to the first whitespace is used |
352 # only data up to the first whitespace is used |
347 line = line.split(None, 1)[0] |
353 line = line.split(None, 1)[0] |
348 |
354 |
349 if "." not in line: |
355 if "." not in line: |
350 self.__tldDict[line].append(line) |
356 self.__tldDict[line].append(line) |
351 else: |
357 else: |
352 key = line[line.rfind(".") + 1:] |
358 key = line[line.rfind(".") + 1 :] |
353 self.__tldDict[key].append(line) |
359 self.__tldDict[key].append(line) |
354 |
360 |
355 return self.isDataLoaded() |
361 return self.isDataLoaded() |
356 except OSError: |
362 except OSError: |
357 return False |
363 return False |
358 |
364 |
359 def __domainHelper(self, host, tldPart): |
365 def __domainHelper(self, host, tldPart): |
360 """ |
366 """ |
361 Private method to get the domain name without TLD. |
367 Private method to get the domain name without TLD. |
362 |
368 |
363 @param host host address |
369 @param host host address |
364 @type str |
370 @type str |
365 @param tldPart TLD part of the host address |
371 @param tldPart TLD part of the host address |
366 @type str |
372 @type str |
367 @return domain name |
373 @return domain name |
368 @rtype str |
374 @rtype str |
369 """ |
375 """ |
370 if not host or not tldPart: |
376 if not host or not tldPart: |
371 return "" |
377 return "" |
372 |
378 |
373 temp = self.__normalizedHost(host) |
379 temp = self.__normalizedHost(host) |
374 temp = temp[:temp.rfind(tldPart)] |
380 temp = temp[: temp.rfind(tldPart)] |
375 |
381 |
376 if temp.endswith("."): |
382 if temp.endswith("."): |
377 temp = temp[:-1] |
383 temp = temp[:-1] |
378 |
384 |
379 return temp[temp.rfind(".") + 1:] |
385 return temp[temp.rfind(".") + 1 :] |
380 |
386 |
381 def __registrableDomainHelper(self, domainPart, tldPart): |
387 def __registrableDomainHelper(self, domainPart, tldPart): |
382 """ |
388 """ |
383 Private method to get the registrable domain (i.e. domain plus TLD). |
389 Private method to get the registrable domain (i.e. domain plus TLD). |
384 |
390 |
385 @param domainPart domain part of a host address |
391 @param domainPart domain part of a host address |
386 @type str |
392 @type str |
387 @param tldPart TLD part of a host address |
393 @param tldPart TLD part of a host address |
388 @type str |
394 @type str |
389 @return registrable domain name |
395 @return registrable domain name |