|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2016 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing the TLD Extractor. |
|
8 """ |
|
9 |
|
10 # |
|
11 # This is a Python port of the TLDExtractor of Qupzilla |
|
12 # Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com> |
|
13 # |
|
14 |
|
15 from __future__ import unicode_literals |
|
16 |
|
17 import collections |
|
18 |
|
19 from PyQt5.QtCore import QObject, QUrl, QFile, QFileInfo, QRegExp, qWarning |
|
20 |
|
21 from E5Gui import E5MessageBox |
|
22 |
|
23 from .data import tld_rc # __IGNORE_WARNING__ |
|
24 |
|
25 |
|
26 class E5TldHostParts(object): |
|
27 """ |
|
28 Class implementing the host parts helper. |
|
29 """ |
|
30 def __init__(self): |
|
31 """ |
|
32 Constructor |
|
33 """ |
|
34 self.host = "" |
|
35 self.tld = "" |
|
36 self.domain = "" |
|
37 self.registrableDomain = "" |
|
38 self.subdomain = "" |
|
39 |
|
40 |
|
41 class E5TldExtractor(QObject): |
|
42 """ |
|
43 Class implementing the TLD Extractor. |
|
44 |
|
45 Note: The module function instance() should be used to get a reference |
|
46 to a global object to avoid overhead. |
|
47 """ |
|
48 def __init__(self, withPrivate=False, parent=None): |
|
49 """ |
|
50 Constructor |
|
51 |
|
52 @param withPrivate flag indicating to load private TLDs as well |
|
53 @type bool |
|
54 @param parent reference to the parent object |
|
55 @type QObject |
|
56 """ |
|
57 super(E5TldExtractor, self).__init__(parent) |
|
58 |
|
59 self.__withPrivate = withPrivate |
|
60 self.__dataFileName = "" |
|
61 self.__dataSearchPaths = [] |
|
62 |
|
63 self.__tldDict = collections.defaultdict(list) |
|
64 # dict with list of str as values |
|
65 |
|
66 self.setDataSearchPaths() |
|
67 |
|
68 def isDataLoaded(self): |
|
69 """ |
|
70 Public method to check, if the TLD data ia already loaded. |
|
71 |
|
72 @return flag indicating data is loaded |
|
73 @rtype bool |
|
74 """ |
|
75 return bool(self.__tldDict) |
|
76 |
|
77 def tld(self, host): |
|
78 """ |
|
79 Public method to get the top level domain for a host. |
|
80 |
|
81 @param host host name to get TLD for |
|
82 @type str |
|
83 @return TLD for host |
|
84 @rtype str |
|
85 """ |
|
86 if not host or host.startswith("."): |
|
87 return "" |
|
88 |
|
89 cleanHost = self.__normalizedHost(host) |
|
90 |
|
91 tldPart = cleanHost[cleanHost.rfind(".") + 1:] |
|
92 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8") |
|
93 |
|
94 self.__loadData() |
|
95 |
|
96 if tldPart not in self.__tldDict: |
|
97 return tldPart |
|
98 |
|
99 tldRules = self.__tldDict[tldPart][:] |
|
100 |
|
101 if tldPart not in tldRules: |
|
102 tldRules.append(tldPart) |
|
103 |
|
104 maxLabelCount = 0 |
|
105 isWildcardTLD = False |
|
106 |
|
107 for rule in tldRules: |
|
108 labelCount = rule.count(".") + 1 |
|
109 |
|
110 if rule.startswith("!"): |
|
111 rule = rule[1:] |
|
112 |
|
113 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
|
114 |
|
115 # matches with exception TLD |
|
116 if cleanHost.endswith(rule): |
|
117 tldPart = rule[rule.find(".") + 1:] |
|
118 break |
|
119 |
|
120 if rule.startswith("*"): |
|
121 rule = rule[1:] |
|
122 |
|
123 if rule.startswith("."): |
|
124 rule = rule[1:] |
|
125 |
|
126 isWildcardTLD = True |
|
127 else: |
|
128 isWildcardTLD = False |
|
129 |
|
130 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
|
131 testRule = "." + rule |
|
132 testUrl = "." + cleanHost |
|
133 |
|
134 if labelCount > maxLabelCount and testUrl.endswith(testRule): |
|
135 tldPart = rule |
|
136 maxLabelCount = labelCount |
|
137 |
|
138 if isWildcardTLD: |
|
139 temp = cleanHost |
|
140 temp = temp[:temp.rfind(tldPart)] |
|
141 |
|
142 if temp.endswith("."): |
|
143 temp = temp[:-1] |
|
144 |
|
145 temp = temp[temp.rfind(".") + 1:] |
|
146 |
|
147 if temp: |
|
148 tldPart = temp + "." + rule |
|
149 else: |
|
150 tldPart = rule |
|
151 |
|
152 temp = self.__normalizedHost(host) |
|
153 tldPart = ".".join( |
|
154 temp.split(".")[temp.count(".") - tldPart.count("."):]) |
|
155 |
|
156 return tldPart |
|
157 |
|
158 def domain(self, host): |
|
159 """ |
|
160 Public method to get the domain for a host. |
|
161 |
|
162 @param host host name to get the domain for |
|
163 @type str |
|
164 @return domain for host |
|
165 @rtype str |
|
166 """ |
|
167 tldPart = self.tld(host) |
|
168 |
|
169 return self.__domainHelper(host, tldPart) |
|
170 |
|
171 def registrableDomain(self, host): |
|
172 """ |
|
173 Public method to get the registrable domain for a host. |
|
174 |
|
175 @param host host name to get the registrable domain for |
|
176 @type str |
|
177 @return registrable domain for host |
|
178 @rtype str |
|
179 """ |
|
180 tldPart = self.tld(host) |
|
181 |
|
182 return self.__registrableDomainHelper( |
|
183 self.__domainHelper(host, tldPart), tldPart) |
|
184 |
|
185 def subdomain(self, host): |
|
186 """ |
|
187 Public method to get the subdomain for a host. |
|
188 |
|
189 @param host host name to get the subdomain for |
|
190 @type str |
|
191 @return subdomain for host |
|
192 @rtype str |
|
193 """ |
|
194 return self.__subdomainHelper(host, self.registrableDomain(host)) |
|
195 |
|
196 def splitParts(self, host): |
|
197 """ |
|
198 Public method to split a host address into its parts. |
|
199 |
|
200 @param host host address to be split |
|
201 @type str |
|
202 @return splitted host address |
|
203 @rtype E5TldHostParts |
|
204 """ |
|
205 hostParts = E5TldHostParts() |
|
206 hostParts.host = host |
|
207 hostParts.tld = self.tld(host) |
|
208 hostParts.domain = self.__domainHelper(host, hostParts.tld) |
|
209 hostParts.registrableDomain = self.__registrableDomainHelper( |
|
210 hostParts.domain, hostParts.tld) |
|
211 hostParts.subdomain = self.__subdomainHelper( |
|
212 host, hostParts.registrableDomain) |
|
213 |
|
214 return hostParts |
|
215 |
|
216 def dataSearchPaths(self): |
|
217 """ |
|
218 Public method to get the search paths for the TLD data file. |
|
219 |
|
220 @return search paths for the TLD data file |
|
221 @rtype list of str |
|
222 """ |
|
223 return self.__dataSearchPaths[:] |
|
224 |
|
225 def setDataSearchPaths(self, searchPaths=None): |
|
226 """ |
|
227 Public method to set the search paths for the TLD data file. |
|
228 |
|
229 @param searchPaths search paths for the TLD data file or None, |
|
230 if the default search paths shall be set |
|
231 @type list of str |
|
232 """ |
|
233 if searchPaths: |
|
234 self.__dataSearchPaths = searchPaths[:] |
|
235 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths()) |
|
236 else: |
|
237 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:] |
|
238 |
|
239 # remove duplicates |
|
240 paths = [] |
|
241 for p in self.__dataSearchPaths: |
|
242 if p not in paths: |
|
243 paths.append(p) |
|
244 self.__dataSearchPaths = paths |
|
245 |
|
246 def __defaultDataSearchPaths(self): |
|
247 """ |
|
248 Private method to get the default search paths for the TLD data file. |
|
249 |
|
250 @return default search paths for the TLD data file |
|
251 @rtype list of str |
|
252 """ |
|
253 return [":"] |
|
254 |
|
255 def getTldDownloadUrl(self): |
|
256 """ |
|
257 Public method to get the TLD data file download URL. |
|
258 |
|
259 @return download URL |
|
260 @rtype QUrl |
|
261 """ |
|
262 return QUrl( |
|
263 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
|
264 "effective_tld_names.dat?raw=1") |
|
265 |
|
266 def __loadData(self): |
|
267 """ |
|
268 Private method to load the TLD data. |
|
269 """ |
|
270 if self.isDataLoaded(): |
|
271 return |
|
272 |
|
273 dataFileName = "" |
|
274 parsedDataFileExist = False |
|
275 |
|
276 for path in self.__dataSearchPaths: |
|
277 dataFileName = QFileInfo(path + "/effective_tld_names.dat")\ |
|
278 .absoluteFilePath() |
|
279 if QFileInfo(dataFileName).exists(): |
|
280 parsedDataFileExist = True |
|
281 break |
|
282 |
|
283 if not parsedDataFileExist: |
|
284 tldDataFileDownloadLink = \ |
|
285 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" \ |
|
286 "effective_tld_names.dat?raw=1" |
|
287 E5MessageBox.information( |
|
288 None, |
|
289 self.tr("TLD Data File not found"), |
|
290 self.tr("""<p>The file 'effective_tld_names.dat' was not""" |
|
291 """ found!<br/>You can download it from """ |
|
292 """'<a href="{0}"><b>here</b></a>' to one of the""" |
|
293 """ following paths:</p><ul>{1}</ul>""").format( |
|
294 tldDataFileDownloadLink, |
|
295 "".join(["<li>{0}</li>".format(p) |
|
296 for p in self.__dataSearchPaths])) |
|
297 ) |
|
298 return |
|
299 |
|
300 self.__dataFileName = dataFileName |
|
301 if not self.__parseData(dataFileName, |
|
302 loadPrivateDomains=self.__withPrivate): |
|
303 qWarning( |
|
304 "E5TldExtractor: There are some parse errors for file: {0}" |
|
305 .format(dataFileName)) |
|
306 |
|
307 def __parseData(self, dataFile, loadPrivateDomains=False): |
|
308 """ |
|
309 Private method to parse TLD data. |
|
310 |
|
311 @param dataFile name of the file containing the TLD data |
|
312 @type str |
|
313 @param loadPrivateDomains flag indicating to load private domains |
|
314 @type bool |
|
315 @return flag indicating success |
|
316 @rtype bool |
|
317 """ |
|
318 # start with a fresh dictionary |
|
319 self.__tldDict = collections.defaultdict(list) |
|
320 |
|
321 file = QFile(dataFile) |
|
322 |
|
323 if not file.open(QFile.ReadOnly | QFile.Text): |
|
324 return False |
|
325 |
|
326 seekToEndOfPrivateDomains = False |
|
327 |
|
328 while not file.atEnd(): |
|
329 line = bytes(file.readLine()).decode("utf-8").strip() |
|
330 if not line: |
|
331 continue |
|
332 |
|
333 if line.startswith("."): |
|
334 line = line[1:] |
|
335 |
|
336 if line.startswith("//"): |
|
337 if "===END PRIVATE DOMAINS===" in line: |
|
338 seekToEndOfPrivateDomains = False |
|
339 |
|
340 if not loadPrivateDomains and \ |
|
341 "===BEGIN PRIVATE DOMAINS===" in line: |
|
342 seekToEndOfPrivateDomains = True |
|
343 |
|
344 continue |
|
345 |
|
346 if seekToEndOfPrivateDomains: |
|
347 continue |
|
348 |
|
349 # only data up to the first whitespace is used |
|
350 line = line.split(None, 1)[0] |
|
351 |
|
352 if "." not in line: |
|
353 self.__tldDict[line].append(line) |
|
354 else: |
|
355 key = line[line.rfind(".") + 1:] |
|
356 self.__tldDict[key].append(line) |
|
357 |
|
358 return self.isDataLoaded() |
|
359 |
|
360 def __domainHelper(self, host, tldPart): |
|
361 """ |
|
362 Private method to get the domain name without TLD. |
|
363 |
|
364 @param host host address |
|
365 @type str |
|
366 @param tldPart TLD part of the host address |
|
367 @type str |
|
368 @return domain name |
|
369 @rtype str |
|
370 """ |
|
371 if not host or not tldPart: |
|
372 return "" |
|
373 |
|
374 temp = self.__normalizedHost(host) |
|
375 temp = temp[:temp.rfind(tldPart)] |
|
376 |
|
377 if temp.endswith("."): |
|
378 temp = temp[:-1] |
|
379 |
|
380 return temp[temp.rfind(".") + 1:] |
|
381 |
|
382 def __registrableDomainHelper(self, domainPart, tldPart): |
|
383 """ |
|
384 Private method to get the registrable domain (i.e. domain plus TLD). |
|
385 |
|
386 @param domainPart domain part of a host address |
|
387 @type str |
|
388 @param tldPart TLD part of a host address |
|
389 @type str |
|
390 @return registrable domain name |
|
391 @rtype str |
|
392 """ |
|
393 if not tldPart or not domainPart: |
|
394 return "" |
|
395 else: |
|
396 return "{0}.{1}".format(domainPart, tldPart) |
|
397 |
|
398 def __subdomainHelper(self, host, registrablePart): |
|
399 """ |
|
400 Private method to get the subdomain of a host address (i.e. domain part |
|
401 without the registrable domain name). |
|
402 |
|
403 @param host host address |
|
404 @type str |
|
405 @param registrablePart registrable domain part of the host address |
|
406 @type str |
|
407 @return subdomain name |
|
408 @rtype str |
|
409 """ |
|
410 if not host or not registrablePart: |
|
411 return "" |
|
412 |
|
413 subdomain = self.__normalizedHost(host) |
|
414 |
|
415 subdomain = subdomain[:subdomain.rfind(registrablePart)] |
|
416 |
|
417 if subdomain.endswith("."): |
|
418 subdomain = subdomain[:-1] |
|
419 |
|
420 return subdomain |
|
421 |
|
422 def __normalizedHost(self, host): |
|
423 """ |
|
424 Private method to get the normalized host for a host address. |
|
425 |
|
426 @param host host address to be normalized |
|
427 @type str |
|
428 @return normalized host address |
|
429 @rtype str |
|
430 """ |
|
431 return host.lower() |
|
432 |
|
433 ################################################################# |
|
434 ## Methods below are for testing purposes |
|
435 ################################################################# |
|
436 |
|
437 def test(self): |
|
438 """ |
|
439 Public method to execute the tests. |
|
440 |
|
441 @return flag indicating the test result |
|
442 @rtype bool |
|
443 """ |
|
444 self.__withPrivate = True |
|
445 self.__loadData() |
|
446 if not self.__tldDict: |
|
447 return False |
|
448 |
|
449 testDataFileName = "" |
|
450 testDataFileExist = False |
|
451 |
|
452 for path in self.__dataSearchPaths: |
|
453 testDataFileName = QFileInfo(path + "/test_psl.txt")\ |
|
454 .absoluteFilePath() |
|
455 if QFileInfo(testDataFileName).exists(): |
|
456 testDataFileExist = True |
|
457 break |
|
458 |
|
459 if not testDataFileExist: |
|
460 testFileDownloadLink = \ |
|
461 "http://mxr.mozilla.org/mozilla-central/source/netwerk/test/" \ |
|
462 "unit/data/test_psl.txt?raw=1" |
|
463 E5MessageBox.information( |
|
464 None, |
|
465 self.tr("TLD Data File not found"), |
|
466 self.tr("""<p>The file 'test_psl.txt' was not found!""" |
|
467 """<br/>You can download it from '<a href="{0}">""" |
|
468 """<b>here</b></a>' to one of the following""" |
|
469 """ paths:</p><ul>{1}</ul>""").format( |
|
470 testFileDownloadLink, |
|
471 "".join(["<li>{0}</li>".format(p) |
|
472 for p in self.__dataSearchPaths])) |
|
473 ) |
|
474 return False |
|
475 |
|
476 file = QFile(testDataFileName) |
|
477 |
|
478 if not file.open(QFile.ReadOnly | QFile.Text): |
|
479 return False |
|
480 |
|
481 testRegExp = QRegExp( |
|
482 "checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);") |
|
483 allTestSuccess = True |
|
484 |
|
485 while not file.atEnd(): |
|
486 line = bytes(file.readLine()).decode("utf-8").strip() |
|
487 if not line or line.startswith("//"): |
|
488 continue |
|
489 |
|
490 if testRegExp.indexIn(line) == -1: |
|
491 allTestSuccess = False |
|
492 else: |
|
493 hostName = testRegExp.cap(2) |
|
494 registrableName = testRegExp.cap(4) |
|
495 |
|
496 if not self.__checkPublicSuffix(hostName, registrableName): |
|
497 allTestSuccess = False |
|
498 |
|
499 if allTestSuccess: |
|
500 qWarning("E5TldExtractor: Test passed successfully.") |
|
501 else: |
|
502 qWarning("E5TldExtractor: Test finished with some errors!") |
|
503 |
|
504 # reset the TLD dictionary |
|
505 self.__tldDict = collections.defaultdict(list) |
|
506 |
|
507 return allTestSuccess |
|
508 |
|
509 def __checkPublicSuffix(self, host, registrableName): |
|
510 """ |
|
511 Private method to test a host name against a registrable name. |
|
512 |
|
513 @param host host name to test |
|
514 @type str |
|
515 @param registrableName registrable domain name to test against |
|
516 @type str |
|
517 @return flag indicating the check result |
|
518 @rtype bool |
|
519 """ |
|
520 regName = self.registrableDomain(host) |
|
521 if regName != registrableName: |
|
522 qWarning( |
|
523 "E5TldExtractor Test Error: hostName: {0}\n" |
|
524 " Correct registrableName: {1}\n" |
|
525 " Calculated registrableName: {2}".format( |
|
526 host, registrableName, regName)) |
|
527 return False |
|
528 |
|
529 return True |
|
530 |
|
531 |
|
532 _TLDExtractor = None |
|
533 |
|
534 |
|
535 def instance(withPrivate=False): |
|
536 """ |
|
537 Global function to get a reference to the TLD extractor and create it, if |
|
538 it hasn't been yet. |
|
539 |
|
540 @param withPrivate flag indicating to load private TLDs as well |
|
541 @type bool |
|
542 @return reference to the zoom manager object |
|
543 @rtype E5TldExtractor |
|
544 """ |
|
545 global _TLDExtractor |
|
546 |
|
547 if _TLDExtractor is None: |
|
548 _TLDExtractor = E5TldExtractor(withPrivate=withPrivate) |
|
549 |
|
550 return _TLDExtractor |