|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2016 - 2021 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing the TLD Extractor. |
|
8 """ |
|
9 |
|
10 # |
|
11 # This is a Python port of the TLDExtractor of Qupzilla |
|
12 # Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com> |
|
13 # |
|
14 |
|
15 import collections |
|
16 import os |
|
17 import re |
|
18 |
|
19 from PyQt6.QtCore import QObject, QUrl, QFile, QFileInfo, qWarning |
|
20 |
|
21 from E5Gui import E5MessageBox |
|
22 |
|
23 |
|
24 class EricTldHostParts: |
|
25 """ |
|
26 Class implementing the host parts helper. |
|
27 """ |
|
28 def __init__(self): |
|
29 """ |
|
30 Constructor |
|
31 """ |
|
32 self.host = "" |
|
33 self.tld = "" |
|
34 self.domain = "" |
|
35 self.registrableDomain = "" |
|
36 self.subdomain = "" |
|
37 |
|
38 |
|
39 class EricTldExtractor(QObject): |
|
40 """ |
|
41 Class implementing the TLD Extractor. |
|
42 |
|
43 Note: The module function instance() should be used to get a reference |
|
44 to a global object to avoid overhead. |
|
45 """ |
|
46 def __init__(self, withPrivate=False, parent=None): |
|
47 """ |
|
48 Constructor |
|
49 |
|
50 @param withPrivate flag indicating to load private TLDs as well |
|
51 @type bool |
|
52 @param parent reference to the parent object |
|
53 @type QObject |
|
54 """ |
|
55 super().__init__(parent) |
|
56 |
|
57 self.__withPrivate = withPrivate |
|
58 self.__dataFileName = "" |
|
59 self.__dataSearchPaths = [] |
|
60 |
|
61 self.__tldDict = collections.defaultdict(list) |
|
62 # dict with list of str as values |
|
63 |
|
64 self.setDataSearchPaths() |
|
65 |
|
66 def isDataLoaded(self): |
|
67 """ |
|
68 Public method to check, if the TLD data ia already loaded. |
|
69 |
|
70 @return flag indicating data is loaded |
|
71 @rtype bool |
|
72 """ |
|
73 return bool(self.__tldDict) |
|
74 |
|
75 def tld(self, host): |
|
76 """ |
|
77 Public method to get the top level domain for a host. |
|
78 |
|
79 @param host host name to get TLD for |
|
80 @type str |
|
81 @return TLD for host |
|
82 @rtype str |
|
83 """ |
|
84 if not host or host.startswith("."): |
|
85 return "" |
|
86 |
|
87 cleanHost = self.__normalizedHost(host) |
|
88 |
|
89 tldPart = cleanHost[cleanHost.rfind(".") + 1:] |
|
90 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8") |
|
91 |
|
92 self.__loadData() |
|
93 |
|
94 if tldPart not in self.__tldDict: |
|
95 return tldPart |
|
96 |
|
97 tldRules = self.__tldDict[tldPart][:] |
|
98 |
|
99 if tldPart not in tldRules: |
|
100 tldRules.append(tldPart) |
|
101 |
|
102 maxLabelCount = 0 |
|
103 isWildcardTLD = False |
|
104 |
|
105 for rule in tldRules: |
|
106 labelCount = rule.count(".") + 1 |
|
107 |
|
108 if rule.startswith("!"): |
|
109 rule = rule[1:] |
|
110 |
|
111 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
|
112 |
|
113 # matches with exception TLD |
|
114 if cleanHost.endswith(rule): |
|
115 tldPart = rule[rule.find(".") + 1:] |
|
116 break |
|
117 |
|
118 if rule.startswith("*"): |
|
119 rule = rule[1:] |
|
120 |
|
121 if rule.startswith("."): |
|
122 rule = rule[1:] |
|
123 |
|
124 isWildcardTLD = True |
|
125 else: |
|
126 isWildcardTLD = False |
|
127 |
|
128 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
|
129 testRule = "." + rule |
|
130 testUrl = "." + cleanHost |
|
131 |
|
132 if labelCount > maxLabelCount and testUrl.endswith(testRule): |
|
133 tldPart = rule |
|
134 maxLabelCount = labelCount |
|
135 |
|
136 if isWildcardTLD: |
|
137 temp = cleanHost |
|
138 temp = temp[:temp.rfind(tldPart)] |
|
139 |
|
140 if temp.endswith("."): |
|
141 temp = temp[:-1] |
|
142 |
|
143 temp = temp[temp.rfind(".") + 1:] |
|
144 |
|
145 if temp: |
|
146 tldPart = temp + "." + rule |
|
147 else: |
|
148 tldPart = rule |
|
149 |
|
150 temp = self.__normalizedHost(host) |
|
151 tldPart = ".".join( |
|
152 temp.split(".")[temp.count(".") - tldPart.count("."):]) |
|
153 |
|
154 return tldPart |
|
155 |
|
156 def domain(self, host): |
|
157 """ |
|
158 Public method to get the domain for a host. |
|
159 |
|
160 @param host host name to get the domain for |
|
161 @type str |
|
162 @return domain for host |
|
163 @rtype str |
|
164 """ |
|
165 tldPart = self.tld(host) |
|
166 |
|
167 return self.__domainHelper(host, tldPart) |
|
168 |
|
169 def registrableDomain(self, host): |
|
170 """ |
|
171 Public method to get the registrable domain for a host. |
|
172 |
|
173 @param host host name to get the registrable domain for |
|
174 @type str |
|
175 @return registrable domain for host |
|
176 @rtype str |
|
177 """ |
|
178 tldPart = self.tld(host) |
|
179 |
|
180 return self.__registrableDomainHelper( |
|
181 self.__domainHelper(host, tldPart), tldPart) |
|
182 |
|
183 def subdomain(self, host): |
|
184 """ |
|
185 Public method to get the subdomain for a host. |
|
186 |
|
187 @param host host name to get the subdomain for |
|
188 @type str |
|
189 @return subdomain for host |
|
190 @rtype str |
|
191 """ |
|
192 return self.__subdomainHelper(host, self.registrableDomain(host)) |
|
193 |
|
194 def splitParts(self, host): |
|
195 """ |
|
196 Public method to split a host address into its parts. |
|
197 |
|
198 @param host host address to be split |
|
199 @type str |
|
200 @return splitted host address |
|
201 @rtype EricTldHostParts |
|
202 """ |
|
203 hostParts = EricTldHostParts() |
|
204 hostParts.host = host |
|
205 hostParts.tld = self.tld(host) |
|
206 hostParts.domain = self.__domainHelper(host, hostParts.tld) |
|
207 hostParts.registrableDomain = self.__registrableDomainHelper( |
|
208 hostParts.domain, hostParts.tld) |
|
209 hostParts.subdomain = self.__subdomainHelper( |
|
210 host, hostParts.registrableDomain) |
|
211 |
|
212 return hostParts |
|
213 |
|
214 def dataSearchPaths(self): |
|
215 """ |
|
216 Public method to get the search paths for the TLD data file. |
|
217 |
|
218 @return search paths for the TLD data file |
|
219 @rtype list of str |
|
220 """ |
|
221 return self.__dataSearchPaths[:] |
|
222 |
|
223 def setDataSearchPaths(self, searchPaths=None): |
|
224 """ |
|
225 Public method to set the search paths for the TLD data file. |
|
226 |
|
227 @param searchPaths search paths for the TLD data file or None, |
|
228 if the default search paths shall be set |
|
229 @type list of str |
|
230 """ |
|
231 if searchPaths: |
|
232 self.__dataSearchPaths = searchPaths[:] |
|
233 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths()) |
|
234 else: |
|
235 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:] |
|
236 |
|
237 # remove duplicates |
|
238 paths = [] |
|
239 for p in self.__dataSearchPaths: |
|
240 if p not in paths: |
|
241 paths.append(p) |
|
242 self.__dataSearchPaths = paths |
|
243 |
|
244 def __defaultDataSearchPaths(self): |
|
245 """ |
|
246 Private method to get the default search paths for the TLD data file. |
|
247 |
|
248 @return default search paths for the TLD data file |
|
249 @rtype list of str |
|
250 """ |
|
251 return [os.path.join(os.path.dirname(__file__), "data")] |
|
252 |
|
253 def getTldDownloadUrl(self): |
|
254 """ |
|
255 Public method to get the TLD data file download URL. |
|
256 |
|
257 @return download URL |
|
258 @rtype QUrl |
|
259 """ |
|
260 return QUrl( |
|
261 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
|
262 "effective_tld_names.dat?raw=1") |
|
263 |
|
264 def __loadData(self): |
|
265 """ |
|
266 Private method to load the TLD data. |
|
267 """ |
|
268 if self.isDataLoaded(): |
|
269 return |
|
270 |
|
271 dataFileName = "" |
|
272 parsedDataFileExist = False |
|
273 |
|
274 for path in self.__dataSearchPaths: |
|
275 dataFileName = ( |
|
276 QFileInfo(path + "/effective_tld_names.dat").absoluteFilePath() |
|
277 ) |
|
278 if QFileInfo(dataFileName).exists(): |
|
279 parsedDataFileExist = True |
|
280 break |
|
281 |
|
282 if not parsedDataFileExist: |
|
283 tldDataFileDownloadLink = ( |
|
284 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
|
285 "effective_tld_names.dat?raw=1" |
|
286 ) |
|
287 E5MessageBox.information( |
|
288 None, |
|
289 self.tr("TLD Data File not found"), |
|
290 self.tr("""<p>The file 'effective_tld_names.dat' was not""" |
|
291 """ found!<br/>You can download it from """ |
|
292 """'<a href="{0}"><b>here</b></a>' to one of the""" |
|
293 """ following paths:</p><ul>{1}</ul>""").format( |
|
294 tldDataFileDownloadLink, |
|
295 "".join(["<li>{0}</li>".format(p) |
|
296 for p in self.__dataSearchPaths])) |
|
297 ) |
|
298 return |
|
299 |
|
300 self.__dataFileName = dataFileName |
|
301 if not self.__parseData(dataFileName, |
|
302 loadPrivateDomains=self.__withPrivate): |
|
303 qWarning( |
|
304 "EricTldExtractor: There are some parse errors for file: {0}" |
|
305 .format(dataFileName)) |
|
306 |
|
307 def __parseData(self, dataFile, loadPrivateDomains=False): |
|
308 """ |
|
309 Private method to parse TLD data. |
|
310 |
|
311 @param dataFile name of the file containing the TLD data |
|
312 @type str |
|
313 @param loadPrivateDomains flag indicating to load private domains |
|
314 @type bool |
|
315 @return flag indicating success |
|
316 @rtype bool |
|
317 """ |
|
318 # start with a fresh dictionary |
|
319 self.__tldDict = collections.defaultdict(list) |
|
320 |
|
321 file = QFile(dataFile) |
|
322 |
|
323 if not file.open(QFile.ReadOnly | QFile.Text): |
|
324 return False |
|
325 |
|
326 seekToEndOfPrivateDomains = False |
|
327 |
|
328 while not file.atEnd(): |
|
329 line = bytes(file.readLine()).decode("utf-8").strip() |
|
330 if not line: |
|
331 continue |
|
332 |
|
333 if line.startswith("."): |
|
334 line = line[1:] |
|
335 |
|
336 if line.startswith("//"): |
|
337 if "===END PRIVATE DOMAINS===" in line: |
|
338 seekToEndOfPrivateDomains = False |
|
339 |
|
340 if ( |
|
341 not loadPrivateDomains and |
|
342 "===BEGIN PRIVATE DOMAINS===" in line |
|
343 ): |
|
344 seekToEndOfPrivateDomains = True |
|
345 |
|
346 continue |
|
347 |
|
348 if seekToEndOfPrivateDomains: |
|
349 continue |
|
350 |
|
351 # only data up to the first whitespace is used |
|
352 line = line.split(None, 1)[0] |
|
353 |
|
354 if "." not in line: |
|
355 self.__tldDict[line].append(line) |
|
356 else: |
|
357 key = line[line.rfind(".") + 1:] |
|
358 self.__tldDict[key].append(line) |
|
359 |
|
360 return self.isDataLoaded() |
|
361 |
|
362 def __domainHelper(self, host, tldPart): |
|
363 """ |
|
364 Private method to get the domain name without TLD. |
|
365 |
|
366 @param host host address |
|
367 @type str |
|
368 @param tldPart TLD part of the host address |
|
369 @type str |
|
370 @return domain name |
|
371 @rtype str |
|
372 """ |
|
373 if not host or not tldPart: |
|
374 return "" |
|
375 |
|
376 temp = self.__normalizedHost(host) |
|
377 temp = temp[:temp.rfind(tldPart)] |
|
378 |
|
379 if temp.endswith("."): |
|
380 temp = temp[:-1] |
|
381 |
|
382 return temp[temp.rfind(".") + 1:] |
|
383 |
|
384 def __registrableDomainHelper(self, domainPart, tldPart): |
|
385 """ |
|
386 Private method to get the registrable domain (i.e. domain plus TLD). |
|
387 |
|
388 @param domainPart domain part of a host address |
|
389 @type str |
|
390 @param tldPart TLD part of a host address |
|
391 @type str |
|
392 @return registrable domain name |
|
393 @rtype str |
|
394 """ |
|
395 if not tldPart or not domainPart: |
|
396 return "" |
|
397 else: |
|
398 return "{0}.{1}".format(domainPart, tldPart) |
|
399 |
|
400 def __subdomainHelper(self, host, registrablePart): |
|
401 """ |
|
402 Private method to get the subdomain of a host address (i.e. domain part |
|
403 without the registrable domain name). |
|
404 |
|
405 @param host host address |
|
406 @type str |
|
407 @param registrablePart registrable domain part of the host address |
|
408 @type str |
|
409 @return subdomain name |
|
410 @rtype str |
|
411 """ |
|
412 if not host or not registrablePart: |
|
413 return "" |
|
414 |
|
415 subdomain = self.__normalizedHost(host) |
|
416 |
|
417 subdomain = subdomain[:subdomain.rfind(registrablePart)] |
|
418 |
|
419 if subdomain.endswith("."): |
|
420 subdomain = subdomain[:-1] |
|
421 |
|
422 return subdomain |
|
423 |
|
424 def __normalizedHost(self, host): |
|
425 """ |
|
426 Private method to get the normalized host for a host address. |
|
427 |
|
428 @param host host address to be normalized |
|
429 @type str |
|
430 @return normalized host address |
|
431 @rtype str |
|
432 """ |
|
433 return host.lower() |
|
434 |
|
435 ################################################################# |
|
436 ## Methods below are for testing purposes |
|
437 ################################################################# |
|
438 |
|
439 def test(self): |
|
440 """ |
|
441 Public method to execute the tests. |
|
442 |
|
443 @return flag indicating the test result |
|
444 @rtype bool |
|
445 """ |
|
446 self.__withPrivate = True |
|
447 self.__loadData() |
|
448 if not self.__tldDict: |
|
449 return False |
|
450 |
|
451 testDataFileName = "" |
|
452 testDataFileExist = False |
|
453 |
|
454 for path in self.__dataSearchPaths: |
|
455 testDataFileName = ( |
|
456 QFileInfo(path + "/test_psl.txt").absoluteFilePath() |
|
457 ) |
|
458 if QFileInfo(testDataFileName).exists(): |
|
459 testDataFileExist = True |
|
460 break |
|
461 |
|
462 if not testDataFileExist: |
|
463 testFileDownloadLink = ( |
|
464 "http://mxr.mozilla.org/mozilla-central/source/netwerk/test/" |
|
465 "unit/data/test_psl.txt?raw=1" |
|
466 ) |
|
467 E5MessageBox.information( |
|
468 None, |
|
469 self.tr("TLD Data File not found"), |
|
470 self.tr("""<p>The file 'test_psl.txt' was not found!""" |
|
471 """<br/>You can download it from '<a href="{0}">""" |
|
472 """<b>here</b></a>' to one of the following""" |
|
473 """ paths:</p><ul>{1}</ul>""").format( |
|
474 testFileDownloadLink, |
|
475 "".join(["<li>{0}</li>".format(p) |
|
476 for p in self.__dataSearchPaths])) |
|
477 ) |
|
478 return False |
|
479 |
|
480 file = QFile(testDataFileName) |
|
481 |
|
482 if not file.open(QFile.ReadOnly | QFile.Text): |
|
483 return False |
|
484 |
|
485 testRegExp = re.compile( |
|
486 "checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);") |
|
487 allTestSuccess = True |
|
488 |
|
489 while not file.atEnd(): |
|
490 line = bytes(file.readLine()).decode("utf-8").strip() |
|
491 if not line or line.startswith("//"): |
|
492 continue |
|
493 |
|
494 match = testRegExp.search(line) |
|
495 if match is None: |
|
496 allTestSuccess = False |
|
497 else: |
|
498 hostName, registrableName = match.group(2, 4) |
|
499 |
|
500 if not self.__checkPublicSuffix(hostName, registrableName): |
|
501 allTestSuccess = False |
|
502 |
|
503 if allTestSuccess: |
|
504 qWarning("EricTldExtractor: Test passed successfully.") |
|
505 else: |
|
506 qWarning("EricTldExtractor: Test finished with some errors!") |
|
507 |
|
508 # reset the TLD dictionary |
|
509 self.__tldDict = collections.defaultdict(list) |
|
510 |
|
511 return allTestSuccess |
|
512 |
|
513 def __checkPublicSuffix(self, host, registrableName): |
|
514 """ |
|
515 Private method to test a host name against a registrable name. |
|
516 |
|
517 @param host host name to test |
|
518 @type str |
|
519 @param registrableName registrable domain name to test against |
|
520 @type str |
|
521 @return flag indicating the check result |
|
522 @rtype bool |
|
523 """ |
|
524 regName = self.registrableDomain(host) |
|
525 if regName != registrableName: |
|
526 qWarning( |
|
527 "EricTldExtractor Test Error: hostName: {0}\n" |
|
528 " Correct registrableName: {1}\n" |
|
529 " Calculated registrableName: {2}".format( |
|
530 host, registrableName, regName)) |
|
531 return False |
|
532 |
|
533 return True |
|
534 |
|
535 |
|
536 _TLDExtractor = None |
|
537 |
|
538 |
|
539 def instance(withPrivate=False): |
|
540 """ |
|
541 Global function to get a reference to the TLD extractor and create it, if |
|
542 it hasn't been yet. |
|
543 |
|
544 @param withPrivate flag indicating to load private TLDs as well |
|
545 @type bool |
|
546 @return reference to the zoom manager object |
|
547 @rtype EricTldExtractor |
|
548 """ |
|
549 global _TLDExtractor |
|
550 |
|
551 if _TLDExtractor is None: |
|
552 _TLDExtractor = EricTldExtractor(withPrivate=withPrivate) |
|
553 |
|
554 return _TLDExtractor |