eric7/EricNetwork/EricTldExtractor.py

branch
eric7
changeset 8354
12ebd3934fef
parent 8318
962bce857696
child 8356
68ec9c3d4de5
equal deleted inserted replaced
8353:799196d0b05d 8354:12ebd3934fef
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2016 - 2021 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing the TLD Extractor.
8 """
9
10 #
11 # This is a Python port of the TLDExtractor of Qupzilla
12 # Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com>
13 #
14
15 import collections
16 import os
17 import re
18
19 from PyQt6.QtCore import QObject, QUrl, QFile, QFileInfo, qWarning
20
21 from E5Gui import E5MessageBox
22
23
24 class EricTldHostParts:
25 """
26 Class implementing the host parts helper.
27 """
28 def __init__(self):
29 """
30 Constructor
31 """
32 self.host = ""
33 self.tld = ""
34 self.domain = ""
35 self.registrableDomain = ""
36 self.subdomain = ""
37
38
39 class EricTldExtractor(QObject):
40 """
41 Class implementing the TLD Extractor.
42
43 Note: The module function instance() should be used to get a reference
44 to a global object to avoid overhead.
45 """
46 def __init__(self, withPrivate=False, parent=None):
47 """
48 Constructor
49
50 @param withPrivate flag indicating to load private TLDs as well
51 @type bool
52 @param parent reference to the parent object
53 @type QObject
54 """
55 super().__init__(parent)
56
57 self.__withPrivate = withPrivate
58 self.__dataFileName = ""
59 self.__dataSearchPaths = []
60
61 self.__tldDict = collections.defaultdict(list)
62 # dict with list of str as values
63
64 self.setDataSearchPaths()
65
66 def isDataLoaded(self):
67 """
68 Public method to check, if the TLD data ia already loaded.
69
70 @return flag indicating data is loaded
71 @rtype bool
72 """
73 return bool(self.__tldDict)
74
75 def tld(self, host):
76 """
77 Public method to get the top level domain for a host.
78
79 @param host host name to get TLD for
80 @type str
81 @return TLD for host
82 @rtype str
83 """
84 if not host or host.startswith("."):
85 return ""
86
87 cleanHost = self.__normalizedHost(host)
88
89 tldPart = cleanHost[cleanHost.rfind(".") + 1:]
90 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8")
91
92 self.__loadData()
93
94 if tldPart not in self.__tldDict:
95 return tldPart
96
97 tldRules = self.__tldDict[tldPart][:]
98
99 if tldPart not in tldRules:
100 tldRules.append(tldPart)
101
102 maxLabelCount = 0
103 isWildcardTLD = False
104
105 for rule in tldRules:
106 labelCount = rule.count(".") + 1
107
108 if rule.startswith("!"):
109 rule = rule[1:]
110
111 rule = bytes(QUrl.toAce(rule)).decode("utf-8")
112
113 # matches with exception TLD
114 if cleanHost.endswith(rule):
115 tldPart = rule[rule.find(".") + 1:]
116 break
117
118 if rule.startswith("*"):
119 rule = rule[1:]
120
121 if rule.startswith("."):
122 rule = rule[1:]
123
124 isWildcardTLD = True
125 else:
126 isWildcardTLD = False
127
128 rule = bytes(QUrl.toAce(rule)).decode("utf-8")
129 testRule = "." + rule
130 testUrl = "." + cleanHost
131
132 if labelCount > maxLabelCount and testUrl.endswith(testRule):
133 tldPart = rule
134 maxLabelCount = labelCount
135
136 if isWildcardTLD:
137 temp = cleanHost
138 temp = temp[:temp.rfind(tldPart)]
139
140 if temp.endswith("."):
141 temp = temp[:-1]
142
143 temp = temp[temp.rfind(".") + 1:]
144
145 if temp:
146 tldPart = temp + "." + rule
147 else:
148 tldPart = rule
149
150 temp = self.__normalizedHost(host)
151 tldPart = ".".join(
152 temp.split(".")[temp.count(".") - tldPart.count("."):])
153
154 return tldPart
155
156 def domain(self, host):
157 """
158 Public method to get the domain for a host.
159
160 @param host host name to get the domain for
161 @type str
162 @return domain for host
163 @rtype str
164 """
165 tldPart = self.tld(host)
166
167 return self.__domainHelper(host, tldPart)
168
169 def registrableDomain(self, host):
170 """
171 Public method to get the registrable domain for a host.
172
173 @param host host name to get the registrable domain for
174 @type str
175 @return registrable domain for host
176 @rtype str
177 """
178 tldPart = self.tld(host)
179
180 return self.__registrableDomainHelper(
181 self.__domainHelper(host, tldPart), tldPart)
182
183 def subdomain(self, host):
184 """
185 Public method to get the subdomain for a host.
186
187 @param host host name to get the subdomain for
188 @type str
189 @return subdomain for host
190 @rtype str
191 """
192 return self.__subdomainHelper(host, self.registrableDomain(host))
193
194 def splitParts(self, host):
195 """
196 Public method to split a host address into its parts.
197
198 @param host host address to be split
199 @type str
200 @return splitted host address
201 @rtype EricTldHostParts
202 """
203 hostParts = EricTldHostParts()
204 hostParts.host = host
205 hostParts.tld = self.tld(host)
206 hostParts.domain = self.__domainHelper(host, hostParts.tld)
207 hostParts.registrableDomain = self.__registrableDomainHelper(
208 hostParts.domain, hostParts.tld)
209 hostParts.subdomain = self.__subdomainHelper(
210 host, hostParts.registrableDomain)
211
212 return hostParts
213
214 def dataSearchPaths(self):
215 """
216 Public method to get the search paths for the TLD data file.
217
218 @return search paths for the TLD data file
219 @rtype list of str
220 """
221 return self.__dataSearchPaths[:]
222
223 def setDataSearchPaths(self, searchPaths=None):
224 """
225 Public method to set the search paths for the TLD data file.
226
227 @param searchPaths search paths for the TLD data file or None,
228 if the default search paths shall be set
229 @type list of str
230 """
231 if searchPaths:
232 self.__dataSearchPaths = searchPaths[:]
233 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths())
234 else:
235 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:]
236
237 # remove duplicates
238 paths = []
239 for p in self.__dataSearchPaths:
240 if p not in paths:
241 paths.append(p)
242 self.__dataSearchPaths = paths
243
244 def __defaultDataSearchPaths(self):
245 """
246 Private method to get the default search paths for the TLD data file.
247
248 @return default search paths for the TLD data file
249 @rtype list of str
250 """
251 return [os.path.join(os.path.dirname(__file__), "data")]
252
253 def getTldDownloadUrl(self):
254 """
255 Public method to get the TLD data file download URL.
256
257 @return download URL
258 @rtype QUrl
259 """
260 return QUrl(
261 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/"
262 "effective_tld_names.dat?raw=1")
263
264 def __loadData(self):
265 """
266 Private method to load the TLD data.
267 """
268 if self.isDataLoaded():
269 return
270
271 dataFileName = ""
272 parsedDataFileExist = False
273
274 for path in self.__dataSearchPaths:
275 dataFileName = (
276 QFileInfo(path + "/effective_tld_names.dat").absoluteFilePath()
277 )
278 if QFileInfo(dataFileName).exists():
279 parsedDataFileExist = True
280 break
281
282 if not parsedDataFileExist:
283 tldDataFileDownloadLink = (
284 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/"
285 "effective_tld_names.dat?raw=1"
286 )
287 E5MessageBox.information(
288 None,
289 self.tr("TLD Data File not found"),
290 self.tr("""<p>The file 'effective_tld_names.dat' was not"""
291 """ found!<br/>You can download it from """
292 """'<a href="{0}"><b>here</b></a>' to one of the"""
293 """ following paths:</p><ul>{1}</ul>""").format(
294 tldDataFileDownloadLink,
295 "".join(["<li>{0}</li>".format(p)
296 for p in self.__dataSearchPaths]))
297 )
298 return
299
300 self.__dataFileName = dataFileName
301 if not self.__parseData(dataFileName,
302 loadPrivateDomains=self.__withPrivate):
303 qWarning(
304 "EricTldExtractor: There are some parse errors for file: {0}"
305 .format(dataFileName))
306
307 def __parseData(self, dataFile, loadPrivateDomains=False):
308 """
309 Private method to parse TLD data.
310
311 @param dataFile name of the file containing the TLD data
312 @type str
313 @param loadPrivateDomains flag indicating to load private domains
314 @type bool
315 @return flag indicating success
316 @rtype bool
317 """
318 # start with a fresh dictionary
319 self.__tldDict = collections.defaultdict(list)
320
321 file = QFile(dataFile)
322
323 if not file.open(QFile.ReadOnly | QFile.Text):
324 return False
325
326 seekToEndOfPrivateDomains = False
327
328 while not file.atEnd():
329 line = bytes(file.readLine()).decode("utf-8").strip()
330 if not line:
331 continue
332
333 if line.startswith("."):
334 line = line[1:]
335
336 if line.startswith("//"):
337 if "===END PRIVATE DOMAINS===" in line:
338 seekToEndOfPrivateDomains = False
339
340 if (
341 not loadPrivateDomains and
342 "===BEGIN PRIVATE DOMAINS===" in line
343 ):
344 seekToEndOfPrivateDomains = True
345
346 continue
347
348 if seekToEndOfPrivateDomains:
349 continue
350
351 # only data up to the first whitespace is used
352 line = line.split(None, 1)[0]
353
354 if "." not in line:
355 self.__tldDict[line].append(line)
356 else:
357 key = line[line.rfind(".") + 1:]
358 self.__tldDict[key].append(line)
359
360 return self.isDataLoaded()
361
362 def __domainHelper(self, host, tldPart):
363 """
364 Private method to get the domain name without TLD.
365
366 @param host host address
367 @type str
368 @param tldPart TLD part of the host address
369 @type str
370 @return domain name
371 @rtype str
372 """
373 if not host or not tldPart:
374 return ""
375
376 temp = self.__normalizedHost(host)
377 temp = temp[:temp.rfind(tldPart)]
378
379 if temp.endswith("."):
380 temp = temp[:-1]
381
382 return temp[temp.rfind(".") + 1:]
383
384 def __registrableDomainHelper(self, domainPart, tldPart):
385 """
386 Private method to get the registrable domain (i.e. domain plus TLD).
387
388 @param domainPart domain part of a host address
389 @type str
390 @param tldPart TLD part of a host address
391 @type str
392 @return registrable domain name
393 @rtype str
394 """
395 if not tldPart or not domainPart:
396 return ""
397 else:
398 return "{0}.{1}".format(domainPart, tldPart)
399
400 def __subdomainHelper(self, host, registrablePart):
401 """
402 Private method to get the subdomain of a host address (i.e. domain part
403 without the registrable domain name).
404
405 @param host host address
406 @type str
407 @param registrablePart registrable domain part of the host address
408 @type str
409 @return subdomain name
410 @rtype str
411 """
412 if not host or not registrablePart:
413 return ""
414
415 subdomain = self.__normalizedHost(host)
416
417 subdomain = subdomain[:subdomain.rfind(registrablePart)]
418
419 if subdomain.endswith("."):
420 subdomain = subdomain[:-1]
421
422 return subdomain
423
424 def __normalizedHost(self, host):
425 """
426 Private method to get the normalized host for a host address.
427
428 @param host host address to be normalized
429 @type str
430 @return normalized host address
431 @rtype str
432 """
433 return host.lower()
434
435 #################################################################
436 ## Methods below are for testing purposes
437 #################################################################
438
439 def test(self):
440 """
441 Public method to execute the tests.
442
443 @return flag indicating the test result
444 @rtype bool
445 """
446 self.__withPrivate = True
447 self.__loadData()
448 if not self.__tldDict:
449 return False
450
451 testDataFileName = ""
452 testDataFileExist = False
453
454 for path in self.__dataSearchPaths:
455 testDataFileName = (
456 QFileInfo(path + "/test_psl.txt").absoluteFilePath()
457 )
458 if QFileInfo(testDataFileName).exists():
459 testDataFileExist = True
460 break
461
462 if not testDataFileExist:
463 testFileDownloadLink = (
464 "http://mxr.mozilla.org/mozilla-central/source/netwerk/test/"
465 "unit/data/test_psl.txt?raw=1"
466 )
467 E5MessageBox.information(
468 None,
469 self.tr("TLD Data File not found"),
470 self.tr("""<p>The file 'test_psl.txt' was not found!"""
471 """<br/>You can download it from '<a href="{0}">"""
472 """<b>here</b></a>' to one of the following"""
473 """ paths:</p><ul>{1}</ul>""").format(
474 testFileDownloadLink,
475 "".join(["<li>{0}</li>".format(p)
476 for p in self.__dataSearchPaths]))
477 )
478 return False
479
480 file = QFile(testDataFileName)
481
482 if not file.open(QFile.ReadOnly | QFile.Text):
483 return False
484
485 testRegExp = re.compile(
486 "checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);")
487 allTestSuccess = True
488
489 while not file.atEnd():
490 line = bytes(file.readLine()).decode("utf-8").strip()
491 if not line or line.startswith("//"):
492 continue
493
494 match = testRegExp.search(line)
495 if match is None:
496 allTestSuccess = False
497 else:
498 hostName, registrableName = match.group(2, 4)
499
500 if not self.__checkPublicSuffix(hostName, registrableName):
501 allTestSuccess = False
502
503 if allTestSuccess:
504 qWarning("EricTldExtractor: Test passed successfully.")
505 else:
506 qWarning("EricTldExtractor: Test finished with some errors!")
507
508 # reset the TLD dictionary
509 self.__tldDict = collections.defaultdict(list)
510
511 return allTestSuccess
512
513 def __checkPublicSuffix(self, host, registrableName):
514 """
515 Private method to test a host name against a registrable name.
516
517 @param host host name to test
518 @type str
519 @param registrableName registrable domain name to test against
520 @type str
521 @return flag indicating the check result
522 @rtype bool
523 """
524 regName = self.registrableDomain(host)
525 if regName != registrableName:
526 qWarning(
527 "EricTldExtractor Test Error: hostName: {0}\n"
528 " Correct registrableName: {1}\n"
529 " Calculated registrableName: {2}".format(
530 host, registrableName, regName))
531 return False
532
533 return True
534
535
536 _TLDExtractor = None
537
538
539 def instance(withPrivate=False):
540 """
541 Global function to get a reference to the TLD extractor and create it, if
542 it hasn't been yet.
543
544 @param withPrivate flag indicating to load private TLDs as well
545 @type bool
546 @return reference to the zoom manager object
547 @rtype EricTldExtractor
548 """
549 global _TLDExtractor
550
551 if _TLDExtractor is None:
552 _TLDExtractor = EricTldExtractor(withPrivate=withPrivate)
553
554 return _TLDExtractor

eric ide

mercurial