E5Network/E5TldExtractor.py

changeset 4971
0f21662c0d2d
child 5389
9b1c800daff3
equal deleted inserted replaced
4970:dcbb14191a3b 4971:0f21662c0d2d
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2016 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing the TLD Extractor.
8 """
9
10 #
11 # This is a Python port of the TLDExtractor of Qupzilla
12 # Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com>
13 #
14
15 from __future__ import unicode_literals
16
17 import collections
18
19 from PyQt5.QtCore import QObject, QUrl, QFile, QFileInfo, QRegExp, qWarning
20
21 from E5Gui import E5MessageBox
22
23 from .data import tld_rc # __IGNORE_WARNING__
24
25
26 class E5TldHostParts(object):
27 """
28 Class implementing the host parts helper.
29 """
30 def __init__(self):
31 """
32 Constructor
33 """
34 self.host = ""
35 self.tld = ""
36 self.domain = ""
37 self.registrableDomain = ""
38 self.subdomain = ""
39
40
41 class E5TldExtractor(QObject):
42 """
43 Class implementing the TLD Extractor.
44
45 Note: The module function instance() should be used to get a reference
46 to a global object to avoid overhead.
47 """
48 def __init__(self, withPrivate=False, parent=None):
49 """
50 Constructor
51
52 @param withPrivate flag indicating to load private TLDs as well
53 @type bool
54 @param parent reference to the parent object
55 @type QObject
56 """
57 super(E5TldExtractor, self).__init__(parent)
58
59 self.__withPrivate = withPrivate
60 self.__dataFileName = ""
61 self.__dataSearchPaths = []
62
63 self.__tldDict = collections.defaultdict(list)
64 # dict with list of str as values
65
66 self.setDataSearchPaths()
67
68 def isDataLoaded(self):
69 """
70 Public method to check, if the TLD data ia already loaded.
71
72 @return flag indicating data is loaded
73 @rtype bool
74 """
75 return bool(self.__tldDict)
76
77 def tld(self, host):
78 """
79 Public method to get the top level domain for a host.
80
81 @param host host name to get TLD for
82 @type str
83 @return TLD for host
84 @rtype str
85 """
86 if not host or host.startswith("."):
87 return ""
88
89 cleanHost = self.__normalizedHost(host)
90
91 tldPart = cleanHost[cleanHost.rfind(".") + 1:]
92 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8")
93
94 self.__loadData()
95
96 if tldPart not in self.__tldDict:
97 return tldPart
98
99 tldRules = self.__tldDict[tldPart][:]
100
101 if tldPart not in tldRules:
102 tldRules.append(tldPart)
103
104 maxLabelCount = 0
105 isWildcardTLD = False
106
107 for rule in tldRules:
108 labelCount = rule.count(".") + 1
109
110 if rule.startswith("!"):
111 rule = rule[1:]
112
113 rule = bytes(QUrl.toAce(rule)).decode("utf-8")
114
115 # matches with exception TLD
116 if cleanHost.endswith(rule):
117 tldPart = rule[rule.find(".") + 1:]
118 break
119
120 if rule.startswith("*"):
121 rule = rule[1:]
122
123 if rule.startswith("."):
124 rule = rule[1:]
125
126 isWildcardTLD = True
127 else:
128 isWildcardTLD = False
129
130 rule = bytes(QUrl.toAce(rule)).decode("utf-8")
131 testRule = "." + rule
132 testUrl = "." + cleanHost
133
134 if labelCount > maxLabelCount and testUrl.endswith(testRule):
135 tldPart = rule
136 maxLabelCount = labelCount
137
138 if isWildcardTLD:
139 temp = cleanHost
140 temp = temp[:temp.rfind(tldPart)]
141
142 if temp.endswith("."):
143 temp = temp[:-1]
144
145 temp = temp[temp.rfind(".") + 1:]
146
147 if temp:
148 tldPart = temp + "." + rule
149 else:
150 tldPart = rule
151
152 temp = self.__normalizedHost(host)
153 tldPart = ".".join(
154 temp.split(".")[temp.count(".") - tldPart.count("."):])
155
156 return tldPart
157
158 def domain(self, host):
159 """
160 Public method to get the domain for a host.
161
162 @param host host name to get the domain for
163 @type str
164 @return domain for host
165 @rtype str
166 """
167 tldPart = self.tld(host)
168
169 return self.__domainHelper(host, tldPart)
170
171 def registrableDomain(self, host):
172 """
173 Public method to get the registrable domain for a host.
174
175 @param host host name to get the registrable domain for
176 @type str
177 @return registrable domain for host
178 @rtype str
179 """
180 tldPart = self.tld(host)
181
182 return self.__registrableDomainHelper(
183 self.__domainHelper(host, tldPart), tldPart)
184
185 def subdomain(self, host):
186 """
187 Public method to get the subdomain for a host.
188
189 @param host host name to get the subdomain for
190 @type str
191 @return subdomain for host
192 @rtype str
193 """
194 return self.__subdomainHelper(host, self.registrableDomain(host))
195
196 def splitParts(self, host):
197 """
198 Public method to split a host address into its parts.
199
200 @param host host address to be split
201 @type str
202 @return splitted host address
203 @rtype E5TldHostParts
204 """
205 hostParts = E5TldHostParts()
206 hostParts.host = host
207 hostParts.tld = self.tld(host)
208 hostParts.domain = self.__domainHelper(host, hostParts.tld)
209 hostParts.registrableDomain = self.__registrableDomainHelper(
210 hostParts.domain, hostParts.tld)
211 hostParts.subdomain = self.__subdomainHelper(
212 host, hostParts.registrableDomain)
213
214 return hostParts
215
216 def dataSearchPaths(self):
217 """
218 Public method to get the search paths for the TLD data file.
219
220 @return search paths for the TLD data file
221 @rtype list of str
222 """
223 return self.__dataSearchPaths[:]
224
225 def setDataSearchPaths(self, searchPaths=None):
226 """
227 Public method to set the search paths for the TLD data file.
228
229 @param searchPaths search paths for the TLD data file or None,
230 if the default search paths shall be set
231 @type list of str
232 """
233 if searchPaths:
234 self.__dataSearchPaths = searchPaths[:]
235 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths())
236 else:
237 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:]
238
239 # remove duplicates
240 paths = []
241 for p in self.__dataSearchPaths:
242 if p not in paths:
243 paths.append(p)
244 self.__dataSearchPaths = paths
245
246 def __defaultDataSearchPaths(self):
247 """
248 Private method to get the default search paths for the TLD data file.
249
250 @return default search paths for the TLD data file
251 @rtype list of str
252 """
253 return [":"]
254
255 def getTldDownloadUrl(self):
256 """
257 Public method to get the TLD data file download URL.
258
259 @return download URL
260 @rtype QUrl
261 """
262 return QUrl(
263 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/"
264 "effective_tld_names.dat?raw=1")
265
266 def __loadData(self):
267 """
268 Private method to load the TLD data.
269 """
270 if self.isDataLoaded():
271 return
272
273 dataFileName = ""
274 parsedDataFileExist = False
275
276 for path in self.__dataSearchPaths:
277 dataFileName = QFileInfo(path + "/effective_tld_names.dat")\
278 .absoluteFilePath()
279 if QFileInfo(dataFileName).exists():
280 parsedDataFileExist = True
281 break
282
283 if not parsedDataFileExist:
284 tldDataFileDownloadLink = \
285 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" \
286 "effective_tld_names.dat?raw=1"
287 E5MessageBox.information(
288 None,
289 self.tr("TLD Data File not found"),
290 self.tr("""<p>The file 'effective_tld_names.dat' was not"""
291 """ found!<br/>You can download it from """
292 """'<a href="{0}"><b>here</b></a>' to one of the"""
293 """ following paths:</p><ul>{1}</ul>""").format(
294 tldDataFileDownloadLink,
295 "".join(["<li>{0}</li>".format(p)
296 for p in self.__dataSearchPaths]))
297 )
298 return
299
300 self.__dataFileName = dataFileName
301 if not self.__parseData(dataFileName,
302 loadPrivateDomains=self.__withPrivate):
303 qWarning(
304 "E5TldExtractor: There are some parse errors for file: {0}"
305 .format(dataFileName))
306
307 def __parseData(self, dataFile, loadPrivateDomains=False):
308 """
309 Private method to parse TLD data.
310
311 @param dataFile name of the file containing the TLD data
312 @type str
313 @param loadPrivateDomains flag indicating to load private domains
314 @type bool
315 @return flag indicating success
316 @rtype bool
317 """
318 # start with a fresh dictionary
319 self.__tldDict = collections.defaultdict(list)
320
321 file = QFile(dataFile)
322
323 if not file.open(QFile.ReadOnly | QFile.Text):
324 return False
325
326 seekToEndOfPrivateDomains = False
327
328 while not file.atEnd():
329 line = bytes(file.readLine()).decode("utf-8").strip()
330 if not line:
331 continue
332
333 if line.startswith("."):
334 line = line[1:]
335
336 if line.startswith("//"):
337 if "===END PRIVATE DOMAINS===" in line:
338 seekToEndOfPrivateDomains = False
339
340 if not loadPrivateDomains and \
341 "===BEGIN PRIVATE DOMAINS===" in line:
342 seekToEndOfPrivateDomains = True
343
344 continue
345
346 if seekToEndOfPrivateDomains:
347 continue
348
349 # only data up to the first whitespace is used
350 line = line.split(None, 1)[0]
351
352 if "." not in line:
353 self.__tldDict[line].append(line)
354 else:
355 key = line[line.rfind(".") + 1:]
356 self.__tldDict[key].append(line)
357
358 return self.isDataLoaded()
359
360 def __domainHelper(self, host, tldPart):
361 """
362 Private method to get the domain name without TLD.
363
364 @param host host address
365 @type str
366 @param tldPart TLD part of the host address
367 @type str
368 @return domain name
369 @rtype str
370 """
371 if not host or not tldPart:
372 return ""
373
374 temp = self.__normalizedHost(host)
375 temp = temp[:temp.rfind(tldPart)]
376
377 if temp.endswith("."):
378 temp = temp[:-1]
379
380 return temp[temp.rfind(".") + 1:]
381
382 def __registrableDomainHelper(self, domainPart, tldPart):
383 """
384 Private method to get the registrable domain (i.e. domain plus TLD).
385
386 @param domainPart domain part of a host address
387 @type str
388 @param tldPart TLD part of a host address
389 @type str
390 @return registrable domain name
391 @rtype str
392 """
393 if not tldPart or not domainPart:
394 return ""
395 else:
396 return "{0}.{1}".format(domainPart, tldPart)
397
398 def __subdomainHelper(self, host, registrablePart):
399 """
400 Private method to get the subdomain of a host address (i.e. domain part
401 without the registrable domain name).
402
403 @param host host address
404 @type str
405 @param registrablePart registrable domain part of the host address
406 @type str
407 @return subdomain name
408 @rtype str
409 """
410 if not host or not registrablePart:
411 return ""
412
413 subdomain = self.__normalizedHost(host)
414
415 subdomain = subdomain[:subdomain.rfind(registrablePart)]
416
417 if subdomain.endswith("."):
418 subdomain = subdomain[:-1]
419
420 return subdomain
421
422 def __normalizedHost(self, host):
423 """
424 Private method to get the normalized host for a host address.
425
426 @param host host address to be normalized
427 @type str
428 @return normalized host address
429 @rtype str
430 """
431 return host.lower()
432
433 #################################################################
434 ## Methods below are for testing purposes
435 #################################################################
436
437 def test(self):
438 """
439 Public method to execute the tests.
440
441 @return flag indicating the test result
442 @rtype bool
443 """
444 self.__withPrivate = True
445 self.__loadData()
446 if not self.__tldDict:
447 return False
448
449 testDataFileName = ""
450 testDataFileExist = False
451
452 for path in self.__dataSearchPaths:
453 testDataFileName = QFileInfo(path + "/test_psl.txt")\
454 .absoluteFilePath()
455 if QFileInfo(testDataFileName).exists():
456 testDataFileExist = True
457 break
458
459 if not testDataFileExist:
460 testFileDownloadLink = \
461 "http://mxr.mozilla.org/mozilla-central/source/netwerk/test/" \
462 "unit/data/test_psl.txt?raw=1"
463 E5MessageBox.information(
464 None,
465 self.tr("TLD Data File not found"),
466 self.tr("""<p>The file 'test_psl.txt' was not found!"""
467 """<br/>You can download it from '<a href="{0}">"""
468 """<b>here</b></a>' to one of the following"""
469 """ paths:</p><ul>{1}</ul>""").format(
470 testFileDownloadLink,
471 "".join(["<li>{0}</li>".format(p)
472 for p in self.__dataSearchPaths]))
473 )
474 return False
475
476 file = QFile(testDataFileName)
477
478 if not file.open(QFile.ReadOnly | QFile.Text):
479 return False
480
481 testRegExp = QRegExp(
482 "checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);")
483 allTestSuccess = True
484
485 while not file.atEnd():
486 line = bytes(file.readLine()).decode("utf-8").strip()
487 if not line or line.startswith("//"):
488 continue
489
490 if testRegExp.indexIn(line) == -1:
491 allTestSuccess = False
492 else:
493 hostName = testRegExp.cap(2)
494 registrableName = testRegExp.cap(4)
495
496 if not self.__checkPublicSuffix(hostName, registrableName):
497 allTestSuccess = False
498
499 if allTestSuccess:
500 qWarning("E5TldExtractor: Test passed successfully.")
501 else:
502 qWarning("E5TldExtractor: Test finished with some errors!")
503
504 # reset the TLD dictionary
505 self.__tldDict = collections.defaultdict(list)
506
507 return allTestSuccess
508
509 def __checkPublicSuffix(self, host, registrableName):
510 """
511 Private method to test a host name against a registrable name.
512
513 @param host host name to test
514 @type str
515 @param registrableName registrable domain name to test against
516 @type str
517 @return flag indicating the check result
518 @rtype bool
519 """
520 regName = self.registrableDomain(host)
521 if regName != registrableName:
522 qWarning(
523 "E5TldExtractor Test Error: hostName: {0}\n"
524 " Correct registrableName: {1}\n"
525 " Calculated registrableName: {2}".format(
526 host, registrableName, regName))
527 return False
528
529 return True
530
531
532 _TLDExtractor = None
533
534
535 def instance(withPrivate=False):
536 """
537 Global function to get a reference to the TLD extractor and create it, if
538 it hasn't been yet.
539
540 @param withPrivate flag indicating to load private TLDs as well
541 @type bool
542 @return reference to the zoom manager object
543 @rtype E5TldExtractor
544 """
545 global _TLDExtractor
546
547 if _TLDExtractor is None:
548 _TLDExtractor = E5TldExtractor(withPrivate=withPrivate)
549
550 return _TLDExtractor

eric ide

mercurial