Sun, 16 May 2021 20:07:24 +0200
Replaced all imports of PyQt5 to PyQt6 and started to replace code using obsoleted methods and adapt to the PyQt6 enum usage.
# -*- coding: utf-8 -*- # Copyright (c) 2016 - 2021 Detlev Offenbach <detlev@die-offenbachs.de> # """ Module implementing the TLD Extractor. """ # # This is a Python port of the TLDExtractor of Qupzilla # Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com> # import collections import os import re from PyQt6.QtCore import QObject, QUrl, QFile, QFileInfo, qWarning from E5Gui import E5MessageBox class E5TldHostParts: """ Class implementing the host parts helper. """ def __init__(self): """ Constructor """ self.host = "" self.tld = "" self.domain = "" self.registrableDomain = "" self.subdomain = "" class E5TldExtractor(QObject): """ Class implementing the TLD Extractor. Note: The module function instance() should be used to get a reference to a global object to avoid overhead. """ def __init__(self, withPrivate=False, parent=None): """ Constructor @param withPrivate flag indicating to load private TLDs as well @type bool @param parent reference to the parent object @type QObject """ super().__init__(parent) self.__withPrivate = withPrivate self.__dataFileName = "" self.__dataSearchPaths = [] self.__tldDict = collections.defaultdict(list) # dict with list of str as values self.setDataSearchPaths() def isDataLoaded(self): """ Public method to check, if the TLD data ia already loaded. @return flag indicating data is loaded @rtype bool """ return bool(self.__tldDict) def tld(self, host): """ Public method to get the top level domain for a host. @param host host name to get TLD for @type str @return TLD for host @rtype str """ if not host or host.startswith("."): return "" cleanHost = self.__normalizedHost(host) tldPart = cleanHost[cleanHost.rfind(".") + 1:] cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8") self.__loadData() if tldPart not in self.__tldDict: return tldPart tldRules = self.__tldDict[tldPart][:] if tldPart not in tldRules: tldRules.append(tldPart) maxLabelCount = 0 isWildcardTLD = False for rule in tldRules: labelCount = rule.count(".") + 1 if rule.startswith("!"): rule = rule[1:] rule = bytes(QUrl.toAce(rule)).decode("utf-8") # matches with exception TLD if cleanHost.endswith(rule): tldPart = rule[rule.find(".") + 1:] break if rule.startswith("*"): rule = rule[1:] if rule.startswith("."): rule = rule[1:] isWildcardTLD = True else: isWildcardTLD = False rule = bytes(QUrl.toAce(rule)).decode("utf-8") testRule = "." + rule testUrl = "." + cleanHost if labelCount > maxLabelCount and testUrl.endswith(testRule): tldPart = rule maxLabelCount = labelCount if isWildcardTLD: temp = cleanHost temp = temp[:temp.rfind(tldPart)] if temp.endswith("."): temp = temp[:-1] temp = temp[temp.rfind(".") + 1:] if temp: tldPart = temp + "." + rule else: tldPart = rule temp = self.__normalizedHost(host) tldPart = ".".join( temp.split(".")[temp.count(".") - tldPart.count("."):]) return tldPart def domain(self, host): """ Public method to get the domain for a host. @param host host name to get the domain for @type str @return domain for host @rtype str """ tldPart = self.tld(host) return self.__domainHelper(host, tldPart) def registrableDomain(self, host): """ Public method to get the registrable domain for a host. @param host host name to get the registrable domain for @type str @return registrable domain for host @rtype str """ tldPart = self.tld(host) return self.__registrableDomainHelper( self.__domainHelper(host, tldPart), tldPart) def subdomain(self, host): """ Public method to get the subdomain for a host. @param host host name to get the subdomain for @type str @return subdomain for host @rtype str """ return self.__subdomainHelper(host, self.registrableDomain(host)) def splitParts(self, host): """ Public method to split a host address into its parts. @param host host address to be split @type str @return splitted host address @rtype E5TldHostParts """ hostParts = E5TldHostParts() hostParts.host = host hostParts.tld = self.tld(host) hostParts.domain = self.__domainHelper(host, hostParts.tld) hostParts.registrableDomain = self.__registrableDomainHelper( hostParts.domain, hostParts.tld) hostParts.subdomain = self.__subdomainHelper( host, hostParts.registrableDomain) return hostParts def dataSearchPaths(self): """ Public method to get the search paths for the TLD data file. @return search paths for the TLD data file @rtype list of str """ return self.__dataSearchPaths[:] def setDataSearchPaths(self, searchPaths=None): """ Public method to set the search paths for the TLD data file. @param searchPaths search paths for the TLD data file or None, if the default search paths shall be set @type list of str """ if searchPaths: self.__dataSearchPaths = searchPaths[:] self.__dataSearchPaths.extend(self.__defaultDataSearchPaths()) else: self.__dataSearchPaths = self.__defaultDataSearchPaths()[:] # remove duplicates paths = [] for p in self.__dataSearchPaths: if p not in paths: paths.append(p) self.__dataSearchPaths = paths def __defaultDataSearchPaths(self): """ Private method to get the default search paths for the TLD data file. @return default search paths for the TLD data file @rtype list of str """ return [os.path.join(os.path.dirname(__file__), "data")] def getTldDownloadUrl(self): """ Public method to get the TLD data file download URL. @return download URL @rtype QUrl """ return QUrl( "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" "effective_tld_names.dat?raw=1") def __loadData(self): """ Private method to load the TLD data. """ if self.isDataLoaded(): return dataFileName = "" parsedDataFileExist = False for path in self.__dataSearchPaths: dataFileName = ( QFileInfo(path + "/effective_tld_names.dat").absoluteFilePath() ) if QFileInfo(dataFileName).exists(): parsedDataFileExist = True break if not parsedDataFileExist: tldDataFileDownloadLink = ( "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" "effective_tld_names.dat?raw=1" ) E5MessageBox.information( None, self.tr("TLD Data File not found"), self.tr("""<p>The file 'effective_tld_names.dat' was not""" """ found!<br/>You can download it from """ """'<a href="{0}"><b>here</b></a>' to one of the""" """ following paths:</p><ul>{1}</ul>""").format( tldDataFileDownloadLink, "".join(["<li>{0}</li>".format(p) for p in self.__dataSearchPaths])) ) return self.__dataFileName = dataFileName if not self.__parseData(dataFileName, loadPrivateDomains=self.__withPrivate): qWarning( "E5TldExtractor: There are some parse errors for file: {0}" .format(dataFileName)) def __parseData(self, dataFile, loadPrivateDomains=False): """ Private method to parse TLD data. @param dataFile name of the file containing the TLD data @type str @param loadPrivateDomains flag indicating to load private domains @type bool @return flag indicating success @rtype bool """ # start with a fresh dictionary self.__tldDict = collections.defaultdict(list) file = QFile(dataFile) if not file.open(QFile.ReadOnly | QFile.Text): return False seekToEndOfPrivateDomains = False while not file.atEnd(): line = bytes(file.readLine()).decode("utf-8").strip() if not line: continue if line.startswith("."): line = line[1:] if line.startswith("//"): if "===END PRIVATE DOMAINS===" in line: seekToEndOfPrivateDomains = False if ( not loadPrivateDomains and "===BEGIN PRIVATE DOMAINS===" in line ): seekToEndOfPrivateDomains = True continue if seekToEndOfPrivateDomains: continue # only data up to the first whitespace is used line = line.split(None, 1)[0] if "." not in line: self.__tldDict[line].append(line) else: key = line[line.rfind(".") + 1:] self.__tldDict[key].append(line) return self.isDataLoaded() def __domainHelper(self, host, tldPart): """ Private method to get the domain name without TLD. @param host host address @type str @param tldPart TLD part of the host address @type str @return domain name @rtype str """ if not host or not tldPart: return "" temp = self.__normalizedHost(host) temp = temp[:temp.rfind(tldPart)] if temp.endswith("."): temp = temp[:-1] return temp[temp.rfind(".") + 1:] def __registrableDomainHelper(self, domainPart, tldPart): """ Private method to get the registrable domain (i.e. domain plus TLD). @param domainPart domain part of a host address @type str @param tldPart TLD part of a host address @type str @return registrable domain name @rtype str """ if not tldPart or not domainPart: return "" else: return "{0}.{1}".format(domainPart, tldPart) def __subdomainHelper(self, host, registrablePart): """ Private method to get the subdomain of a host address (i.e. domain part without the registrable domain name). @param host host address @type str @param registrablePart registrable domain part of the host address @type str @return subdomain name @rtype str """ if not host or not registrablePart: return "" subdomain = self.__normalizedHost(host) subdomain = subdomain[:subdomain.rfind(registrablePart)] if subdomain.endswith("."): subdomain = subdomain[:-1] return subdomain def __normalizedHost(self, host): """ Private method to get the normalized host for a host address. @param host host address to be normalized @type str @return normalized host address @rtype str """ return host.lower() ################################################################# ## Methods below are for testing purposes ################################################################# def test(self): """ Public method to execute the tests. @return flag indicating the test result @rtype bool """ self.__withPrivate = True self.__loadData() if not self.__tldDict: return False testDataFileName = "" testDataFileExist = False for path in self.__dataSearchPaths: testDataFileName = ( QFileInfo(path + "/test_psl.txt").absoluteFilePath() ) if QFileInfo(testDataFileName).exists(): testDataFileExist = True break if not testDataFileExist: testFileDownloadLink = ( "http://mxr.mozilla.org/mozilla-central/source/netwerk/test/" "unit/data/test_psl.txt?raw=1" ) E5MessageBox.information( None, self.tr("TLD Data File not found"), self.tr("""<p>The file 'test_psl.txt' was not found!""" """<br/>You can download it from '<a href="{0}">""" """<b>here</b></a>' to one of the following""" """ paths:</p><ul>{1}</ul>""").format( testFileDownloadLink, "".join(["<li>{0}</li>".format(p) for p in self.__dataSearchPaths])) ) return False file = QFile(testDataFileName) if not file.open(QFile.ReadOnly | QFile.Text): return False testRegExp = re.compile( "checkPublicSuffix\\(('([^']+)'|null), ('([^']+)'|null)\\);") allTestSuccess = True while not file.atEnd(): line = bytes(file.readLine()).decode("utf-8").strip() if not line or line.startswith("//"): continue match = testRegExp.search(line) if match is None: allTestSuccess = False else: hostName, registrableName = match.group(2, 4) if not self.__checkPublicSuffix(hostName, registrableName): allTestSuccess = False if allTestSuccess: qWarning("E5TldExtractor: Test passed successfully.") else: qWarning("E5TldExtractor: Test finished with some errors!") # reset the TLD dictionary self.__tldDict = collections.defaultdict(list) return allTestSuccess def __checkPublicSuffix(self, host, registrableName): """ Private method to test a host name against a registrable name. @param host host name to test @type str @param registrableName registrable domain name to test against @type str @return flag indicating the check result @rtype bool """ regName = self.registrableDomain(host) if regName != registrableName: qWarning( "E5TldExtractor Test Error: hostName: {0}\n" " Correct registrableName: {1}\n" " Calculated registrableName: {2}".format( host, registrableName, regName)) return False return True _TLDExtractor = None def instance(withPrivate=False): """ Global function to get a reference to the TLD extractor and create it, if it hasn't been yet. @param withPrivate flag indicating to load private TLDs as well @type bool @return reference to the zoom manager object @rtype E5TldExtractor """ global _TLDExtractor if _TLDExtractor is None: _TLDExtractor = E5TldExtractor(withPrivate=withPrivate) return _TLDExtractor