src/eric7/EricNetwork/EricTldExtractor.py

branch
eric7
changeset 9209
b99e7fd55fd3
parent 9162
8b75b1668583
child 9221
bf71ee032bb4
equal deleted inserted replaced
9208:3fc8dfeb6ebe 9209:b99e7fd55fd3
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2016 - 2022 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing the TLD Extractor.
8 """
9
10 #
11 # This is a Python port of the TLDExtractor of Qupzilla
12 # Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com>
13 #
14
15 import collections
16 import os
17
18 from PyQt6.QtCore import QObject, QUrl, qWarning
19
20 from EricWidgets import EricMessageBox
21
22
23 class EricTldHostParts:
24 """
25 Class implementing the host parts helper.
26 """
27 def __init__(self):
28 """
29 Constructor
30 """
31 self.host = ""
32 self.tld = ""
33 self.domain = ""
34 self.registrableDomain = ""
35 self.subdomain = ""
36
37
38 class EricTldExtractor(QObject):
39 """
40 Class implementing the TLD Extractor.
41
42 Note: The module function instance() should be used to get a reference
43 to a global object to avoid overhead.
44 """
45 def __init__(self, withPrivate=False, parent=None):
46 """
47 Constructor
48
49 @param withPrivate flag indicating to load private TLDs as well
50 @type bool
51 @param parent reference to the parent object
52 @type QObject
53 """
54 super().__init__(parent)
55
56 self.__withPrivate = withPrivate
57 self.__dataFileName = ""
58 self.__dataSearchPaths = []
59
60 self.__tldDict = collections.defaultdict(list)
61 # dict with list of str as values
62
63 self.setDataSearchPaths()
64
65 def isDataLoaded(self):
66 """
67 Public method to check, if the TLD data ia already loaded.
68
69 @return flag indicating data is loaded
70 @rtype bool
71 """
72 return bool(self.__tldDict)
73
74 def tld(self, host):
75 """
76 Public method to get the top level domain for a host.
77
78 @param host host name to get TLD for
79 @type str
80 @return TLD for host
81 @rtype str
82 """
83 if not host or host.startswith("."):
84 return ""
85
86 cleanHost = self.__normalizedHost(host)
87
88 tldPart = cleanHost[cleanHost.rfind(".") + 1:]
89 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8")
90
91 self.__loadData()
92
93 if tldPart not in self.__tldDict:
94 return tldPart
95
96 tldRules = self.__tldDict[tldPart][:]
97
98 if tldPart not in tldRules:
99 tldRules.append(tldPart)
100
101 maxLabelCount = 0
102 isWildcardTLD = False
103
104 for rule in tldRules:
105 labelCount = rule.count(".") + 1
106
107 if rule.startswith("!"):
108 rule = rule[1:]
109
110 rule = bytes(QUrl.toAce(rule)).decode("utf-8")
111
112 # matches with exception TLD
113 if cleanHost.endswith(rule):
114 tldPart = rule[rule.find(".") + 1:]
115 break
116
117 if rule.startswith("*"):
118 rule = rule[1:]
119
120 if rule.startswith("."):
121 rule = rule[1:]
122
123 isWildcardTLD = True
124 else:
125 isWildcardTLD = False
126
127 rule = bytes(QUrl.toAce(rule)).decode("utf-8")
128 testRule = "." + rule
129 testUrl = "." + cleanHost
130
131 if labelCount > maxLabelCount and testUrl.endswith(testRule):
132 tldPart = rule
133 maxLabelCount = labelCount
134
135 if isWildcardTLD:
136 temp = cleanHost
137 temp = temp[:temp.rfind(tldPart)]
138
139 if temp.endswith("."):
140 temp = temp[:-1]
141
142 temp = temp[temp.rfind(".") + 1:]
143
144 if temp:
145 tldPart = temp + "." + rule
146 else:
147 tldPart = rule
148
149 temp = self.__normalizedHost(host)
150 tldPart = ".".join(
151 temp.split(".")[temp.count(".") - tldPart.count("."):])
152
153 return tldPart
154
155 def domain(self, host):
156 """
157 Public method to get the domain for a host.
158
159 @param host host name to get the domain for
160 @type str
161 @return domain for host
162 @rtype str
163 """
164 tldPart = self.tld(host)
165
166 return self.__domainHelper(host, tldPart)
167
168 def registrableDomain(self, host):
169 """
170 Public method to get the registrable domain for a host.
171
172 @param host host name to get the registrable domain for
173 @type str
174 @return registrable domain for host
175 @rtype str
176 """
177 tldPart = self.tld(host)
178
179 return self.__registrableDomainHelper(
180 self.__domainHelper(host, tldPart), tldPart)
181
182 def subdomain(self, host):
183 """
184 Public method to get the subdomain for a host.
185
186 @param host host name to get the subdomain for
187 @type str
188 @return subdomain for host
189 @rtype str
190 """
191 return self.__subdomainHelper(host, self.registrableDomain(host))
192
193 def splitParts(self, host):
194 """
195 Public method to split a host address into its parts.
196
197 @param host host address to be split
198 @type str
199 @return splitted host address
200 @rtype EricTldHostParts
201 """
202 hostParts = EricTldHostParts()
203 hostParts.host = host
204 hostParts.tld = self.tld(host)
205 hostParts.domain = self.__domainHelper(host, hostParts.tld)
206 hostParts.registrableDomain = self.__registrableDomainHelper(
207 hostParts.domain, hostParts.tld)
208 hostParts.subdomain = self.__subdomainHelper(
209 host, hostParts.registrableDomain)
210
211 return hostParts
212
213 def dataSearchPaths(self):
214 """
215 Public method to get the search paths for the TLD data file.
216
217 @return search paths for the TLD data file
218 @rtype list of str
219 """
220 return self.__dataSearchPaths[:]
221
222 def setDataSearchPaths(self, searchPaths=None):
223 """
224 Public method to set the search paths for the TLD data file.
225
226 @param searchPaths search paths for the TLD data file or None,
227 if the default search paths shall be set
228 @type list of str
229 """
230 if searchPaths:
231 self.__dataSearchPaths = searchPaths[:]
232 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths())
233 else:
234 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:]
235
236 # remove duplicates
237 paths = []
238 for p in self.__dataSearchPaths:
239 if p not in paths:
240 paths.append(p)
241 self.__dataSearchPaths = paths
242
243 def __defaultDataSearchPaths(self):
244 """
245 Private method to get the default search paths for the TLD data file.
246
247 @return default search paths for the TLD data file
248 @rtype list of str
249 """
250 return [os.path.join(os.path.dirname(__file__), "data")]
251
252 def getTldDownloadUrl(self):
253 """
254 Public method to get the TLD data file download URL.
255
256 @return download URL
257 @rtype QUrl
258 """
259 return QUrl(
260 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/"
261 "effective_tld_names.dat?raw=1")
262
263 def __loadData(self):
264 """
265 Private method to load the TLD data.
266 """
267 if self.isDataLoaded():
268 return
269
270 dataFileName = ""
271 parsedDataFileExist = False
272
273 for searchPath in self.__dataSearchPaths:
274 dataFileName = os.path.abspath(
275 os.path.join(searchPath, "effective_tld_names.dat")
276 )
277 if os.path.exists(dataFileName):
278 parsedDataFileExist = True
279 break
280
281 if not parsedDataFileExist:
282 tldDataFileDownloadLink = (
283 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/"
284 "effective_tld_names.dat?raw=1"
285 )
286 EricMessageBox.information(
287 None,
288 self.tr("TLD Data File not found"),
289 self.tr("""<p>The file 'effective_tld_names.dat' was not"""
290 """ found!<br/>You can download it from """
291 """'<a href="{0}"><b>here</b></a>' to one of the"""
292 """ following paths:</p><ul>{1}</ul>""").format(
293 tldDataFileDownloadLink,
294 "".join(["<li>{0}</li>".format(p)
295 for p in self.__dataSearchPaths]))
296 )
297 return
298
299 self.__dataFileName = dataFileName
300 if not self.__parseData(dataFileName,
301 loadPrivateDomains=self.__withPrivate):
302 qWarning(
303 "EricTldExtractor: There are some parse errors for file: {0}"
304 .format(dataFileName))
305
306 def __parseData(self, dataFile, loadPrivateDomains=False):
307 """
308 Private method to parse TLD data.
309
310 @param dataFile name of the file containing the TLD data
311 @type str
312 @param loadPrivateDomains flag indicating to load private domains
313 @type bool
314 @return flag indicating success
315 @rtype bool
316 """
317 # start with a fresh dictionary
318 self.__tldDict = collections.defaultdict(list)
319
320 seekToEndOfPrivateDomains = False
321
322 try:
323 with open(dataFile, "r", encoding="utf-8") as f:
324 for line in f.readlines():
325 if not line:
326 continue
327
328 if line.startswith("."):
329 line = line[1:]
330
331 if line.startswith("//"):
332 if "===END PRIVATE DOMAINS===" in line:
333 seekToEndOfPrivateDomains = False
334
335 if (
336 not loadPrivateDomains and
337 "===BEGIN PRIVATE DOMAINS===" in line
338 ):
339 seekToEndOfPrivateDomains = True
340
341 continue
342
343 if seekToEndOfPrivateDomains:
344 continue
345
346 # only data up to the first whitespace is used
347 line = line.split(None, 1)[0]
348
349 if "." not in line:
350 self.__tldDict[line].append(line)
351 else:
352 key = line[line.rfind(".") + 1:]
353 self.__tldDict[key].append(line)
354
355 return self.isDataLoaded()
356 except OSError:
357 return False
358
359 def __domainHelper(self, host, tldPart):
360 """
361 Private method to get the domain name without TLD.
362
363 @param host host address
364 @type str
365 @param tldPart TLD part of the host address
366 @type str
367 @return domain name
368 @rtype str
369 """
370 if not host or not tldPart:
371 return ""
372
373 temp = self.__normalizedHost(host)
374 temp = temp[:temp.rfind(tldPart)]
375
376 if temp.endswith("."):
377 temp = temp[:-1]
378
379 return temp[temp.rfind(".") + 1:]
380
381 def __registrableDomainHelper(self, domainPart, tldPart):
382 """
383 Private method to get the registrable domain (i.e. domain plus TLD).
384
385 @param domainPart domain part of a host address
386 @type str
387 @param tldPart TLD part of a host address
388 @type str
389 @return registrable domain name
390 @rtype str
391 """
392 if not tldPart or not domainPart:
393 return ""
394 else:
395 return "{0}.{1}".format(domainPart, tldPart)
396
397 def __subdomainHelper(self, host, registrablePart):
398 """
399 Private method to get the subdomain of a host address (i.e. domain part
400 without the registrable domain name).
401
402 @param host host address
403 @type str
404 @param registrablePart registrable domain part of the host address
405 @type str
406 @return subdomain name
407 @rtype str
408 """
409 if not host or not registrablePart:
410 return ""
411
412 subdomain = self.__normalizedHost(host)
413
414 subdomain = subdomain[:subdomain.rfind(registrablePart)]
415
416 if subdomain.endswith("."):
417 subdomain = subdomain[:-1]
418
419 return subdomain
420
421 def __normalizedHost(self, host):
422 """
423 Private method to get the normalized host for a host address.
424
425 @param host host address to be normalized
426 @type str
427 @return normalized host address
428 @rtype str
429 """
430 return host.lower()
431
432
433 _TLDExtractor = None
434
435
436 def instance(withPrivate=False):
437 """
438 Global function to get a reference to the TLD extractor and create it, if
439 it hasn't been yet.
440
441 @param withPrivate flag indicating to load private TLDs as well
442 @type bool
443 @return reference to the zoom manager object
444 @rtype EricTldExtractor
445 """
446 global _TLDExtractor
447
448 if _TLDExtractor is None:
449 _TLDExtractor = EricTldExtractor(withPrivate=withPrivate)
450
451 return _TLDExtractor

eric ide

mercurial