|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2016 - 2022 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing the TLD Extractor. |
|
8 """ |
|
9 |
|
10 # |
|
11 # This is a Python port of the TLDExtractor of Qupzilla |
|
12 # Copyright (C) 2014 Razi Alavizadeh <s.r.alavizadeh@gmail.com> |
|
13 # |
|
14 |
|
15 import collections |
|
16 import os |
|
17 |
|
18 from PyQt6.QtCore import QObject, QUrl, qWarning |
|
19 |
|
20 from EricWidgets import EricMessageBox |
|
21 |
|
22 |
|
23 class EricTldHostParts: |
|
24 """ |
|
25 Class implementing the host parts helper. |
|
26 """ |
|
27 def __init__(self): |
|
28 """ |
|
29 Constructor |
|
30 """ |
|
31 self.host = "" |
|
32 self.tld = "" |
|
33 self.domain = "" |
|
34 self.registrableDomain = "" |
|
35 self.subdomain = "" |
|
36 |
|
37 |
|
38 class EricTldExtractor(QObject): |
|
39 """ |
|
40 Class implementing the TLD Extractor. |
|
41 |
|
42 Note: The module function instance() should be used to get a reference |
|
43 to a global object to avoid overhead. |
|
44 """ |
|
45 def __init__(self, withPrivate=False, parent=None): |
|
46 """ |
|
47 Constructor |
|
48 |
|
49 @param withPrivate flag indicating to load private TLDs as well |
|
50 @type bool |
|
51 @param parent reference to the parent object |
|
52 @type QObject |
|
53 """ |
|
54 super().__init__(parent) |
|
55 |
|
56 self.__withPrivate = withPrivate |
|
57 self.__dataFileName = "" |
|
58 self.__dataSearchPaths = [] |
|
59 |
|
60 self.__tldDict = collections.defaultdict(list) |
|
61 # dict with list of str as values |
|
62 |
|
63 self.setDataSearchPaths() |
|
64 |
|
65 def isDataLoaded(self): |
|
66 """ |
|
67 Public method to check, if the TLD data ia already loaded. |
|
68 |
|
69 @return flag indicating data is loaded |
|
70 @rtype bool |
|
71 """ |
|
72 return bool(self.__tldDict) |
|
73 |
|
74 def tld(self, host): |
|
75 """ |
|
76 Public method to get the top level domain for a host. |
|
77 |
|
78 @param host host name to get TLD for |
|
79 @type str |
|
80 @return TLD for host |
|
81 @rtype str |
|
82 """ |
|
83 if not host or host.startswith("."): |
|
84 return "" |
|
85 |
|
86 cleanHost = self.__normalizedHost(host) |
|
87 |
|
88 tldPart = cleanHost[cleanHost.rfind(".") + 1:] |
|
89 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8") |
|
90 |
|
91 self.__loadData() |
|
92 |
|
93 if tldPart not in self.__tldDict: |
|
94 return tldPart |
|
95 |
|
96 tldRules = self.__tldDict[tldPart][:] |
|
97 |
|
98 if tldPart not in tldRules: |
|
99 tldRules.append(tldPart) |
|
100 |
|
101 maxLabelCount = 0 |
|
102 isWildcardTLD = False |
|
103 |
|
104 for rule in tldRules: |
|
105 labelCount = rule.count(".") + 1 |
|
106 |
|
107 if rule.startswith("!"): |
|
108 rule = rule[1:] |
|
109 |
|
110 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
|
111 |
|
112 # matches with exception TLD |
|
113 if cleanHost.endswith(rule): |
|
114 tldPart = rule[rule.find(".") + 1:] |
|
115 break |
|
116 |
|
117 if rule.startswith("*"): |
|
118 rule = rule[1:] |
|
119 |
|
120 if rule.startswith("."): |
|
121 rule = rule[1:] |
|
122 |
|
123 isWildcardTLD = True |
|
124 else: |
|
125 isWildcardTLD = False |
|
126 |
|
127 rule = bytes(QUrl.toAce(rule)).decode("utf-8") |
|
128 testRule = "." + rule |
|
129 testUrl = "." + cleanHost |
|
130 |
|
131 if labelCount > maxLabelCount and testUrl.endswith(testRule): |
|
132 tldPart = rule |
|
133 maxLabelCount = labelCount |
|
134 |
|
135 if isWildcardTLD: |
|
136 temp = cleanHost |
|
137 temp = temp[:temp.rfind(tldPart)] |
|
138 |
|
139 if temp.endswith("."): |
|
140 temp = temp[:-1] |
|
141 |
|
142 temp = temp[temp.rfind(".") + 1:] |
|
143 |
|
144 if temp: |
|
145 tldPart = temp + "." + rule |
|
146 else: |
|
147 tldPart = rule |
|
148 |
|
149 temp = self.__normalizedHost(host) |
|
150 tldPart = ".".join( |
|
151 temp.split(".")[temp.count(".") - tldPart.count("."):]) |
|
152 |
|
153 return tldPart |
|
154 |
|
155 def domain(self, host): |
|
156 """ |
|
157 Public method to get the domain for a host. |
|
158 |
|
159 @param host host name to get the domain for |
|
160 @type str |
|
161 @return domain for host |
|
162 @rtype str |
|
163 """ |
|
164 tldPart = self.tld(host) |
|
165 |
|
166 return self.__domainHelper(host, tldPart) |
|
167 |
|
168 def registrableDomain(self, host): |
|
169 """ |
|
170 Public method to get the registrable domain for a host. |
|
171 |
|
172 @param host host name to get the registrable domain for |
|
173 @type str |
|
174 @return registrable domain for host |
|
175 @rtype str |
|
176 """ |
|
177 tldPart = self.tld(host) |
|
178 |
|
179 return self.__registrableDomainHelper( |
|
180 self.__domainHelper(host, tldPart), tldPart) |
|
181 |
|
182 def subdomain(self, host): |
|
183 """ |
|
184 Public method to get the subdomain for a host. |
|
185 |
|
186 @param host host name to get the subdomain for |
|
187 @type str |
|
188 @return subdomain for host |
|
189 @rtype str |
|
190 """ |
|
191 return self.__subdomainHelper(host, self.registrableDomain(host)) |
|
192 |
|
193 def splitParts(self, host): |
|
194 """ |
|
195 Public method to split a host address into its parts. |
|
196 |
|
197 @param host host address to be split |
|
198 @type str |
|
199 @return splitted host address |
|
200 @rtype EricTldHostParts |
|
201 """ |
|
202 hostParts = EricTldHostParts() |
|
203 hostParts.host = host |
|
204 hostParts.tld = self.tld(host) |
|
205 hostParts.domain = self.__domainHelper(host, hostParts.tld) |
|
206 hostParts.registrableDomain = self.__registrableDomainHelper( |
|
207 hostParts.domain, hostParts.tld) |
|
208 hostParts.subdomain = self.__subdomainHelper( |
|
209 host, hostParts.registrableDomain) |
|
210 |
|
211 return hostParts |
|
212 |
|
213 def dataSearchPaths(self): |
|
214 """ |
|
215 Public method to get the search paths for the TLD data file. |
|
216 |
|
217 @return search paths for the TLD data file |
|
218 @rtype list of str |
|
219 """ |
|
220 return self.__dataSearchPaths[:] |
|
221 |
|
222 def setDataSearchPaths(self, searchPaths=None): |
|
223 """ |
|
224 Public method to set the search paths for the TLD data file. |
|
225 |
|
226 @param searchPaths search paths for the TLD data file or None, |
|
227 if the default search paths shall be set |
|
228 @type list of str |
|
229 """ |
|
230 if searchPaths: |
|
231 self.__dataSearchPaths = searchPaths[:] |
|
232 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths()) |
|
233 else: |
|
234 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:] |
|
235 |
|
236 # remove duplicates |
|
237 paths = [] |
|
238 for p in self.__dataSearchPaths: |
|
239 if p not in paths: |
|
240 paths.append(p) |
|
241 self.__dataSearchPaths = paths |
|
242 |
|
243 def __defaultDataSearchPaths(self): |
|
244 """ |
|
245 Private method to get the default search paths for the TLD data file. |
|
246 |
|
247 @return default search paths for the TLD data file |
|
248 @rtype list of str |
|
249 """ |
|
250 return [os.path.join(os.path.dirname(__file__), "data")] |
|
251 |
|
252 def getTldDownloadUrl(self): |
|
253 """ |
|
254 Public method to get the TLD data file download URL. |
|
255 |
|
256 @return download URL |
|
257 @rtype QUrl |
|
258 """ |
|
259 return QUrl( |
|
260 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
|
261 "effective_tld_names.dat?raw=1") |
|
262 |
|
263 def __loadData(self): |
|
264 """ |
|
265 Private method to load the TLD data. |
|
266 """ |
|
267 if self.isDataLoaded(): |
|
268 return |
|
269 |
|
270 dataFileName = "" |
|
271 parsedDataFileExist = False |
|
272 |
|
273 for searchPath in self.__dataSearchPaths: |
|
274 dataFileName = os.path.abspath( |
|
275 os.path.join(searchPath, "effective_tld_names.dat") |
|
276 ) |
|
277 if os.path.exists(dataFileName): |
|
278 parsedDataFileExist = True |
|
279 break |
|
280 |
|
281 if not parsedDataFileExist: |
|
282 tldDataFileDownloadLink = ( |
|
283 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" |
|
284 "effective_tld_names.dat?raw=1" |
|
285 ) |
|
286 EricMessageBox.information( |
|
287 None, |
|
288 self.tr("TLD Data File not found"), |
|
289 self.tr("""<p>The file 'effective_tld_names.dat' was not""" |
|
290 """ found!<br/>You can download it from """ |
|
291 """'<a href="{0}"><b>here</b></a>' to one of the""" |
|
292 """ following paths:</p><ul>{1}</ul>""").format( |
|
293 tldDataFileDownloadLink, |
|
294 "".join(["<li>{0}</li>".format(p) |
|
295 for p in self.__dataSearchPaths])) |
|
296 ) |
|
297 return |
|
298 |
|
299 self.__dataFileName = dataFileName |
|
300 if not self.__parseData(dataFileName, |
|
301 loadPrivateDomains=self.__withPrivate): |
|
302 qWarning( |
|
303 "EricTldExtractor: There are some parse errors for file: {0}" |
|
304 .format(dataFileName)) |
|
305 |
|
306 def __parseData(self, dataFile, loadPrivateDomains=False): |
|
307 """ |
|
308 Private method to parse TLD data. |
|
309 |
|
310 @param dataFile name of the file containing the TLD data |
|
311 @type str |
|
312 @param loadPrivateDomains flag indicating to load private domains |
|
313 @type bool |
|
314 @return flag indicating success |
|
315 @rtype bool |
|
316 """ |
|
317 # start with a fresh dictionary |
|
318 self.__tldDict = collections.defaultdict(list) |
|
319 |
|
320 seekToEndOfPrivateDomains = False |
|
321 |
|
322 try: |
|
323 with open(dataFile, "r", encoding="utf-8") as f: |
|
324 for line in f.readlines(): |
|
325 if not line: |
|
326 continue |
|
327 |
|
328 if line.startswith("."): |
|
329 line = line[1:] |
|
330 |
|
331 if line.startswith("//"): |
|
332 if "===END PRIVATE DOMAINS===" in line: |
|
333 seekToEndOfPrivateDomains = False |
|
334 |
|
335 if ( |
|
336 not loadPrivateDomains and |
|
337 "===BEGIN PRIVATE DOMAINS===" in line |
|
338 ): |
|
339 seekToEndOfPrivateDomains = True |
|
340 |
|
341 continue |
|
342 |
|
343 if seekToEndOfPrivateDomains: |
|
344 continue |
|
345 |
|
346 # only data up to the first whitespace is used |
|
347 line = line.split(None, 1)[0] |
|
348 |
|
349 if "." not in line: |
|
350 self.__tldDict[line].append(line) |
|
351 else: |
|
352 key = line[line.rfind(".") + 1:] |
|
353 self.__tldDict[key].append(line) |
|
354 |
|
355 return self.isDataLoaded() |
|
356 except OSError: |
|
357 return False |
|
358 |
|
359 def __domainHelper(self, host, tldPart): |
|
360 """ |
|
361 Private method to get the domain name without TLD. |
|
362 |
|
363 @param host host address |
|
364 @type str |
|
365 @param tldPart TLD part of the host address |
|
366 @type str |
|
367 @return domain name |
|
368 @rtype str |
|
369 """ |
|
370 if not host or not tldPart: |
|
371 return "" |
|
372 |
|
373 temp = self.__normalizedHost(host) |
|
374 temp = temp[:temp.rfind(tldPart)] |
|
375 |
|
376 if temp.endswith("."): |
|
377 temp = temp[:-1] |
|
378 |
|
379 return temp[temp.rfind(".") + 1:] |
|
380 |
|
381 def __registrableDomainHelper(self, domainPart, tldPart): |
|
382 """ |
|
383 Private method to get the registrable domain (i.e. domain plus TLD). |
|
384 |
|
385 @param domainPart domain part of a host address |
|
386 @type str |
|
387 @param tldPart TLD part of a host address |
|
388 @type str |
|
389 @return registrable domain name |
|
390 @rtype str |
|
391 """ |
|
392 if not tldPart or not domainPart: |
|
393 return "" |
|
394 else: |
|
395 return "{0}.{1}".format(domainPart, tldPart) |
|
396 |
|
397 def __subdomainHelper(self, host, registrablePart): |
|
398 """ |
|
399 Private method to get the subdomain of a host address (i.e. domain part |
|
400 without the registrable domain name). |
|
401 |
|
402 @param host host address |
|
403 @type str |
|
404 @param registrablePart registrable domain part of the host address |
|
405 @type str |
|
406 @return subdomain name |
|
407 @rtype str |
|
408 """ |
|
409 if not host or not registrablePart: |
|
410 return "" |
|
411 |
|
412 subdomain = self.__normalizedHost(host) |
|
413 |
|
414 subdomain = subdomain[:subdomain.rfind(registrablePart)] |
|
415 |
|
416 if subdomain.endswith("."): |
|
417 subdomain = subdomain[:-1] |
|
418 |
|
419 return subdomain |
|
420 |
|
421 def __normalizedHost(self, host): |
|
422 """ |
|
423 Private method to get the normalized host for a host address. |
|
424 |
|
425 @param host host address to be normalized |
|
426 @type str |
|
427 @return normalized host address |
|
428 @rtype str |
|
429 """ |
|
430 return host.lower() |
|
431 |
|
432 |
|
433 _TLDExtractor = None |
|
434 |
|
435 |
|
436 def instance(withPrivate=False): |
|
437 """ |
|
438 Global function to get a reference to the TLD extractor and create it, if |
|
439 it hasn't been yet. |
|
440 |
|
441 @param withPrivate flag indicating to load private TLDs as well |
|
442 @type bool |
|
443 @return reference to the zoom manager object |
|
444 @rtype EricTldExtractor |
|
445 """ |
|
446 global _TLDExtractor |
|
447 |
|
448 if _TLDExtractor is None: |
|
449 _TLDExtractor = EricTldExtractor(withPrivate=withPrivate) |
|
450 |
|
451 return _TLDExtractor |