src/eric7/EricNetwork/EricTldExtractor.py

branch
eric7
changeset 9221
bf71ee032bb4
parent 9209
b99e7fd55fd3
child 9413
80c06d472826
equal deleted inserted replaced
9220:e9e7eca7efee 9221:bf71ee032bb4
22 22
23 class EricTldHostParts: 23 class EricTldHostParts:
24 """ 24 """
25 Class implementing the host parts helper. 25 Class implementing the host parts helper.
26 """ 26 """
27
27 def __init__(self): 28 def __init__(self):
28 """ 29 """
29 Constructor 30 Constructor
30 """ 31 """
31 self.host = "" 32 self.host = ""
36 37
37 38
38 class EricTldExtractor(QObject): 39 class EricTldExtractor(QObject):
39 """ 40 """
40 Class implementing the TLD Extractor. 41 Class implementing the TLD Extractor.
41 42
42 Note: The module function instance() should be used to get a reference 43 Note: The module function instance() should be used to get a reference
43 to a global object to avoid overhead. 44 to a global object to avoid overhead.
44 """ 45 """
46
45 def __init__(self, withPrivate=False, parent=None): 47 def __init__(self, withPrivate=False, parent=None):
46 """ 48 """
47 Constructor 49 Constructor
48 50
49 @param withPrivate flag indicating to load private TLDs as well 51 @param withPrivate flag indicating to load private TLDs as well
50 @type bool 52 @type bool
51 @param parent reference to the parent object 53 @param parent reference to the parent object
52 @type QObject 54 @type QObject
53 """ 55 """
54 super().__init__(parent) 56 super().__init__(parent)
55 57
56 self.__withPrivate = withPrivate 58 self.__withPrivate = withPrivate
57 self.__dataFileName = "" 59 self.__dataFileName = ""
58 self.__dataSearchPaths = [] 60 self.__dataSearchPaths = []
59 61
60 self.__tldDict = collections.defaultdict(list) 62 self.__tldDict = collections.defaultdict(list)
61 # dict with list of str as values 63 # dict with list of str as values
62 64
63 self.setDataSearchPaths() 65 self.setDataSearchPaths()
64 66
65 def isDataLoaded(self): 67 def isDataLoaded(self):
66 """ 68 """
67 Public method to check, if the TLD data ia already loaded. 69 Public method to check, if the TLD data ia already loaded.
68 70
69 @return flag indicating data is loaded 71 @return flag indicating data is loaded
70 @rtype bool 72 @rtype bool
71 """ 73 """
72 return bool(self.__tldDict) 74 return bool(self.__tldDict)
73 75
74 def tld(self, host): 76 def tld(self, host):
75 """ 77 """
76 Public method to get the top level domain for a host. 78 Public method to get the top level domain for a host.
77 79
78 @param host host name to get TLD for 80 @param host host name to get TLD for
79 @type str 81 @type str
80 @return TLD for host 82 @return TLD for host
81 @rtype str 83 @rtype str
82 """ 84 """
83 if not host or host.startswith("."): 85 if not host or host.startswith("."):
84 return "" 86 return ""
85 87
86 cleanHost = self.__normalizedHost(host) 88 cleanHost = self.__normalizedHost(host)
87 89
88 tldPart = cleanHost[cleanHost.rfind(".") + 1:] 90 tldPart = cleanHost[cleanHost.rfind(".") + 1 :]
89 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8") 91 cleanHost = bytes(QUrl.toAce(cleanHost)).decode("utf-8")
90 92
91 self.__loadData() 93 self.__loadData()
92 94
93 if tldPart not in self.__tldDict: 95 if tldPart not in self.__tldDict:
94 return tldPart 96 return tldPart
95 97
96 tldRules = self.__tldDict[tldPart][:] 98 tldRules = self.__tldDict[tldPart][:]
97 99
98 if tldPart not in tldRules: 100 if tldPart not in tldRules:
99 tldRules.append(tldPart) 101 tldRules.append(tldPart)
100 102
101 maxLabelCount = 0 103 maxLabelCount = 0
102 isWildcardTLD = False 104 isWildcardTLD = False
103 105
104 for rule in tldRules: 106 for rule in tldRules:
105 labelCount = rule.count(".") + 1 107 labelCount = rule.count(".") + 1
106 108
107 if rule.startswith("!"): 109 if rule.startswith("!"):
108 rule = rule[1:] 110 rule = rule[1:]
109 111
110 rule = bytes(QUrl.toAce(rule)).decode("utf-8") 112 rule = bytes(QUrl.toAce(rule)).decode("utf-8")
111 113
112 # matches with exception TLD 114 # matches with exception TLD
113 if cleanHost.endswith(rule): 115 if cleanHost.endswith(rule):
114 tldPart = rule[rule.find(".") + 1:] 116 tldPart = rule[rule.find(".") + 1 :]
115 break 117 break
116 118
117 if rule.startswith("*"): 119 if rule.startswith("*"):
118 rule = rule[1:] 120 rule = rule[1:]
119 121
120 if rule.startswith("."): 122 if rule.startswith("."):
121 rule = rule[1:] 123 rule = rule[1:]
122 124
123 isWildcardTLD = True 125 isWildcardTLD = True
124 else: 126 else:
125 isWildcardTLD = False 127 isWildcardTLD = False
126 128
127 rule = bytes(QUrl.toAce(rule)).decode("utf-8") 129 rule = bytes(QUrl.toAce(rule)).decode("utf-8")
128 testRule = "." + rule 130 testRule = "." + rule
129 testUrl = "." + cleanHost 131 testUrl = "." + cleanHost
130 132
131 if labelCount > maxLabelCount and testUrl.endswith(testRule): 133 if labelCount > maxLabelCount and testUrl.endswith(testRule):
132 tldPart = rule 134 tldPart = rule
133 maxLabelCount = labelCount 135 maxLabelCount = labelCount
134 136
135 if isWildcardTLD: 137 if isWildcardTLD:
136 temp = cleanHost 138 temp = cleanHost
137 temp = temp[:temp.rfind(tldPart)] 139 temp = temp[: temp.rfind(tldPart)]
138 140
139 if temp.endswith("."): 141 if temp.endswith("."):
140 temp = temp[:-1] 142 temp = temp[:-1]
141 143
142 temp = temp[temp.rfind(".") + 1:] 144 temp = temp[temp.rfind(".") + 1 :]
143 145
144 if temp: 146 if temp:
145 tldPart = temp + "." + rule 147 tldPart = temp + "." + rule
146 else: 148 else:
147 tldPart = rule 149 tldPart = rule
148 150
149 temp = self.__normalizedHost(host) 151 temp = self.__normalizedHost(host)
150 tldPart = ".".join( 152 tldPart = ".".join(temp.split(".")[temp.count(".") - tldPart.count(".") :])
151 temp.split(".")[temp.count(".") - tldPart.count("."):]) 153
152
153 return tldPart 154 return tldPart
154 155
155 def domain(self, host): 156 def domain(self, host):
156 """ 157 """
157 Public method to get the domain for a host. 158 Public method to get the domain for a host.
158 159
159 @param host host name to get the domain for 160 @param host host name to get the domain for
160 @type str 161 @type str
161 @return domain for host 162 @return domain for host
162 @rtype str 163 @rtype str
163 """ 164 """
164 tldPart = self.tld(host) 165 tldPart = self.tld(host)
165 166
166 return self.__domainHelper(host, tldPart) 167 return self.__domainHelper(host, tldPart)
167 168
168 def registrableDomain(self, host): 169 def registrableDomain(self, host):
169 """ 170 """
170 Public method to get the registrable domain for a host. 171 Public method to get the registrable domain for a host.
171 172
172 @param host host name to get the registrable domain for 173 @param host host name to get the registrable domain for
173 @type str 174 @type str
174 @return registrable domain for host 175 @return registrable domain for host
175 @rtype str 176 @rtype str
176 """ 177 """
177 tldPart = self.tld(host) 178 tldPart = self.tld(host)
178 179
179 return self.__registrableDomainHelper( 180 return self.__registrableDomainHelper(
180 self.__domainHelper(host, tldPart), tldPart) 181 self.__domainHelper(host, tldPart), tldPart
181 182 )
183
182 def subdomain(self, host): 184 def subdomain(self, host):
183 """ 185 """
184 Public method to get the subdomain for a host. 186 Public method to get the subdomain for a host.
185 187
186 @param host host name to get the subdomain for 188 @param host host name to get the subdomain for
187 @type str 189 @type str
188 @return subdomain for host 190 @return subdomain for host
189 @rtype str 191 @rtype str
190 """ 192 """
191 return self.__subdomainHelper(host, self.registrableDomain(host)) 193 return self.__subdomainHelper(host, self.registrableDomain(host))
192 194
193 def splitParts(self, host): 195 def splitParts(self, host):
194 """ 196 """
195 Public method to split a host address into its parts. 197 Public method to split a host address into its parts.
196 198
197 @param host host address to be split 199 @param host host address to be split
198 @type str 200 @type str
199 @return splitted host address 201 @return splitted host address
200 @rtype EricTldHostParts 202 @rtype EricTldHostParts
201 """ 203 """
202 hostParts = EricTldHostParts() 204 hostParts = EricTldHostParts()
203 hostParts.host = host 205 hostParts.host = host
204 hostParts.tld = self.tld(host) 206 hostParts.tld = self.tld(host)
205 hostParts.domain = self.__domainHelper(host, hostParts.tld) 207 hostParts.domain = self.__domainHelper(host, hostParts.tld)
206 hostParts.registrableDomain = self.__registrableDomainHelper( 208 hostParts.registrableDomain = self.__registrableDomainHelper(
207 hostParts.domain, hostParts.tld) 209 hostParts.domain, hostParts.tld
208 hostParts.subdomain = self.__subdomainHelper( 210 )
209 host, hostParts.registrableDomain) 211 hostParts.subdomain = self.__subdomainHelper(host, hostParts.registrableDomain)
210 212
211 return hostParts 213 return hostParts
212 214
213 def dataSearchPaths(self): 215 def dataSearchPaths(self):
214 """ 216 """
215 Public method to get the search paths for the TLD data file. 217 Public method to get the search paths for the TLD data file.
216 218
217 @return search paths for the TLD data file 219 @return search paths for the TLD data file
218 @rtype list of str 220 @rtype list of str
219 """ 221 """
220 return self.__dataSearchPaths[:] 222 return self.__dataSearchPaths[:]
221 223
222 def setDataSearchPaths(self, searchPaths=None): 224 def setDataSearchPaths(self, searchPaths=None):
223 """ 225 """
224 Public method to set the search paths for the TLD data file. 226 Public method to set the search paths for the TLD data file.
225 227
226 @param searchPaths search paths for the TLD data file or None, 228 @param searchPaths search paths for the TLD data file or None,
227 if the default search paths shall be set 229 if the default search paths shall be set
228 @type list of str 230 @type list of str
229 """ 231 """
230 if searchPaths: 232 if searchPaths:
231 self.__dataSearchPaths = searchPaths[:] 233 self.__dataSearchPaths = searchPaths[:]
232 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths()) 234 self.__dataSearchPaths.extend(self.__defaultDataSearchPaths())
233 else: 235 else:
234 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:] 236 self.__dataSearchPaths = self.__defaultDataSearchPaths()[:]
235 237
236 # remove duplicates 238 # remove duplicates
237 paths = [] 239 paths = []
238 for p in self.__dataSearchPaths: 240 for p in self.__dataSearchPaths:
239 if p not in paths: 241 if p not in paths:
240 paths.append(p) 242 paths.append(p)
241 self.__dataSearchPaths = paths 243 self.__dataSearchPaths = paths
242 244
243 def __defaultDataSearchPaths(self): 245 def __defaultDataSearchPaths(self):
244 """ 246 """
245 Private method to get the default search paths for the TLD data file. 247 Private method to get the default search paths for the TLD data file.
246 248
247 @return default search paths for the TLD data file 249 @return default search paths for the TLD data file
248 @rtype list of str 250 @rtype list of str
249 """ 251 """
250 return [os.path.join(os.path.dirname(__file__), "data")] 252 return [os.path.join(os.path.dirname(__file__), "data")]
251 253
252 def getTldDownloadUrl(self): 254 def getTldDownloadUrl(self):
253 """ 255 """
254 Public method to get the TLD data file download URL. 256 Public method to get the TLD data file download URL.
255 257
256 @return download URL 258 @return download URL
257 @rtype QUrl 259 @rtype QUrl
258 """ 260 """
259 return QUrl( 261 return QUrl(
260 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" 262 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/"
261 "effective_tld_names.dat?raw=1") 263 "effective_tld_names.dat?raw=1"
262 264 )
265
263 def __loadData(self): 266 def __loadData(self):
264 """ 267 """
265 Private method to load the TLD data. 268 Private method to load the TLD data.
266 """ 269 """
267 if self.isDataLoaded(): 270 if self.isDataLoaded():
268 return 271 return
269 272
270 dataFileName = "" 273 dataFileName = ""
271 parsedDataFileExist = False 274 parsedDataFileExist = False
272 275
273 for searchPath in self.__dataSearchPaths: 276 for searchPath in self.__dataSearchPaths:
274 dataFileName = os.path.abspath( 277 dataFileName = os.path.abspath(
275 os.path.join(searchPath, "effective_tld_names.dat") 278 os.path.join(searchPath, "effective_tld_names.dat")
276 ) 279 )
277 if os.path.exists(dataFileName): 280 if os.path.exists(dataFileName):
278 parsedDataFileExist = True 281 parsedDataFileExist = True
279 break 282 break
280 283
281 if not parsedDataFileExist: 284 if not parsedDataFileExist:
282 tldDataFileDownloadLink = ( 285 tldDataFileDownloadLink = (
283 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/" 286 "http://mxr.mozilla.org/mozilla-central/source/netwerk/dns/"
284 "effective_tld_names.dat?raw=1" 287 "effective_tld_names.dat?raw=1"
285 ) 288 )
286 EricMessageBox.information( 289 EricMessageBox.information(
287 None, 290 None,
288 self.tr("TLD Data File not found"), 291 self.tr("TLD Data File not found"),
289 self.tr("""<p>The file 'effective_tld_names.dat' was not""" 292 self.tr(
290 """ found!<br/>You can download it from """ 293 """<p>The file 'effective_tld_names.dat' was not"""
291 """'<a href="{0}"><b>here</b></a>' to one of the""" 294 """ found!<br/>You can download it from """
292 """ following paths:</p><ul>{1}</ul>""").format( 295 """'<a href="{0}"><b>here</b></a>' to one of the"""
296 """ following paths:</p><ul>{1}</ul>"""
297 ).format(
293 tldDataFileDownloadLink, 298 tldDataFileDownloadLink,
294 "".join(["<li>{0}</li>".format(p) 299 "".join(["<li>{0}</li>".format(p) for p in self.__dataSearchPaths]),
295 for p in self.__dataSearchPaths])) 300 ),
296 ) 301 )
297 return 302 return
298 303
299 self.__dataFileName = dataFileName 304 self.__dataFileName = dataFileName
300 if not self.__parseData(dataFileName, 305 if not self.__parseData(dataFileName, loadPrivateDomains=self.__withPrivate):
301 loadPrivateDomains=self.__withPrivate):
302 qWarning( 306 qWarning(
303 "EricTldExtractor: There are some parse errors for file: {0}" 307 "EricTldExtractor: There are some parse errors for file: {0}".format(
304 .format(dataFileName)) 308 dataFileName
305 309 )
310 )
311
306 def __parseData(self, dataFile, loadPrivateDomains=False): 312 def __parseData(self, dataFile, loadPrivateDomains=False):
307 """ 313 """
308 Private method to parse TLD data. 314 Private method to parse TLD data.
309 315
310 @param dataFile name of the file containing the TLD data 316 @param dataFile name of the file containing the TLD data
311 @type str 317 @type str
312 @param loadPrivateDomains flag indicating to load private domains 318 @param loadPrivateDomains flag indicating to load private domains
313 @type bool 319 @type bool
314 @return flag indicating success 320 @return flag indicating success
315 @rtype bool 321 @rtype bool
316 """ 322 """
317 # start with a fresh dictionary 323 # start with a fresh dictionary
318 self.__tldDict = collections.defaultdict(list) 324 self.__tldDict = collections.defaultdict(list)
319 325
320 seekToEndOfPrivateDomains = False 326 seekToEndOfPrivateDomains = False
321 327
322 try: 328 try:
323 with open(dataFile, "r", encoding="utf-8") as f: 329 with open(dataFile, "r", encoding="utf-8") as f:
324 for line in f.readlines(): 330 for line in f.readlines():
325 if not line: 331 if not line:
326 continue 332 continue
327 333
328 if line.startswith("."): 334 if line.startswith("."):
329 line = line[1:] 335 line = line[1:]
330 336
331 if line.startswith("//"): 337 if line.startswith("//"):
332 if "===END PRIVATE DOMAINS===" in line: 338 if "===END PRIVATE DOMAINS===" in line:
333 seekToEndOfPrivateDomains = False 339 seekToEndOfPrivateDomains = False
334 340
335 if ( 341 if (
336 not loadPrivateDomains and 342 not loadPrivateDomains
337 "===BEGIN PRIVATE DOMAINS===" in line 343 and "===BEGIN PRIVATE DOMAINS===" in line
338 ): 344 ):
339 seekToEndOfPrivateDomains = True 345 seekToEndOfPrivateDomains = True
340 346
341 continue 347 continue
342 348
343 if seekToEndOfPrivateDomains: 349 if seekToEndOfPrivateDomains:
344 continue 350 continue
345 351
346 # only data up to the first whitespace is used 352 # only data up to the first whitespace is used
347 line = line.split(None, 1)[0] 353 line = line.split(None, 1)[0]
348 354
349 if "." not in line: 355 if "." not in line:
350 self.__tldDict[line].append(line) 356 self.__tldDict[line].append(line)
351 else: 357 else:
352 key = line[line.rfind(".") + 1:] 358 key = line[line.rfind(".") + 1 :]
353 self.__tldDict[key].append(line) 359 self.__tldDict[key].append(line)
354 360
355 return self.isDataLoaded() 361 return self.isDataLoaded()
356 except OSError: 362 except OSError:
357 return False 363 return False
358 364
359 def __domainHelper(self, host, tldPart): 365 def __domainHelper(self, host, tldPart):
360 """ 366 """
361 Private method to get the domain name without TLD. 367 Private method to get the domain name without TLD.
362 368
363 @param host host address 369 @param host host address
364 @type str 370 @type str
365 @param tldPart TLD part of the host address 371 @param tldPart TLD part of the host address
366 @type str 372 @type str
367 @return domain name 373 @return domain name
368 @rtype str 374 @rtype str
369 """ 375 """
370 if not host or not tldPart: 376 if not host or not tldPart:
371 return "" 377 return ""
372 378
373 temp = self.__normalizedHost(host) 379 temp = self.__normalizedHost(host)
374 temp = temp[:temp.rfind(tldPart)] 380 temp = temp[: temp.rfind(tldPart)]
375 381
376 if temp.endswith("."): 382 if temp.endswith("."):
377 temp = temp[:-1] 383 temp = temp[:-1]
378 384
379 return temp[temp.rfind(".") + 1:] 385 return temp[temp.rfind(".") + 1 :]
380 386
381 def __registrableDomainHelper(self, domainPart, tldPart): 387 def __registrableDomainHelper(self, domainPart, tldPart):
382 """ 388 """
383 Private method to get the registrable domain (i.e. domain plus TLD). 389 Private method to get the registrable domain (i.e. domain plus TLD).
384 390
385 @param domainPart domain part of a host address 391 @param domainPart domain part of a host address
386 @type str 392 @type str
387 @param tldPart TLD part of a host address 393 @param tldPart TLD part of a host address
388 @type str 394 @type str
389 @return registrable domain name 395 @return registrable domain name
391 """ 397 """
392 if not tldPart or not domainPart: 398 if not tldPart or not domainPart:
393 return "" 399 return ""
394 else: 400 else:
395 return "{0}.{1}".format(domainPart, tldPart) 401 return "{0}.{1}".format(domainPart, tldPart)
396 402
397 def __subdomainHelper(self, host, registrablePart): 403 def __subdomainHelper(self, host, registrablePart):
398 """ 404 """
399 Private method to get the subdomain of a host address (i.e. domain part 405 Private method to get the subdomain of a host address (i.e. domain part
400 without the registrable domain name). 406 without the registrable domain name).
401 407
402 @param host host address 408 @param host host address
403 @type str 409 @type str
404 @param registrablePart registrable domain part of the host address 410 @param registrablePart registrable domain part of the host address
405 @type str 411 @type str
406 @return subdomain name 412 @return subdomain name
407 @rtype str 413 @rtype str
408 """ 414 """
409 if not host or not registrablePart: 415 if not host or not registrablePart:
410 return "" 416 return ""
411 417
412 subdomain = self.__normalizedHost(host) 418 subdomain = self.__normalizedHost(host)
413 419
414 subdomain = subdomain[:subdomain.rfind(registrablePart)] 420 subdomain = subdomain[: subdomain.rfind(registrablePart)]
415 421
416 if subdomain.endswith("."): 422 if subdomain.endswith("."):
417 subdomain = subdomain[:-1] 423 subdomain = subdomain[:-1]
418 424
419 return subdomain 425 return subdomain
420 426
421 def __normalizedHost(self, host): 427 def __normalizedHost(self, host):
422 """ 428 """
423 Private method to get the normalized host for a host address. 429 Private method to get the normalized host for a host address.
424 430
425 @param host host address to be normalized 431 @param host host address to be normalized
426 @type str 432 @type str
427 @return normalized host address 433 @return normalized host address
428 @rtype str 434 @rtype str
429 """ 435 """
435 441
436 def instance(withPrivate=False): 442 def instance(withPrivate=False):
437 """ 443 """
438 Global function to get a reference to the TLD extractor and create it, if 444 Global function to get a reference to the TLD extractor and create it, if
439 it hasn't been yet. 445 it hasn't been yet.
440 446
441 @param withPrivate flag indicating to load private TLDs as well 447 @param withPrivate flag indicating to load private TLDs as well
442 @type bool 448 @type bool
443 @return reference to the zoom manager object 449 @return reference to the zoom manager object
444 @rtype EricTldExtractor 450 @rtype EricTldExtractor
445 """ 451 """
446 global _TLDExtractor 452 global _TLDExtractor
447 453
448 if _TLDExtractor is None: 454 if _TLDExtractor is None:
449 _TLDExtractor = EricTldExtractor(withPrivate=withPrivate) 455 _TLDExtractor = EricTldExtractor(withPrivate=withPrivate)
450 456
451 return _TLDExtractor 457 return _TLDExtractor

eric ide

mercurial