eric6/Helpviewer/AdBlock/AdBlockRule.py

changeset 6942
2602857055c5
parent 6645
ad476851d7e0
equal deleted inserted replaced
6941:f99d60d6b59b 6942:2602857055c5
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2009 - 2019 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing the AdBlock rule class.
8 """
9
10 from __future__ import unicode_literals
11
12 import re
13
14 from PyQt5.QtCore import Qt, QRegExp, QUrl
15 from PyQt5.QtNetwork import QNetworkRequest
16
17 from Globals import qVersionTuple
18
19
20 # Qt version < 4.8 has an issue; it will wrongly
21 # count .co.uk (and others) as second-level domains
22 def toSecondLevelDomain(url):
23 """
24 Module function to get a second level domain from the given URL.
25
26 @param url URL to extract domain from (QUrl)
27 @return name of second level domain (string)
28 """
29 if qVersionTuple() >= (4, 8, 0):
30 topLevelDomain = url.topLevelDomain()
31 urlHost = url.host()
32
33 if not topLevelDomain or not urlHost:
34 return ""
35
36 domain = urlHost[:len(urlHost) - len(topLevelDomain)]
37 if domain.count(".") == 0:
38 return urlHost
39
40 while domain.count(".") != 0:
41 domain = domain[domain.find(".") + 1:]
42
43 return domain + topLevelDomain
44 else:
45 domain = url.host()
46
47 if domain.count(".") == 0:
48 return ""
49
50 while domain.count(".") != 1:
51 domain = domain[domain.find(".") + 1:]
52
53 return domain
54
55
56 class AdBlockRule(object):
57 """
58 Class implementing the AdBlock rule.
59 """
60 def __init__(self, filterRule="", subscription=None):
61 """
62 Constructor
63
64 @param filterRule filter string of the rule (string)
65 @param subscription reference to the subscription object
66 (AdBlockSubscription)
67 """
68 self.__subscription = subscription
69
70 self.__regExp = QRegExp()
71 self.__options = []
72 self.__blockedDomains = []
73 self.__allowedDomains = []
74
75 self.__enabled = True
76 self.__cssRule = False
77 self.__exception = False
78 self.__internalDisabled = False
79 self.__domainRestricted = False
80 self.__useRegExp = False
81 self.__useDomainMatch = False
82 self.__useEndsMatch = False
83 self.__thirdParty = False
84 self.__thirdPartyException = False
85 self.__object = False
86 self.__objectException = False
87 self.__subdocument = False
88 self.__subdocumentException = False
89 self.__xmlhttprequest = False
90 self.__xmlhttprequestException = False
91 self.__document = False
92 self.__elemhide = False
93 self.__caseSensitivity = Qt.CaseInsensitive
94
95 self.setFilter(filterRule)
96
97 def subscription(self):
98 """
99 Public method to get the subscription this rule belongs to.
100
101 @return subscription of the rule (AdBlockSubscription)
102 """
103 return self.__subscription
104
105 def filter(self):
106 """
107 Public method to get the rule filter string.
108
109 @return rule filter string (string)
110 """
111 return self.__filter
112
113 def setFilter(self, filterRule):
114 """
115 Public method to set the rule filter string.
116
117 @param filterRule rule filter string (string)
118 """
119 self.__filter = filterRule
120 self.__parseFilter()
121
122 def __parseFilter(self):
123 """
124 Private method to parse the filter pattern.
125 """
126 parsedLine = self.__filter
127
128 # empty rule or just a comment
129 if not parsedLine.strip() or parsedLine.startswith(("!", "[Adblock")):
130 self.__enabled = False
131 return
132
133 # CSS element hiding rule
134 if "##" in parsedLine:
135 self.__cssRule = True
136 pos = parsedLine.find("##")
137
138 # domain restricted rule
139 if not parsedLine.startswith("##"):
140 domains = parsedLine[:pos]
141 self.__parseDomains(domains, ",")
142
143 self.__cssSelector = parsedLine[pos + 2:]
144 # CSS rule cannot have more options -> stop parsing
145 return
146
147 # Exception always starts with @@
148 if parsedLine.startswith("@@"):
149 self.__exception = True
150 parsedLine = parsedLine[2:]
151
152 # Parse all options following '$' character
153 optionsIndex = parsedLine.find("$")
154 if optionsIndex >= 0:
155 options = parsedLine[optionsIndex + 1:].split(",")
156
157 handledOptions = 0
158 for option in options:
159 if option.startswith("domain="):
160 self.__parseDomains(option[7:], "|")
161 handledOptions += 1
162 elif option == "match-case":
163 self.__caseSensitivity = Qt.CaseSensitive
164 handledOptions += 1
165 elif option.endswith("third-party"):
166 self.__thirdParty = True
167 self.__thirdPartyException = option.startswith("~")
168 handledOptions += 1
169 elif option.endswith("object"):
170 self.__object = True
171 self.__objectException = option.startswith("~")
172 handledOptions += 1
173 elif option.endswith("subdocument"):
174 self.__subdocument = True
175 self.__subdocumentException = option.startswith("~")
176 handledOptions += 1
177 elif option.endswith("xmlhttprequest"):
178 self.__xmlhttprequest = True
179 self.__xmlhttprequestException = option.startswith("~")
180 handledOptions += 1
181 elif option == "document" and self.__exception:
182 self.__document = True
183 handledOptions += 1
184 elif option == "elemhide" and self.__exception:
185 self.__elemhide = True
186 handledOptions += 1
187 elif option == "collapse":
188 # Hiding placeholders of blocked elements
189 handledOptions += 1
190
191 # If we don't handle all options, it's safer to just disable
192 # this rule
193 if handledOptions != len(options):
194 self.__internalDisabled = True
195 return
196
197 parsedLine = parsedLine[:optionsIndex]
198
199 # Rule is classic regexp
200 if parsedLine.startswith("/") and parsedLine.endswith("/"):
201 parsedLine = parsedLine[1:-1]
202 self.__useRegExp = True
203 self.__regExp = QRegExp(parsedLine, self.__caseSensitivity,
204 QRegExp.RegExp)
205 return
206
207 # Remove starting / ending wildcards
208 if parsedLine.startswith("*"):
209 parsedLine = parsedLine[1:]
210 if parsedLine.endswith("*"):
211 parsedLine = parsedLine[:-1]
212
213 # Fast string matching for domain can be used
214 if parsedLine.startswith("||") and \
215 parsedLine.endswith("^") and \
216 QRegExp("[/:?=&\\*]").indexIn(parsedLine) == -1:
217 parsedLine = parsedLine[2:-1]
218 self.__useDomainMatch = True
219 self.__matchString = parsedLine
220 return
221
222 # If rule contains '|' only at the end, string matching can be used
223 if parsedLine.endswith("|") and \
224 QRegExp("[\\^\\*]").indexIn(parsedLine) == -1 and \
225 parsedLine.count("|") == 1:
226 parsedLine = parsedLine[:-1]
227 self.__useEndsMatch = True
228 self.__matchString = parsedLine
229 return
230
231 # If there is still a wildcard (*) or separator (^) or (|),
232 # the rule must be modified to comply with QRegExp.
233 if "*" in parsedLine or "^" in parsedLine or "|" in parsedLine:
234 pattern = self.__convertPatternToRegExp(parsedLine)
235 self.__useRegExp = True
236 self.__regExp = QRegExp(pattern, self.__caseSensitivity,
237 QRegExp.RegExp)
238 return
239
240 # no regexp required
241 self.__useRegExp = False
242 self.__matchString = parsedLine
243
244 def __parseDomains(self, domains, separator):
245 """
246 Private method to parse a string with a domain list.
247
248 @param domains list of domains (string)
249 @param separator separator character used by the list (string)
250 """
251 domainsList = domains.split(separator)
252
253 for domain in domainsList:
254 if not domain:
255 continue
256 if domain.startswith("~"):
257 self.__blockedDomains.append(domain[1:])
258 else:
259 self.__allowedDomains.append(domain)
260
261 self.__domainRestricted = \
262 bool(self.__blockedDomains) or bool(self.__allowedDomains)
263
264 def networkMatch(self, request, domain, encodedUrl):
265 """
266 Public method to check the rule for a match.
267
268 @param request reference to the network request (QNetworkRequest)
269 @param domain domain name (string)
270 @param encodedUrl string encoded URL to be checked (string)
271 @return flag indicating a match (boolean)
272 """
273 if self.__cssRule or not self.__enabled or self.__internalDisabled:
274 return False
275
276 matched = False
277
278 if self.__useRegExp:
279 matched = self.__regExp.indexIn(encodedUrl) != -1
280 elif self.__useDomainMatch:
281 matched = domain.endswith(self.__matchString)
282 elif self.__useEndsMatch:
283 if self.__caseSensitivity == Qt.CaseInsensitive:
284 matched = encodedUrl.lower().endswith(
285 self.__matchString.lower())
286 else:
287 matched = encodedUrl.endswith(self.__matchString)
288 else:
289 if self.__caseSensitivity == Qt.CaseInsensitive:
290 matched = self.__matchString.lower() in encodedUrl.lower()
291 else:
292 matched = self.__matchString in encodedUrl
293
294 if matched:
295 # check domain restrictions
296 if self.__domainRestricted and not self.matchDomain(domain):
297 return False
298
299 # check third-party restrictions
300 if self.__thirdParty and not self.matchThirdParty(request):
301 return False
302
303 # check object restrictions
304 if self.__object and not self.matchObject(request):
305 return False
306
307 # check subdocument restrictions
308 if self.__subdocument and not self.matchSubdocument(request):
309 return False
310
311 # check xmlhttprequest restriction
312 if self.__xmlhttprequest and not self.matchXmlHttpRequest(request):
313 return False
314
315 return matched
316
317 def urlMatch(self, url):
318 """
319 Public method to check an URL against the rule.
320
321 @param url URL to check (QUrl)
322 @return flag indicating a match (boolean)
323 """
324 if not self.__document and not self.__elemhide:
325 return False
326
327 encodedUrl = bytes(url.toEncoded()).decode()
328 domain = url.host()
329 return self.networkMatch(QNetworkRequest(url), domain, encodedUrl)
330
331 def matchDomain(self, domain):
332 """
333 Public method to match a domain.
334
335 @param domain domain name to check (string)
336 @return flag indicating a match (boolean)
337 """
338 if not self.__enabled:
339 return False
340
341 if not self.__domainRestricted:
342 return True
343
344 if len(self.__blockedDomains) == 0:
345 for dom in self.__allowedDomains:
346 if domain.endswith(dom):
347 return True
348 elif len(self.__allowedDomains) == 0:
349 for dom in self.__blockedDomains:
350 if domain.endswith(dom):
351 return False
352 return True
353 else:
354 for dom in self.__blockedDomains:
355 if domain.endswith(dom):
356 return False
357 for dom in self.__allowedDomains:
358 if domain.endswith(dom):
359 return True
360
361 return False
362
363 def matchThirdParty(self, req):
364 """
365 Public slot to match a third-party rule.
366
367 @param req request object to check (QNetworkRequest)
368 @return flag indicating a match (boolean)
369 """
370 referer = \
371 bytes(req.attribute(QNetworkRequest.User + 200, b"")).decode()
372 if referer == "":
373 return False
374
375 # Third-party matching should be performed on second-level domains
376 refererHost = toSecondLevelDomain(QUrl(referer))
377 host = toSecondLevelDomain(req.url())
378
379 match = refererHost != host
380
381 if self.__thirdPartyException:
382 return not match
383 else:
384 return match
385
386 def matchObject(self, req):
387 """
388 Public slot to match an object rule.
389
390 @param req request object to check (QNetworkRequest)
391 @return flag indicating a match (boolean)
392 """
393 match = req.attribute(QNetworkRequest.User + 200) == "object"
394
395 if self.__objectException:
396 return not match
397 else:
398 return match
399
400 def matchSubdocument(self, req):
401 """
402 Public slot to match a sub-document rule.
403
404 @param req request object to check (QNetworkRequest)
405 @return flag indicating a match (boolean)
406 """
407 originatingFrame = req.originatingObject()
408 if originatingFrame is None:
409 return False
410
411 page = originatingFrame.page()
412 if page is None:
413 return False
414
415 match = originatingFrame != page.mainFrame()
416
417 if self.__subdocumentException:
418 return not match
419 else:
420 return match
421
422 def matchXmlHttpRequest(self, req):
423 """
424 Public slot to match a XmlHttpRequest rule.
425
426 @param req request object to check (QNetworkRequest)
427 @return flag indicating a match (boolean)
428 """
429 match = req.rawHeader(b"X-Request-With") == "XMLHttpRequest"
430
431 if self.__xmlhttprequestException:
432 return not match
433 else:
434 return match
435
436 def isException(self):
437 """
438 Public method to check, if the rule defines an exception.
439
440 @return flag indicating an exception (boolean)
441 """
442 return self.__exception
443
444 def setException(self, exception):
445 """
446 Public method to set the rule's exception flag.
447
448 @param exception flag indicating an exception rule (boolean)
449 """
450 self.__exception = exception
451
452 def isEnabled(self):
453 """
454 Public method to check, if the rule is enabled.
455
456 @return flag indicating enabled state (boolean)
457 """
458 return self.__enabled
459
460 def setEnabled(self, enabled):
461 """
462 Public method to set the rule's enabled state.
463
464 @param enabled flag indicating the new enabled state (boolean)
465 """
466 self.__enabled = enabled
467 if not enabled:
468 self.__filter = "!" + self.__filter
469 else:
470 self.__filter = self.__filter[1:]
471
472 def isCSSRule(self):
473 """
474 Public method to check, if the rule is a CSS rule.
475
476 @return flag indicating a CSS rule (boolean)
477 """
478 return self.__cssRule
479
480 def cssSelector(self):
481 """
482 Public method to get the CSS selector of the rule.
483
484 @return CSS selector (string)
485 """
486 return self.__cssSelector
487
488 def isDocument(self):
489 """
490 Public method to check, if this is a document rule.
491
492 @return flag indicating a document rule (boolean)
493 """
494 return self.__document
495
496 def isElementHiding(self):
497 """
498 Public method to check, if this is an element hiding rule.
499
500 @return flag indicating an element hiding rule (boolean)
501 """
502 return self.__elemhide
503
504 def isDomainRestricted(self):
505 """
506 Public method to check, if this rule is restricted by domain.
507
508 @return flag indicating a domain restriction (boolean)
509 """
510 return self.__domainRestricted
511
512 def isComment(self):
513 """
514 Public method to check, if this is a comment.
515
516 @return flag indicating a comment (boolean)
517 """
518 return self.__filter.startswith("!")
519
520 def isHeader(self):
521 """
522 Public method to check, if this is a header.
523
524 @return flag indicating a header (boolean)
525 """
526 return self.__filter.startswith("[Adblock")
527
528 def isSlow(self):
529 """
530 Public method to check, if this is a slow rule.
531
532 @return flag indicating a slow rule (boolean)
533 """
534 return self.__useRegExp
535
536 def isInternalDisabled(self):
537 """
538 Public method to check, if this rule was disabled internally.
539
540 @return flag indicating an internally disabled rule (boolean)
541 """
542 return self.__internalDisabled
543
544 def __convertPatternToRegExp(self, wildcardPattern):
545 """
546 Private method to convert a wildcard pattern to a regular expression.
547
548 @param wildcardPattern string containing the wildcard pattern (string)
549 @return string containing a regular expression (string)
550 """
551 pattern = wildcardPattern
552
553 # remove multiple wildcards
554 pattern = re.sub(r"\*+", "*", pattern)
555 # remove anchors following separator placeholder
556 pattern = re.sub(r"\^\|$", "^", pattern)
557 # remove leading wildcards
558 pattern = re.sub(r"^(\*)", "", pattern)
559 # remove trailing wildcards
560 pattern = re.sub(r"(\*)$", "", pattern)
561 # escape special symbols
562 pattern = re.sub(r"(\W)", r"\\\1", pattern)
563 # process extended anchor at expression start
564 pattern = re.sub(
565 r"^\\\|\\\|",
566 r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern)
567 # process separator placeholders
568 pattern = re.sub(r"\\\^", r"(?:[^\w\d\-.%]|$)", pattern)
569 # process anchor at expression start
570 pattern = re.sub(r"^\\\|", "^", pattern)
571 # process anchor at expression end
572 pattern = re.sub(r"\\\|$", "$", pattern)
573 # replace wildcards by .*
574 pattern = re.sub(r"\\\*", ".*", pattern)
575
576 return pattern

eric ide

mercurial