WebBrowser/AdBlock/AdBlockRule.py

branch
QtWebEngine
changeset 4858
19dff9c9cf26
parent 4631
5c1a96925da4
child 4860
0a44aff88bfa
equal deleted inserted replaced
4857:8dba5fb92f12 4858:19dff9c9cf26
1 # -*- coding: utf-8 -*-
2
3 # Copyright (c) 2009 - 2016 Detlev Offenbach <detlev@die-offenbachs.de>
4 #
5
6 """
7 Module implementing the AdBlock rule class.
8 """
9
10 from __future__ import unicode_literals
11
12 import re
13
14 from PyQt5.QtCore import Qt, QRegExp, QUrl
15 from PyQt5.QtNetwork import QNetworkRequest
16 from PyQt5.QtWebEngineCore import QWebEngineUrlRequestInfo
17
18
19 def toSecondLevelDomain(url):
20 """
21 Module function to get a second level domain from the given URL.
22
23 @param url URL to extract domain from (QUrl)
24 @return name of second level domain (string)
25 """
26 topLevelDomain = url.topLevelDomain()
27 urlHost = url.host()
28
29 if not topLevelDomain or not urlHost:
30 return ""
31
32 domain = urlHost[:len(urlHost) - len(topLevelDomain)]
33 if domain.count(".") == 0:
34 return urlHost
35
36 while domain.count(".") != 0:
37 domain = domain[domain.find(".") + 1:]
38
39 return domain + topLevelDomain
40
41
42 class AdBlockRule(object):
43 """
44 Class implementing the AdBlock rule.
45 """
46 def __init__(self, filter="", subscription=None):
47 """
48 Constructor
49
50 @param filter filter string of the rule (string)
51 @param subscription reference to the subscription object
52 (AdBlockSubscription)
53 """
54 self.__subscription = subscription
55
56 self.__regExp = QRegExp()
57 self.__options = []
58 self.__blockedDomains = []
59 self.__allowedDomains = []
60
61 self.__enabled = True
62 self.__cssRule = False
63 self.__exception = False
64 self.__internalDisabled = False
65 self.__domainRestricted = False
66 self.__useRegExp = False
67 self.__useDomainMatch = False
68 self.__useEndsMatch = False
69 self.__thirdParty = False
70 self.__thirdPartyException = False
71 self.__object = False
72 self.__objectException = False
73 self.__subdocument = False
74 self.__subdocumentException = False
75 self.__xmlhttprequest = False
76 self.__xmlhttprequestException = False
77 self.__document = False
78 self.__elemhide = False
79 self.__caseSensitivity = Qt.CaseInsensitive
80 self.__image = False
81 self.__imageException = False
82 self.__script = False
83 self.__scriptException = False
84 self.__stylesheet = False
85 self.__stylesheetException = False
86 self.__objectSubrequest = False
87 self.__objectSubrequestException = False
88
89 self.setFilter(filter)
90
91 def subscription(self):
92 """
93 Public method to get the subscription this rule belongs to.
94
95 @return subscription of the rule (AdBlockSubscription)
96 """
97 return self.__subscription
98
99 def filter(self):
100 """
101 Public method to get the rule filter string.
102
103 @return rule filter string (string)
104 """
105 return self.__filter
106
107 def setFilter(self, filter):
108 """
109 Public method to set the rule filter string.
110
111 @param filter rule filter string (string)
112 """
113 self.__filter = filter
114 self.__parseFilter()
115
116 def __parseFilter(self):
117 """
118 Private method to parse the filter pattern.
119 """
120 parsedLine = self.__filter
121
122 # empty rule or just a comment
123 if not parsedLine.strip() or parsedLine.startswith(("!", "[Adblock")):
124 self.__enabled = False
125 return
126
127 # CSS element hiding rule
128 if "##" in parsedLine or "#@#" in parsedLine:
129 self.__cssRule = True
130 pos = parsedLine.find("#")
131
132 # domain restricted rule
133 if not parsedLine.startswith("##"):
134 domains = parsedLine[:pos]
135 self.__parseDomains(domains, ",")
136
137 self.__exception = parsedLine[pos + 1] == "@"
138
139 if self.__exception:
140 self.__cssSelector = parsedLine[pos + 3:]
141 else:
142 self.__cssSelector = parsedLine[pos + 2:]
143 # CSS rule cannot have more options -> stop parsing
144 return
145
146 # Exception always starts with @@
147 if parsedLine.startswith("@@"):
148 self.__exception = True
149 parsedLine = parsedLine[2:]
150
151 # Parse all options following '$' character
152 optionsIndex = parsedLine.find("$")
153 if optionsIndex >= 0:
154 options = parsedLine[optionsIndex + 1:].split(",")
155
156 handledOptions = 0
157 for option in options:
158 if option.startswith("domain="):
159 self.__parseDomains(option[7:], "|")
160 handledOptions += 1
161 elif option == "match-case":
162 self.__caseSensitivity = Qt.CaseSensitive
163 handledOptions += 1
164 elif option.endswith("third-party"):
165 self.__thirdParty = True
166 self.__thirdPartyException = option.startswith("~")
167 handledOptions += 1
168 elif option.endswith("object"):
169 self.__object = True
170 self.__objectException = option.startswith("~")
171 handledOptions += 1
172 elif option.endswith("subdocument"):
173 self.__subdocument = True
174 self.__subdocumentException = option.startswith("~")
175 handledOptions += 1
176 elif option.endswith("xmlhttprequest"):
177 self.__xmlhttprequest = True
178 self.__xmlhttprequestException = option.startswith("~")
179 handledOptions += 1
180 elif option.endswith("image"):
181 self.__image = True
182 self.__imageException = option.startswith("~")
183 elif option.endswith("script"):
184 self.__script = True
185 self.__scriptException = option.startswith("~")
186 elif option.endswith("stylesheet"):
187 self.__stylesheet = True
188 self.__stylesheetException = option.startswith("~")
189 elif option.endswith("object-subrequest"):
190 self.__objectSubrequest = True
191 self.__objectSubrequestException = option.startswith("~")
192 elif option == "document" and self.__exception:
193 self.__document = True
194 handledOptions += 1
195 elif option == "elemhide" and self.__exception:
196 self.__elemhide = True
197 handledOptions += 1
198 elif option == "collapse":
199 # Hiding placeholders of blocked elements
200 handledOptions += 1
201
202 # If we don't handle all options, it's safer to just disable
203 # this rule
204 if handledOptions != len(options):
205 self.__internalDisabled = True
206 return
207
208 parsedLine = parsedLine[:optionsIndex]
209
210 # Rule is classic regexp
211 if parsedLine.startswith("/") and parsedLine.endswith("/"):
212 parsedLine = parsedLine[1:-1]
213 self.__useRegExp = True
214 self.__regExp = QRegExp(parsedLine, self.__caseSensitivity,
215 QRegExp.RegExp)
216 return
217
218 # Remove starting / ending wildcards
219 if parsedLine.startswith("*"):
220 parsedLine = parsedLine[1:]
221 if parsedLine.endswith("*"):
222 parsedLine = parsedLine[:-1]
223
224 # Fast string matching for domain can be used
225 if parsedLine.startswith("||") and \
226 parsedLine.endswith("^") and \
227 QRegExp("[/:?=&\\*]").indexIn(parsedLine) == -1:
228 parsedLine = parsedLine[2:-1]
229 self.__useDomainMatch = True
230 self.__matchString = parsedLine
231 return
232
233 # If rule contains '|' only at the end, string matching can be used
234 if parsedLine.endswith("|") and \
235 QRegExp("[\\^\\*]").indexIn(parsedLine) == -1 and \
236 parsedLine.count("|") == 1:
237 parsedLine = parsedLine[:-1]
238 self.__useEndsMatch = True
239 self.__matchString = parsedLine
240 return
241
242 # If there is still a wildcard (*) or separator (^) or (|),
243 # the rule must be modified to comply with QRegExp.
244 if "*" in parsedLine or "^" in parsedLine or "|" in parsedLine:
245 pattern = self.__convertPatternToRegExp(parsedLine)
246 self.__useRegExp = True
247 self.__regExp = QRegExp(pattern, self.__caseSensitivity,
248 QRegExp.RegExp)
249 return
250
251 # no regexp required
252 self.__useRegExp = False
253 self.__matchString = parsedLine
254
255 def __parseDomains(self, domains, separator):
256 """
257 Private method to parse a string with a domain list.
258
259 @param domains list of domains (string)
260 @param separator separator character used by the list (string)
261 """
262 domainsList = domains.split(separator)
263
264 for domain in domainsList:
265 if not domain:
266 continue
267 if domain.startswith("~"):
268 self.__blockedDomains.append(domain[1:])
269 else:
270 self.__allowedDomains.append(domain)
271
272 self.__domainRestricted = \
273 bool(self.__blockedDomains) or bool(self.__allowedDomains)
274
275 def networkMatch(self, request, domain, encodedUrl):
276 """
277 Public method to check the rule for a match.
278
279 @param request reference to the network request
280 @type QWebEngineUrlRequestInfo
281 @param domain domain name
282 @type str
283 @param encodedUrl string encoded URL to be checked
284 @type str
285 @return flag indicating a match
286 @rtype bool
287 """
288 if self.__cssRule or not self.__enabled or self.__internalDisabled:
289 return False
290
291 matched = False
292
293 if self.__useRegExp:
294 matched = self.__regExp.indexIn(encodedUrl) != -1
295 elif self.__useDomainMatch:
296 matched = domain.endswith(self.__matchString)
297 elif self.__useEndsMatch:
298 if self.__caseSensitivity == Qt.CaseInsensitive:
299 matched = encodedUrl.lower().endswith(
300 self.__matchString.lower())
301 else:
302 matched = encodedUrl.endswith(self.__matchString)
303 else:
304 if self.__caseSensitivity == Qt.CaseInsensitive:
305 matched = self.__matchString.lower() in encodedUrl.lower()
306 else:
307 matched = self.__matchString in encodedUrl
308
309 if matched:
310 # check domain restrictions
311 if self.__domainRestricted and \
312 not self.matchDomain(request.firstPartyUrl().host()):
313 return False
314
315 # check third-party restrictions
316 if self.__thirdParty and not self.matchThirdParty(request):
317 return False
318
319 # check object restrictions
320 if self.__object and not self.matchObject(request):
321 return False
322
323 # check subdocument restrictions
324 if self.__subdocument and not self.matchSubdocument(request):
325 return False
326
327 # check xmlhttprequest restriction
328 if self.__xmlhttprequest and not self.matchXmlHttpRequest(request):
329 return False
330
331 # check image restriction
332 if self.__image and not self.matchImage(request):
333 return False
334
335 # check script restriction
336 if self.__script and not self.matchScript(request):
337 return False
338
339 # check stylesheet restriction
340 if self.__stylesheet and not self.matchStyleSheet(request):
341 return False
342
343 # check object-subrequest restriction
344 if self.__objectSubrequest and \
345 not self.matchObjectSubrequest(request):
346 return False
347
348 return matched
349
350 def urlMatch(self, url):
351 """
352 Public method to check an URL against the rule.
353
354 @param url URL to check (QUrl)
355 @return flag indicating a match (boolean)
356 """
357 if not self.__document and not self.__elemhide:
358 return False
359
360 encodedUrl = bytes(url.toEncoded()).decode()
361 domain = url.host()
362 return self.networkMatch(QNetworkRequest(url), domain, encodedUrl)
363
364 def matchDomain(self, domain):
365 """
366 Public method to match a domain.
367
368 @param domain domain name to check (string)
369 @return flag indicating a match (boolean)
370 """
371 if not self.__enabled:
372 return False
373
374 if not self.__domainRestricted:
375 return True
376
377 if len(self.__blockedDomains) == 0:
378 for dom in self.__allowedDomains:
379 if domain.endswith(dom):
380 return True
381 elif len(self.__allowedDomains) == 0:
382 for dom in self.__blockedDomains:
383 if domain.endswith(dom):
384 return False
385 return True
386 else:
387 for dom in self.__blockedDomains:
388 if domain.endswith(dom):
389 return False
390 for dom in self.__allowedDomains:
391 if domain.endswith(dom):
392 return True
393
394 return False
395
396 def matchThirdParty(self, req):
397 """
398 Public slot to match a third-party rule.
399
400 @param req request object to check (QWebEngineUrlRequestInfo)
401 @return flag indicating a match (boolean)
402 """
403 # Third-party matching should be performed on second-level domains
404 firstPartyHost = toSecondLevelDomain(req.firstPartyUrl())
405 host = toSecondLevelDomain(req.requestUrl())
406
407 match = firstPartyHost != host
408
409 if self.__thirdPartyException:
410 return not match
411 else:
412 return match
413
414 def matchObject(self, req):
415 """
416 Public slot to match an object rule.
417
418 @param req request object to check (QWebEngineUrlRequestInfo)
419 @return flag indicating a match (boolean)
420 """
421 match = (
422 req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeObject)
423
424 if self.__objectException:
425 return not match
426 else:
427 return match
428
429 def matchSubdocument(self, req):
430 """
431 Public slot to match a sub-document rule.
432
433 @param req request object to check (QWebEngineUrlRequestInfo)
434 @return flag indicating a match (boolean)
435 """
436 match = (
437 req.resourceType() ==
438 QWebEngineUrlRequestInfo.ResourceTypeSubFrame)
439
440 if self.__subdocumentException:
441 return not match
442 else:
443 return match
444
445 def matchXmlHttpRequest(self, req):
446 """
447 Public slot to match a XmlHttpRequest rule.
448
449 @param req request object to check (QWebEngineUrlRequestInfo)
450 @return flag indicating a match (boolean)
451 """
452 match = (
453 req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeXhr)
454
455 if self.__xmlhttprequestException:
456 return not match
457 else:
458 return match
459
460 def matchImage(self, req):
461 """
462 Public slot to match an Image rule.
463
464 @param req request object to check (QWebEngineUrlRequestInfo)
465 @return flag indicating a match (boolean)
466 """
467 match = (
468 req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeImage)
469
470 if self.__imageException:
471 return not match
472 else:
473 return match
474
475 def matchScript(self, req):
476 """
477 Public slot to match a Script rule.
478
479 @param req request object to check (QWebEngineUrlRequestInfo)
480 @return flag indicating a match (boolean)
481 """
482 match = (
483 req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeScript)
484
485 if self.__scriptException:
486 return not match
487 else:
488 return match
489
490 def matchStyleSheet(self, req):
491 """
492 Public slot to match a StyleSheet rule.
493
494 @param req request object to check (QWebEngineUrlRequestInfo)
495 @return flag indicating a match (boolean)
496 """
497 match = (
498 req.resourceType() ==
499 QWebEngineUrlRequestInfo.ResourceTypeStylesheet)
500
501 if self.__stylesheetException:
502 return not match
503 else:
504 return match
505
506 def matchObjectSubrequest(self, req):
507 """
508 Public slot to match an Object Subrequest rule.
509
510 @param req request object to check (QWebEngineUrlRequestInfo)
511 @return flag indicating a match (boolean)
512 """
513 match = (
514 req.resourceType() ==
515 QWebEngineUrlRequestInfo.ResourceTypeSubResource)
516
517 if self.__objectSubrequestException:
518 return not match
519 else:
520 return match
521
522 def isException(self):
523 """
524 Public method to check, if the rule defines an exception.
525
526 @return flag indicating an exception (boolean)
527 """
528 return self.__exception
529
530 def setException(self, exception):
531 """
532 Public method to set the rule's exception flag.
533
534 @param exception flag indicating an exception rule (boolean)
535 """
536 self.__exception = exception
537
538 def isEnabled(self):
539 """
540 Public method to check, if the rule is enabled.
541
542 @return flag indicating enabled state (boolean)
543 """
544 return self.__enabled
545
546 def setEnabled(self, enabled):
547 """
548 Public method to set the rule's enabled state.
549
550 @param enabled flag indicating the new enabled state (boolean)
551 """
552 self.__enabled = enabled
553 if not enabled:
554 self.__filter = "!" + self.__filter
555 else:
556 self.__filter = self.__filter[1:]
557
558 def isCSSRule(self):
559 """
560 Public method to check, if the rule is a CSS rule.
561
562 @return flag indicating a CSS rule (boolean)
563 """
564 return self.__cssRule
565
566 def cssSelector(self):
567 """
568 Public method to get the CSS selector of the rule.
569
570 @return CSS selector (string)
571 """
572 return self.__cssSelector
573
574 def isDocument(self):
575 """
576 Public method to check, if this is a document rule.
577
578 @return flag indicating a document rule (boolean)
579 """
580 return self.__document
581
582 def isElementHiding(self):
583 """
584 Public method to check, if this is an element hiding rule.
585
586 @return flag indicating an element hiding rule (boolean)
587 """
588 return self.__elemhide
589
590 def isDomainRestricted(self):
591 """
592 Public method to check, if this rule is restricted by domain.
593
594 @return flag indicating a domain restriction (boolean)
595 """
596 return self.__domainRestricted
597
598 def isComment(self):
599 """
600 Public method to check, if this is a comment.
601
602 @return flag indicating a comment (boolean)
603 """
604 return self.__filter.startswith("!")
605
606 def isHeader(self):
607 """
608 Public method to check, if this is a header.
609
610 @return flag indicating a header (boolean)
611 """
612 return self.__filter.startswith("[Adblock")
613
614 def isSlow(self):
615 """
616 Public method to check, if this is a slow rule.
617
618 @return flag indicating a slow rule (boolean)
619 """
620 return self.__useRegExp
621
622 def isInternalDisabled(self):
623 """
624 Public method to check, if this rule was disabled internally.
625
626 @return flag indicating an internally disabled rule (boolean)
627 """
628 return self.__internalDisabled
629
630 def __convertPatternToRegExp(self, wildcardPattern):
631 """
632 Private method to convert a wildcard pattern to a regular expression.
633
634 @param wildcardPattern string containing the wildcard pattern (string)
635 @return string containing a regular expression (string)
636 """
637 pattern = wildcardPattern
638
639 # remove multiple wildcards
640 pattern = re.sub(r"\*+", "*", pattern)
641 # remove anchors following separator placeholder
642 pattern = re.sub(r"\^\|$", "^", pattern)
643 # remove leading wildcards
644 pattern = re.sub(r"^(\*)", "", pattern)
645 # remove trailing wildcards
646 pattern = re.sub(r"(\*)$", "", pattern)
647 # escape special symbols
648 pattern = re.sub(r"(\W)", r"\\\1", pattern)
649 # process extended anchor at expression start
650 pattern = re.sub(
651 r"^\\\|\\\|",
652 r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern)
653 # process separator placeholders
654 pattern = re.sub(r"\\\^", r"(?:[^\w\d\-.%]|$)", pattern)
655 # process anchor at expression start
656 pattern = re.sub(r"^\\\|", "^", pattern)
657 # process anchor at expression end
658 pattern = re.sub(r"\\\|$", "$", pattern)
659 # replace wildcards by .*
660 pattern = re.sub(r"\\\*", ".*", pattern)
661
662 return pattern

eric ide

mercurial