|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2009 - 2016 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing the AdBlock rule class. |
|
8 """ |
|
9 |
|
10 from __future__ import unicode_literals |
|
11 |
|
12 import re |
|
13 |
|
14 from PyQt5.QtCore import Qt, QRegExp, QUrl |
|
15 from PyQt5.QtNetwork import QNetworkRequest |
|
16 from PyQt5.QtWebEngineCore import QWebEngineUrlRequestInfo |
|
17 |
|
18 |
|
19 def toSecondLevelDomain(url): |
|
20 """ |
|
21 Module function to get a second level domain from the given URL. |
|
22 |
|
23 @param url URL to extract domain from (QUrl) |
|
24 @return name of second level domain (string) |
|
25 """ |
|
26 topLevelDomain = url.topLevelDomain() |
|
27 urlHost = url.host() |
|
28 |
|
29 if not topLevelDomain or not urlHost: |
|
30 return "" |
|
31 |
|
32 domain = urlHost[:len(urlHost) - len(topLevelDomain)] |
|
33 if domain.count(".") == 0: |
|
34 return urlHost |
|
35 |
|
36 while domain.count(".") != 0: |
|
37 domain = domain[domain.find(".") + 1:] |
|
38 |
|
39 return domain + topLevelDomain |
|
40 |
|
41 |
|
42 class AdBlockRule(object): |
|
43 """ |
|
44 Class implementing the AdBlock rule. |
|
45 """ |
|
46 def __init__(self, filter="", subscription=None): |
|
47 """ |
|
48 Constructor |
|
49 |
|
50 @param filter filter string of the rule (string) |
|
51 @param subscription reference to the subscription object |
|
52 (AdBlockSubscription) |
|
53 """ |
|
54 self.__subscription = subscription |
|
55 |
|
56 self.__regExp = QRegExp() |
|
57 self.__options = [] |
|
58 self.__blockedDomains = [] |
|
59 self.__allowedDomains = [] |
|
60 |
|
61 self.__enabled = True |
|
62 self.__cssRule = False |
|
63 self.__exception = False |
|
64 self.__internalDisabled = False |
|
65 self.__domainRestricted = False |
|
66 self.__useRegExp = False |
|
67 self.__useDomainMatch = False |
|
68 self.__useEndsMatch = False |
|
69 self.__thirdParty = False |
|
70 self.__thirdPartyException = False |
|
71 self.__object = False |
|
72 self.__objectException = False |
|
73 self.__subdocument = False |
|
74 self.__subdocumentException = False |
|
75 self.__xmlhttprequest = False |
|
76 self.__xmlhttprequestException = False |
|
77 self.__document = False |
|
78 self.__elemhide = False |
|
79 self.__caseSensitivity = Qt.CaseInsensitive |
|
80 self.__image = False |
|
81 self.__imageException = False |
|
82 self.__script = False |
|
83 self.__scriptException = False |
|
84 self.__stylesheet = False |
|
85 self.__stylesheetException = False |
|
86 self.__objectSubrequest = False |
|
87 self.__objectSubrequestException = False |
|
88 |
|
89 self.setFilter(filter) |
|
90 |
|
91 def subscription(self): |
|
92 """ |
|
93 Public method to get the subscription this rule belongs to. |
|
94 |
|
95 @return subscription of the rule (AdBlockSubscription) |
|
96 """ |
|
97 return self.__subscription |
|
98 |
|
99 def filter(self): |
|
100 """ |
|
101 Public method to get the rule filter string. |
|
102 |
|
103 @return rule filter string (string) |
|
104 """ |
|
105 return self.__filter |
|
106 |
|
107 def setFilter(self, filter): |
|
108 """ |
|
109 Public method to set the rule filter string. |
|
110 |
|
111 @param filter rule filter string (string) |
|
112 """ |
|
113 self.__filter = filter |
|
114 self.__parseFilter() |
|
115 |
|
116 def __parseFilter(self): |
|
117 """ |
|
118 Private method to parse the filter pattern. |
|
119 """ |
|
120 parsedLine = self.__filter |
|
121 |
|
122 # empty rule or just a comment |
|
123 if not parsedLine.strip() or parsedLine.startswith(("!", "[Adblock")): |
|
124 self.__enabled = False |
|
125 return |
|
126 |
|
127 # CSS element hiding rule |
|
128 if "##" in parsedLine or "#@#" in parsedLine: |
|
129 self.__cssRule = True |
|
130 pos = parsedLine.find("#") |
|
131 |
|
132 # domain restricted rule |
|
133 if not parsedLine.startswith("##"): |
|
134 domains = parsedLine[:pos] |
|
135 self.__parseDomains(domains, ",") |
|
136 |
|
137 self.__exception = parsedLine[pos + 1] == "@" |
|
138 |
|
139 if self.__exception: |
|
140 self.__cssSelector = parsedLine[pos + 3:] |
|
141 else: |
|
142 self.__cssSelector = parsedLine[pos + 2:] |
|
143 # CSS rule cannot have more options -> stop parsing |
|
144 return |
|
145 |
|
146 # Exception always starts with @@ |
|
147 if parsedLine.startswith("@@"): |
|
148 self.__exception = True |
|
149 parsedLine = parsedLine[2:] |
|
150 |
|
151 # Parse all options following '$' character |
|
152 optionsIndex = parsedLine.find("$") |
|
153 if optionsIndex >= 0: |
|
154 options = parsedLine[optionsIndex + 1:].split(",") |
|
155 |
|
156 handledOptions = 0 |
|
157 for option in options: |
|
158 if option.startswith("domain="): |
|
159 self.__parseDomains(option[7:], "|") |
|
160 handledOptions += 1 |
|
161 elif option == "match-case": |
|
162 self.__caseSensitivity = Qt.CaseSensitive |
|
163 handledOptions += 1 |
|
164 elif option.endswith("third-party"): |
|
165 self.__thirdParty = True |
|
166 self.__thirdPartyException = option.startswith("~") |
|
167 handledOptions += 1 |
|
168 elif option.endswith("object"): |
|
169 self.__object = True |
|
170 self.__objectException = option.startswith("~") |
|
171 handledOptions += 1 |
|
172 elif option.endswith("subdocument"): |
|
173 self.__subdocument = True |
|
174 self.__subdocumentException = option.startswith("~") |
|
175 handledOptions += 1 |
|
176 elif option.endswith("xmlhttprequest"): |
|
177 self.__xmlhttprequest = True |
|
178 self.__xmlhttprequestException = option.startswith("~") |
|
179 handledOptions += 1 |
|
180 elif option.endswith("image"): |
|
181 self.__image = True |
|
182 self.__imageException = option.startswith("~") |
|
183 elif option.endswith("script"): |
|
184 self.__script = True |
|
185 self.__scriptException = option.startswith("~") |
|
186 elif option.endswith("stylesheet"): |
|
187 self.__stylesheet = True |
|
188 self.__stylesheetException = option.startswith("~") |
|
189 elif option.endswith("object-subrequest"): |
|
190 self.__objectSubrequest = True |
|
191 self.__objectSubrequestException = option.startswith("~") |
|
192 elif option == "document" and self.__exception: |
|
193 self.__document = True |
|
194 handledOptions += 1 |
|
195 elif option == "elemhide" and self.__exception: |
|
196 self.__elemhide = True |
|
197 handledOptions += 1 |
|
198 elif option == "collapse": |
|
199 # Hiding placeholders of blocked elements |
|
200 handledOptions += 1 |
|
201 |
|
202 # If we don't handle all options, it's safer to just disable |
|
203 # this rule |
|
204 if handledOptions != len(options): |
|
205 self.__internalDisabled = True |
|
206 return |
|
207 |
|
208 parsedLine = parsedLine[:optionsIndex] |
|
209 |
|
210 # Rule is classic regexp |
|
211 if parsedLine.startswith("/") and parsedLine.endswith("/"): |
|
212 parsedLine = parsedLine[1:-1] |
|
213 self.__useRegExp = True |
|
214 self.__regExp = QRegExp(parsedLine, self.__caseSensitivity, |
|
215 QRegExp.RegExp) |
|
216 return |
|
217 |
|
218 # Remove starting / ending wildcards |
|
219 if parsedLine.startswith("*"): |
|
220 parsedLine = parsedLine[1:] |
|
221 if parsedLine.endswith("*"): |
|
222 parsedLine = parsedLine[:-1] |
|
223 |
|
224 # Fast string matching for domain can be used |
|
225 if parsedLine.startswith("||") and \ |
|
226 parsedLine.endswith("^") and \ |
|
227 QRegExp("[/:?=&\\*]").indexIn(parsedLine) == -1: |
|
228 parsedLine = parsedLine[2:-1] |
|
229 self.__useDomainMatch = True |
|
230 self.__matchString = parsedLine |
|
231 return |
|
232 |
|
233 # If rule contains '|' only at the end, string matching can be used |
|
234 if parsedLine.endswith("|") and \ |
|
235 QRegExp("[\\^\\*]").indexIn(parsedLine) == -1 and \ |
|
236 parsedLine.count("|") == 1: |
|
237 parsedLine = parsedLine[:-1] |
|
238 self.__useEndsMatch = True |
|
239 self.__matchString = parsedLine |
|
240 return |
|
241 |
|
242 # If there is still a wildcard (*) or separator (^) or (|), |
|
243 # the rule must be modified to comply with QRegExp. |
|
244 if "*" in parsedLine or "^" in parsedLine or "|" in parsedLine: |
|
245 pattern = self.__convertPatternToRegExp(parsedLine) |
|
246 self.__useRegExp = True |
|
247 self.__regExp = QRegExp(pattern, self.__caseSensitivity, |
|
248 QRegExp.RegExp) |
|
249 return |
|
250 |
|
251 # no regexp required |
|
252 self.__useRegExp = False |
|
253 self.__matchString = parsedLine |
|
254 |
|
255 def __parseDomains(self, domains, separator): |
|
256 """ |
|
257 Private method to parse a string with a domain list. |
|
258 |
|
259 @param domains list of domains (string) |
|
260 @param separator separator character used by the list (string) |
|
261 """ |
|
262 domainsList = domains.split(separator) |
|
263 |
|
264 for domain in domainsList: |
|
265 if not domain: |
|
266 continue |
|
267 if domain.startswith("~"): |
|
268 self.__blockedDomains.append(domain[1:]) |
|
269 else: |
|
270 self.__allowedDomains.append(domain) |
|
271 |
|
272 self.__domainRestricted = \ |
|
273 bool(self.__blockedDomains) or bool(self.__allowedDomains) |
|
274 |
|
275 def networkMatch(self, request, domain, encodedUrl): |
|
276 """ |
|
277 Public method to check the rule for a match. |
|
278 |
|
279 @param request reference to the network request |
|
280 @type QWebEngineUrlRequestInfo |
|
281 @param domain domain name |
|
282 @type str |
|
283 @param encodedUrl string encoded URL to be checked |
|
284 @type str |
|
285 @return flag indicating a match |
|
286 @rtype bool |
|
287 """ |
|
288 if self.__cssRule or not self.__enabled or self.__internalDisabled: |
|
289 return False |
|
290 |
|
291 matched = False |
|
292 |
|
293 if self.__useRegExp: |
|
294 matched = self.__regExp.indexIn(encodedUrl) != -1 |
|
295 elif self.__useDomainMatch: |
|
296 matched = domain.endswith(self.__matchString) |
|
297 elif self.__useEndsMatch: |
|
298 if self.__caseSensitivity == Qt.CaseInsensitive: |
|
299 matched = encodedUrl.lower().endswith( |
|
300 self.__matchString.lower()) |
|
301 else: |
|
302 matched = encodedUrl.endswith(self.__matchString) |
|
303 else: |
|
304 if self.__caseSensitivity == Qt.CaseInsensitive: |
|
305 matched = self.__matchString.lower() in encodedUrl.lower() |
|
306 else: |
|
307 matched = self.__matchString in encodedUrl |
|
308 |
|
309 if matched: |
|
310 # check domain restrictions |
|
311 if self.__domainRestricted and \ |
|
312 not self.matchDomain(request.firstPartyUrl().host()): |
|
313 return False |
|
314 |
|
315 # check third-party restrictions |
|
316 if self.__thirdParty and not self.matchThirdParty(request): |
|
317 return False |
|
318 |
|
319 # check object restrictions |
|
320 if self.__object and not self.matchObject(request): |
|
321 return False |
|
322 |
|
323 # check subdocument restrictions |
|
324 if self.__subdocument and not self.matchSubdocument(request): |
|
325 return False |
|
326 |
|
327 # check xmlhttprequest restriction |
|
328 if self.__xmlhttprequest and not self.matchXmlHttpRequest(request): |
|
329 return False |
|
330 |
|
331 # check image restriction |
|
332 if self.__image and not self.matchImage(request): |
|
333 return False |
|
334 |
|
335 # check script restriction |
|
336 if self.__script and not self.matchScript(request): |
|
337 return False |
|
338 |
|
339 # check stylesheet restriction |
|
340 if self.__stylesheet and not self.matchStyleSheet(request): |
|
341 return False |
|
342 |
|
343 # check object-subrequest restriction |
|
344 if self.__objectSubrequest and \ |
|
345 not self.matchObjectSubrequest(request): |
|
346 return False |
|
347 |
|
348 return matched |
|
349 |
|
350 def urlMatch(self, url): |
|
351 """ |
|
352 Public method to check an URL against the rule. |
|
353 |
|
354 @param url URL to check (QUrl) |
|
355 @return flag indicating a match (boolean) |
|
356 """ |
|
357 if not self.__document and not self.__elemhide: |
|
358 return False |
|
359 |
|
360 encodedUrl = bytes(url.toEncoded()).decode() |
|
361 domain = url.host() |
|
362 return self.networkMatch(QNetworkRequest(url), domain, encodedUrl) |
|
363 |
|
364 def matchDomain(self, domain): |
|
365 """ |
|
366 Public method to match a domain. |
|
367 |
|
368 @param domain domain name to check (string) |
|
369 @return flag indicating a match (boolean) |
|
370 """ |
|
371 if not self.__enabled: |
|
372 return False |
|
373 |
|
374 if not self.__domainRestricted: |
|
375 return True |
|
376 |
|
377 if len(self.__blockedDomains) == 0: |
|
378 for dom in self.__allowedDomains: |
|
379 if domain.endswith(dom): |
|
380 return True |
|
381 elif len(self.__allowedDomains) == 0: |
|
382 for dom in self.__blockedDomains: |
|
383 if domain.endswith(dom): |
|
384 return False |
|
385 return True |
|
386 else: |
|
387 for dom in self.__blockedDomains: |
|
388 if domain.endswith(dom): |
|
389 return False |
|
390 for dom in self.__allowedDomains: |
|
391 if domain.endswith(dom): |
|
392 return True |
|
393 |
|
394 return False |
|
395 |
|
396 def matchThirdParty(self, req): |
|
397 """ |
|
398 Public slot to match a third-party rule. |
|
399 |
|
400 @param req request object to check (QWebEngineUrlRequestInfo) |
|
401 @return flag indicating a match (boolean) |
|
402 """ |
|
403 # Third-party matching should be performed on second-level domains |
|
404 firstPartyHost = toSecondLevelDomain(req.firstPartyUrl()) |
|
405 host = toSecondLevelDomain(req.requestUrl()) |
|
406 |
|
407 match = firstPartyHost != host |
|
408 |
|
409 if self.__thirdPartyException: |
|
410 return not match |
|
411 else: |
|
412 return match |
|
413 |
|
414 def matchObject(self, req): |
|
415 """ |
|
416 Public slot to match an object rule. |
|
417 |
|
418 @param req request object to check (QWebEngineUrlRequestInfo) |
|
419 @return flag indicating a match (boolean) |
|
420 """ |
|
421 match = ( |
|
422 req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeObject) |
|
423 |
|
424 if self.__objectException: |
|
425 return not match |
|
426 else: |
|
427 return match |
|
428 |
|
429 def matchSubdocument(self, req): |
|
430 """ |
|
431 Public slot to match a sub-document rule. |
|
432 |
|
433 @param req request object to check (QWebEngineUrlRequestInfo) |
|
434 @return flag indicating a match (boolean) |
|
435 """ |
|
436 match = ( |
|
437 req.resourceType() == |
|
438 QWebEngineUrlRequestInfo.ResourceTypeSubFrame) |
|
439 |
|
440 if self.__subdocumentException: |
|
441 return not match |
|
442 else: |
|
443 return match |
|
444 |
|
445 def matchXmlHttpRequest(self, req): |
|
446 """ |
|
447 Public slot to match a XmlHttpRequest rule. |
|
448 |
|
449 @param req request object to check (QWebEngineUrlRequestInfo) |
|
450 @return flag indicating a match (boolean) |
|
451 """ |
|
452 match = ( |
|
453 req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeXhr) |
|
454 |
|
455 if self.__xmlhttprequestException: |
|
456 return not match |
|
457 else: |
|
458 return match |
|
459 |
|
460 def matchImage(self, req): |
|
461 """ |
|
462 Public slot to match an Image rule. |
|
463 |
|
464 @param req request object to check (QWebEngineUrlRequestInfo) |
|
465 @return flag indicating a match (boolean) |
|
466 """ |
|
467 match = ( |
|
468 req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeImage) |
|
469 |
|
470 if self.__imageException: |
|
471 return not match |
|
472 else: |
|
473 return match |
|
474 |
|
475 def matchScript(self, req): |
|
476 """ |
|
477 Public slot to match a Script rule. |
|
478 |
|
479 @param req request object to check (QWebEngineUrlRequestInfo) |
|
480 @return flag indicating a match (boolean) |
|
481 """ |
|
482 match = ( |
|
483 req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeScript) |
|
484 |
|
485 if self.__scriptException: |
|
486 return not match |
|
487 else: |
|
488 return match |
|
489 |
|
490 def matchStyleSheet(self, req): |
|
491 """ |
|
492 Public slot to match a StyleSheet rule. |
|
493 |
|
494 @param req request object to check (QWebEngineUrlRequestInfo) |
|
495 @return flag indicating a match (boolean) |
|
496 """ |
|
497 match = ( |
|
498 req.resourceType() == |
|
499 QWebEngineUrlRequestInfo.ResourceTypeStylesheet) |
|
500 |
|
501 if self.__stylesheetException: |
|
502 return not match |
|
503 else: |
|
504 return match |
|
505 |
|
506 def matchObjectSubrequest(self, req): |
|
507 """ |
|
508 Public slot to match an Object Subrequest rule. |
|
509 |
|
510 @param req request object to check (QWebEngineUrlRequestInfo) |
|
511 @return flag indicating a match (boolean) |
|
512 """ |
|
513 match = ( |
|
514 req.resourceType() == |
|
515 QWebEngineUrlRequestInfo.ResourceTypeSubResource) |
|
516 |
|
517 if self.__objectSubrequestException: |
|
518 return not match |
|
519 else: |
|
520 return match |
|
521 |
|
522 def isException(self): |
|
523 """ |
|
524 Public method to check, if the rule defines an exception. |
|
525 |
|
526 @return flag indicating an exception (boolean) |
|
527 """ |
|
528 return self.__exception |
|
529 |
|
530 def setException(self, exception): |
|
531 """ |
|
532 Public method to set the rule's exception flag. |
|
533 |
|
534 @param exception flag indicating an exception rule (boolean) |
|
535 """ |
|
536 self.__exception = exception |
|
537 |
|
538 def isEnabled(self): |
|
539 """ |
|
540 Public method to check, if the rule is enabled. |
|
541 |
|
542 @return flag indicating enabled state (boolean) |
|
543 """ |
|
544 return self.__enabled |
|
545 |
|
546 def setEnabled(self, enabled): |
|
547 """ |
|
548 Public method to set the rule's enabled state. |
|
549 |
|
550 @param enabled flag indicating the new enabled state (boolean) |
|
551 """ |
|
552 self.__enabled = enabled |
|
553 if not enabled: |
|
554 self.__filter = "!" + self.__filter |
|
555 else: |
|
556 self.__filter = self.__filter[1:] |
|
557 |
|
558 def isCSSRule(self): |
|
559 """ |
|
560 Public method to check, if the rule is a CSS rule. |
|
561 |
|
562 @return flag indicating a CSS rule (boolean) |
|
563 """ |
|
564 return self.__cssRule |
|
565 |
|
566 def cssSelector(self): |
|
567 """ |
|
568 Public method to get the CSS selector of the rule. |
|
569 |
|
570 @return CSS selector (string) |
|
571 """ |
|
572 return self.__cssSelector |
|
573 |
|
574 def isDocument(self): |
|
575 """ |
|
576 Public method to check, if this is a document rule. |
|
577 |
|
578 @return flag indicating a document rule (boolean) |
|
579 """ |
|
580 return self.__document |
|
581 |
|
582 def isElementHiding(self): |
|
583 """ |
|
584 Public method to check, if this is an element hiding rule. |
|
585 |
|
586 @return flag indicating an element hiding rule (boolean) |
|
587 """ |
|
588 return self.__elemhide |
|
589 |
|
590 def isDomainRestricted(self): |
|
591 """ |
|
592 Public method to check, if this rule is restricted by domain. |
|
593 |
|
594 @return flag indicating a domain restriction (boolean) |
|
595 """ |
|
596 return self.__domainRestricted |
|
597 |
|
598 def isComment(self): |
|
599 """ |
|
600 Public method to check, if this is a comment. |
|
601 |
|
602 @return flag indicating a comment (boolean) |
|
603 """ |
|
604 return self.__filter.startswith("!") |
|
605 |
|
606 def isHeader(self): |
|
607 """ |
|
608 Public method to check, if this is a header. |
|
609 |
|
610 @return flag indicating a header (boolean) |
|
611 """ |
|
612 return self.__filter.startswith("[Adblock") |
|
613 |
|
614 def isSlow(self): |
|
615 """ |
|
616 Public method to check, if this is a slow rule. |
|
617 |
|
618 @return flag indicating a slow rule (boolean) |
|
619 """ |
|
620 return self.__useRegExp |
|
621 |
|
622 def isInternalDisabled(self): |
|
623 """ |
|
624 Public method to check, if this rule was disabled internally. |
|
625 |
|
626 @return flag indicating an internally disabled rule (boolean) |
|
627 """ |
|
628 return self.__internalDisabled |
|
629 |
|
630 def __convertPatternToRegExp(self, wildcardPattern): |
|
631 """ |
|
632 Private method to convert a wildcard pattern to a regular expression. |
|
633 |
|
634 @param wildcardPattern string containing the wildcard pattern (string) |
|
635 @return string containing a regular expression (string) |
|
636 """ |
|
637 pattern = wildcardPattern |
|
638 |
|
639 # remove multiple wildcards |
|
640 pattern = re.sub(r"\*+", "*", pattern) |
|
641 # remove anchors following separator placeholder |
|
642 pattern = re.sub(r"\^\|$", "^", pattern) |
|
643 # remove leading wildcards |
|
644 pattern = re.sub(r"^(\*)", "", pattern) |
|
645 # remove trailing wildcards |
|
646 pattern = re.sub(r"(\*)$", "", pattern) |
|
647 # escape special symbols |
|
648 pattern = re.sub(r"(\W)", r"\\\1", pattern) |
|
649 # process extended anchor at expression start |
|
650 pattern = re.sub( |
|
651 r"^\\\|\\\|", |
|
652 r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern) |
|
653 # process separator placeholders |
|
654 pattern = re.sub(r"\\\^", r"(?:[^\w\d\-.%]|$)", pattern) |
|
655 # process anchor at expression start |
|
656 pattern = re.sub(r"^\\\|", "^", pattern) |
|
657 # process anchor at expression end |
|
658 pattern = re.sub(r"\\\|$", "$", pattern) |
|
659 # replace wildcards by .* |
|
660 pattern = re.sub(r"\\\*", ".*", pattern) |
|
661 |
|
662 return pattern |