|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2009 - 2019 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing the AdBlock rule class. |
|
8 """ |
|
9 |
|
10 from __future__ import unicode_literals |
|
11 |
|
12 import re |
|
13 |
|
14 from PyQt5.QtCore import Qt, QRegExp, QUrl |
|
15 from PyQt5.QtNetwork import QNetworkRequest |
|
16 |
|
17 from Globals import qVersionTuple |
|
18 |
|
19 |
|
20 # Qt version < 4.8 has an issue; it will wrongly |
|
21 # count .co.uk (and others) as second-level domains |
|
22 def toSecondLevelDomain(url): |
|
23 """ |
|
24 Module function to get a second level domain from the given URL. |
|
25 |
|
26 @param url URL to extract domain from (QUrl) |
|
27 @return name of second level domain (string) |
|
28 """ |
|
29 if qVersionTuple() >= (4, 8, 0): |
|
30 topLevelDomain = url.topLevelDomain() |
|
31 urlHost = url.host() |
|
32 |
|
33 if not topLevelDomain or not urlHost: |
|
34 return "" |
|
35 |
|
36 domain = urlHost[:len(urlHost) - len(topLevelDomain)] |
|
37 if domain.count(".") == 0: |
|
38 return urlHost |
|
39 |
|
40 while domain.count(".") != 0: |
|
41 domain = domain[domain.find(".") + 1:] |
|
42 |
|
43 return domain + topLevelDomain |
|
44 else: |
|
45 domain = url.host() |
|
46 |
|
47 if domain.count(".") == 0: |
|
48 return "" |
|
49 |
|
50 while domain.count(".") != 1: |
|
51 domain = domain[domain.find(".") + 1:] |
|
52 |
|
53 return domain |
|
54 |
|
55 |
|
56 class AdBlockRule(object): |
|
57 """ |
|
58 Class implementing the AdBlock rule. |
|
59 """ |
|
60 def __init__(self, filterRule="", subscription=None): |
|
61 """ |
|
62 Constructor |
|
63 |
|
64 @param filterRule filter string of the rule (string) |
|
65 @param subscription reference to the subscription object |
|
66 (AdBlockSubscription) |
|
67 """ |
|
68 self.__subscription = subscription |
|
69 |
|
70 self.__regExp = QRegExp() |
|
71 self.__options = [] |
|
72 self.__blockedDomains = [] |
|
73 self.__allowedDomains = [] |
|
74 |
|
75 self.__enabled = True |
|
76 self.__cssRule = False |
|
77 self.__exception = False |
|
78 self.__internalDisabled = False |
|
79 self.__domainRestricted = False |
|
80 self.__useRegExp = False |
|
81 self.__useDomainMatch = False |
|
82 self.__useEndsMatch = False |
|
83 self.__thirdParty = False |
|
84 self.__thirdPartyException = False |
|
85 self.__object = False |
|
86 self.__objectException = False |
|
87 self.__subdocument = False |
|
88 self.__subdocumentException = False |
|
89 self.__xmlhttprequest = False |
|
90 self.__xmlhttprequestException = False |
|
91 self.__document = False |
|
92 self.__elemhide = False |
|
93 self.__caseSensitivity = Qt.CaseInsensitive |
|
94 |
|
95 self.setFilter(filterRule) |
|
96 |
|
97 def subscription(self): |
|
98 """ |
|
99 Public method to get the subscription this rule belongs to. |
|
100 |
|
101 @return subscription of the rule (AdBlockSubscription) |
|
102 """ |
|
103 return self.__subscription |
|
104 |
|
105 def filter(self): |
|
106 """ |
|
107 Public method to get the rule filter string. |
|
108 |
|
109 @return rule filter string (string) |
|
110 """ |
|
111 return self.__filter |
|
112 |
|
113 def setFilter(self, filterRule): |
|
114 """ |
|
115 Public method to set the rule filter string. |
|
116 |
|
117 @param filterRule rule filter string (string) |
|
118 """ |
|
119 self.__filter = filterRule |
|
120 self.__parseFilter() |
|
121 |
|
122 def __parseFilter(self): |
|
123 """ |
|
124 Private method to parse the filter pattern. |
|
125 """ |
|
126 parsedLine = self.__filter |
|
127 |
|
128 # empty rule or just a comment |
|
129 if not parsedLine.strip() or parsedLine.startswith(("!", "[Adblock")): |
|
130 self.__enabled = False |
|
131 return |
|
132 |
|
133 # CSS element hiding rule |
|
134 if "##" in parsedLine: |
|
135 self.__cssRule = True |
|
136 pos = parsedLine.find("##") |
|
137 |
|
138 # domain restricted rule |
|
139 if not parsedLine.startswith("##"): |
|
140 domains = parsedLine[:pos] |
|
141 self.__parseDomains(domains, ",") |
|
142 |
|
143 self.__cssSelector = parsedLine[pos + 2:] |
|
144 # CSS rule cannot have more options -> stop parsing |
|
145 return |
|
146 |
|
147 # Exception always starts with @@ |
|
148 if parsedLine.startswith("@@"): |
|
149 self.__exception = True |
|
150 parsedLine = parsedLine[2:] |
|
151 |
|
152 # Parse all options following '$' character |
|
153 optionsIndex = parsedLine.find("$") |
|
154 if optionsIndex >= 0: |
|
155 options = parsedLine[optionsIndex + 1:].split(",") |
|
156 |
|
157 handledOptions = 0 |
|
158 for option in options: |
|
159 if option.startswith("domain="): |
|
160 self.__parseDomains(option[7:], "|") |
|
161 handledOptions += 1 |
|
162 elif option == "match-case": |
|
163 self.__caseSensitivity = Qt.CaseSensitive |
|
164 handledOptions += 1 |
|
165 elif option.endswith("third-party"): |
|
166 self.__thirdParty = True |
|
167 self.__thirdPartyException = option.startswith("~") |
|
168 handledOptions += 1 |
|
169 elif option.endswith("object"): |
|
170 self.__object = True |
|
171 self.__objectException = option.startswith("~") |
|
172 handledOptions += 1 |
|
173 elif option.endswith("subdocument"): |
|
174 self.__subdocument = True |
|
175 self.__subdocumentException = option.startswith("~") |
|
176 handledOptions += 1 |
|
177 elif option.endswith("xmlhttprequest"): |
|
178 self.__xmlhttprequest = True |
|
179 self.__xmlhttprequestException = option.startswith("~") |
|
180 handledOptions += 1 |
|
181 elif option == "document" and self.__exception: |
|
182 self.__document = True |
|
183 handledOptions += 1 |
|
184 elif option == "elemhide" and self.__exception: |
|
185 self.__elemhide = True |
|
186 handledOptions += 1 |
|
187 elif option == "collapse": |
|
188 # Hiding placeholders of blocked elements |
|
189 handledOptions += 1 |
|
190 |
|
191 # If we don't handle all options, it's safer to just disable |
|
192 # this rule |
|
193 if handledOptions != len(options): |
|
194 self.__internalDisabled = True |
|
195 return |
|
196 |
|
197 parsedLine = parsedLine[:optionsIndex] |
|
198 |
|
199 # Rule is classic regexp |
|
200 if parsedLine.startswith("/") and parsedLine.endswith("/"): |
|
201 parsedLine = parsedLine[1:-1] |
|
202 self.__useRegExp = True |
|
203 self.__regExp = QRegExp(parsedLine, self.__caseSensitivity, |
|
204 QRegExp.RegExp) |
|
205 return |
|
206 |
|
207 # Remove starting / ending wildcards |
|
208 if parsedLine.startswith("*"): |
|
209 parsedLine = parsedLine[1:] |
|
210 if parsedLine.endswith("*"): |
|
211 parsedLine = parsedLine[:-1] |
|
212 |
|
213 # Fast string matching for domain can be used |
|
214 if parsedLine.startswith("||") and \ |
|
215 parsedLine.endswith("^") and \ |
|
216 QRegExp("[/:?=&\\*]").indexIn(parsedLine) == -1: |
|
217 parsedLine = parsedLine[2:-1] |
|
218 self.__useDomainMatch = True |
|
219 self.__matchString = parsedLine |
|
220 return |
|
221 |
|
222 # If rule contains '|' only at the end, string matching can be used |
|
223 if parsedLine.endswith("|") and \ |
|
224 QRegExp("[\\^\\*]").indexIn(parsedLine) == -1 and \ |
|
225 parsedLine.count("|") == 1: |
|
226 parsedLine = parsedLine[:-1] |
|
227 self.__useEndsMatch = True |
|
228 self.__matchString = parsedLine |
|
229 return |
|
230 |
|
231 # If there is still a wildcard (*) or separator (^) or (|), |
|
232 # the rule must be modified to comply with QRegExp. |
|
233 if "*" in parsedLine or "^" in parsedLine or "|" in parsedLine: |
|
234 pattern = self.__convertPatternToRegExp(parsedLine) |
|
235 self.__useRegExp = True |
|
236 self.__regExp = QRegExp(pattern, self.__caseSensitivity, |
|
237 QRegExp.RegExp) |
|
238 return |
|
239 |
|
240 # no regexp required |
|
241 self.__useRegExp = False |
|
242 self.__matchString = parsedLine |
|
243 |
|
244 def __parseDomains(self, domains, separator): |
|
245 """ |
|
246 Private method to parse a string with a domain list. |
|
247 |
|
248 @param domains list of domains (string) |
|
249 @param separator separator character used by the list (string) |
|
250 """ |
|
251 domainsList = domains.split(separator) |
|
252 |
|
253 for domain in domainsList: |
|
254 if not domain: |
|
255 continue |
|
256 if domain.startswith("~"): |
|
257 self.__blockedDomains.append(domain[1:]) |
|
258 else: |
|
259 self.__allowedDomains.append(domain) |
|
260 |
|
261 self.__domainRestricted = \ |
|
262 bool(self.__blockedDomains) or bool(self.__allowedDomains) |
|
263 |
|
264 def networkMatch(self, request, domain, encodedUrl): |
|
265 """ |
|
266 Public method to check the rule for a match. |
|
267 |
|
268 @param request reference to the network request (QNetworkRequest) |
|
269 @param domain domain name (string) |
|
270 @param encodedUrl string encoded URL to be checked (string) |
|
271 @return flag indicating a match (boolean) |
|
272 """ |
|
273 if self.__cssRule or not self.__enabled or self.__internalDisabled: |
|
274 return False |
|
275 |
|
276 matched = False |
|
277 |
|
278 if self.__useRegExp: |
|
279 matched = self.__regExp.indexIn(encodedUrl) != -1 |
|
280 elif self.__useDomainMatch: |
|
281 matched = domain.endswith(self.__matchString) |
|
282 elif self.__useEndsMatch: |
|
283 if self.__caseSensitivity == Qt.CaseInsensitive: |
|
284 matched = encodedUrl.lower().endswith( |
|
285 self.__matchString.lower()) |
|
286 else: |
|
287 matched = encodedUrl.endswith(self.__matchString) |
|
288 else: |
|
289 if self.__caseSensitivity == Qt.CaseInsensitive: |
|
290 matched = self.__matchString.lower() in encodedUrl.lower() |
|
291 else: |
|
292 matched = self.__matchString in encodedUrl |
|
293 |
|
294 if matched: |
|
295 # check domain restrictions |
|
296 if self.__domainRestricted and not self.matchDomain(domain): |
|
297 return False |
|
298 |
|
299 # check third-party restrictions |
|
300 if self.__thirdParty and not self.matchThirdParty(request): |
|
301 return False |
|
302 |
|
303 # check object restrictions |
|
304 if self.__object and not self.matchObject(request): |
|
305 return False |
|
306 |
|
307 # check subdocument restrictions |
|
308 if self.__subdocument and not self.matchSubdocument(request): |
|
309 return False |
|
310 |
|
311 # check xmlhttprequest restriction |
|
312 if self.__xmlhttprequest and not self.matchXmlHttpRequest(request): |
|
313 return False |
|
314 |
|
315 return matched |
|
316 |
|
317 def urlMatch(self, url): |
|
318 """ |
|
319 Public method to check an URL against the rule. |
|
320 |
|
321 @param url URL to check (QUrl) |
|
322 @return flag indicating a match (boolean) |
|
323 """ |
|
324 if not self.__document and not self.__elemhide: |
|
325 return False |
|
326 |
|
327 encodedUrl = bytes(url.toEncoded()).decode() |
|
328 domain = url.host() |
|
329 return self.networkMatch(QNetworkRequest(url), domain, encodedUrl) |
|
330 |
|
331 def matchDomain(self, domain): |
|
332 """ |
|
333 Public method to match a domain. |
|
334 |
|
335 @param domain domain name to check (string) |
|
336 @return flag indicating a match (boolean) |
|
337 """ |
|
338 if not self.__enabled: |
|
339 return False |
|
340 |
|
341 if not self.__domainRestricted: |
|
342 return True |
|
343 |
|
344 if len(self.__blockedDomains) == 0: |
|
345 for dom in self.__allowedDomains: |
|
346 if domain.endswith(dom): |
|
347 return True |
|
348 elif len(self.__allowedDomains) == 0: |
|
349 for dom in self.__blockedDomains: |
|
350 if domain.endswith(dom): |
|
351 return False |
|
352 return True |
|
353 else: |
|
354 for dom in self.__blockedDomains: |
|
355 if domain.endswith(dom): |
|
356 return False |
|
357 for dom in self.__allowedDomains: |
|
358 if domain.endswith(dom): |
|
359 return True |
|
360 |
|
361 return False |
|
362 |
|
363 def matchThirdParty(self, req): |
|
364 """ |
|
365 Public slot to match a third-party rule. |
|
366 |
|
367 @param req request object to check (QNetworkRequest) |
|
368 @return flag indicating a match (boolean) |
|
369 """ |
|
370 referer = \ |
|
371 bytes(req.attribute(QNetworkRequest.User + 200, b"")).decode() |
|
372 if referer == "": |
|
373 return False |
|
374 |
|
375 # Third-party matching should be performed on second-level domains |
|
376 refererHost = toSecondLevelDomain(QUrl(referer)) |
|
377 host = toSecondLevelDomain(req.url()) |
|
378 |
|
379 match = refererHost != host |
|
380 |
|
381 if self.__thirdPartyException: |
|
382 return not match |
|
383 else: |
|
384 return match |
|
385 |
|
386 def matchObject(self, req): |
|
387 """ |
|
388 Public slot to match an object rule. |
|
389 |
|
390 @param req request object to check (QNetworkRequest) |
|
391 @return flag indicating a match (boolean) |
|
392 """ |
|
393 match = req.attribute(QNetworkRequest.User + 200) == "object" |
|
394 |
|
395 if self.__objectException: |
|
396 return not match |
|
397 else: |
|
398 return match |
|
399 |
|
400 def matchSubdocument(self, req): |
|
401 """ |
|
402 Public slot to match a sub-document rule. |
|
403 |
|
404 @param req request object to check (QNetworkRequest) |
|
405 @return flag indicating a match (boolean) |
|
406 """ |
|
407 originatingFrame = req.originatingObject() |
|
408 if originatingFrame is None: |
|
409 return False |
|
410 |
|
411 page = originatingFrame.page() |
|
412 if page is None: |
|
413 return False |
|
414 |
|
415 match = originatingFrame != page.mainFrame() |
|
416 |
|
417 if self.__subdocumentException: |
|
418 return not match |
|
419 else: |
|
420 return match |
|
421 |
|
422 def matchXmlHttpRequest(self, req): |
|
423 """ |
|
424 Public slot to match a XmlHttpRequest rule. |
|
425 |
|
426 @param req request object to check (QNetworkRequest) |
|
427 @return flag indicating a match (boolean) |
|
428 """ |
|
429 match = req.rawHeader(b"X-Request-With") == "XMLHttpRequest" |
|
430 |
|
431 if self.__xmlhttprequestException: |
|
432 return not match |
|
433 else: |
|
434 return match |
|
435 |
|
436 def isException(self): |
|
437 """ |
|
438 Public method to check, if the rule defines an exception. |
|
439 |
|
440 @return flag indicating an exception (boolean) |
|
441 """ |
|
442 return self.__exception |
|
443 |
|
444 def setException(self, exception): |
|
445 """ |
|
446 Public method to set the rule's exception flag. |
|
447 |
|
448 @param exception flag indicating an exception rule (boolean) |
|
449 """ |
|
450 self.__exception = exception |
|
451 |
|
452 def isEnabled(self): |
|
453 """ |
|
454 Public method to check, if the rule is enabled. |
|
455 |
|
456 @return flag indicating enabled state (boolean) |
|
457 """ |
|
458 return self.__enabled |
|
459 |
|
460 def setEnabled(self, enabled): |
|
461 """ |
|
462 Public method to set the rule's enabled state. |
|
463 |
|
464 @param enabled flag indicating the new enabled state (boolean) |
|
465 """ |
|
466 self.__enabled = enabled |
|
467 if not enabled: |
|
468 self.__filter = "!" + self.__filter |
|
469 else: |
|
470 self.__filter = self.__filter[1:] |
|
471 |
|
472 def isCSSRule(self): |
|
473 """ |
|
474 Public method to check, if the rule is a CSS rule. |
|
475 |
|
476 @return flag indicating a CSS rule (boolean) |
|
477 """ |
|
478 return self.__cssRule |
|
479 |
|
480 def cssSelector(self): |
|
481 """ |
|
482 Public method to get the CSS selector of the rule. |
|
483 |
|
484 @return CSS selector (string) |
|
485 """ |
|
486 return self.__cssSelector |
|
487 |
|
488 def isDocument(self): |
|
489 """ |
|
490 Public method to check, if this is a document rule. |
|
491 |
|
492 @return flag indicating a document rule (boolean) |
|
493 """ |
|
494 return self.__document |
|
495 |
|
496 def isElementHiding(self): |
|
497 """ |
|
498 Public method to check, if this is an element hiding rule. |
|
499 |
|
500 @return flag indicating an element hiding rule (boolean) |
|
501 """ |
|
502 return self.__elemhide |
|
503 |
|
504 def isDomainRestricted(self): |
|
505 """ |
|
506 Public method to check, if this rule is restricted by domain. |
|
507 |
|
508 @return flag indicating a domain restriction (boolean) |
|
509 """ |
|
510 return self.__domainRestricted |
|
511 |
|
512 def isComment(self): |
|
513 """ |
|
514 Public method to check, if this is a comment. |
|
515 |
|
516 @return flag indicating a comment (boolean) |
|
517 """ |
|
518 return self.__filter.startswith("!") |
|
519 |
|
520 def isHeader(self): |
|
521 """ |
|
522 Public method to check, if this is a header. |
|
523 |
|
524 @return flag indicating a header (boolean) |
|
525 """ |
|
526 return self.__filter.startswith("[Adblock") |
|
527 |
|
528 def isSlow(self): |
|
529 """ |
|
530 Public method to check, if this is a slow rule. |
|
531 |
|
532 @return flag indicating a slow rule (boolean) |
|
533 """ |
|
534 return self.__useRegExp |
|
535 |
|
536 def isInternalDisabled(self): |
|
537 """ |
|
538 Public method to check, if this rule was disabled internally. |
|
539 |
|
540 @return flag indicating an internally disabled rule (boolean) |
|
541 """ |
|
542 return self.__internalDisabled |
|
543 |
|
544 def __convertPatternToRegExp(self, wildcardPattern): |
|
545 """ |
|
546 Private method to convert a wildcard pattern to a regular expression. |
|
547 |
|
548 @param wildcardPattern string containing the wildcard pattern (string) |
|
549 @return string containing a regular expression (string) |
|
550 """ |
|
551 pattern = wildcardPattern |
|
552 |
|
553 # remove multiple wildcards |
|
554 pattern = re.sub(r"\*+", "*", pattern) |
|
555 # remove anchors following separator placeholder |
|
556 pattern = re.sub(r"\^\|$", "^", pattern) |
|
557 # remove leading wildcards |
|
558 pattern = re.sub(r"^(\*)", "", pattern) |
|
559 # remove trailing wildcards |
|
560 pattern = re.sub(r"(\*)$", "", pattern) |
|
561 # escape special symbols |
|
562 pattern = re.sub(r"(\W)", r"\\\1", pattern) |
|
563 # process extended anchor at expression start |
|
564 pattern = re.sub( |
|
565 r"^\\\|\\\|", |
|
566 r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern) |
|
567 # process separator placeholders |
|
568 pattern = re.sub(r"\\\^", r"(?:[^\w\d\-.%]|$)", pattern) |
|
569 # process anchor at expression start |
|
570 pattern = re.sub(r"^\\\|", "^", pattern) |
|
571 # process anchor at expression end |
|
572 pattern = re.sub(r"\\\|$", "$", pattern) |
|
573 # replace wildcards by .* |
|
574 pattern = re.sub(r"\\\*", ".*", pattern) |
|
575 |
|
576 return pattern |