Helpviewer/AdBlock/AdBlockRule.py

changeset 1960
d8c45fe8a1b9
parent 1950
4f004ec74b48
child 1963
9c5b3235abf9
equal deleted inserted replaced
1957:2fed7bc4ad83 1960:d8c45fe8a1b9
7 Module implementing the AdBlock rule class. 7 Module implementing the AdBlock rule class.
8 """ 8 """
9 9
10 import re 10 import re
11 11
12 from PyQt4.QtCore import Qt, QRegExp, QUrl 12 from PyQt4.QtCore import Qt, QRegExp, QUrl, qVersion
13 from PyQt4.QtNetwork import QNetworkRequest
14
15
16 # Qt version < 4.8 has an issue; it will wrongly
17 # count .co.uk (and others) as second-level domains
18 def toSecondLevelDomain(url):
19 """
20 Module function to get a second level domain from the given URL.
21
22 @param url URL to extract domain from (QUrl)
23 @return name of second level domain (string)
24 """
25 if qVersion() >= "4.8.0":
26 topLevelDomain = url.topLevelDomain()
27 urlHost = url.host()
28
29 if not topLevelDomain or not urlHost:
30 return ""
31
32 domain = urlHost[:len(urlHost) - len(topLevelDomain)]
33 if domain.count(".") == 0:
34 return urlHost
35
36 while domain.count(".") != 0:
37 domain = domain[domain.find(".") + 1:]
38
39 return domain + topLevelDomain
40 else:
41 domain = url.host()
42
43 if domain.count(".") == 0:
44 return ""
45
46 while domain.count(".") != 1:
47 domain = domain[domain.find(".") + 1:]
48
49 return domain
13 50
14 51
15 class AdBlockRule(object): 52 class AdBlockRule(object):
16 """ 53 """
17 Class implementing the AdBlock rule. 54 Class implementing the AdBlock rule.
20 """ 57 """
21 Constructor 58 Constructor
22 """ 59 """
23 self.__regExp = QRegExp() 60 self.__regExp = QRegExp()
24 self.__options = [] 61 self.__options = []
62 self.__blockedDomains = []
63 self.__allowedDomains = []
64
65 self.__enabled = True
66 self.__cssRule = False
67 self.__exception = False
68 self.__internalDisabled = False
69 self.__domainRestricted = False
70 self.__useRegExp = False
71 self.__useDomainMatch = False
72 self.__useEndsMatch = False
73 self.__thirdParty = False
74 self.__thirdPartyException = False
75 self.__object = False
76 self.__objectException = False
77 self.__subdocument = False
78 self.__subdocumentException = False
79 self.__xmlhttprequest = False
80 self.__xmlhttprequestException = False
81 self.__document = False
82 self.__elemhide = False
83 self.__caseSensitivity = Qt.CaseInsensitive
25 84
26 self.setFilter(filter) 85 self.setFilter(filter)
27 86
28 def filter(self): 87 def filter(self):
29 """ 88 """
38 Public method to set the rule filter string. 97 Public method to set the rule filter string.
39 98
40 @param filter rule filter string (string) 99 @param filter rule filter string (string)
41 """ 100 """
42 self.__filter = filter 101 self.__filter = filter
43 102 self.__parseFilter()
44 self.__cssRule = False 103
45 self.__enabled = True 104 def __parseFilter(self):
46 self.__exception = False 105 """
47 regExpRule = False 106 Private method to parse the filter pattern.
48 107 """
49 if filter.startswith("!") or not filter.strip(): 108 parsedLine = self.__filter
109
110 # empty rule or just a comment
111 if not parsedLine.strip() or parsedLine.startswith("!"):
50 self.__enabled = False 112 self.__enabled = False
51 113 return
52 if "##" in filter: 114
115 # CSS element hiding rule
116 if "##" in parsedLine:
53 self.__cssRule = True 117 self.__cssRule = True
54 118 pos = parsedLine.find("##")
55 parsedLine = filter 119
120 # domain restricted rule
121 if not parsedLine.startswith("##"):
122 domains = parsedLine[:pos]
123 self.__parseDomains(domains, ",")
124
125 self.__cssSelector = parsedLine[pos + 2:]
126 # CSS rule cannot have more options -> stop parsing
127 return
128
129 # Exception always starts with @@
56 if parsedLine.startswith("@@"): 130 if parsedLine.startswith("@@"):
57 self.__exception = True 131 self.__exception = True
58 parsedLine = parsedLine[2:] 132 parsedLine = parsedLine[2:]
59 if parsedLine.startswith("/"): 133
60 if parsedLine.endswith("/"): 134 # Parse all options following '$' character
61 parsedLine = parsedLine[1:-1] 135 optionsIndex = parsedLine.find("$")
62 regExpRule = True 136 if optionsIndex >= 0:
63 137 options = parsedLine[optionsIndex + 1:].split(",")
64 options = parsedLine.find("$") 138
65 if options >= 0: 139 handledOptions = 0
66 try: 140 for option in options:
67 self.__options = parsedLine[options + 1:].split(",") 141 if option.startswith("domain="):
68 except IndexError: 142 self.__parseDomains(option[7:], "|")
69 self.__options = [] 143 handledOptions += 1
70 parsedLine = parsedLine[:options] 144 elif option == "match-case":
71 145 self.__caseSensitivity = Qt.CaseSensitive
72 self.setPattern(parsedLine, regExpRule) 146 handledOptions += 1
73 147 elif option.endswith("third-party"):
74 if "match-case" in self.__options: 148 self.__thirdParty = True
75 self.__regExp.setCaseSensitivity(Qt.CaseSensitive) 149 self.__thirdPartyException = option.startswith("~")
76 self.__options.remove("match-case") 150 handledOptions += 1
77 151 elif option.endswith("object"):
78 def networkMatch(self, encodedUrl): 152 self.__object = True
153 self.__objectException = option.startswith("~")
154 handledOptions += 1
155 elif option.endswith("subdocument"):
156 self.__subdocument = True
157 self.__subdocumentException = option.startswith("~")
158 handledOptions += 1
159 elif option.endswith("xmlhttprequest"):
160 self.__xmlhttprequest = True
161 self.__xmlhttprequestException = option.startswith("~")
162 handledOptions += 1
163 elif option == "document" and self.__exception:
164 self.__document = True
165 handledOptions += 1
166 elif option == "elemhide" and self.__exception:
167 self.__elemhide = True
168 handledOptions += 1
169 elif option == "collapse":
170 # Hiding placeholders of blocked elements
171 handledOptions += 1
172
173 # If we don't handle all options, it's safer to just disable this rule
174 if handledOptions != len(options):
175 self.__internalDisabled = True
176 return
177
178 parsedLine = parsedLine[:optionsIndex]
179
180 # Rule is classic regexp
181 if parsedLine.startswith("/") and parsedLine.endswith("/"):
182 parsedLine = parsedLine[1:-1]
183 self.__useRegExp = True
184 self.__regExp = QRegExp(parsedLine, self.__caseSensitivity,
185 QRegExp.RegExp)
186 return
187
188 # Remove starting / ending wildcards
189 if parsedLine.startswith("*"):
190 parsedLine = parsedLine[1:]
191 if parsedLine.endswith("*"):
192 parsedLine = parsedLine[:-1]
193
194 # Fast string matching for domain can be used
195 if parsedLine.startswith("||") and \
196 parsedLine.endswith("^") and \
197 QRegExp("[/:?=&\\*]").indexIn(parsedLine) == -1:
198 parsedLine = parsedLine[2:-1]
199 self.__useDomainMatch = True
200 self.__matchString = parsedLine
201 return
202
203 # If rule contains '|' only at the end, string matching can be used
204 if parsedLine.endswith("|") and \
205 QRegExp("[\\^\\*]").indexIn(parsedLine) == -1 and \
206 parsedLine.count("|") == 1:
207 parsedLine = parsedLine[:-1]
208 self.__useEndsMatch = True
209 self.__matchString = parsedLine
210 return
211
212 # If there is still a wildcard (*) or separator (^) or (|),
213 # the rule must be modified to comply with QRegExp.
214 if "*" in parsedLine or "^" in parsedLine or "|" in parsedLine:
215 pattern = self.__convertPatternToRegExp(parsedLine)
216 self.__useRegExp = True
217 self.__regExp = QRegExp(pattern, self.__caseSensitivity, QRegExp.RegExp)
218 return
219
220 # no regexp required
221 self.__useRegExp = False
222 self.__matchString = parsedLine
223
224 def __parseDomains(self, domains, separator):
225 """
226 Private method to parse a string with a domain list.
227
228 @param domains list of domains (string)
229 @param separator separator character used by the list (string)
230 """
231 domainsList = domains.split(separator)
232
233 for domain in domainsList:
234 if not domain:
235 continue
236 if domain.startswith("~"):
237 self.__blockedDomains.append(domain[1:])
238 else:
239 self.__allowedDomains.append(domain)
240
241 self.__domainRestricted = \
242 bool(self.__blockedDomains) or bool(self.__allowedDomains)
243
244 def networkMatch(self, request, domain, encodedUrl):
79 """ 245 """
80 Public method to check the rule for a match. 246 Public method to check the rule for a match.
81 247
248 @param request reference to the network request (QNetworkRequest)
249 @param domain domain name (string)
82 @param encodedUrl string encoded URL to be checked (string) 250 @param encodedUrl string encoded URL to be checked (string)
83 @return flag indicating a match (boolean) 251 @return flag indicating a match (boolean)
84 """ 252 """
85 if self.__cssRule: 253 if self.__cssRule or not self.__enabled or self.__internalDisabled:
86 return False 254 return False
87 255
256 matched = False
257
258 if self.__useRegExp:
259 matched = self.__regExp.indexIn(encodedUrl) != -1
260 elif self.__useDomainMatch:
261 matched = domain.endswith(self.__matchString)
262 elif self.__useEndsMatch:
263 if self.__caseSensitivity == Qt.CaseInsensitive:
264 matched = encodedUrl.lower().endswith(self.__matchString.lower())
265 else:
266 matched = encodedUrl.endswith(self.__matchString)
267 else:
268 if self.__caseSensitivity == Qt.CaseInsensitive:
269 matched = self.__matchString.lower() in encodedUrl.lower()
270 else:
271 matched = self.__matchString in encodedUrl
272
273 if matched:
274 # check domain restrictions
275 if self.__domainRestricted and not self.matchDomain(domain):
276 return False
277
278 # check third-party restrictions
279 if self.__thirdParty and not self.matchThirdParty(request):
280 return False
281
282 # check object restrictions
283 if self.__object and not self.matchObject(request):
284 return False
285
286 # check subdocument restrictions
287 if self.__subdocument and not self.matchSubdocument(request):
288 return False
289
290 # check xmlhttprequest restriction
291 if self.__xmlhttprequest and not self.matchXmlHttpRequest(request):
292 return False
293
294 return matched
295
296 def urlMatch(self, url):
297 """
298 Public method to check an URL against the rule.
299
300 @param url URL to check (QUrl)
301 @return flag indicating a match (boolean)
302 """
303 if not self.__document and not self.__elemhide:
304 return False
305
306 encodedUrl = bytes(url.toEncoded()).decode()
307 domain = url.host()
308 return self.networkMatch(QNetworkRequest(url), domain, encodedUrl)
309
310 def matchDomain(self, domain):
311 """
312 Public method to match a domain.
313
314 @param domain domain name to check (string)
315 @return flag indicating a match (boolean)
316 """
88 if not self.__enabled: 317 if not self.__enabled:
89 return False 318 return False
90 319
91 matched = self.__regExp.indexIn(encodedUrl) != -1 320 if not self.__domainRestricted:
92 321 return True
93 if matched and not len(self.__options) == 0: 322
94 # only domain rules are supported 323 if len(self.__blockedDomains) == 0:
95 if len(self.__options) == 1: 324 for dom in self.__allowedDomains:
96 for option in self.__options: 325 if domain.endswith(dom):
97 if option.startswith("domain="): 326 return True
98 url = QUrl.fromEncoded(encodedUrl) 327 elif len(self.__allowedDomains) == 0:
99 host = url.host() 328 for dom in self.__blockedDomains:
100 domainOptions = option[7:].split("|") 329 if domain.endswith(dom):
101 for domainOption in domainOptions: 330 return False
102 negate = domainOption.startswith("~") 331 return True
103 if negate: 332 else:
104 domainOption = domainOption[1:] 333 for dom in self.__blockedDomains:
105 hostMatched = domainOption == host 334 if domain.endswith(dom):
106 if hostMatched and not negate: 335 return False
107 return True 336 for dom in self.__allowedDomains:
108 if not hostMatched and negate: 337 if domain.endswith(dom):
109 return True 338 return True
110 339
340 return False
341
342 def matchThirdParty(self, req):
343 """
344 Public slot to match a third-party rule.
345
346 @param req request object to check (QNetworkRequest)
347 @return flag indicating a match (boolean)
348 """
349 referer = bytes(req.attribute(QNetworkRequest.User + 200, "")).decode()
350 if referer == "":
111 return False 351 return False
112 352
113 return matched 353 # Third-party matching should be performed on second-level domains
354 refererHost = toSecondLevelDomain(QUrl(referer))
355 host = toSecondLevelDomain(req.url())
356
357 match = refererHost != host
358
359 if self.__thirdPartyException:
360 return not match
361 else:
362 return match
363
364 def matchObject(self, req):
365 """
366 Public slot to match an object rule.
367
368 @param req request object to check (QNetworkRequest)
369 @return flag indicating a match (boolean)
370 """
371 match = req.attribute(QNetworkRequest.User + 200) == "object"
372
373 if self.__objectException:
374 return not match
375 else:
376 return match
377
378 def matchSubdocument(self, req):
379 """
380 Public slot to match a sub-document rule.
381
382 @param req request object to check (QNetworkRequest)
383 @return flag indicating a match (boolean)
384 """
385 originatingFrame = req.originatingObject()
386 if originatingFrame is None:
387 return False
388
389 page = originatingFrame.page()
390 if page is None:
391 return False
392
393 match = originatingFrame != page.mainFrame()
394
395 if self.__subdocumentException:
396 return not match
397 else:
398 return match
399
400 def matchXmlHttpRequest(self, req):
401 """
402 Public slot to match a XmlHttpRequest rule.
403
404 @param req request object to check (QNetworkRequest)
405 @return flag indicating a match (boolean)
406 """
407 match = req.rawHeader("X-Request-With") == "XMLHttpRequest"
408
409 if self.__xmlhttprequestException:
410 return not match
411 else:
412 return match
114 413
115 def isException(self): 414 def isException(self):
116 """ 415 """
117 Public method to check, if the rule defines an exception. 416 Public method to check, if the rule defines an exception.
118 417
154 453
155 @return flag indicating a CSS rule (boolean) 454 @return flag indicating a CSS rule (boolean)
156 """ 455 """
157 return self.__cssRule 456 return self.__cssRule
158 457
159 def regExpPattern(self): 458 def cssSelector(self):
160 """ 459 """
161 Public method to get the regexp pattern of the rule. 460 Public method to get the CSS selector of the rule.
162 461
163 @return regexp pattern (QRegExp) 462 @return CSS selector (string)
164 """ 463 """
165 return self.__regExp.pattern() 464 return self.__cssSelector
465
466 def isDocument(self):
467 """
468 Public method to check, if this is a document rule.
469
470 @return flag indicating a document rule (boolean)
471 """
472 return self.__document
473
474 def isElementHiding(self):
475 """
476 Public method to check, if this is an element hiding rule.
477
478 @return flag indicating an element hiding rule (boolean)
479 """
480 return self.__elemhide
481
482 def isDomainRestricted(self):
483 """
484 Public method to check, if this rule is restricted by domain.
485
486 @return flag indicating a domain restriction (boolean)
487 """
488 return self.__domainRestricted
489
490 def isComment(self):
491 """
492 Public method to check, if this is a comment.
493
494 @return flag indicating a comment (boolean)
495 """
496 return self.__filter.startswith("!")
497
498 def isSlow(self):
499 """
500 Public method to check, if this is a slow rule.
501
502 @return flag indicating a slow rule (boolean)
503 """
504 return self.__useRegExp
505
506 def isInternalDisabled(self):
507 """
508 Public method to check, if this rule was disabled internally.
509
510 @return flag indicating an internally disabled rule (boolean)
511 """
512 return self.__internalDisabled
166 513
167 def __convertPatternToRegExp(self, wildcardPattern): 514 def __convertPatternToRegExp(self, wildcardPattern):
168 """ 515 """
169 Private method to convert a wildcard pattern to a regular expression. 516 Private method to convert a wildcard pattern to a regular expression.
170 517
171 @param wildcardPattern string containing the wildcard pattern (string) 518 @param wildcardPattern string containing the wildcard pattern (string)
172 @return string containing a regular expression (string) 519 @return string containing a regular expression (string)
173 """ 520 """
174 pattern = wildcardPattern 521 pattern = wildcardPattern
175 522
176 pattern = re.sub(r"\*+", "*", pattern) # remove multiple wildcards 523 pattern = re.sub(r"\*+", "*", pattern) # remove multiple wildcards
177 pattern = re.sub(r"\^\|$", "^", pattern) # remove anchors following separator placeholder 524 pattern = re.sub(r"\^\|$", "^", pattern) # remove anchors following separator
178 pattern = re.sub(r"^(\*)", "", pattern) # remove leading wildcards 525 # placeholder
179 pattern = re.sub(r"(\*)$", "", pattern) # remove trailing wildcards 526 pattern = re.sub(r"^(\*)", "", pattern) # remove leading wildcards
180 pattern = re.sub(r"(\W)", r"\\\1", pattern) # escape special symbols 527 pattern = re.sub(r"(\*)$", "", pattern) # remove trailing wildcards
528 pattern = re.sub(r"(\W)", r"\\\1", pattern) # escape special symbols
181 pattern = re.sub(r"^\\\|\\\|", 529 pattern = re.sub(r"^\\\|\\\|",
182 r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern) # process extended anchor at expression start 530 r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern) # process extended anchor at
531 # expression start
183 pattern = re.sub(r"\\\^", 532 pattern = re.sub(r"\\\^",
184 r"(?:[^\w\d\-.%]|$)", pattern) # process separator placeholders 533 r"(?:[^\w\d\-.%]|$)", pattern) # process separator placeholders
185 pattern = re.sub(r"^\\\|", "^", pattern) # process anchor at expression start 534 pattern = re.sub(r"^\\\|", "^", pattern) # process anchor at expression start
186 pattern = re.sub(r"\\\|$", "$", pattern) # process anchor at expression end 535 pattern = re.sub(r"\\\|$", "$", pattern) # process anchor at expression end
187 pattern = re.sub(r"\\\*", ".*", pattern) # replace wildcards by .* 536 pattern = re.sub(r"\\\*", ".*", pattern) # replace wildcards by .*
188 537
189 return pattern 538 return pattern
190
191 def setPattern(self, pattern, isRegExp):
192 """
193 Public method to set the rule pattern.
194
195 @param pattern string containing the pattern (string)
196 @param isRegExp flag indicating a reg exp pattern (boolean)
197 """
198 if isRegExp:
199 self.__regExp = QRegExp(pattern, Qt.CaseInsensitive, QRegExp.RegExp2)
200 else:
201 self.__regExp = QRegExp(self.__convertPatternToRegExp(pattern),
202 Qt.CaseInsensitive, QRegExp.RegExp2)

eric ide

mercurial