diff -r 2fed7bc4ad83 -r d8c45fe8a1b9 Helpviewer/AdBlock/AdBlockRule.py --- a/Helpviewer/AdBlock/AdBlockRule.py Thu Jul 26 18:38:15 2012 +0200 +++ b/Helpviewer/AdBlock/AdBlockRule.py Sat Jul 28 11:23:12 2012 +0200 @@ -9,7 +9,44 @@ import re -from PyQt4.QtCore import Qt, QRegExp, QUrl +from PyQt4.QtCore import Qt, QRegExp, QUrl, qVersion +from PyQt4.QtNetwork import QNetworkRequest + + +# Qt version < 4.8 has an issue; it will wrongly +# count .co.uk (and others) as second-level domains +def toSecondLevelDomain(url): + """ + Module function to get a second level domain from the given URL. + + @param url URL to extract domain from (QUrl) + @return name of second level domain (string) + """ + if qVersion() >= "4.8.0": + topLevelDomain = url.topLevelDomain() + urlHost = url.host() + + if not topLevelDomain or not urlHost: + return "" + + domain = urlHost[:len(urlHost) - len(topLevelDomain)] + if domain.count(".") == 0: + return urlHost + + while domain.count(".") != 0: + domain = domain[domain.find(".") + 1:] + + return domain + topLevelDomain + else: + domain = url.host() + + if domain.count(".") == 0: + return "" + + while domain.count(".") != 1: + domain = domain[domain.find(".") + 1:] + + return domain class AdBlockRule(object): @@ -22,6 +59,28 @@ """ self.__regExp = QRegExp() self.__options = [] + self.__blockedDomains = [] + self.__allowedDomains = [] + + self.__enabled = True + self.__cssRule = False + self.__exception = False + self.__internalDisabled = False + self.__domainRestricted = False + self.__useRegExp = False + self.__useDomainMatch = False + self.__useEndsMatch = False + self.__thirdParty = False + self.__thirdPartyException = False + self.__object = False + self.__objectException = False + self.__subdocument = False + self.__subdocumentException = False + self.__xmlhttprequest = False + self.__xmlhttprequestException = False + self.__document = False + self.__elemhide = False + self.__caseSensitivity = Qt.CaseInsensitive self.setFilter(filter) @@ -40,77 +99,317 @@ @param filter rule filter string (string) """ self.__filter = filter + self.__parseFilter() + + def __parseFilter(self): + """ + Private method to parse the filter pattern. + """ + parsedLine = self.__filter - self.__cssRule = False - self.__enabled = True - self.__exception = False - regExpRule = False + # empty rule or just a comment + if not parsedLine.strip() or parsedLine.startswith("!"): + self.__enabled = False + return - if filter.startswith("!") or not filter.strip(): - self.__enabled = False + # CSS element hiding rule + if "##" in parsedLine: + self.__cssRule = True + pos = parsedLine.find("##") + + # domain restricted rule + if not parsedLine.startswith("##"): + domains = parsedLine[:pos] + self.__parseDomains(domains, ",") + + self.__cssSelector = parsedLine[pos + 2:] + # CSS rule cannot have more options -> stop parsing + return - if "##" in filter: - self.__cssRule = True - - parsedLine = filter + # Exception always starts with @@ if parsedLine.startswith("@@"): self.__exception = True parsedLine = parsedLine[2:] - if parsedLine.startswith("/"): - if parsedLine.endswith("/"): - parsedLine = parsedLine[1:-1] - regExpRule = True + + # Parse all options following '$' character + optionsIndex = parsedLine.find("$") + if optionsIndex >= 0: + options = parsedLine[optionsIndex + 1:].split(",") + + handledOptions = 0 + for option in options: + if option.startswith("domain="): + self.__parseDomains(option[7:], "|") + handledOptions += 1 + elif option == "match-case": + self.__caseSensitivity = Qt.CaseSensitive + handledOptions += 1 + elif option.endswith("third-party"): + self.__thirdParty = True + self.__thirdPartyException = option.startswith("~") + handledOptions += 1 + elif option.endswith("object"): + self.__object = True + self.__objectException = option.startswith("~") + handledOptions += 1 + elif option.endswith("subdocument"): + self.__subdocument = True + self.__subdocumentException = option.startswith("~") + handledOptions += 1 + elif option.endswith("xmlhttprequest"): + self.__xmlhttprequest = True + self.__xmlhttprequestException = option.startswith("~") + handledOptions += 1 + elif option == "document" and self.__exception: + self.__document = True + handledOptions += 1 + elif option == "elemhide" and self.__exception: + self.__elemhide = True + handledOptions += 1 + elif option == "collapse": + # Hiding placeholders of blocked elements + handledOptions += 1 + + # If we don't handle all options, it's safer to just disable this rule + if handledOptions != len(options): + self.__internalDisabled = True + return + + parsedLine = parsedLine[:optionsIndex] + + # Rule is classic regexp + if parsedLine.startswith("/") and parsedLine.endswith("/"): + parsedLine = parsedLine[1:-1] + self.__useRegExp = True + self.__regExp = QRegExp(parsedLine, self.__caseSensitivity, + QRegExp.RegExp) + return - options = parsedLine.find("$") - if options >= 0: - try: - self.__options = parsedLine[options + 1:].split(",") - except IndexError: - self.__options = [] - parsedLine = parsedLine[:options] + # Remove starting / ending wildcards + if parsedLine.startswith("*"): + parsedLine = parsedLine[1:] + if parsedLine.endswith("*"): + parsedLine = parsedLine[:-1] + + # Fast string matching for domain can be used + if parsedLine.startswith("||") and \ + parsedLine.endswith("^") and \ + QRegExp("[/:?=&\\*]").indexIn(parsedLine) == -1: + parsedLine = parsedLine[2:-1] + self.__useDomainMatch = True + self.__matchString = parsedLine + return + + # If rule contains '|' only at the end, string matching can be used + if parsedLine.endswith("|") and \ + QRegExp("[\\^\\*]").indexIn(parsedLine) == -1 and \ + parsedLine.count("|") == 1: + parsedLine = parsedLine[:-1] + self.__useEndsMatch = True + self.__matchString = parsedLine + return - self.setPattern(parsedLine, regExpRule) + # If there is still a wildcard (*) or separator (^) or (|), + # the rule must be modified to comply with QRegExp. + if "*" in parsedLine or "^" in parsedLine or "|" in parsedLine: + pattern = self.__convertPatternToRegExp(parsedLine) + self.__useRegExp = True + self.__regExp = QRegExp(pattern, self.__caseSensitivity, QRegExp.RegExp) + return + + # no regexp required + self.__useRegExp = False + self.__matchString = parsedLine + + def __parseDomains(self, domains, separator): + """ + Private method to parse a string with a domain list. - if "match-case" in self.__options: - self.__regExp.setCaseSensitivity(Qt.CaseSensitive) - self.__options.remove("match-case") + @param domains list of domains (string) + @param separator separator character used by the list (string) + """ + domainsList = domains.split(separator) + + for domain in domainsList: + if not domain: + continue + if domain.startswith("~"): + self.__blockedDomains.append(domain[1:]) + else: + self.__allowedDomains.append(domain) + + self.__domainRestricted = \ + bool(self.__blockedDomains) or bool(self.__allowedDomains) - def networkMatch(self, encodedUrl): + def networkMatch(self, request, domain, encodedUrl): """ Public method to check the rule for a match. + @param request reference to the network request (QNetworkRequest) + @param domain domain name (string) @param encodedUrl string encoded URL to be checked (string) @return flag indicating a match (boolean) """ - if self.__cssRule: + if self.__cssRule or not self.__enabled or self.__internalDisabled: return False + matched = False + + if self.__useRegExp: + matched = self.__regExp.indexIn(encodedUrl) != -1 + elif self.__useDomainMatch: + matched = domain.endswith(self.__matchString) + elif self.__useEndsMatch: + if self.__caseSensitivity == Qt.CaseInsensitive: + matched = encodedUrl.lower().endswith(self.__matchString.lower()) + else: + matched = encodedUrl.endswith(self.__matchString) + else: + if self.__caseSensitivity == Qt.CaseInsensitive: + matched = self.__matchString.lower() in encodedUrl.lower() + else: + matched = self.__matchString in encodedUrl + + if matched: + # check domain restrictions + if self.__domainRestricted and not self.matchDomain(domain): + return False + + # check third-party restrictions + if self.__thirdParty and not self.matchThirdParty(request): + return False + + # check object restrictions + if self.__object and not self.matchObject(request): + return False + + # check subdocument restrictions + if self.__subdocument and not self.matchSubdocument(request): + return False + + # check xmlhttprequest restriction + if self.__xmlhttprequest and not self.matchXmlHttpRequest(request): + return False + + return matched + + def urlMatch(self, url): + """ + Public method to check an URL against the rule. + + @param url URL to check (QUrl) + @return flag indicating a match (boolean) + """ + if not self.__document and not self.__elemhide: + return False + + encodedUrl = bytes(url.toEncoded()).decode() + domain = url.host() + return self.networkMatch(QNetworkRequest(url), domain, encodedUrl) + + def matchDomain(self, domain): + """ + Public method to match a domain. + + @param domain domain name to check (string) + @return flag indicating a match (boolean) + """ if not self.__enabled: return False - matched = self.__regExp.indexIn(encodedUrl) != -1 + if not self.__domainRestricted: + return True - if matched and not len(self.__options) == 0: - # only domain rules are supported - if len(self.__options) == 1: - for option in self.__options: - if option.startswith("domain="): - url = QUrl.fromEncoded(encodedUrl) - host = url.host() - domainOptions = option[7:].split("|") - for domainOption in domainOptions: - negate = domainOption.startswith("~") - if negate: - domainOption = domainOption[1:] - hostMatched = domainOption == host - if hostMatched and not negate: - return True - if not hostMatched and negate: - return True - + if len(self.__blockedDomains) == 0: + for dom in self.__allowedDomains: + if domain.endswith(dom): + return True + elif len(self.__allowedDomains) == 0: + for dom in self.__blockedDomains: + if domain.endswith(dom): + return False + return True + else: + for dom in self.__blockedDomains: + if domain.endswith(dom): + return False + for dom in self.__allowedDomains: + if domain.endswith(dom): + return True + + return False + + def matchThirdParty(self, req): + """ + Public slot to match a third-party rule. + + @param req request object to check (QNetworkRequest) + @return flag indicating a match (boolean) + """ + referer = bytes(req.attribute(QNetworkRequest.User + 200, "")).decode() + if referer == "": return False - return matched + # Third-party matching should be performed on second-level domains + refererHost = toSecondLevelDomain(QUrl(referer)) + host = toSecondLevelDomain(req.url()) + + match = refererHost != host + + if self.__thirdPartyException: + return not match + else: + return match + + def matchObject(self, req): + """ + Public slot to match an object rule. + + @param req request object to check (QNetworkRequest) + @return flag indicating a match (boolean) + """ + match = req.attribute(QNetworkRequest.User + 200) == "object" + + if self.__objectException: + return not match + else: + return match + + def matchSubdocument(self, req): + """ + Public slot to match a sub-document rule. + + @param req request object to check (QNetworkRequest) + @return flag indicating a match (boolean) + """ + originatingFrame = req.originatingObject() + if originatingFrame is None: + return False + + page = originatingFrame.page() + if page is None: + return False + + match = originatingFrame != page.mainFrame() + + if self.__subdocumentException: + return not match + else: + return match + + def matchXmlHttpRequest(self, req): + """ + Public slot to match a XmlHttpRequest rule. + + @param req request object to check (QNetworkRequest) + @return flag indicating a match (boolean) + """ + match = req.rawHeader("X-Request-With") == "XMLHttpRequest" + + if self.__xmlhttprequestException: + return not match + else: + return match def isException(self): """ @@ -156,13 +455,61 @@ """ return self.__cssRule - def regExpPattern(self): + def cssSelector(self): + """ + Public method to get the CSS selector of the rule. + + @return CSS selector (string) + """ + return self.__cssSelector + + def isDocument(self): + """ + Public method to check, if this is a document rule. + + @return flag indicating a document rule (boolean) """ - Public method to get the regexp pattern of the rule. + return self.__document + + def isElementHiding(self): + """ + Public method to check, if this is an element hiding rule. + + @return flag indicating an element hiding rule (boolean) + """ + return self.__elemhide + + def isDomainRestricted(self): + """ + Public method to check, if this rule is restricted by domain. - @return regexp pattern (QRegExp) + @return flag indicating a domain restriction (boolean) + """ + return self.__domainRestricted + + def isComment(self): + """ + Public method to check, if this is a comment. + + @return flag indicating a comment (boolean) + """ + return self.__filter.startswith("!") + + def isSlow(self): """ - return self.__regExp.pattern() + Public method to check, if this is a slow rule. + + @return flag indicating a slow rule (boolean) + """ + return self.__useRegExp + + def isInternalDisabled(self): + """ + Public method to check, if this rule was disabled internally. + + @return flag indicating an internally disabled rule (boolean) + """ + return self.__internalDisabled def __convertPatternToRegExp(self, wildcardPattern): """ @@ -173,30 +520,19 @@ """ pattern = wildcardPattern - pattern = re.sub(r"\*+", "*", pattern) # remove multiple wildcards - pattern = re.sub(r"\^\|$", "^", pattern) # remove anchors following separator placeholder - pattern = re.sub(r"^(\*)", "", pattern) # remove leading wildcards - pattern = re.sub(r"(\*)$", "", pattern) # remove trailing wildcards - pattern = re.sub(r"(\W)", r"\\\1", pattern) # escape special symbols + pattern = re.sub(r"\*+", "*", pattern) # remove multiple wildcards + pattern = re.sub(r"\^\|$", "^", pattern) # remove anchors following separator + # placeholder + pattern = re.sub(r"^(\*)", "", pattern) # remove leading wildcards + pattern = re.sub(r"(\*)$", "", pattern) # remove trailing wildcards + pattern = re.sub(r"(\W)", r"\\\1", pattern) # escape special symbols pattern = re.sub(r"^\\\|\\\|", - r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern) # process extended anchor at expression start + r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern) # process extended anchor at + # expression start pattern = re.sub(r"\\\^", - r"(?:[^\w\d\-.%]|$)", pattern) # process separator placeholders - pattern = re.sub(r"^\\\|", "^", pattern) # process anchor at expression start - pattern = re.sub(r"\\\|$", "$", pattern) # process anchor at expression end - pattern = re.sub(r"\\\*", ".*", pattern) # replace wildcards by .* + r"(?:[^\w\d\-.%]|$)", pattern) # process separator placeholders + pattern = re.sub(r"^\\\|", "^", pattern) # process anchor at expression start + pattern = re.sub(r"\\\|$", "$", pattern) # process anchor at expression end + pattern = re.sub(r"\\\*", ".*", pattern) # replace wildcards by .* return pattern - - def setPattern(self, pattern, isRegExp): - """ - Public method to set the rule pattern. - - @param pattern string containing the pattern (string) - @param isRegExp flag indicating a reg exp pattern (boolean) - """ - if isRegExp: - self.__regExp = QRegExp(pattern, Qt.CaseInsensitive, QRegExp.RegExp2) - else: - self.__regExp = QRegExp(self.__convertPatternToRegExp(pattern), - Qt.CaseInsensitive, QRegExp.RegExp2)