Helpviewer/AdBlock/AdBlockRule.py

changeset 1960
d8c45fe8a1b9
parent 1950
4f004ec74b48
child 1963
9c5b3235abf9
diff -r 2fed7bc4ad83 -r d8c45fe8a1b9 Helpviewer/AdBlock/AdBlockRule.py
--- a/Helpviewer/AdBlock/AdBlockRule.py	Thu Jul 26 18:38:15 2012 +0200
+++ b/Helpviewer/AdBlock/AdBlockRule.py	Sat Jul 28 11:23:12 2012 +0200
@@ -9,7 +9,44 @@
 
 import re
 
-from PyQt4.QtCore import Qt, QRegExp, QUrl
+from PyQt4.QtCore import Qt, QRegExp, QUrl, qVersion
+from PyQt4.QtNetwork import QNetworkRequest
+
+
+# Qt version < 4.8 has an issue; it will wrongly
+# count .co.uk (and others) as second-level domains
+def toSecondLevelDomain(url):
+    """
+    Module function to get a second level domain from the given URL.
+    
+    @param url URL to extract domain from (QUrl)
+    @return name of second level domain (string)
+    """
+    if qVersion() >= "4.8.0":
+        topLevelDomain = url.topLevelDomain()
+        urlHost = url.host()
+        
+        if not topLevelDomain or not urlHost:
+            return ""
+        
+        domain = urlHost[:len(urlHost) - len(topLevelDomain)]
+        if domain.count(".") == 0:
+            return urlHost
+        
+        while domain.count(".") != 0:
+            domain = domain[domain.find(".") + 1:]
+        
+        return domain + topLevelDomain
+    else:
+        domain = url.host()
+        
+        if domain.count(".") == 0:
+            return ""
+        
+        while domain.count(".") != 1:
+            domain = domain[domain.find(".") + 1:]
+        
+        return domain
 
 
 class AdBlockRule(object):
@@ -22,6 +59,28 @@
         """
         self.__regExp = QRegExp()
         self.__options = []
+        self.__blockedDomains = []
+        self.__allowedDomains = []
+        
+        self.__enabled = True
+        self.__cssRule = False
+        self.__exception = False
+        self.__internalDisabled = False
+        self.__domainRestricted = False
+        self.__useRegExp = False
+        self.__useDomainMatch = False
+        self.__useEndsMatch = False
+        self.__thirdParty = False
+        self.__thirdPartyException = False
+        self.__object = False
+        self.__objectException = False
+        self.__subdocument = False
+        self.__subdocumentException = False
+        self.__xmlhttprequest = False
+        self.__xmlhttprequestException = False
+        self.__document = False
+        self.__elemhide = False
+        self.__caseSensitivity = Qt.CaseInsensitive
         
         self.setFilter(filter)
     
@@ -40,77 +99,317 @@
         @param filter rule filter string (string)
         """
         self.__filter = filter
+        self.__parseFilter()
+    
+    def __parseFilter(self):
+        """
+        Private method to parse the filter pattern.
+        """
+        parsedLine = self.__filter
         
-        self.__cssRule = False
-        self.__enabled = True
-        self.__exception = False
-        regExpRule = False
+        # empty rule or just a comment
+        if not parsedLine.strip() or parsedLine.startswith("!"):
+            self.__enabled = False
+            return
         
-        if filter.startswith("!") or not filter.strip():
-            self.__enabled = False
+        # CSS element hiding rule
+        if "##" in parsedLine:
+            self.__cssRule = True
+            pos = parsedLine.find("##")
+            
+            # domain restricted rule
+            if not parsedLine.startswith("##"):
+                domains = parsedLine[:pos]
+                self.__parseDomains(domains, ",")
+            
+            self.__cssSelector = parsedLine[pos + 2:]
+            # CSS rule cannot have more options -> stop parsing
+            return
         
-        if "##" in filter:
-            self.__cssRule = True
-        
-        parsedLine = filter
+        # Exception always starts with @@
         if parsedLine.startswith("@@"):
             self.__exception = True
             parsedLine = parsedLine[2:]
-        if parsedLine.startswith("/"):
-            if parsedLine.endswith("/"):
-                parsedLine = parsedLine[1:-1]
-                regExpRule = True
+        
+        # Parse all options following '$' character
+        optionsIndex = parsedLine.find("$")
+        if optionsIndex >= 0:
+            options = parsedLine[optionsIndex + 1:].split(",")
+            
+            handledOptions = 0
+            for option in options:
+                if option.startswith("domain="):
+                    self.__parseDomains(option[7:], "|")
+                    handledOptions += 1
+                elif option == "match-case":
+                    self.__caseSensitivity = Qt.CaseSensitive
+                    handledOptions += 1
+                elif option.endswith("third-party"):
+                    self.__thirdParty = True
+                    self.__thirdPartyException = option.startswith("~")
+                    handledOptions += 1
+                elif option.endswith("object"):
+                    self.__object = True
+                    self.__objectException = option.startswith("~")
+                    handledOptions += 1
+                elif option.endswith("subdocument"):
+                    self.__subdocument = True
+                    self.__subdocumentException = option.startswith("~")
+                    handledOptions += 1
+                elif option.endswith("xmlhttprequest"):
+                    self.__xmlhttprequest = True
+                    self.__xmlhttprequestException = option.startswith("~")
+                    handledOptions += 1
+                elif option == "document" and self.__exception:
+                    self.__document = True
+                    handledOptions += 1
+                elif option == "elemhide" and self.__exception:
+                    self.__elemhide = True
+                    handledOptions += 1
+                elif option == "collapse":
+                    # Hiding placeholders of blocked elements
+                    handledOptions += 1
+            
+            # If we don't handle all options, it's safer to just disable this rule
+            if handledOptions != len(options):
+                self.__internalDisabled = True
+                return
+            
+            parsedLine = parsedLine[:optionsIndex]
+        
+        # Rule is classic regexp
+        if parsedLine.startswith("/") and parsedLine.endswith("/"):
+            parsedLine = parsedLine[1:-1]
+            self.__useRegExp = True
+            self.__regExp = QRegExp(parsedLine, self.__caseSensitivity,
+                                     QRegExp.RegExp)
+            return
         
-        options = parsedLine.find("$")
-        if options >= 0:
-            try:
-                self.__options = parsedLine[options + 1:].split(",")
-            except IndexError:
-                self.__options = []
-            parsedLine = parsedLine[:options]
+        # Remove starting / ending wildcards
+        if parsedLine.startswith("*"):
+            parsedLine = parsedLine[1:]
+        if parsedLine.endswith("*"):
+            parsedLine = parsedLine[:-1]
+        
+        # Fast string matching for domain can be used
+        if parsedLine.startswith("||") and \
+           parsedLine.endswith("^") and \
+           QRegExp("[/:?=&\\*]").indexIn(parsedLine) == -1:
+            parsedLine = parsedLine[2:-1]
+            self.__useDomainMatch = True
+            self.__matchString = parsedLine
+            return
+        
+        # If rule contains '|' only at the end, string matching can be used
+        if parsedLine.endswith("|") and \
+           QRegExp("[\\^\\*]").indexIn(parsedLine) == -1 and \
+           parsedLine.count("|") == 1:
+            parsedLine = parsedLine[:-1]
+            self.__useEndsMatch = True
+            self.__matchString = parsedLine
+            return
         
-        self.setPattern(parsedLine, regExpRule)
+        # If there is still a wildcard (*) or separator (^) or (|),
+        # the rule must be modified to comply with QRegExp.
+        if "*" in parsedLine or "^" in parsedLine or "|" in parsedLine:
+            pattern = self.__convertPatternToRegExp(parsedLine)
+            self.__useRegExp = True
+            self.__regExp = QRegExp(pattern, self.__caseSensitivity, QRegExp.RegExp)
+            return
+        
+        # no regexp required
+        self.__useRegExp = False
+        self.__matchString = parsedLine
+    
+    def __parseDomains(self, domains, separator):
+        """
+        Private method to parse a string with a domain list.
         
-        if "match-case" in self.__options:
-            self.__regExp.setCaseSensitivity(Qt.CaseSensitive)
-            self.__options.remove("match-case")
+        @param domains list of domains (string)
+        @param separator separator character used by the list (string)
+        """
+        domainsList = domains.split(separator)
+        
+        for domain in domainsList:
+            if not domain:
+                continue
+            if domain.startswith("~"):
+                self.__blockedDomains.append(domain[1:])
+            else:
+                self.__allowedDomains.append(domain)
+        
+        self.__domainRestricted = \
+            bool(self.__blockedDomains) or bool(self.__allowedDomains)
     
-    def networkMatch(self, encodedUrl):
+    def networkMatch(self, request, domain, encodedUrl):
         """
         Public method to check the rule for a match.
         
+        @param request reference to the network request (QNetworkRequest)
+        @param domain domain name (string)
         @param encodedUrl string encoded URL to be checked (string)
         @return flag indicating a match (boolean)
         """
-        if self.__cssRule:
+        if self.__cssRule or not self.__enabled or self.__internalDisabled:
             return False
         
+        matched = False
+        
+        if self.__useRegExp:
+            matched = self.__regExp.indexIn(encodedUrl) != -1
+        elif self.__useDomainMatch:
+            matched = domain.endswith(self.__matchString)
+        elif self.__useEndsMatch:
+            if self.__caseSensitivity == Qt.CaseInsensitive:
+                matched = encodedUrl.lower().endswith(self.__matchString.lower())
+            else:
+                matched = encodedUrl.endswith(self.__matchString)
+        else:
+            if self.__caseSensitivity == Qt.CaseInsensitive:
+                matched = self.__matchString.lower() in encodedUrl.lower()
+            else:
+                matched = self.__matchString in encodedUrl
+        
+        if matched:
+            # check domain restrictions
+            if self.__domainRestricted and not self.matchDomain(domain):
+                return False
+            
+            # check third-party restrictions
+            if self.__thirdParty and not self.matchThirdParty(request):
+                return False
+            
+            # check object restrictions
+            if self.__object and not self.matchObject(request):
+                return False
+            
+            # check subdocument restrictions
+            if self.__subdocument and not self.matchSubdocument(request):
+                return False
+            
+            # check xmlhttprequest restriction
+            if self.__xmlhttprequest and not self.matchXmlHttpRequest(request):
+                return False
+        
+        return matched
+    
+    def urlMatch(self, url):
+        """
+        Public method to check an URL against the rule.
+        
+        @param url URL to check (QUrl)
+        @return flag indicating a match (boolean)
+        """
+        if not self.__document and not self.__elemhide:
+            return False
+        
+        encodedUrl = bytes(url.toEncoded()).decode()
+        domain = url.host()
+        return self.networkMatch(QNetworkRequest(url), domain, encodedUrl)
+    
+    def matchDomain(self, domain):
+        """
+        Public method to match a domain.
+        
+        @param domain domain name to check (string)
+        @return flag indicating a match (boolean)
+        """
         if not self.__enabled:
             return False
         
-        matched = self.__regExp.indexIn(encodedUrl) != -1
+        if not self.__domainRestricted:
+            return True
         
-        if matched and not len(self.__options) == 0:
-            # only domain rules are supported
-            if len(self.__options) == 1:
-                for option in self.__options:
-                    if option.startswith("domain="):
-                        url = QUrl.fromEncoded(encodedUrl)
-                        host = url.host()
-                        domainOptions = option[7:].split("|")
-                        for domainOption in domainOptions:
-                            negate = domainOption.startswith("~")
-                            if negate:
-                                domainOption = domainOption[1:]
-                            hostMatched = domainOption == host
-                            if hostMatched and not negate:
-                                return True
-                            if not hostMatched and negate:
-                                return True
-            
+        if len(self.__blockedDomains) == 0:
+            for dom in self.__allowedDomains:
+                if domain.endswith(dom):
+                    return True
+        elif len(self.__allowedDomains) == 0:
+            for dom in self.__blockedDomains:
+                if domain.endswith(dom):
+                    return False
+            return True
+        else:
+            for dom in self.__blockedDomains:
+                if domain.endswith(dom):
+                    return False
+            for dom in self.__allowedDomains:
+                if domain.endswith(dom):
+                    return True
+        
+        return False
+    
+    def matchThirdParty(self, req):
+        """
+        Public slot to match a third-party rule.
+        
+        @param req request object to check (QNetworkRequest)
+        @return flag indicating a match (boolean)
+        """
+        referer = bytes(req.attribute(QNetworkRequest.User + 200, "")).decode()
+        if referer == "":
             return False
         
-        return matched
+        # Third-party matching should be performed on second-level domains
+        refererHost = toSecondLevelDomain(QUrl(referer))
+        host = toSecondLevelDomain(req.url())
+        
+        match = refererHost != host
+        
+        if self.__thirdPartyException:
+            return not match
+        else:
+            return match
+    
+    def matchObject(self, req):
+        """
+        Public slot to match an object rule.
+        
+        @param req request object to check (QNetworkRequest)
+        @return flag indicating a match (boolean)
+        """
+        match = req.attribute(QNetworkRequest.User + 200) == "object"
+        
+        if self.__objectException:
+            return not match
+        else:
+            return match
+    
+    def matchSubdocument(self, req):
+        """
+        Public slot to match a sub-document rule.
+        
+        @param req request object to check (QNetworkRequest)
+        @return flag indicating a match (boolean)
+        """
+        originatingFrame = req.originatingObject()
+        if originatingFrame is None:
+            return False
+        
+        page = originatingFrame.page()
+        if page is None:
+            return False
+        
+        match = originatingFrame != page.mainFrame()
+        
+        if self.__subdocumentException:
+            return not match
+        else:
+            return match
+    
+    def matchXmlHttpRequest(self, req):
+        """
+        Public slot to match a XmlHttpRequest rule.
+        
+        @param req request object to check (QNetworkRequest)
+        @return flag indicating a match (boolean)
+        """
+        match = req.rawHeader("X-Request-With") == "XMLHttpRequest"
+        
+        if self.__xmlhttprequestException:
+            return not match
+        else:
+            return match
     
     def isException(self):
         """
@@ -156,13 +455,61 @@
         """
         return self.__cssRule
     
-    def regExpPattern(self):
+    def cssSelector(self):
+        """
+        Public method to get the CSS selector of the rule.
+        
+        @return CSS selector (string)
+        """
+        return self.__cssSelector
+    
+    def isDocument(self):
+        """
+        Public method to check, if this is a document rule.
+        
+        @return flag indicating a document rule (boolean)
         """
-        Public method to get the regexp pattern of the rule.
+        return self.__document
+    
+    def isElementHiding(self):
+        """
+        Public method to check, if this is an element hiding rule.
+        
+        @return flag indicating an element hiding rule (boolean)
+        """
+        return self.__elemhide
+    
+    def isDomainRestricted(self):
+        """
+        Public method to check, if this rule is restricted by domain.
         
-        @return regexp pattern (QRegExp)
+        @return flag indicating a domain restriction (boolean)
+        """
+        return self.__domainRestricted
+    
+    def isComment(self):
+        """
+        Public method to check, if this is a comment.
+        
+        @return flag indicating a comment (boolean)
+        """
+        return self.__filter.startswith("!")
+    
+    def isSlow(self):
         """
-        return self.__regExp.pattern()
+        Public method to check, if this is a slow rule.
+        
+        @return flag indicating a slow rule (boolean)
+        """
+        return self.__useRegExp
+    
+    def isInternalDisabled(self):
+        """
+        Public method to check, if this rule was disabled internally.
+        
+        @return flag indicating an internally disabled rule (boolean)
+        """
+        return self.__internalDisabled
     
     def __convertPatternToRegExp(self, wildcardPattern):
         """
@@ -173,30 +520,19 @@
         """
         pattern = wildcardPattern
         
-        pattern = re.sub(r"\*+", "*", pattern)      # remove multiple wildcards
-        pattern = re.sub(r"\^\|$", "^", pattern)    # remove anchors following separator placeholder
-        pattern = re.sub(r"^(\*)", "", pattern)     # remove leading wildcards
-        pattern = re.sub(r"(\*)$", "", pattern)     # remove trailing wildcards
-        pattern = re.sub(r"(\W)", r"\\\1", pattern)      # escape special symbols
+        pattern = re.sub(r"\*+", "*", pattern)       # remove multiple wildcards
+        pattern = re.sub(r"\^\|$", "^", pattern)     # remove anchors following separator
+                                                     # placeholder
+        pattern = re.sub(r"^(\*)", "", pattern)      # remove leading wildcards
+        pattern = re.sub(r"(\*)$", "", pattern)      # remove trailing wildcards
+        pattern = re.sub(r"(\W)", r"\\\1", pattern)  # escape special symbols
         pattern = re.sub(r"^\\\|\\\|",
-            r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern)  # process extended anchor at expression start
+            r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern)  # process extended anchor at
+                                                          # expression start
         pattern = re.sub(r"\\\^",
-            r"(?:[^\w\d\-.%]|$)", pattern)          # process separator placeholders
-        pattern = re.sub(r"^\\\|", "^", pattern)    # process anchor at expression start
-        pattern = re.sub(r"\\\|$", "$", pattern)    # process anchor at expression end
-        pattern = re.sub(r"\\\*", ".*", pattern)    # replace wildcards by .*
+            r"(?:[^\w\d\-.%]|$)", pattern)           # process separator placeholders
+        pattern = re.sub(r"^\\\|", "^", pattern)     # process anchor at expression start
+        pattern = re.sub(r"\\\|$", "$", pattern)     # process anchor at expression end
+        pattern = re.sub(r"\\\*", ".*", pattern)     # replace wildcards by .*
         
         return pattern
-    
-    def setPattern(self, pattern, isRegExp):
-        """
-        Public method to set the rule pattern.
-        
-        @param pattern string containing the pattern (string)
-        @param isRegExp flag indicating a reg exp pattern (boolean)
-        """
-        if isRegExp:
-            self.__regExp = QRegExp(pattern, Qt.CaseInsensitive, QRegExp.RegExp2)
-        else:
-            self.__regExp = QRegExp(self.__convertPatternToRegExp(pattern),
-                                    Qt.CaseInsensitive, QRegExp.RegExp2)

eric ide

mercurial