WebBrowser/AdBlock/AdBlockRule.py

branch
QtWebEngine
changeset 4858
19dff9c9cf26
parent 4631
5c1a96925da4
child 4860
0a44aff88bfa
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/WebBrowser/AdBlock/AdBlockRule.py	Sun Mar 13 20:54:42 2016 +0100
@@ -0,0 +1,662 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2009 - 2016 Detlev Offenbach <detlev@die-offenbachs.de>
+#
+
+"""
+Module implementing the AdBlock rule class.
+"""
+
+from __future__ import unicode_literals
+
+import re
+
+from PyQt5.QtCore import Qt, QRegExp, QUrl
+from PyQt5.QtNetwork import QNetworkRequest
+from PyQt5.QtWebEngineCore import QWebEngineUrlRequestInfo
+
+
+def toSecondLevelDomain(url):
+    """
+    Module function to get a second level domain from the given URL.
+    
+    @param url URL to extract domain from (QUrl)
+    @return name of second level domain (string)
+    """
+    topLevelDomain = url.topLevelDomain()
+    urlHost = url.host()
+    
+    if not topLevelDomain or not urlHost:
+        return ""
+    
+    domain = urlHost[:len(urlHost) - len(topLevelDomain)]
+    if domain.count(".") == 0:
+        return urlHost
+    
+    while domain.count(".") != 0:
+        domain = domain[domain.find(".") + 1:]
+    
+    return domain + topLevelDomain
+
+
+class AdBlockRule(object):
+    """
+    Class implementing the AdBlock rule.
+    """
+    def __init__(self, filter="", subscription=None):
+        """
+        Constructor
+        
+        @param filter filter string of the rule (string)
+        @param subscription reference to the subscription object
+            (AdBlockSubscription)
+        """
+        self.__subscription = subscription
+        
+        self.__regExp = QRegExp()
+        self.__options = []
+        self.__blockedDomains = []
+        self.__allowedDomains = []
+        
+        self.__enabled = True
+        self.__cssRule = False
+        self.__exception = False
+        self.__internalDisabled = False
+        self.__domainRestricted = False
+        self.__useRegExp = False
+        self.__useDomainMatch = False
+        self.__useEndsMatch = False
+        self.__thirdParty = False
+        self.__thirdPartyException = False
+        self.__object = False
+        self.__objectException = False
+        self.__subdocument = False
+        self.__subdocumentException = False
+        self.__xmlhttprequest = False
+        self.__xmlhttprequestException = False
+        self.__document = False
+        self.__elemhide = False
+        self.__caseSensitivity = Qt.CaseInsensitive
+        self.__image = False
+        self.__imageException = False
+        self.__script = False
+        self.__scriptException = False
+        self.__stylesheet = False
+        self.__stylesheetException = False
+        self.__objectSubrequest = False
+        self.__objectSubrequestException = False
+        
+        self.setFilter(filter)
+    
+    def subscription(self):
+        """
+        Public method to get the subscription this rule belongs to.
+        
+        @return subscription of the rule (AdBlockSubscription)
+        """
+        return self.__subscription
+    
+    def filter(self):
+        """
+        Public method to get the rule filter string.
+        
+        @return rule filter string (string)
+        """
+        return self.__filter
+    
+    def setFilter(self, filter):
+        """
+        Public method to set the rule filter string.
+        
+        @param filter rule filter string (string)
+        """
+        self.__filter = filter
+        self.__parseFilter()
+    
+    def __parseFilter(self):
+        """
+        Private method to parse the filter pattern.
+        """
+        parsedLine = self.__filter
+        
+        # empty rule or just a comment
+        if not parsedLine.strip() or parsedLine.startswith(("!", "[Adblock")):
+            self.__enabled = False
+            return
+        
+        # CSS element hiding rule
+        if "##" in parsedLine or "#@#" in parsedLine:
+            self.__cssRule = True
+            pos = parsedLine.find("#")
+            
+            # domain restricted rule
+            if not parsedLine.startswith("##"):
+                domains = parsedLine[:pos]
+                self.__parseDomains(domains, ",")
+            
+            self.__exception = parsedLine[pos + 1] == "@"
+            
+            if self.__exception:
+                self.__cssSelector = parsedLine[pos + 3:]
+            else:
+                self.__cssSelector = parsedLine[pos + 2:]
+            # CSS rule cannot have more options -> stop parsing
+            return
+        
+        # Exception always starts with @@
+        if parsedLine.startswith("@@"):
+            self.__exception = True
+            parsedLine = parsedLine[2:]
+        
+        # Parse all options following '$' character
+        optionsIndex = parsedLine.find("$")
+        if optionsIndex >= 0:
+            options = parsedLine[optionsIndex + 1:].split(",")
+            
+            handledOptions = 0
+            for option in options:
+                if option.startswith("domain="):
+                    self.__parseDomains(option[7:], "|")
+                    handledOptions += 1
+                elif option == "match-case":
+                    self.__caseSensitivity = Qt.CaseSensitive
+                    handledOptions += 1
+                elif option.endswith("third-party"):
+                    self.__thirdParty = True
+                    self.__thirdPartyException = option.startswith("~")
+                    handledOptions += 1
+                elif option.endswith("object"):
+                    self.__object = True
+                    self.__objectException = option.startswith("~")
+                    handledOptions += 1
+                elif option.endswith("subdocument"):
+                    self.__subdocument = True
+                    self.__subdocumentException = option.startswith("~")
+                    handledOptions += 1
+                elif option.endswith("xmlhttprequest"):
+                    self.__xmlhttprequest = True
+                    self.__xmlhttprequestException = option.startswith("~")
+                    handledOptions += 1
+                elif option.endswith("image"):
+                    self.__image = True
+                    self.__imageException = option.startswith("~")
+                elif option.endswith("script"):
+                    self.__script = True
+                    self.__scriptException = option.startswith("~")
+                elif option.endswith("stylesheet"):
+                    self.__stylesheet = True
+                    self.__stylesheetException = option.startswith("~")
+                elif option.endswith("object-subrequest"):
+                    self.__objectSubrequest = True
+                    self.__objectSubrequestException = option.startswith("~")
+                elif option == "document" and self.__exception:
+                    self.__document = True
+                    handledOptions += 1
+                elif option == "elemhide" and self.__exception:
+                    self.__elemhide = True
+                    handledOptions += 1
+                elif option == "collapse":
+                    # Hiding placeholders of blocked elements
+                    handledOptions += 1
+            
+            # If we don't handle all options, it's safer to just disable
+            # this rule
+            if handledOptions != len(options):
+                self.__internalDisabled = True
+                return
+            
+            parsedLine = parsedLine[:optionsIndex]
+        
+        # Rule is classic regexp
+        if parsedLine.startswith("/") and parsedLine.endswith("/"):
+            parsedLine = parsedLine[1:-1]
+            self.__useRegExp = True
+            self.__regExp = QRegExp(parsedLine, self.__caseSensitivity,
+                                    QRegExp.RegExp)
+            return
+        
+        # Remove starting / ending wildcards
+        if parsedLine.startswith("*"):
+            parsedLine = parsedLine[1:]
+        if parsedLine.endswith("*"):
+            parsedLine = parsedLine[:-1]
+        
+        # Fast string matching for domain can be used
+        if parsedLine.startswith("||") and \
+           parsedLine.endswith("^") and \
+           QRegExp("[/:?=&\\*]").indexIn(parsedLine) == -1:
+            parsedLine = parsedLine[2:-1]
+            self.__useDomainMatch = True
+            self.__matchString = parsedLine
+            return
+        
+        # If rule contains '|' only at the end, string matching can be used
+        if parsedLine.endswith("|") and \
+           QRegExp("[\\^\\*]").indexIn(parsedLine) == -1 and \
+           parsedLine.count("|") == 1:
+            parsedLine = parsedLine[:-1]
+            self.__useEndsMatch = True
+            self.__matchString = parsedLine
+            return
+        
+        # If there is still a wildcard (*) or separator (^) or (|),
+        # the rule must be modified to comply with QRegExp.
+        if "*" in parsedLine or "^" in parsedLine or "|" in parsedLine:
+            pattern = self.__convertPatternToRegExp(parsedLine)
+            self.__useRegExp = True
+            self.__regExp = QRegExp(pattern, self.__caseSensitivity,
+                                    QRegExp.RegExp)
+            return
+        
+        # no regexp required
+        self.__useRegExp = False
+        self.__matchString = parsedLine
+    
+    def __parseDomains(self, domains, separator):
+        """
+        Private method to parse a string with a domain list.
+        
+        @param domains list of domains (string)
+        @param separator separator character used by the list (string)
+        """
+        domainsList = domains.split(separator)
+        
+        for domain in domainsList:
+            if not domain:
+                continue
+            if domain.startswith("~"):
+                self.__blockedDomains.append(domain[1:])
+            else:
+                self.__allowedDomains.append(domain)
+        
+        self.__domainRestricted = \
+            bool(self.__blockedDomains) or bool(self.__allowedDomains)
+    
+    def networkMatch(self, request, domain, encodedUrl):
+        """
+        Public method to check the rule for a match.
+        
+        @param request reference to the network request
+        @type QWebEngineUrlRequestInfo
+        @param domain domain name
+        @type str
+        @param encodedUrl string encoded URL to be checked
+        @type str
+        @return flag indicating a match
+        @rtype bool
+        """
+        if self.__cssRule or not self.__enabled or self.__internalDisabled:
+            return False
+        
+        matched = False
+        
+        if self.__useRegExp:
+            matched = self.__regExp.indexIn(encodedUrl) != -1
+        elif self.__useDomainMatch:
+            matched = domain.endswith(self.__matchString)
+        elif self.__useEndsMatch:
+            if self.__caseSensitivity == Qt.CaseInsensitive:
+                matched = encodedUrl.lower().endswith(
+                    self.__matchString.lower())
+            else:
+                matched = encodedUrl.endswith(self.__matchString)
+        else:
+            if self.__caseSensitivity == Qt.CaseInsensitive:
+                matched = self.__matchString.lower() in encodedUrl.lower()
+            else:
+                matched = self.__matchString in encodedUrl
+        
+        if matched:
+            # check domain restrictions
+            if self.__domainRestricted and \
+                    not self.matchDomain(request.firstPartyUrl().host()):
+                return False
+            
+            # check third-party restrictions
+            if self.__thirdParty and not self.matchThirdParty(request):
+                return False
+            
+            # check object restrictions
+            if self.__object and not self.matchObject(request):
+                return False
+            
+            # check subdocument restrictions
+            if self.__subdocument and not self.matchSubdocument(request):
+                return False
+            
+            # check xmlhttprequest restriction
+            if self.__xmlhttprequest and not self.matchXmlHttpRequest(request):
+                return False
+            
+            # check image restriction
+            if self.__image and not self.matchImage(request):
+                return False
+            
+            # check script restriction
+            if self.__script and not self.matchScript(request):
+                return False
+            
+            # check stylesheet restriction
+            if self.__stylesheet and not self.matchStyleSheet(request):
+                return False
+            
+            # check object-subrequest restriction
+            if self.__objectSubrequest and \
+                    not self.matchObjectSubrequest(request):
+                return False
+        
+        return matched
+    
+    def urlMatch(self, url):
+        """
+        Public method to check an URL against the rule.
+        
+        @param url URL to check (QUrl)
+        @return flag indicating a match (boolean)
+        """
+        if not self.__document and not self.__elemhide:
+            return False
+        
+        encodedUrl = bytes(url.toEncoded()).decode()
+        domain = url.host()
+        return self.networkMatch(QNetworkRequest(url), domain, encodedUrl)
+    
+    def matchDomain(self, domain):
+        """
+        Public method to match a domain.
+        
+        @param domain domain name to check (string)
+        @return flag indicating a match (boolean)
+        """
+        if not self.__enabled:
+            return False
+        
+        if not self.__domainRestricted:
+            return True
+        
+        if len(self.__blockedDomains) == 0:
+            for dom in self.__allowedDomains:
+                if domain.endswith(dom):
+                    return True
+        elif len(self.__allowedDomains) == 0:
+            for dom in self.__blockedDomains:
+                if domain.endswith(dom):
+                    return False
+            return True
+        else:
+            for dom in self.__blockedDomains:
+                if domain.endswith(dom):
+                    return False
+            for dom in self.__allowedDomains:
+                if domain.endswith(dom):
+                    return True
+        
+        return False
+    
+    def matchThirdParty(self, req):
+        """
+        Public slot to match a third-party rule.
+        
+        @param req request object to check (QWebEngineUrlRequestInfo)
+        @return flag indicating a match (boolean)
+        """
+        # Third-party matching should be performed on second-level domains
+        firstPartyHost = toSecondLevelDomain(req.firstPartyUrl())
+        host = toSecondLevelDomain(req.requestUrl())
+        
+        match = firstPartyHost != host
+        
+        if self.__thirdPartyException:
+            return not match
+        else:
+            return match
+    
+    def matchObject(self, req):
+        """
+        Public slot to match an object rule.
+        
+        @param req request object to check (QWebEngineUrlRequestInfo)
+        @return flag indicating a match (boolean)
+        """
+        match = (
+            req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeObject)
+        
+        if self.__objectException:
+            return not match
+        else:
+            return match
+    
+    def matchSubdocument(self, req):
+        """
+        Public slot to match a sub-document rule.
+        
+        @param req request object to check (QWebEngineUrlRequestInfo)
+        @return flag indicating a match (boolean)
+        """
+        match = (
+            req.resourceType() ==
+            QWebEngineUrlRequestInfo.ResourceTypeSubFrame)
+        
+        if self.__subdocumentException:
+            return not match
+        else:
+            return match
+    
+    def matchXmlHttpRequest(self, req):
+        """
+        Public slot to match a XmlHttpRequest rule.
+        
+        @param req request object to check (QWebEngineUrlRequestInfo)
+        @return flag indicating a match (boolean)
+        """
+        match = (
+            req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeXhr)
+        
+        if self.__xmlhttprequestException:
+            return not match
+        else:
+            return match
+    
+    def matchImage(self, req):
+        """
+        Public slot to match an Image rule.
+        
+        @param req request object to check (QWebEngineUrlRequestInfo)
+        @return flag indicating a match (boolean)
+        """
+        match = (
+            req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeImage)
+        
+        if self.__imageException:
+            return not match
+        else:
+            return match
+    
+    def matchScript(self, req):
+        """
+        Public slot to match a Script rule.
+        
+        @param req request object to check (QWebEngineUrlRequestInfo)
+        @return flag indicating a match (boolean)
+        """
+        match = (
+            req.resourceType() == QWebEngineUrlRequestInfo.ResourceTypeScript)
+        
+        if self.__scriptException:
+            return not match
+        else:
+            return match
+    
+    def matchStyleSheet(self, req):
+        """
+        Public slot to match a StyleSheet rule.
+        
+        @param req request object to check (QWebEngineUrlRequestInfo)
+        @return flag indicating a match (boolean)
+        """
+        match = (
+            req.resourceType() ==
+            QWebEngineUrlRequestInfo.ResourceTypeStylesheet)
+        
+        if self.__stylesheetException:
+            return not match
+        else:
+            return match
+    
+    def matchObjectSubrequest(self, req):
+        """
+        Public slot to match an Object Subrequest rule.
+        
+        @param req request object to check (QWebEngineUrlRequestInfo)
+        @return flag indicating a match (boolean)
+        """
+        match = (
+            req.resourceType() ==
+            QWebEngineUrlRequestInfo.ResourceTypeSubResource)
+        
+        if self.__objectSubrequestException:
+            return not match
+        else:
+            return match
+    
+    def isException(self):
+        """
+        Public method to check, if the rule defines an exception.
+        
+        @return flag indicating an exception (boolean)
+        """
+        return self.__exception
+    
+    def setException(self, exception):
+        """
+        Public method to set the rule's exception flag.
+        
+        @param exception flag indicating an exception rule (boolean)
+        """
+        self.__exception = exception
+    
+    def isEnabled(self):
+        """
+        Public method to check, if the rule is enabled.
+        
+        @return flag indicating enabled state (boolean)
+        """
+        return self.__enabled
+    
+    def setEnabled(self, enabled):
+        """
+        Public method to set the rule's enabled state.
+        
+        @param enabled flag indicating the new enabled state (boolean)
+        """
+        self.__enabled = enabled
+        if not enabled:
+            self.__filter = "!" + self.__filter
+        else:
+            self.__filter = self.__filter[1:]
+    
+    def isCSSRule(self):
+        """
+        Public method to check, if the rule is a CSS rule.
+        
+        @return flag indicating a CSS rule (boolean)
+        """
+        return self.__cssRule
+    
+    def cssSelector(self):
+        """
+        Public method to get the CSS selector of the rule.
+        
+        @return CSS selector (string)
+        """
+        return self.__cssSelector
+    
+    def isDocument(self):
+        """
+        Public method to check, if this is a document rule.
+        
+        @return flag indicating a document rule (boolean)
+        """
+        return self.__document
+    
+    def isElementHiding(self):
+        """
+        Public method to check, if this is an element hiding rule.
+        
+        @return flag indicating an element hiding rule (boolean)
+        """
+        return self.__elemhide
+    
+    def isDomainRestricted(self):
+        """
+        Public method to check, if this rule is restricted by domain.
+        
+        @return flag indicating a domain restriction (boolean)
+        """
+        return self.__domainRestricted
+    
+    def isComment(self):
+        """
+        Public method to check, if this is a comment.
+        
+        @return flag indicating a comment (boolean)
+        """
+        return self.__filter.startswith("!")
+    
+    def isHeader(self):
+        """
+        Public method to check, if this is a header.
+        
+        @return flag indicating a header (boolean)
+        """
+        return self.__filter.startswith("[Adblock")
+    
+    def isSlow(self):
+        """
+        Public method to check, if this is a slow rule.
+        
+        @return flag indicating a slow rule (boolean)
+        """
+        return self.__useRegExp
+    
+    def isInternalDisabled(self):
+        """
+        Public method to check, if this rule was disabled internally.
+        
+        @return flag indicating an internally disabled rule (boolean)
+        """
+        return self.__internalDisabled
+    
+    def __convertPatternToRegExp(self, wildcardPattern):
+        """
+        Private method to convert a wildcard pattern to a regular expression.
+        
+        @param wildcardPattern string containing the wildcard pattern (string)
+        @return string containing a regular expression (string)
+        """
+        pattern = wildcardPattern
+        
+        # remove multiple wildcards
+        pattern = re.sub(r"\*+", "*", pattern)
+        # remove anchors following separator placeholder
+        pattern = re.sub(r"\^\|$", "^", pattern)
+        # remove leading wildcards
+        pattern = re.sub(r"^(\*)", "", pattern)
+        # remove trailing wildcards
+        pattern = re.sub(r"(\*)$", "", pattern)
+        # escape special symbols
+        pattern = re.sub(r"(\W)", r"\\\1", pattern)
+        # process extended anchor at expression start
+        pattern = re.sub(
+            r"^\\\|\\\|",
+            r"^[\w\-]+:\/+(?!\/)(?:[^\/]+\.)?", pattern)
+        # process separator placeholders
+        pattern = re.sub(r"\\\^", r"(?:[^\w\d\-.%]|$)", pattern)
+        # process anchor at expression start
+        pattern = re.sub(r"^\\\|", "^", pattern)
+        # process anchor at expression end
+        pattern = re.sub(r"\\\|$", "$", pattern)
+        # replace wildcards by .*
+        pattern = re.sub(r"\\\*", ".*", pattern)
+        
+        return pattern

eric ide

mercurial