ThirdParty/Jasy/jasy/js/tokenize/Tokenizer.py

changeset 6650
1dd52aa8897c
parent 6649
f1b3a73831c9
child 6651
e8f3b5568b21
diff -r f1b3a73831c9 -r 1dd52aa8897c ThirdParty/Jasy/jasy/js/tokenize/Tokenizer.py
--- a/ThirdParty/Jasy/jasy/js/tokenize/Tokenizer.py	Sat Jan 12 11:26:32 2019 +0100
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,609 +0,0 @@
-#
-# Jasy - Web Tooling Framework
-# Copyright 2010-2012 Zynga Inc.
-#
-
-#
-# License: MPL 1.1/GPL 2.0/LGPL 2.1
-# Authors: 
-#   - Brendan Eich <brendan@mozilla.org> (Original JavaScript) (2004-2010)
-#   - Sebastian Werner <info@sebastian-werner.net> (Python Port) (2010)
-#
-
-from __future__ import unicode_literals
-
-import copy
-
-import jasy.js.tokenize.Lang as Lang
-import jasy.js.api.Comment as Comment
-import jasy.core.Console as Console
-
-__all__ = [ "Tokenizer" ]
-
-
-# Operator and punctuator mapping from token to tree node type name.
-# NB: because the lexer doesn't backtrack, all token prefixes must themselves
-# be valid tokens (e.g. !== is acceptable because its prefixes are the valid
-# tokens != and !).
-operatorNames = {
-    '<'   : 'lt', 
-    '>'   : 'gt', 
-    '<='  : 'le', 
-    '>='  : 'ge', 
-    '!='  : 'ne', 
-    '!'   : 'not', 
-    '=='  : 'eq', 
-    '===' : 'strict_eq', 
-    '!==' : 'strict_ne', 
-
-    '>>'  : 'rsh', 
-    '<<'  : 'lsh',
-    '>>>' : 'ursh', 
-     
-    '+'   : 'plus', 
-    '*'   : 'mul', 
-    '-'   : 'minus', 
-    '/'   : 'div', 
-    '%'   : 'mod', 
-
-    ','   : 'comma', 
-    ';'   : 'semicolon', 
-    ':'   : 'colon', 
-    '='   : 'assign', 
-    '?'   : 'hook', 
-
-    '&&'  : 'and', 
-    '||'  : 'or', 
-
-    '++'  : 'increment', 
-    '--'  : 'decrement', 
-
-    ')'   : 'right_paren', 
-    '('   : 'left_paren', 
-    '['   : 'left_bracket', 
-    ']'   : 'right_bracket', 
-    '{'   : 'left_curly', 
-    '}'   : 'right_curly', 
-
-    '&'   : 'bitwise_and', 
-    '^'   : 'bitwise_xor', 
-    '|'   : 'bitwise_or', 
-    '~'   : 'bitwise_not'
-}
-
-
-# Assignment operators
-assignOperators = ["|", "^", "&", "<<", ">>", ">>>", "+", "-", "*", "/", "%"]
-
-
-
-
-#
-# Classes
-#
-
-class Token: 
-    __slots__ = ["type", "start", "line", "assignOp", "end", "value"]
-
-
-class ParseError(Exception):
-    def __init__(self, message, fileId, line):
-        Exception.__init__(self, "Syntax error: %s\n%s:%s" % (message, fileId, line))
-
-
-class Tokenizer(object):
-    def __init__(self, source, fileId="", line=1):
-        # source: JavaScript source
-        # fileId: Filename (for debugging proposes)
-        # line: Line number (for debugging proposes)
-        self.cursor = 0
-        self.source = str(source)
-        self.tokens = {}
-        self.tokenIndex = 0
-        self.lookahead = 0
-        self.scanNewlines = False
-        self.fileId = fileId
-        self.line = line
-        self.comments = []
-
-    input_ = property(lambda self: self.source[self.cursor:])
-    token = property(lambda self: self.tokens.get(self.tokenIndex))
-
-
-    def done(self):
-        # We need to set scanOperand to true here because the first thing
-        # might be a regexp.
-        return self.peek(True) == "end"
-        
-
-    def match(self, tokenType, scanOperand=False):
-        return self.get(scanOperand) == tokenType or self.unget()
-
-
-    def mustMatch(self, tokenType):
-        if not self.match(tokenType):
-            raise ParseError("Missing " + tokenType, self.fileId, self.line)
-            
-        return self.token
-
-
-    def peek(self, scanOperand=False):
-        if self.lookahead:
-            next = self.tokens.get((self.tokenIndex + self.lookahead) & 3)
-            if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)):
-                tokenType = "newline"
-            else:
-                tokenType = getattr(next, "type", None)
-        else:
-            tokenType = self.get(scanOperand)
-            self.unget()
-            
-        return tokenType
-
-
-    def peekOnSameLine(self, scanOperand=False):
-        self.scanNewlines = True
-        tokenType = self.peek(scanOperand)
-        self.scanNewlines = False
-        return tokenType
-        
-
-    def getComments(self):
-        if self.comments:
-            comments = self.comments
-            self.comments = []
-            return comments
-            
-        return None
-
-
-    def skip(self):
-        """Eats comments and whitespace."""
-        input = self.source
-        startLine = self.line
-
-        # Whether this is the first called as happen on start parsing a file (eat leading comments/white space)
-        startOfFile = self.cursor is 0
-        
-        indent = ""
-        
-        while (True):
-            if len(input) > self.cursor:
-                ch = input[self.cursor]
-            else:
-                return
-                
-            self.cursor += 1
-            
-            if len(input) > self.cursor:
-                next = input[self.cursor]
-            else:
-                next = None
-
-            if ch == "\n" and not self.scanNewlines:
-                self.line += 1
-                indent = ""
-                
-            elif ch == "/" and next == "*":
-                self.cursor += 1
-                text = "/*"
-                inline = startLine == self.line and startLine > 1
-                commentStartLine = self.line
-                if startLine == self.line and not startOfFile:
-                    mode = "inline"
-                elif (self.line-1) > startLine:
-                    # distance before this comment means it is a comment block for a whole section (multiple lines of code)
-                    mode = "section"
-                else:
-                    # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
-                    mode = "block"
-                    
-                while (True):
-                    try:
-                        ch = input[self.cursor]
-                        self.cursor += 1
-                    except IndexError:
-                        raise ParseError("Unterminated comment", self.fileId, self.line)
-                        
-                    if ch == "*":
-                        next = input[self.cursor]
-                        if next == "/":
-                            text += "*/"
-                            self.cursor += 1
-                            break
-                            
-                    elif ch == "\n":
-                        self.line += 1
-                        
-                    text += ch
-                    
-                
-                # Filter escaping on slash-star combinations in comment text
-                text = text.replace("*\/", "*/")
-                
-                try:
-                    self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId))
-                except Comment.CommentException as commentError:
-                    Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
-                    
-                    
-            elif ch == "/" and next == "/":
-                self.cursor += 1
-                text = "//"
-                if startLine == self.line and not startOfFile:
-                    mode = "inline"
-                elif (self.line-1) > startLine:
-                    # distance before this comment means it is a comment block for a whole section (multiple lines of code)
-                    mode = "section"
-                else:
-                    # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
-                    mode = "block"
-                    
-                while (True):
-                    try:
-                        ch = input[self.cursor]
-                        self.cursor += 1
-                    except IndexError:
-                        # end of file etc.
-                        break
-
-                    if ch == "\n":
-                        self.line += 1
-                        break
-                    
-                    text += ch
-                    
-                try:
-                    self.comments.append(Comment.Comment(text, mode, self.line-1, "", self.fileId))
-                except Comment.CommentException:
-                    Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
-
-            # check for whitespace, also for special cases like 0xA0
-            elif ch in "\xA0 \t":
-                indent += ch
-
-            else:
-                self.cursor -= 1
-                return
-
-
-    # Lexes the exponential part of a number, if present. Returns True if an
-    # exponential part was found.
-    def lexExponent(self):
-        input = self.source
-        next = input[self.cursor]
-        if next == "e" or next == "E":
-            self.cursor += 1
-            ch = input[self.cursor]
-            self.cursor += 1
-            if ch == "+" or ch == "-":
-                ch = input[self.cursor]
-                self.cursor += 1
-
-            if ch < "0" or ch > "9":
-                raise ParseError("Missing exponent", self.fileId, self.line)
-
-            while(True):
-                ch = input[self.cursor]
-                self.cursor += 1
-                if not (ch >= "0" and ch <= "9"):
-                    break
-                
-            self.cursor -= 1
-            return True
-
-        return False
-
-
-    def lexZeroNumber(self, ch):
-        token = self.token
-        input = self.source
-        token.type = "number"
-
-        ch = input[self.cursor]
-        self.cursor += 1
-        if ch == ".":
-            while(True):
-                ch = input[self.cursor]
-                self.cursor += 1
-                if not (ch >= "0" and ch <= "9"):
-                    break
-                
-            self.cursor -= 1
-            self.lexExponent()
-            token.value = input[token.start:self.cursor]
-            
-        elif ch == "x" or ch == "X":
-            while(True):
-                ch = input[self.cursor]
-                self.cursor += 1
-                if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")):
-                    break
-                    
-            self.cursor -= 1
-            token.value = input[token.start:self.cursor]
-
-        elif ch >= "0" and ch <= "7":
-            while(True):
-                ch = input[self.cursor]
-                self.cursor += 1
-                if not (ch >= "0" and ch <= "7"):
-                    break
-                    
-            self.cursor -= 1
-            token.value = input[token.start:self.cursor]
-
-        else:
-            self.cursor -= 1
-            self.lexExponent()     # 0E1, &c.
-            token.value = 0
-    
-
-    def lexNumber(self, ch):
-        token = self.token
-        input = self.source
-        token.type = "number"
-
-        floating = False
-        while(True):
-            ch = input[self.cursor]
-            self.cursor += 1
-            
-            if ch == "." and not floating:
-                floating = True
-                ch = input[self.cursor]
-                self.cursor += 1
-                
-            if not (ch >= "0" and ch <= "9"):
-                break
-
-        self.cursor -= 1
-
-        exponent = self.lexExponent()
-        segment = input[token.start:self.cursor]
-        
-        # Protect float or exponent numbers
-        if floating or exponent:
-            token.value = segment
-        else:
-            token.value = int(segment)
-
-
-    def lexDot(self, ch):
-        token = self.token
-        input = self.source
-        next = input[self.cursor]
-        
-        if next >= "0" and next <= "9":
-            while (True):
-                ch = input[self.cursor]
-                self.cursor += 1
-                if not (ch >= "0" and ch <= "9"):
-                    break
-
-            self.cursor -= 1
-            self.lexExponent()
-
-            token.type = "number"
-            token.value = input[token.start:self.cursor]
-
-        else:
-            token.type = "dot"
-
-
-    def lexString(self, ch):
-        token = self.token
-        input = self.source
-        token.type = "string"
-
-        hasEscapes = False
-        delim = ch
-        ch = input[self.cursor]
-        self.cursor += 1
-        while ch != delim:
-            if ch == "\\":
-                hasEscapes = True
-                self.cursor += 1
-
-            ch = input[self.cursor]
-            self.cursor += 1
-
-        if hasEscapes:
-            token.value = eval(input[token.start:self.cursor])
-        else:
-            token.value = input[token.start+1:self.cursor-1]
-
-
-    def lexRegExp(self, ch):
-        token = self.token
-        input = self.source
-        token.type = "regexp"
-
-        while (True):
-            try:
-                ch = input[self.cursor]
-                self.cursor += 1
-            except IndexError:
-                raise ParseError("Unterminated regex", self.fileId, self.line)
-
-            if ch == "\\":
-                self.cursor += 1
-                
-            elif ch == "[":
-                while (True):
-                    if ch == "\\":
-                        self.cursor += 1
-
-                    try:
-                        ch = input[self.cursor]
-                        self.cursor += 1
-                    except IndexError:
-                        raise ParseError("Unterminated character class", self.fileId, self.line)
-                    
-                    if ch == "]":
-                        break
-                    
-            if ch == "/":
-                break
-
-        while(True):
-            ch = input[self.cursor]
-            self.cursor += 1
-            if not (ch >= "a" and ch <= "z"):
-                break
-
-        self.cursor -= 1
-        token.value = input[token.start:self.cursor]
-    
-
-    def lexOp(self, ch):
-        token = self.token
-        input = self.source
-
-        op = ch
-        while(True):
-            try:
-                next = input[self.cursor]
-            except IndexError:
-                break
-                
-            if (op + next) in operatorNames:
-                self.cursor += 1
-                op += next
-            else:
-                break
-        
-        try:
-            next = input[self.cursor]
-        except IndexError:
-            next = None
-
-        if next == "=" and op in assignOperators:
-            self.cursor += 1
-            token.type = "assign"
-            token.assignOp = operatorNames[op]
-            op += "="
-            
-        else:
-            token.type = operatorNames[op]
-            token.assignOp = None
-
-
-    # FIXME: Unicode escape sequences
-    # FIXME: Unicode identifiers
-    def lexIdent(self, ch):
-        token = self.token
-        input = self.source
-
-        try:
-            while True:
-                ch = input[self.cursor]
-                self.cursor += 1
-            
-                if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "$" or ch == "_"):
-                    break
-                    
-        except IndexError:
-            self.cursor += 1
-            pass
-        
-        # Put the non-word character back.
-        self.cursor -= 1
-
-        identifier = input[token.start:self.cursor]
-        if identifier in Lang.keywords:
-            token.type = identifier
-        else:
-            token.type = "identifier"
-            token.value = identifier
-
-
-    def get(self, scanOperand=False):
-        """ 
-        It consumes input *only* if there is no lookahead.
-        Dispatches to the appropriate lexing function depending on the input.
-        """
-        while self.lookahead:
-            self.lookahead -= 1
-            self.tokenIndex = (self.tokenIndex + 1) & 3
-            token = self.tokens[self.tokenIndex]
-            if token.type != "newline" or self.scanNewlines:
-                return token.type
-
-        self.skip()
-
-        self.tokenIndex = (self.tokenIndex + 1) & 3
-        self.tokens[self.tokenIndex] = token = Token()
-
-        token.start = self.cursor
-        token.line = self.line
-
-        input = self.source
-        if self.cursor == len(input):
-            token.end = token.start
-            token.type = "end"
-            return token.type
-
-        ch = input[self.cursor]
-        self.cursor += 1
-        
-        if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "_":
-            self.lexIdent(ch)
-        
-        elif scanOperand and ch == "/":
-            self.lexRegExp(ch)
-        
-        elif ch == ".":
-            self.lexDot(ch)
-
-        elif self.scanNewlines and ch == "\n":
-            token.type = "newline"
-            self.line += 1
-
-        elif ch in operatorNames:
-            self.lexOp(ch)
-        
-        elif ch >= "1" and ch <= "9":
-            self.lexNumber(ch)
-        
-        elif ch == "0":
-            self.lexZeroNumber(ch)
-        
-        elif ch == '"' or ch == "'":
-            self.lexString(ch)
-        
-        else:
-            raise ParseError("Illegal token: %s (Code: %s)" % (ch, ord(ch)), self.fileId, self.line)
-
-        token.end = self.cursor
-        return token.type
-        
-
-    def unget(self):
-        """ Match depends on unget returning undefined."""
-        self.lookahead += 1
-        
-        if self.lookahead == 4: 
-            raise ParseError("PANIC: too much lookahead!", self.fileId, self.line)
-        
-        self.tokenIndex = (self.tokenIndex - 1) & 3
-        
-    
-    def save(self):
-        return {
-            "cursor" : self.cursor,
-            "tokenIndex": self.tokenIndex,
-            "tokens": copy.copy(self.tokens),
-            "lookahead": self.lookahead,
-            "scanNewlines": self.scanNewlines,
-            "line": self.line
-        }
-
-    
-    def rewind(self, point):
-        self.cursor = point["cursor"]
-        self.tokenIndex = point["tokenIndex"]
-        self.tokens = copy.copy(point["tokens"])
-        self.lookahead = point["lookahead"]
-        self.scanNewline = point["scanNewline"]
-        self.line = point["line"]

eric ide

mercurial