|
1 # |
|
2 # Jasy - Web Tooling Framework |
|
3 # Copyright 2010-2012 Zynga Inc. |
|
4 # |
|
5 |
|
6 # |
|
7 # License: MPL 1.1/GPL 2.0/LGPL 2.1 |
|
8 # Authors: |
|
9 # - Brendan Eich <brendan@mozilla.org> (Original JavaScript) (2004-2010) |
|
10 # - Sebastian Werner <info@sebastian-werner.net> (Python Port) (2010) |
|
11 # |
|
12 |
|
13 import copy |
|
14 |
|
15 import jasy.js.tokenize.Lang as Lang |
|
16 import jasy.js.api.Comment as Comment |
|
17 import jasy.core.Console as Console |
|
18 |
|
19 __all__ = [ "Tokenizer" ] |
|
20 |
|
21 |
|
22 # Operator and punctuator mapping from token to tree node type name. |
|
23 # NB: because the lexer doesn't backtrack, all token prefixes must themselves |
|
24 # be valid tokens (e.g. !== is acceptable because its prefixes are the valid |
|
25 # tokens != and !). |
|
26 operatorNames = { |
|
27 '<' : 'lt', |
|
28 '>' : 'gt', |
|
29 '<=' : 'le', |
|
30 '>=' : 'ge', |
|
31 '!=' : 'ne', |
|
32 '!' : 'not', |
|
33 '==' : 'eq', |
|
34 '===' : 'strict_eq', |
|
35 '!==' : 'strict_ne', |
|
36 |
|
37 '>>' : 'rsh', |
|
38 '<<' : 'lsh', |
|
39 '>>>' : 'ursh', |
|
40 |
|
41 '+' : 'plus', |
|
42 '*' : 'mul', |
|
43 '-' : 'minus', |
|
44 '/' : 'div', |
|
45 '%' : 'mod', |
|
46 |
|
47 ',' : 'comma', |
|
48 ';' : 'semicolon', |
|
49 ':' : 'colon', |
|
50 '=' : 'assign', |
|
51 '?' : 'hook', |
|
52 |
|
53 '&&' : 'and', |
|
54 '||' : 'or', |
|
55 |
|
56 '++' : 'increment', |
|
57 '--' : 'decrement', |
|
58 |
|
59 ')' : 'right_paren', |
|
60 '(' : 'left_paren', |
|
61 '[' : 'left_bracket', |
|
62 ']' : 'right_bracket', |
|
63 '{' : 'left_curly', |
|
64 '}' : 'right_curly', |
|
65 |
|
66 '&' : 'bitwise_and', |
|
67 '^' : 'bitwise_xor', |
|
68 '|' : 'bitwise_or', |
|
69 '~' : 'bitwise_not' |
|
70 } |
|
71 |
|
72 |
|
73 # Assignment operators |
|
74 assignOperators = ["|", "^", "&", "<<", ">>", ">>>", "+", "-", "*", "/", "%"] |
|
75 |
|
76 |
|
77 |
|
78 |
|
79 # |
|
80 # Classes |
|
81 # |
|
82 |
|
83 class Token: |
|
84 __slots__ = ["type", "start", "line", "assignOp", "end", "value"] |
|
85 |
|
86 |
|
87 class ParseError(Exception): |
|
88 def __init__(self, message, fileId, line): |
|
89 Exception.__init__(self, "Syntax error: %s\n%s:%s" % (message, fileId, line)) |
|
90 |
|
91 |
|
92 class Tokenizer(object): |
|
93 def __init__(self, source, fileId="", line=1): |
|
94 # source: JavaScript source |
|
95 # fileId: Filename (for debugging proposes) |
|
96 # line: Line number (for debugging proposes) |
|
97 self.cursor = 0 |
|
98 self.source = str(source) |
|
99 self.tokens = {} |
|
100 self.tokenIndex = 0 |
|
101 self.lookahead = 0 |
|
102 self.scanNewlines = False |
|
103 self.fileId = fileId |
|
104 self.line = line |
|
105 self.comments = [] |
|
106 |
|
107 input_ = property(lambda self: self.source[self.cursor:]) |
|
108 token = property(lambda self: self.tokens.get(self.tokenIndex)) |
|
109 |
|
110 |
|
111 def done(self): |
|
112 # We need to set scanOperand to true here because the first thing |
|
113 # might be a regexp. |
|
114 return self.peek(True) == "end" |
|
115 |
|
116 |
|
117 def match(self, tokenType, scanOperand=False): |
|
118 return self.get(scanOperand) == tokenType or self.unget() |
|
119 |
|
120 |
|
121 def mustMatch(self, tokenType): |
|
122 if not self.match(tokenType): |
|
123 raise ParseError("Missing " + tokenType, self.fileId, self.line) |
|
124 |
|
125 return self.token |
|
126 |
|
127 |
|
128 def peek(self, scanOperand=False): |
|
129 if self.lookahead: |
|
130 next = self.tokens.get((self.tokenIndex + self.lookahead) & 3) |
|
131 if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)): |
|
132 tokenType = "newline" |
|
133 else: |
|
134 tokenType = getattr(next, "type", None) |
|
135 else: |
|
136 tokenType = self.get(scanOperand) |
|
137 self.unget() |
|
138 |
|
139 return tokenType |
|
140 |
|
141 |
|
142 def peekOnSameLine(self, scanOperand=False): |
|
143 self.scanNewlines = True |
|
144 tokenType = self.peek(scanOperand) |
|
145 self.scanNewlines = False |
|
146 return tokenType |
|
147 |
|
148 |
|
149 def getComments(self): |
|
150 if self.comments: |
|
151 comments = self.comments |
|
152 self.comments = [] |
|
153 return comments |
|
154 |
|
155 return None |
|
156 |
|
157 |
|
158 def skip(self): |
|
159 """Eats comments and whitespace.""" |
|
160 input = self.source |
|
161 startLine = self.line |
|
162 |
|
163 # Whether this is the first called as happen on start parsing a file (eat leading comments/white space) |
|
164 startOfFile = self.cursor is 0 |
|
165 |
|
166 indent = "" |
|
167 |
|
168 while (True): |
|
169 if len(input) > self.cursor: |
|
170 ch = input[self.cursor] |
|
171 else: |
|
172 return |
|
173 |
|
174 self.cursor += 1 |
|
175 |
|
176 if len(input) > self.cursor: |
|
177 next = input[self.cursor] |
|
178 else: |
|
179 next = None |
|
180 |
|
181 if ch == "\n" and not self.scanNewlines: |
|
182 self.line += 1 |
|
183 indent = "" |
|
184 |
|
185 elif ch == "/" and next == "*": |
|
186 self.cursor += 1 |
|
187 text = "/*" |
|
188 commentStartLine = self.line |
|
189 if startLine == self.line and not startOfFile: |
|
190 mode = "inline" |
|
191 elif (self.line-1) > startLine: |
|
192 # distance before this comment means it is a comment block for a whole section (multiple lines of code) |
|
193 mode = "section" |
|
194 else: |
|
195 # comment for maybe multiple following lines of code, but not that important (no visual white space divider) |
|
196 mode = "block" |
|
197 |
|
198 while (True): |
|
199 try: |
|
200 ch = input[self.cursor] |
|
201 self.cursor += 1 |
|
202 except IndexError: |
|
203 raise ParseError("Unterminated comment", self.fileId, self.line) |
|
204 |
|
205 if ch == "*": |
|
206 next = input[self.cursor] |
|
207 if next == "/": |
|
208 text += "*/" |
|
209 self.cursor += 1 |
|
210 break |
|
211 |
|
212 elif ch == "\n": |
|
213 self.line += 1 |
|
214 |
|
215 text += ch |
|
216 |
|
217 |
|
218 # Filter escaping on slash-star combinations in comment text |
|
219 text = text.replace("*\/", "*/") |
|
220 |
|
221 try: |
|
222 self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId)) |
|
223 except Comment.CommentException as commentError: |
|
224 Console.error("Ignoring comment in %s: %s", self.fileId, commentError) |
|
225 |
|
226 |
|
227 elif ch == "/" and next == "/": |
|
228 self.cursor += 1 |
|
229 text = "//" |
|
230 if startLine == self.line and not startOfFile: |
|
231 mode = "inline" |
|
232 elif (self.line-1) > startLine: |
|
233 # distance before this comment means it is a comment block for a whole section (multiple lines of code) |
|
234 mode = "section" |
|
235 else: |
|
236 # comment for maybe multiple following lines of code, but not that important (no visual white space divider) |
|
237 mode = "block" |
|
238 |
|
239 while (True): |
|
240 try: |
|
241 ch = input[self.cursor] |
|
242 self.cursor += 1 |
|
243 except IndexError: |
|
244 # end of file etc. |
|
245 break |
|
246 |
|
247 if ch == "\n": |
|
248 self.line += 1 |
|
249 break |
|
250 |
|
251 text += ch |
|
252 |
|
253 try: |
|
254 self.comments.append(Comment.Comment(text, mode, self.line-1, "", self.fileId)) |
|
255 except Comment.CommentException: |
|
256 Console.error("Ignoring comment in %s: %s", self.fileId, commentError) |
|
257 |
|
258 # check for whitespace, also for special cases like 0xA0 |
|
259 elif ch in "\xA0 \t": |
|
260 indent += ch |
|
261 |
|
262 else: |
|
263 self.cursor -= 1 |
|
264 return |
|
265 |
|
266 |
|
267 # Lexes the exponential part of a number, if present. Returns True if an |
|
268 # exponential part was found. |
|
269 def lexExponent(self): |
|
270 input = self.source |
|
271 next = input[self.cursor] |
|
272 if next == "e" or next == "E": |
|
273 self.cursor += 1 |
|
274 ch = input[self.cursor] |
|
275 self.cursor += 1 |
|
276 if ch == "+" or ch == "-": |
|
277 ch = input[self.cursor] |
|
278 self.cursor += 1 |
|
279 |
|
280 if ch < "0" or ch > "9": |
|
281 raise ParseError("Missing exponent", self.fileId, self.line) |
|
282 |
|
283 while(True): |
|
284 ch = input[self.cursor] |
|
285 self.cursor += 1 |
|
286 if not (ch >= "0" and ch <= "9"): |
|
287 break |
|
288 |
|
289 self.cursor -= 1 |
|
290 return True |
|
291 |
|
292 return False |
|
293 |
|
294 |
|
295 def lexZeroNumber(self, ch): |
|
296 token = self.token |
|
297 input = self.source |
|
298 token.type = "number" |
|
299 |
|
300 ch = input[self.cursor] |
|
301 self.cursor += 1 |
|
302 if ch == ".": |
|
303 while(True): |
|
304 ch = input[self.cursor] |
|
305 self.cursor += 1 |
|
306 if not (ch >= "0" and ch <= "9"): |
|
307 break |
|
308 |
|
309 self.cursor -= 1 |
|
310 self.lexExponent() |
|
311 token.value = input[token.start:self.cursor] |
|
312 |
|
313 elif ch == "x" or ch == "X": |
|
314 while(True): |
|
315 ch = input[self.cursor] |
|
316 self.cursor += 1 |
|
317 if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")): |
|
318 break |
|
319 |
|
320 self.cursor -= 1 |
|
321 token.value = input[token.start:self.cursor] |
|
322 |
|
323 elif ch >= "0" and ch <= "7": |
|
324 while(True): |
|
325 ch = input[self.cursor] |
|
326 self.cursor += 1 |
|
327 if not (ch >= "0" and ch <= "7"): |
|
328 break |
|
329 |
|
330 self.cursor -= 1 |
|
331 token.value = input[token.start:self.cursor] |
|
332 |
|
333 else: |
|
334 self.cursor -= 1 |
|
335 self.lexExponent() # 0E1, &c. |
|
336 token.value = 0 |
|
337 |
|
338 |
|
339 def lexNumber(self, ch): |
|
340 token = self.token |
|
341 input = self.source |
|
342 token.type = "number" |
|
343 |
|
344 floating = False |
|
345 while(True): |
|
346 ch = input[self.cursor] |
|
347 self.cursor += 1 |
|
348 |
|
349 if ch == "." and not floating: |
|
350 floating = True |
|
351 ch = input[self.cursor] |
|
352 self.cursor += 1 |
|
353 |
|
354 if not (ch >= "0" and ch <= "9"): |
|
355 break |
|
356 |
|
357 self.cursor -= 1 |
|
358 |
|
359 exponent = self.lexExponent() |
|
360 segment = input[token.start:self.cursor] |
|
361 |
|
362 # Protect float or exponent numbers |
|
363 if floating or exponent: |
|
364 token.value = segment |
|
365 else: |
|
366 token.value = int(segment) |
|
367 |
|
368 |
|
369 def lexDot(self, ch): |
|
370 token = self.token |
|
371 input = self.source |
|
372 next = input[self.cursor] |
|
373 |
|
374 if next >= "0" and next <= "9": |
|
375 while (True): |
|
376 ch = input[self.cursor] |
|
377 self.cursor += 1 |
|
378 if not (ch >= "0" and ch <= "9"): |
|
379 break |
|
380 |
|
381 self.cursor -= 1 |
|
382 self.lexExponent() |
|
383 |
|
384 token.type = "number" |
|
385 token.value = input[token.start:self.cursor] |
|
386 |
|
387 else: |
|
388 token.type = "dot" |
|
389 |
|
390 |
|
391 def lexString(self, ch): |
|
392 token = self.token |
|
393 input = self.source |
|
394 token.type = "string" |
|
395 |
|
396 hasEscapes = False |
|
397 delim = ch |
|
398 ch = input[self.cursor] |
|
399 self.cursor += 1 |
|
400 while ch != delim: |
|
401 if ch == "\\": |
|
402 hasEscapes = True |
|
403 self.cursor += 1 |
|
404 |
|
405 ch = input[self.cursor] |
|
406 self.cursor += 1 |
|
407 |
|
408 if hasEscapes: |
|
409 token.value = eval(input[token.start:self.cursor]) |
|
410 else: |
|
411 token.value = input[token.start+1:self.cursor-1] |
|
412 |
|
413 |
|
414 def lexRegExp(self, ch): |
|
415 token = self.token |
|
416 input = self.source |
|
417 token.type = "regexp" |
|
418 |
|
419 while (True): |
|
420 try: |
|
421 ch = input[self.cursor] |
|
422 self.cursor += 1 |
|
423 except IndexError: |
|
424 raise ParseError("Unterminated regex", self.fileId, self.line) |
|
425 |
|
426 if ch == "\\": |
|
427 self.cursor += 1 |
|
428 |
|
429 elif ch == "[": |
|
430 while (True): |
|
431 if ch == "\\": |
|
432 self.cursor += 1 |
|
433 |
|
434 try: |
|
435 ch = input[self.cursor] |
|
436 self.cursor += 1 |
|
437 except IndexError: |
|
438 raise ParseError("Unterminated character class", self.fileId, self.line) |
|
439 |
|
440 if ch == "]": |
|
441 break |
|
442 |
|
443 if ch == "/": |
|
444 break |
|
445 |
|
446 while(True): |
|
447 ch = input[self.cursor] |
|
448 self.cursor += 1 |
|
449 if not (ch >= "a" and ch <= "z"): |
|
450 break |
|
451 |
|
452 self.cursor -= 1 |
|
453 token.value = input[token.start:self.cursor] |
|
454 |
|
455 |
|
456 def lexOp(self, ch): |
|
457 token = self.token |
|
458 input = self.source |
|
459 |
|
460 op = ch |
|
461 while(True): |
|
462 try: |
|
463 next = input[self.cursor] |
|
464 except IndexError: |
|
465 break |
|
466 |
|
467 if (op + next) in operatorNames: |
|
468 self.cursor += 1 |
|
469 op += next |
|
470 else: |
|
471 break |
|
472 |
|
473 try: |
|
474 next = input[self.cursor] |
|
475 except IndexError: |
|
476 next = None |
|
477 |
|
478 if next == "=" and op in assignOperators: |
|
479 self.cursor += 1 |
|
480 token.type = "assign" |
|
481 token.assignOp = operatorNames[op] |
|
482 op += "=" |
|
483 |
|
484 else: |
|
485 token.type = operatorNames[op] |
|
486 token.assignOp = None |
|
487 |
|
488 |
|
489 # FIXME: Unicode escape sequences |
|
490 # FIXME: Unicode identifiers |
|
491 def lexIdent(self, ch): |
|
492 token = self.token |
|
493 input = self.source |
|
494 |
|
495 try: |
|
496 while True: |
|
497 ch = input[self.cursor] |
|
498 self.cursor += 1 |
|
499 |
|
500 if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "$" or ch == "_"): |
|
501 break |
|
502 |
|
503 except IndexError: |
|
504 self.cursor += 1 |
|
505 pass |
|
506 |
|
507 # Put the non-word character back. |
|
508 self.cursor -= 1 |
|
509 |
|
510 identifier = input[token.start:self.cursor] |
|
511 if identifier in Lang.keywords: |
|
512 token.type = identifier |
|
513 else: |
|
514 token.type = "identifier" |
|
515 token.value = identifier |
|
516 |
|
517 |
|
518 def get(self, scanOperand=False): |
|
519 """ |
|
520 It consumes input *only* if there is no lookahead. |
|
521 Dispatches to the appropriate lexing function depending on the input. |
|
522 """ |
|
523 while self.lookahead: |
|
524 self.lookahead -= 1 |
|
525 self.tokenIndex = (self.tokenIndex + 1) & 3 |
|
526 token = self.tokens[self.tokenIndex] |
|
527 if token.type != "newline" or self.scanNewlines: |
|
528 return token.type |
|
529 |
|
530 self.skip() |
|
531 |
|
532 self.tokenIndex = (self.tokenIndex + 1) & 3 |
|
533 self.tokens[self.tokenIndex] = token = Token() |
|
534 |
|
535 token.start = self.cursor |
|
536 token.line = self.line |
|
537 |
|
538 input = self.source |
|
539 if self.cursor == len(input): |
|
540 token.end = token.start |
|
541 token.type = "end" |
|
542 return token.type |
|
543 |
|
544 ch = input[self.cursor] |
|
545 self.cursor += 1 |
|
546 |
|
547 if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "_": |
|
548 self.lexIdent(ch) |
|
549 |
|
550 elif scanOperand and ch == "/": |
|
551 self.lexRegExp(ch) |
|
552 |
|
553 elif ch == ".": |
|
554 self.lexDot(ch) |
|
555 |
|
556 elif self.scanNewlines and ch == "\n": |
|
557 token.type = "newline" |
|
558 self.line += 1 |
|
559 |
|
560 elif ch in operatorNames: |
|
561 self.lexOp(ch) |
|
562 |
|
563 elif ch >= "1" and ch <= "9": |
|
564 self.lexNumber(ch) |
|
565 |
|
566 elif ch == "0": |
|
567 self.lexZeroNumber(ch) |
|
568 |
|
569 elif ch == '"' or ch == "'": |
|
570 self.lexString(ch) |
|
571 |
|
572 else: |
|
573 raise ParseError("Illegal token: %s (Code: %s)" % (ch, ord(ch)), self.fileId, self.line) |
|
574 |
|
575 token.end = self.cursor |
|
576 return token.type |
|
577 |
|
578 |
|
579 def unget(self): |
|
580 """ Match depends on unget returning undefined.""" |
|
581 self.lookahead += 1 |
|
582 |
|
583 if self.lookahead == 4: |
|
584 raise ParseError("PANIC: too much lookahead!", self.fileId, self.line) |
|
585 |
|
586 self.tokenIndex = (self.tokenIndex - 1) & 3 |
|
587 |
|
588 |
|
589 def save(self): |
|
590 return { |
|
591 "cursor" : self.cursor, |
|
592 "tokenIndex": self.tokenIndex, |
|
593 "tokens": copy.copy(self.tokens), |
|
594 "lookahead": self.lookahead, |
|
595 "scanNewlines": self.scanNewlines, |
|
596 "line": self.line |
|
597 } |
|
598 |
|
599 |
|
600 def rewind(self, point): |
|
601 self.cursor = point["cursor"] |
|
602 self.tokenIndex = point["tokenIndex"] |
|
603 self.tokens = copy.copy(point["tokens"]) |
|
604 self.lookahead = point["lookahead"] |
|
605 self.scanNewline = point["scanNewline"] |
|
606 self.line = point["line"] |