1 # |
|
2 # Jasy - Web Tooling Framework |
|
3 # Copyright 2010-2012 Zynga Inc. |
|
4 # |
|
5 |
|
6 # |
|
7 # License: MPL 1.1/GPL 2.0/LGPL 2.1 |
|
8 # Authors: |
|
9 # - Brendan Eich <brendan@mozilla.org> (Original JavaScript) (2004-2010) |
|
10 # - Sebastian Werner <info@sebastian-werner.net> (Python Port) (2010) |
|
11 # |
|
12 |
|
13 from __future__ import unicode_literals |
|
14 |
|
15 import copy |
|
16 |
|
17 import jasy.js.tokenize.Lang as Lang |
|
18 import jasy.js.api.Comment as Comment |
|
19 import jasy.core.Console as Console |
|
20 |
|
21 __all__ = [ "Tokenizer" ] |
|
22 |
|
23 |
|
24 # Operator and punctuator mapping from token to tree node type name. |
|
25 # NB: because the lexer doesn't backtrack, all token prefixes must themselves |
|
26 # be valid tokens (e.g. !== is acceptable because its prefixes are the valid |
|
27 # tokens != and !). |
|
28 operatorNames = { |
|
29 '<' : 'lt', |
|
30 '>' : 'gt', |
|
31 '<=' : 'le', |
|
32 '>=' : 'ge', |
|
33 '!=' : 'ne', |
|
34 '!' : 'not', |
|
35 '==' : 'eq', |
|
36 '===' : 'strict_eq', |
|
37 '!==' : 'strict_ne', |
|
38 |
|
39 '>>' : 'rsh', |
|
40 '<<' : 'lsh', |
|
41 '>>>' : 'ursh', |
|
42 |
|
43 '+' : 'plus', |
|
44 '*' : 'mul', |
|
45 '-' : 'minus', |
|
46 '/' : 'div', |
|
47 '%' : 'mod', |
|
48 |
|
49 ',' : 'comma', |
|
50 ';' : 'semicolon', |
|
51 ':' : 'colon', |
|
52 '=' : 'assign', |
|
53 '?' : 'hook', |
|
54 |
|
55 '&&' : 'and', |
|
56 '||' : 'or', |
|
57 |
|
58 '++' : 'increment', |
|
59 '--' : 'decrement', |
|
60 |
|
61 ')' : 'right_paren', |
|
62 '(' : 'left_paren', |
|
63 '[' : 'left_bracket', |
|
64 ']' : 'right_bracket', |
|
65 '{' : 'left_curly', |
|
66 '}' : 'right_curly', |
|
67 |
|
68 '&' : 'bitwise_and', |
|
69 '^' : 'bitwise_xor', |
|
70 '|' : 'bitwise_or', |
|
71 '~' : 'bitwise_not' |
|
72 } |
|
73 |
|
74 |
|
75 # Assignment operators |
|
76 assignOperators = ["|", "^", "&", "<<", ">>", ">>>", "+", "-", "*", "/", "%"] |
|
77 |
|
78 |
|
79 |
|
80 |
|
81 # |
|
82 # Classes |
|
83 # |
|
84 |
|
85 class Token: |
|
86 __slots__ = ["type", "start", "line", "assignOp", "end", "value"] |
|
87 |
|
88 |
|
89 class ParseError(Exception): |
|
90 def __init__(self, message, fileId, line): |
|
91 Exception.__init__(self, "Syntax error: %s\n%s:%s" % (message, fileId, line)) |
|
92 |
|
93 |
|
94 class Tokenizer(object): |
|
95 def __init__(self, source, fileId="", line=1): |
|
96 # source: JavaScript source |
|
97 # fileId: Filename (for debugging proposes) |
|
98 # line: Line number (for debugging proposes) |
|
99 self.cursor = 0 |
|
100 self.source = str(source) |
|
101 self.tokens = {} |
|
102 self.tokenIndex = 0 |
|
103 self.lookahead = 0 |
|
104 self.scanNewlines = False |
|
105 self.fileId = fileId |
|
106 self.line = line |
|
107 self.comments = [] |
|
108 |
|
109 input_ = property(lambda self: self.source[self.cursor:]) |
|
110 token = property(lambda self: self.tokens.get(self.tokenIndex)) |
|
111 |
|
112 |
|
113 def done(self): |
|
114 # We need to set scanOperand to true here because the first thing |
|
115 # might be a regexp. |
|
116 return self.peek(True) == "end" |
|
117 |
|
118 |
|
119 def match(self, tokenType, scanOperand=False): |
|
120 return self.get(scanOperand) == tokenType or self.unget() |
|
121 |
|
122 |
|
123 def mustMatch(self, tokenType): |
|
124 if not self.match(tokenType): |
|
125 raise ParseError("Missing " + tokenType, self.fileId, self.line) |
|
126 |
|
127 return self.token |
|
128 |
|
129 |
|
130 def peek(self, scanOperand=False): |
|
131 if self.lookahead: |
|
132 next = self.tokens.get((self.tokenIndex + self.lookahead) & 3) |
|
133 if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)): |
|
134 tokenType = "newline" |
|
135 else: |
|
136 tokenType = getattr(next, "type", None) |
|
137 else: |
|
138 tokenType = self.get(scanOperand) |
|
139 self.unget() |
|
140 |
|
141 return tokenType |
|
142 |
|
143 |
|
144 def peekOnSameLine(self, scanOperand=False): |
|
145 self.scanNewlines = True |
|
146 tokenType = self.peek(scanOperand) |
|
147 self.scanNewlines = False |
|
148 return tokenType |
|
149 |
|
150 |
|
151 def getComments(self): |
|
152 if self.comments: |
|
153 comments = self.comments |
|
154 self.comments = [] |
|
155 return comments |
|
156 |
|
157 return None |
|
158 |
|
159 |
|
160 def skip(self): |
|
161 """Eats comments and whitespace.""" |
|
162 input = self.source |
|
163 startLine = self.line |
|
164 |
|
165 # Whether this is the first called as happen on start parsing a file (eat leading comments/white space) |
|
166 startOfFile = self.cursor is 0 |
|
167 |
|
168 indent = "" |
|
169 |
|
170 while (True): |
|
171 if len(input) > self.cursor: |
|
172 ch = input[self.cursor] |
|
173 else: |
|
174 return |
|
175 |
|
176 self.cursor += 1 |
|
177 |
|
178 if len(input) > self.cursor: |
|
179 next = input[self.cursor] |
|
180 else: |
|
181 next = None |
|
182 |
|
183 if ch == "\n" and not self.scanNewlines: |
|
184 self.line += 1 |
|
185 indent = "" |
|
186 |
|
187 elif ch == "/" and next == "*": |
|
188 self.cursor += 1 |
|
189 text = "/*" |
|
190 inline = startLine == self.line and startLine > 1 |
|
191 commentStartLine = self.line |
|
192 if startLine == self.line and not startOfFile: |
|
193 mode = "inline" |
|
194 elif (self.line-1) > startLine: |
|
195 # distance before this comment means it is a comment block for a whole section (multiple lines of code) |
|
196 mode = "section" |
|
197 else: |
|
198 # comment for maybe multiple following lines of code, but not that important (no visual white space divider) |
|
199 mode = "block" |
|
200 |
|
201 while (True): |
|
202 try: |
|
203 ch = input[self.cursor] |
|
204 self.cursor += 1 |
|
205 except IndexError: |
|
206 raise ParseError("Unterminated comment", self.fileId, self.line) |
|
207 |
|
208 if ch == "*": |
|
209 next = input[self.cursor] |
|
210 if next == "/": |
|
211 text += "*/" |
|
212 self.cursor += 1 |
|
213 break |
|
214 |
|
215 elif ch == "\n": |
|
216 self.line += 1 |
|
217 |
|
218 text += ch |
|
219 |
|
220 |
|
221 # Filter escaping on slash-star combinations in comment text |
|
222 text = text.replace("*\/", "*/") |
|
223 |
|
224 try: |
|
225 self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId)) |
|
226 except Comment.CommentException as commentError: |
|
227 Console.error("Ignoring comment in %s: %s", self.fileId, commentError) |
|
228 |
|
229 |
|
230 elif ch == "/" and next == "/": |
|
231 self.cursor += 1 |
|
232 text = "//" |
|
233 if startLine == self.line and not startOfFile: |
|
234 mode = "inline" |
|
235 elif (self.line-1) > startLine: |
|
236 # distance before this comment means it is a comment block for a whole section (multiple lines of code) |
|
237 mode = "section" |
|
238 else: |
|
239 # comment for maybe multiple following lines of code, but not that important (no visual white space divider) |
|
240 mode = "block" |
|
241 |
|
242 while (True): |
|
243 try: |
|
244 ch = input[self.cursor] |
|
245 self.cursor += 1 |
|
246 except IndexError: |
|
247 # end of file etc. |
|
248 break |
|
249 |
|
250 if ch == "\n": |
|
251 self.line += 1 |
|
252 break |
|
253 |
|
254 text += ch |
|
255 |
|
256 try: |
|
257 self.comments.append(Comment.Comment(text, mode, self.line-1, "", self.fileId)) |
|
258 except Comment.CommentException: |
|
259 Console.error("Ignoring comment in %s: %s", self.fileId, commentError) |
|
260 |
|
261 # check for whitespace, also for special cases like 0xA0 |
|
262 elif ch in "\xA0 \t": |
|
263 indent += ch |
|
264 |
|
265 else: |
|
266 self.cursor -= 1 |
|
267 return |
|
268 |
|
269 |
|
270 # Lexes the exponential part of a number, if present. Returns True if an |
|
271 # exponential part was found. |
|
272 def lexExponent(self): |
|
273 input = self.source |
|
274 next = input[self.cursor] |
|
275 if next == "e" or next == "E": |
|
276 self.cursor += 1 |
|
277 ch = input[self.cursor] |
|
278 self.cursor += 1 |
|
279 if ch == "+" or ch == "-": |
|
280 ch = input[self.cursor] |
|
281 self.cursor += 1 |
|
282 |
|
283 if ch < "0" or ch > "9": |
|
284 raise ParseError("Missing exponent", self.fileId, self.line) |
|
285 |
|
286 while(True): |
|
287 ch = input[self.cursor] |
|
288 self.cursor += 1 |
|
289 if not (ch >= "0" and ch <= "9"): |
|
290 break |
|
291 |
|
292 self.cursor -= 1 |
|
293 return True |
|
294 |
|
295 return False |
|
296 |
|
297 |
|
298 def lexZeroNumber(self, ch): |
|
299 token = self.token |
|
300 input = self.source |
|
301 token.type = "number" |
|
302 |
|
303 ch = input[self.cursor] |
|
304 self.cursor += 1 |
|
305 if ch == ".": |
|
306 while(True): |
|
307 ch = input[self.cursor] |
|
308 self.cursor += 1 |
|
309 if not (ch >= "0" and ch <= "9"): |
|
310 break |
|
311 |
|
312 self.cursor -= 1 |
|
313 self.lexExponent() |
|
314 token.value = input[token.start:self.cursor] |
|
315 |
|
316 elif ch == "x" or ch == "X": |
|
317 while(True): |
|
318 ch = input[self.cursor] |
|
319 self.cursor += 1 |
|
320 if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")): |
|
321 break |
|
322 |
|
323 self.cursor -= 1 |
|
324 token.value = input[token.start:self.cursor] |
|
325 |
|
326 elif ch >= "0" and ch <= "7": |
|
327 while(True): |
|
328 ch = input[self.cursor] |
|
329 self.cursor += 1 |
|
330 if not (ch >= "0" and ch <= "7"): |
|
331 break |
|
332 |
|
333 self.cursor -= 1 |
|
334 token.value = input[token.start:self.cursor] |
|
335 |
|
336 else: |
|
337 self.cursor -= 1 |
|
338 self.lexExponent() # 0E1, &c. |
|
339 token.value = 0 |
|
340 |
|
341 |
|
342 def lexNumber(self, ch): |
|
343 token = self.token |
|
344 input = self.source |
|
345 token.type = "number" |
|
346 |
|
347 floating = False |
|
348 while(True): |
|
349 ch = input[self.cursor] |
|
350 self.cursor += 1 |
|
351 |
|
352 if ch == "." and not floating: |
|
353 floating = True |
|
354 ch = input[self.cursor] |
|
355 self.cursor += 1 |
|
356 |
|
357 if not (ch >= "0" and ch <= "9"): |
|
358 break |
|
359 |
|
360 self.cursor -= 1 |
|
361 |
|
362 exponent = self.lexExponent() |
|
363 segment = input[token.start:self.cursor] |
|
364 |
|
365 # Protect float or exponent numbers |
|
366 if floating or exponent: |
|
367 token.value = segment |
|
368 else: |
|
369 token.value = int(segment) |
|
370 |
|
371 |
|
372 def lexDot(self, ch): |
|
373 token = self.token |
|
374 input = self.source |
|
375 next = input[self.cursor] |
|
376 |
|
377 if next >= "0" and next <= "9": |
|
378 while (True): |
|
379 ch = input[self.cursor] |
|
380 self.cursor += 1 |
|
381 if not (ch >= "0" and ch <= "9"): |
|
382 break |
|
383 |
|
384 self.cursor -= 1 |
|
385 self.lexExponent() |
|
386 |
|
387 token.type = "number" |
|
388 token.value = input[token.start:self.cursor] |
|
389 |
|
390 else: |
|
391 token.type = "dot" |
|
392 |
|
393 |
|
394 def lexString(self, ch): |
|
395 token = self.token |
|
396 input = self.source |
|
397 token.type = "string" |
|
398 |
|
399 hasEscapes = False |
|
400 delim = ch |
|
401 ch = input[self.cursor] |
|
402 self.cursor += 1 |
|
403 while ch != delim: |
|
404 if ch == "\\": |
|
405 hasEscapes = True |
|
406 self.cursor += 1 |
|
407 |
|
408 ch = input[self.cursor] |
|
409 self.cursor += 1 |
|
410 |
|
411 if hasEscapes: |
|
412 token.value = eval(input[token.start:self.cursor]) |
|
413 else: |
|
414 token.value = input[token.start+1:self.cursor-1] |
|
415 |
|
416 |
|
417 def lexRegExp(self, ch): |
|
418 token = self.token |
|
419 input = self.source |
|
420 token.type = "regexp" |
|
421 |
|
422 while (True): |
|
423 try: |
|
424 ch = input[self.cursor] |
|
425 self.cursor += 1 |
|
426 except IndexError: |
|
427 raise ParseError("Unterminated regex", self.fileId, self.line) |
|
428 |
|
429 if ch == "\\": |
|
430 self.cursor += 1 |
|
431 |
|
432 elif ch == "[": |
|
433 while (True): |
|
434 if ch == "\\": |
|
435 self.cursor += 1 |
|
436 |
|
437 try: |
|
438 ch = input[self.cursor] |
|
439 self.cursor += 1 |
|
440 except IndexError: |
|
441 raise ParseError("Unterminated character class", self.fileId, self.line) |
|
442 |
|
443 if ch == "]": |
|
444 break |
|
445 |
|
446 if ch == "/": |
|
447 break |
|
448 |
|
449 while(True): |
|
450 ch = input[self.cursor] |
|
451 self.cursor += 1 |
|
452 if not (ch >= "a" and ch <= "z"): |
|
453 break |
|
454 |
|
455 self.cursor -= 1 |
|
456 token.value = input[token.start:self.cursor] |
|
457 |
|
458 |
|
459 def lexOp(self, ch): |
|
460 token = self.token |
|
461 input = self.source |
|
462 |
|
463 op = ch |
|
464 while(True): |
|
465 try: |
|
466 next = input[self.cursor] |
|
467 except IndexError: |
|
468 break |
|
469 |
|
470 if (op + next) in operatorNames: |
|
471 self.cursor += 1 |
|
472 op += next |
|
473 else: |
|
474 break |
|
475 |
|
476 try: |
|
477 next = input[self.cursor] |
|
478 except IndexError: |
|
479 next = None |
|
480 |
|
481 if next == "=" and op in assignOperators: |
|
482 self.cursor += 1 |
|
483 token.type = "assign" |
|
484 token.assignOp = operatorNames[op] |
|
485 op += "=" |
|
486 |
|
487 else: |
|
488 token.type = operatorNames[op] |
|
489 token.assignOp = None |
|
490 |
|
491 |
|
492 # FIXME: Unicode escape sequences |
|
493 # FIXME: Unicode identifiers |
|
494 def lexIdent(self, ch): |
|
495 token = self.token |
|
496 input = self.source |
|
497 |
|
498 try: |
|
499 while True: |
|
500 ch = input[self.cursor] |
|
501 self.cursor += 1 |
|
502 |
|
503 if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "$" or ch == "_"): |
|
504 break |
|
505 |
|
506 except IndexError: |
|
507 self.cursor += 1 |
|
508 pass |
|
509 |
|
510 # Put the non-word character back. |
|
511 self.cursor -= 1 |
|
512 |
|
513 identifier = input[token.start:self.cursor] |
|
514 if identifier in Lang.keywords: |
|
515 token.type = identifier |
|
516 else: |
|
517 token.type = "identifier" |
|
518 token.value = identifier |
|
519 |
|
520 |
|
521 def get(self, scanOperand=False): |
|
522 """ |
|
523 It consumes input *only* if there is no lookahead. |
|
524 Dispatches to the appropriate lexing function depending on the input. |
|
525 """ |
|
526 while self.lookahead: |
|
527 self.lookahead -= 1 |
|
528 self.tokenIndex = (self.tokenIndex + 1) & 3 |
|
529 token = self.tokens[self.tokenIndex] |
|
530 if token.type != "newline" or self.scanNewlines: |
|
531 return token.type |
|
532 |
|
533 self.skip() |
|
534 |
|
535 self.tokenIndex = (self.tokenIndex + 1) & 3 |
|
536 self.tokens[self.tokenIndex] = token = Token() |
|
537 |
|
538 token.start = self.cursor |
|
539 token.line = self.line |
|
540 |
|
541 input = self.source |
|
542 if self.cursor == len(input): |
|
543 token.end = token.start |
|
544 token.type = "end" |
|
545 return token.type |
|
546 |
|
547 ch = input[self.cursor] |
|
548 self.cursor += 1 |
|
549 |
|
550 if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "_": |
|
551 self.lexIdent(ch) |
|
552 |
|
553 elif scanOperand and ch == "/": |
|
554 self.lexRegExp(ch) |
|
555 |
|
556 elif ch == ".": |
|
557 self.lexDot(ch) |
|
558 |
|
559 elif self.scanNewlines and ch == "\n": |
|
560 token.type = "newline" |
|
561 self.line += 1 |
|
562 |
|
563 elif ch in operatorNames: |
|
564 self.lexOp(ch) |
|
565 |
|
566 elif ch >= "1" and ch <= "9": |
|
567 self.lexNumber(ch) |
|
568 |
|
569 elif ch == "0": |
|
570 self.lexZeroNumber(ch) |
|
571 |
|
572 elif ch == '"' or ch == "'": |
|
573 self.lexString(ch) |
|
574 |
|
575 else: |
|
576 raise ParseError("Illegal token: %s (Code: %s)" % (ch, ord(ch)), self.fileId, self.line) |
|
577 |
|
578 token.end = self.cursor |
|
579 return token.type |
|
580 |
|
581 |
|
582 def unget(self): |
|
583 """ Match depends on unget returning undefined.""" |
|
584 self.lookahead += 1 |
|
585 |
|
586 if self.lookahead == 4: |
|
587 raise ParseError("PANIC: too much lookahead!", self.fileId, self.line) |
|
588 |
|
589 self.tokenIndex = (self.tokenIndex - 1) & 3 |
|
590 |
|
591 |
|
592 def save(self): |
|
593 return { |
|
594 "cursor" : self.cursor, |
|
595 "tokenIndex": self.tokenIndex, |
|
596 "tokens": copy.copy(self.tokens), |
|
597 "lookahead": self.lookahead, |
|
598 "scanNewlines": self.scanNewlines, |
|
599 "line": self.line |
|
600 } |
|
601 |
|
602 |
|
603 def rewind(self, point): |
|
604 self.cursor = point["cursor"] |
|
605 self.tokenIndex = point["tokenIndex"] |
|
606 self.tokens = copy.copy(point["tokens"]) |
|
607 self.lookahead = point["lookahead"] |
|
608 self.scanNewline = point["scanNewline"] |
|
609 self.line = point["line"] |
|