|
1 # |
|
2 # Jasy - Web Tooling Framework |
|
3 # Copyright 2010-2012 Zynga Inc. |
|
4 # Copyright 2013-2014 Sebastian Werner |
|
5 # |
|
6 |
|
7 # |
|
8 # License: MPL 1.1/GPL 2.0/LGPL 2.1 |
|
9 # Authors: |
|
10 # - Brendan Eich <brendan@mozilla.org> (Original JavaScript) (2004-2010) |
|
11 # - Sebastian Werner <info@sebastian-werner.net> (Python Port) (2010) |
|
12 # |
|
13 |
|
14 from __future__ import unicode_literals |
|
15 |
|
16 import re, copy |
|
17 |
|
18 import jasy.script.tokenize.Lang as Lang |
|
19 import jasy.script.api.Comment as Comment |
|
20 import jasy.core.Console as Console |
|
21 |
|
22 |
|
23 # Operator and punctuator mapping from token to tree node type name. |
|
24 # NB: because the lexer doesn't backtrack, all token prefixes must themselves |
|
25 # be valid tokens (e.g. !== is acceptable because its prefixes are the valid |
|
26 # tokens != and !). |
|
27 operatorNames = { |
|
28 '<' : 'lt', |
|
29 '>' : 'gt', |
|
30 '<=' : 'le', |
|
31 '>=' : 'ge', |
|
32 '!=' : 'ne', |
|
33 '!' : 'not', |
|
34 '==' : 'eq', |
|
35 '===' : 'strict_eq', |
|
36 '!==' : 'strict_ne', |
|
37 |
|
38 '>>' : 'rsh', |
|
39 '<<' : 'lsh', |
|
40 '>>>' : 'ursh', |
|
41 |
|
42 '+' : 'plus', |
|
43 '*' : 'mul', |
|
44 '-' : 'minus', |
|
45 '/' : 'div', |
|
46 '%' : 'mod', |
|
47 |
|
48 ',' : 'comma', |
|
49 ';' : 'semicolon', |
|
50 ':' : 'colon', |
|
51 '=' : 'assign', |
|
52 '?' : 'hook', |
|
53 |
|
54 '&&' : 'and', |
|
55 '||' : 'or', |
|
56 |
|
57 '++' : 'increment', |
|
58 '--' : 'decrement', |
|
59 |
|
60 ')' : 'right_paren', |
|
61 '(' : 'left_paren', |
|
62 '[' : 'left_bracket', |
|
63 ']' : 'right_bracket', |
|
64 '{' : 'left_curly', |
|
65 '}' : 'right_curly', |
|
66 |
|
67 '&' : 'bitwise_and', |
|
68 '^' : 'bitwise_xor', |
|
69 '|' : 'bitwise_or', |
|
70 '~' : 'bitwise_not' |
|
71 } |
|
72 |
|
73 |
|
74 # Assignment operators |
|
75 assignOperators = ["|", "^", "&", "<<", ">>", ">>>", "+", "-", "*", "/", "%"] |
|
76 |
|
77 |
|
78 |
|
79 |
|
80 # |
|
81 # Classes |
|
82 # |
|
83 |
|
84 class Token: |
|
85 __slots__ = ["type", "start", "line", "assignOp", "end", "value"] |
|
86 |
|
87 |
|
88 class ParseError(Exception): |
|
89 def __init__(self, message, fileId, line): |
|
90 Exception.__init__(self, "Syntax error: %s\n%s:%s" % (message, fileId, line)) |
|
91 |
|
92 |
|
93 class Tokenizer(object): |
|
94 def __init__(self, source, fileId="", line=1): |
|
95 # source: JavaScript source |
|
96 # fileId: Filename (for debugging proposes) |
|
97 # line: Line number (for debugging proposes) |
|
98 self.cursor = 0 |
|
99 self.source = str(source) |
|
100 self.tokens = {} |
|
101 self.tokenIndex = 0 |
|
102 self.lookahead = 0 |
|
103 self.scanNewlines = False |
|
104 self.fileId = fileId |
|
105 self.line = line |
|
106 self.comments = [] |
|
107 |
|
108 input_ = property(lambda self: self.source[self.cursor:]) |
|
109 token = property(lambda self: self.tokens.get(self.tokenIndex)) |
|
110 |
|
111 |
|
112 def done(self): |
|
113 # We need to set scanOperand to true here because the first thing |
|
114 # might be a regexp. |
|
115 return self.peek(True) == "end" |
|
116 |
|
117 |
|
118 def match(self, tokenType, scanOperand=False): |
|
119 return self.get(scanOperand) == tokenType or self.unget() |
|
120 |
|
121 |
|
122 def mustMatch(self, tokenType): |
|
123 if not self.match(tokenType): |
|
124 raise ParseError("Missing " + tokenType, self.fileId, self.line) |
|
125 |
|
126 return self.token |
|
127 |
|
128 |
|
129 def peek(self, scanOperand=False): |
|
130 if self.lookahead: |
|
131 next = self.tokens.get((self.tokenIndex + self.lookahead) & 3) |
|
132 if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)): |
|
133 tokenType = "newline" |
|
134 else: |
|
135 tokenType = getattr(next, "type", None) |
|
136 else: |
|
137 tokenType = self.get(scanOperand) |
|
138 self.unget() |
|
139 |
|
140 return tokenType |
|
141 |
|
142 |
|
143 def peekOnSameLine(self, scanOperand=False): |
|
144 self.scanNewlines = True |
|
145 tokenType = self.peek(scanOperand) |
|
146 self.scanNewlines = False |
|
147 return tokenType |
|
148 |
|
149 |
|
150 def getComments(self): |
|
151 if self.comments: |
|
152 comments = self.comments |
|
153 self.comments = [] |
|
154 return comments |
|
155 |
|
156 return None |
|
157 |
|
158 |
|
159 def skip(self): |
|
160 """Eats comments and whitespace.""" |
|
161 input = self.source |
|
162 startLine = self.line |
|
163 |
|
164 # Whether this is the first called as happen on start parsing a file (eat leading comments/white space) |
|
165 startOfFile = self.cursor is 0 |
|
166 |
|
167 indent = "" |
|
168 |
|
169 while (True): |
|
170 if len(input) > self.cursor: |
|
171 ch = input[self.cursor] |
|
172 else: |
|
173 return |
|
174 |
|
175 self.cursor += 1 |
|
176 |
|
177 if len(input) > self.cursor: |
|
178 next = input[self.cursor] |
|
179 else: |
|
180 next = None |
|
181 |
|
182 if ch == "\n" and not self.scanNewlines: |
|
183 self.line += 1 |
|
184 indent = "" |
|
185 |
|
186 elif ch == "/" and next == "*": |
|
187 self.cursor += 1 |
|
188 text = "/*" |
|
189 inline = startLine == self.line and startLine > 1 |
|
190 commentStartLine = self.line |
|
191 if startLine == self.line and not startOfFile: |
|
192 mode = "inline" |
|
193 elif (self.line-1) > startLine: |
|
194 # distance before this comment means it is a comment block for a whole section (multiple lines of code) |
|
195 mode = "section" |
|
196 else: |
|
197 # comment for maybe multiple following lines of code, but not that important (no visual white space divider) |
|
198 mode = "block" |
|
199 |
|
200 while (True): |
|
201 try: |
|
202 ch = input[self.cursor] |
|
203 self.cursor += 1 |
|
204 except IndexError: |
|
205 raise ParseError("Unterminated comment", self.fileId, self.line) |
|
206 |
|
207 if ch == "*": |
|
208 next = input[self.cursor] |
|
209 if next == "/": |
|
210 text += "*/" |
|
211 self.cursor += 1 |
|
212 break |
|
213 |
|
214 elif ch == "\n": |
|
215 self.line += 1 |
|
216 |
|
217 text += ch |
|
218 |
|
219 |
|
220 # Filter escaping on slash-star combinations in comment text |
|
221 text = text.replace("*\/", "*/") |
|
222 |
|
223 try: |
|
224 self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId)) |
|
225 except Comment.CommentException as commentError: |
|
226 Console.error("Ignoring comment in %s: %s", self.fileId, commentError) |
|
227 |
|
228 |
|
229 elif ch == "/" and next == "/": |
|
230 self.cursor += 1 |
|
231 text = "//" |
|
232 if startLine == self.line and not startOfFile: |
|
233 mode = "inline" |
|
234 elif (self.line-1) > startLine: |
|
235 # distance before this comment means it is a comment block for a whole section (multiple lines of code) |
|
236 mode = "section" |
|
237 else: |
|
238 # comment for maybe multiple following lines of code, but not that important (no visual white space divider) |
|
239 mode = "block" |
|
240 |
|
241 while (True): |
|
242 try: |
|
243 ch = input[self.cursor] |
|
244 self.cursor += 1 |
|
245 except IndexError: |
|
246 # end of file etc. |
|
247 break |
|
248 |
|
249 if ch == "\n": |
|
250 self.line += 1 |
|
251 break |
|
252 |
|
253 text += ch |
|
254 |
|
255 try: |
|
256 self.comments.append(Comment.Comment(text, mode, self.line-1, "", self.fileId)) |
|
257 except Comment.CommentException: |
|
258 Console.error("Ignoring comment in %s: %s", self.fileId, commentError) |
|
259 |
|
260 # check for whitespace, also for special cases like 0xA0 |
|
261 elif ch in "\xA0 \t": |
|
262 indent += ch |
|
263 |
|
264 else: |
|
265 self.cursor -= 1 |
|
266 return |
|
267 |
|
268 |
|
269 # Lexes the exponential part of a number, if present. Returns True if an |
|
270 # exponential part was found. |
|
271 def lexExponent(self): |
|
272 input = self.source |
|
273 next = input[self.cursor] |
|
274 if next == "e" or next == "E": |
|
275 self.cursor += 1 |
|
276 ch = input[self.cursor] |
|
277 self.cursor += 1 |
|
278 if ch == "+" or ch == "-": |
|
279 ch = input[self.cursor] |
|
280 self.cursor += 1 |
|
281 |
|
282 if ch < "0" or ch > "9": |
|
283 raise ParseError("Missing exponent", self.fileId, self.line) |
|
284 |
|
285 while(True): |
|
286 ch = input[self.cursor] |
|
287 self.cursor += 1 |
|
288 if not (ch >= "0" and ch <= "9"): |
|
289 break |
|
290 |
|
291 self.cursor -= 1 |
|
292 return True |
|
293 |
|
294 return False |
|
295 |
|
296 |
|
297 def lexZeroNumber(self, ch): |
|
298 token = self.token |
|
299 input = self.source |
|
300 token.type = "number" |
|
301 |
|
302 ch = input[self.cursor] |
|
303 self.cursor += 1 |
|
304 if ch == ".": |
|
305 while(True): |
|
306 ch = input[self.cursor] |
|
307 self.cursor += 1 |
|
308 if not (ch >= "0" and ch <= "9"): |
|
309 break |
|
310 |
|
311 self.cursor -= 1 |
|
312 self.lexExponent() |
|
313 token.value = input[token.start:self.cursor] |
|
314 |
|
315 elif ch == "x" or ch == "X": |
|
316 while(True): |
|
317 ch = input[self.cursor] |
|
318 self.cursor += 1 |
|
319 if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")): |
|
320 break |
|
321 |
|
322 self.cursor -= 1 |
|
323 token.value = input[token.start:self.cursor] |
|
324 |
|
325 elif ch >= "0" and ch <= "7": |
|
326 while(True): |
|
327 ch = input[self.cursor] |
|
328 self.cursor += 1 |
|
329 if not (ch >= "0" and ch <= "7"): |
|
330 break |
|
331 |
|
332 self.cursor -= 1 |
|
333 token.value = input[token.start:self.cursor] |
|
334 |
|
335 else: |
|
336 self.cursor -= 1 |
|
337 self.lexExponent() # 0E1, &c. |
|
338 token.value = 0 |
|
339 |
|
340 |
|
341 def lexNumber(self, ch): |
|
342 token = self.token |
|
343 input = self.source |
|
344 token.type = "number" |
|
345 |
|
346 floating = False |
|
347 while(True): |
|
348 ch = input[self.cursor] |
|
349 self.cursor += 1 |
|
350 |
|
351 if ch == "." and not floating: |
|
352 floating = True |
|
353 ch = input[self.cursor] |
|
354 self.cursor += 1 |
|
355 |
|
356 if not (ch >= "0" and ch <= "9"): |
|
357 break |
|
358 |
|
359 self.cursor -= 1 |
|
360 |
|
361 exponent = self.lexExponent() |
|
362 segment = input[token.start:self.cursor] |
|
363 |
|
364 # Protect float or exponent numbers |
|
365 if floating or exponent: |
|
366 token.value = segment |
|
367 else: |
|
368 token.value = int(segment) |
|
369 |
|
370 |
|
371 def lexDot(self, ch): |
|
372 token = self.token |
|
373 input = self.source |
|
374 next = input[self.cursor] |
|
375 |
|
376 if next >= "0" and next <= "9": |
|
377 while (True): |
|
378 ch = input[self.cursor] |
|
379 self.cursor += 1 |
|
380 if not (ch >= "0" and ch <= "9"): |
|
381 break |
|
382 |
|
383 self.cursor -= 1 |
|
384 self.lexExponent() |
|
385 |
|
386 token.type = "number" |
|
387 token.value = input[token.start:self.cursor] |
|
388 |
|
389 else: |
|
390 token.type = "dot" |
|
391 |
|
392 |
|
393 def lexString(self, ch): |
|
394 token = self.token |
|
395 input = self.source |
|
396 token.type = "string" |
|
397 |
|
398 hasEscapes = False |
|
399 delim = ch |
|
400 ch = input[self.cursor] |
|
401 self.cursor += 1 |
|
402 while ch != delim: |
|
403 if ch == "\\": |
|
404 hasEscapes = True |
|
405 self.cursor += 1 |
|
406 |
|
407 ch = input[self.cursor] |
|
408 self.cursor += 1 |
|
409 |
|
410 if hasEscapes: |
|
411 token.value = eval(input[token.start:self.cursor]) |
|
412 else: |
|
413 token.value = input[token.start+1:self.cursor-1] |
|
414 |
|
415 |
|
416 def lexRegExp(self, ch): |
|
417 token = self.token |
|
418 input = self.source |
|
419 token.type = "regexp" |
|
420 |
|
421 while (True): |
|
422 try: |
|
423 ch = input[self.cursor] |
|
424 self.cursor += 1 |
|
425 except IndexError: |
|
426 raise ParseError("Unterminated regex", self.fileId, self.line) |
|
427 |
|
428 if ch == "\\": |
|
429 self.cursor += 1 |
|
430 |
|
431 elif ch == "[": |
|
432 while (True): |
|
433 if ch == "\\": |
|
434 self.cursor += 1 |
|
435 |
|
436 try: |
|
437 ch = input[self.cursor] |
|
438 self.cursor += 1 |
|
439 except IndexError: |
|
440 raise ParseError("Unterminated character class", self.fileId, self.line) |
|
441 |
|
442 if ch == "]": |
|
443 break |
|
444 |
|
445 if ch == "/": |
|
446 break |
|
447 |
|
448 while(True): |
|
449 ch = input[self.cursor] |
|
450 self.cursor += 1 |
|
451 if not (ch >= "a" and ch <= "z"): |
|
452 break |
|
453 |
|
454 self.cursor -= 1 |
|
455 token.value = input[token.start:self.cursor] |
|
456 |
|
457 |
|
458 def lexOp(self, ch): |
|
459 token = self.token |
|
460 input = self.source |
|
461 |
|
462 op = ch |
|
463 while(True): |
|
464 try: |
|
465 next = input[self.cursor] |
|
466 except IndexError: |
|
467 break |
|
468 |
|
469 if (op + next) in operatorNames: |
|
470 self.cursor += 1 |
|
471 op += next |
|
472 else: |
|
473 break |
|
474 |
|
475 try: |
|
476 next = input[self.cursor] |
|
477 except IndexError: |
|
478 next = None |
|
479 |
|
480 if next == "=" and op in assignOperators: |
|
481 self.cursor += 1 |
|
482 token.type = "assign" |
|
483 token.assignOp = operatorNames[op] |
|
484 op += "=" |
|
485 |
|
486 else: |
|
487 token.type = operatorNames[op] |
|
488 token.assignOp = None |
|
489 |
|
490 |
|
491 # FIXME: Unicode escape sequences |
|
492 # FIXME: Unicode identifiers |
|
493 def lexIdent(self, ch): |
|
494 token = self.token |
|
495 input = self.source |
|
496 |
|
497 try: |
|
498 while True: |
|
499 ch = input[self.cursor] |
|
500 self.cursor += 1 |
|
501 |
|
502 if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "$" or ch == "_"): |
|
503 break |
|
504 |
|
505 except IndexError: |
|
506 self.cursor += 1 |
|
507 pass |
|
508 |
|
509 # Put the non-word character back. |
|
510 self.cursor -= 1 |
|
511 |
|
512 identifier = input[token.start:self.cursor] |
|
513 if identifier in Lang.keywords: |
|
514 token.type = identifier |
|
515 else: |
|
516 token.type = "identifier" |
|
517 token.value = identifier |
|
518 |
|
519 |
|
520 def get(self, scanOperand=False): |
|
521 """ |
|
522 It consumes input *only* if there is no lookahead. |
|
523 Dispatches to the appropriate lexing function depending on the input. |
|
524 """ |
|
525 while self.lookahead: |
|
526 self.lookahead -= 1 |
|
527 self.tokenIndex = (self.tokenIndex + 1) & 3 |
|
528 token = self.tokens[self.tokenIndex] |
|
529 if token.type != "newline" or self.scanNewlines: |
|
530 return token.type |
|
531 |
|
532 self.skip() |
|
533 |
|
534 self.tokenIndex = (self.tokenIndex + 1) & 3 |
|
535 self.tokens[self.tokenIndex] = token = Token() |
|
536 |
|
537 token.start = self.cursor |
|
538 token.line = self.line |
|
539 |
|
540 input = self.source |
|
541 if self.cursor == len(input): |
|
542 token.end = token.start |
|
543 token.type = "end" |
|
544 return token.type |
|
545 |
|
546 ch = input[self.cursor] |
|
547 self.cursor += 1 |
|
548 |
|
549 if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "_": |
|
550 self.lexIdent(ch) |
|
551 |
|
552 elif scanOperand and ch == "/": |
|
553 self.lexRegExp(ch) |
|
554 |
|
555 elif ch == ".": |
|
556 self.lexDot(ch) |
|
557 |
|
558 elif self.scanNewlines and ch == "\n": |
|
559 token.type = "newline" |
|
560 self.line += 1 |
|
561 |
|
562 elif ch in operatorNames: |
|
563 self.lexOp(ch) |
|
564 |
|
565 elif ch >= "1" and ch <= "9": |
|
566 self.lexNumber(ch) |
|
567 |
|
568 elif ch == "0": |
|
569 self.lexZeroNumber(ch) |
|
570 |
|
571 elif ch == '"' or ch == "'": |
|
572 self.lexString(ch) |
|
573 |
|
574 else: |
|
575 raise ParseError("Illegal token: %s (Code: %s)" % (ch, ord(ch)), self.fileId, self.line) |
|
576 |
|
577 token.end = self.cursor |
|
578 return token.type |
|
579 |
|
580 |
|
581 def unget(self): |
|
582 """ Match depends on unget returning undefined.""" |
|
583 self.lookahead += 1 |
|
584 |
|
585 if self.lookahead == 4: |
|
586 raise ParseError("PANIC: too much lookahead!", self.fileId, self.line) |
|
587 |
|
588 self.tokenIndex = (self.tokenIndex - 1) & 3 |
|
589 |