ThirdParty/Jasy/jasy/js/tokenize/Tokenizer.py

changeset 2779
4d433896b6d6
child 2847
1843ef6e2656
equal deleted inserted replaced
2776:43b8060a4b44 2779:4d433896b6d6
1 #
2 # Jasy - Web Tooling Framework
3 # Copyright 2010-2012 Zynga Inc.
4 #
5
6 #
7 # License: MPL 1.1/GPL 2.0/LGPL 2.1
8 # Authors:
9 # - Brendan Eich <brendan@mozilla.org> (Original JavaScript) (2004-2010)
10 # - Sebastian Werner <info@sebastian-werner.net> (Python Port) (2010)
11 #
12
13 import copy
14
15 import jasy.js.tokenize.Lang as Lang
16 import jasy.js.api.Comment as Comment
17 import jasy.core.Console as Console
18
19 __all__ = [ "Tokenizer" ]
20
21
22 # Operator and punctuator mapping from token to tree node type name.
23 # NB: because the lexer doesn't backtrack, all token prefixes must themselves
24 # be valid tokens (e.g. !== is acceptable because its prefixes are the valid
25 # tokens != and !).
26 operatorNames = {
27 '<' : 'lt',
28 '>' : 'gt',
29 '<=' : 'le',
30 '>=' : 'ge',
31 '!=' : 'ne',
32 '!' : 'not',
33 '==' : 'eq',
34 '===' : 'strict_eq',
35 '!==' : 'strict_ne',
36
37 '>>' : 'rsh',
38 '<<' : 'lsh',
39 '>>>' : 'ursh',
40
41 '+' : 'plus',
42 '*' : 'mul',
43 '-' : 'minus',
44 '/' : 'div',
45 '%' : 'mod',
46
47 ',' : 'comma',
48 ';' : 'semicolon',
49 ':' : 'colon',
50 '=' : 'assign',
51 '?' : 'hook',
52
53 '&&' : 'and',
54 '||' : 'or',
55
56 '++' : 'increment',
57 '--' : 'decrement',
58
59 ')' : 'right_paren',
60 '(' : 'left_paren',
61 '[' : 'left_bracket',
62 ']' : 'right_bracket',
63 '{' : 'left_curly',
64 '}' : 'right_curly',
65
66 '&' : 'bitwise_and',
67 '^' : 'bitwise_xor',
68 '|' : 'bitwise_or',
69 '~' : 'bitwise_not'
70 }
71
72
73 # Assignment operators
74 assignOperators = ["|", "^", "&", "<<", ">>", ">>>", "+", "-", "*", "/", "%"]
75
76
77
78
79 #
80 # Classes
81 #
82
83 class Token:
84 __slots__ = ["type", "start", "line", "assignOp", "end", "value"]
85
86
87 class ParseError(Exception):
88 def __init__(self, message, fileId, line):
89 Exception.__init__(self, "Syntax error: %s\n%s:%s" % (message, fileId, line))
90
91
92 class Tokenizer(object):
93 def __init__(self, source, fileId="", line=1):
94 # source: JavaScript source
95 # fileId: Filename (for debugging proposes)
96 # line: Line number (for debugging proposes)
97 self.cursor = 0
98 self.source = str(source)
99 self.tokens = {}
100 self.tokenIndex = 0
101 self.lookahead = 0
102 self.scanNewlines = False
103 self.fileId = fileId
104 self.line = line
105 self.comments = []
106
107 input_ = property(lambda self: self.source[self.cursor:])
108 token = property(lambda self: self.tokens.get(self.tokenIndex))
109
110
111 def done(self):
112 # We need to set scanOperand to true here because the first thing
113 # might be a regexp.
114 return self.peek(True) == "end"
115
116
117 def match(self, tokenType, scanOperand=False):
118 return self.get(scanOperand) == tokenType or self.unget()
119
120
121 def mustMatch(self, tokenType):
122 if not self.match(tokenType):
123 raise ParseError("Missing " + tokenType, self.fileId, self.line)
124
125 return self.token
126
127
128 def peek(self, scanOperand=False):
129 if self.lookahead:
130 next = self.tokens.get((self.tokenIndex + self.lookahead) & 3)
131 if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)):
132 tokenType = "newline"
133 else:
134 tokenType = getattr(next, "type", None)
135 else:
136 tokenType = self.get(scanOperand)
137 self.unget()
138
139 return tokenType
140
141
142 def peekOnSameLine(self, scanOperand=False):
143 self.scanNewlines = True
144 tokenType = self.peek(scanOperand)
145 self.scanNewlines = False
146 return tokenType
147
148
149 def getComments(self):
150 if self.comments:
151 comments = self.comments
152 self.comments = []
153 return comments
154
155 return None
156
157
158 def skip(self):
159 """Eats comments and whitespace."""
160 input = self.source
161 startLine = self.line
162
163 # Whether this is the first called as happen on start parsing a file (eat leading comments/white space)
164 startOfFile = self.cursor is 0
165
166 indent = ""
167
168 while (True):
169 if len(input) > self.cursor:
170 ch = input[self.cursor]
171 else:
172 return
173
174 self.cursor += 1
175
176 if len(input) > self.cursor:
177 next = input[self.cursor]
178 else:
179 next = None
180
181 if ch == "\n" and not self.scanNewlines:
182 self.line += 1
183 indent = ""
184
185 elif ch == "/" and next == "*":
186 self.cursor += 1
187 text = "/*"
188 commentStartLine = self.line
189 if startLine == self.line and not startOfFile:
190 mode = "inline"
191 elif (self.line-1) > startLine:
192 # distance before this comment means it is a comment block for a whole section (multiple lines of code)
193 mode = "section"
194 else:
195 # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
196 mode = "block"
197
198 while (True):
199 try:
200 ch = input[self.cursor]
201 self.cursor += 1
202 except IndexError:
203 raise ParseError("Unterminated comment", self.fileId, self.line)
204
205 if ch == "*":
206 next = input[self.cursor]
207 if next == "/":
208 text += "*/"
209 self.cursor += 1
210 break
211
212 elif ch == "\n":
213 self.line += 1
214
215 text += ch
216
217
218 # Filter escaping on slash-star combinations in comment text
219 text = text.replace("*\/", "*/")
220
221 try:
222 self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId))
223 except Comment.CommentException as commentError:
224 Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
225
226
227 elif ch == "/" and next == "/":
228 self.cursor += 1
229 text = "//"
230 if startLine == self.line and not startOfFile:
231 mode = "inline"
232 elif (self.line-1) > startLine:
233 # distance before this comment means it is a comment block for a whole section (multiple lines of code)
234 mode = "section"
235 else:
236 # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
237 mode = "block"
238
239 while (True):
240 try:
241 ch = input[self.cursor]
242 self.cursor += 1
243 except IndexError:
244 # end of file etc.
245 break
246
247 if ch == "\n":
248 self.line += 1
249 break
250
251 text += ch
252
253 try:
254 self.comments.append(Comment.Comment(text, mode, self.line-1, "", self.fileId))
255 except Comment.CommentException:
256 Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
257
258 # check for whitespace, also for special cases like 0xA0
259 elif ch in "\xA0 \t":
260 indent += ch
261
262 else:
263 self.cursor -= 1
264 return
265
266
267 # Lexes the exponential part of a number, if present. Returns True if an
268 # exponential part was found.
269 def lexExponent(self):
270 input = self.source
271 next = input[self.cursor]
272 if next == "e" or next == "E":
273 self.cursor += 1
274 ch = input[self.cursor]
275 self.cursor += 1
276 if ch == "+" or ch == "-":
277 ch = input[self.cursor]
278 self.cursor += 1
279
280 if ch < "0" or ch > "9":
281 raise ParseError("Missing exponent", self.fileId, self.line)
282
283 while(True):
284 ch = input[self.cursor]
285 self.cursor += 1
286 if not (ch >= "0" and ch <= "9"):
287 break
288
289 self.cursor -= 1
290 return True
291
292 return False
293
294
295 def lexZeroNumber(self, ch):
296 token = self.token
297 input = self.source
298 token.type = "number"
299
300 ch = input[self.cursor]
301 self.cursor += 1
302 if ch == ".":
303 while(True):
304 ch = input[self.cursor]
305 self.cursor += 1
306 if not (ch >= "0" and ch <= "9"):
307 break
308
309 self.cursor -= 1
310 self.lexExponent()
311 token.value = input[token.start:self.cursor]
312
313 elif ch == "x" or ch == "X":
314 while(True):
315 ch = input[self.cursor]
316 self.cursor += 1
317 if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")):
318 break
319
320 self.cursor -= 1
321 token.value = input[token.start:self.cursor]
322
323 elif ch >= "0" and ch <= "7":
324 while(True):
325 ch = input[self.cursor]
326 self.cursor += 1
327 if not (ch >= "0" and ch <= "7"):
328 break
329
330 self.cursor -= 1
331 token.value = input[token.start:self.cursor]
332
333 else:
334 self.cursor -= 1
335 self.lexExponent() # 0E1, &c.
336 token.value = 0
337
338
339 def lexNumber(self, ch):
340 token = self.token
341 input = self.source
342 token.type = "number"
343
344 floating = False
345 while(True):
346 ch = input[self.cursor]
347 self.cursor += 1
348
349 if ch == "." and not floating:
350 floating = True
351 ch = input[self.cursor]
352 self.cursor += 1
353
354 if not (ch >= "0" and ch <= "9"):
355 break
356
357 self.cursor -= 1
358
359 exponent = self.lexExponent()
360 segment = input[token.start:self.cursor]
361
362 # Protect float or exponent numbers
363 if floating or exponent:
364 token.value = segment
365 else:
366 token.value = int(segment)
367
368
369 def lexDot(self, ch):
370 token = self.token
371 input = self.source
372 next = input[self.cursor]
373
374 if next >= "0" and next <= "9":
375 while (True):
376 ch = input[self.cursor]
377 self.cursor += 1
378 if not (ch >= "0" and ch <= "9"):
379 break
380
381 self.cursor -= 1
382 self.lexExponent()
383
384 token.type = "number"
385 token.value = input[token.start:self.cursor]
386
387 else:
388 token.type = "dot"
389
390
391 def lexString(self, ch):
392 token = self.token
393 input = self.source
394 token.type = "string"
395
396 hasEscapes = False
397 delim = ch
398 ch = input[self.cursor]
399 self.cursor += 1
400 while ch != delim:
401 if ch == "\\":
402 hasEscapes = True
403 self.cursor += 1
404
405 ch = input[self.cursor]
406 self.cursor += 1
407
408 if hasEscapes:
409 token.value = eval(input[token.start:self.cursor])
410 else:
411 token.value = input[token.start+1:self.cursor-1]
412
413
414 def lexRegExp(self, ch):
415 token = self.token
416 input = self.source
417 token.type = "regexp"
418
419 while (True):
420 try:
421 ch = input[self.cursor]
422 self.cursor += 1
423 except IndexError:
424 raise ParseError("Unterminated regex", self.fileId, self.line)
425
426 if ch == "\\":
427 self.cursor += 1
428
429 elif ch == "[":
430 while (True):
431 if ch == "\\":
432 self.cursor += 1
433
434 try:
435 ch = input[self.cursor]
436 self.cursor += 1
437 except IndexError:
438 raise ParseError("Unterminated character class", self.fileId, self.line)
439
440 if ch == "]":
441 break
442
443 if ch == "/":
444 break
445
446 while(True):
447 ch = input[self.cursor]
448 self.cursor += 1
449 if not (ch >= "a" and ch <= "z"):
450 break
451
452 self.cursor -= 1
453 token.value = input[token.start:self.cursor]
454
455
456 def lexOp(self, ch):
457 token = self.token
458 input = self.source
459
460 op = ch
461 while(True):
462 try:
463 next = input[self.cursor]
464 except IndexError:
465 break
466
467 if (op + next) in operatorNames:
468 self.cursor += 1
469 op += next
470 else:
471 break
472
473 try:
474 next = input[self.cursor]
475 except IndexError:
476 next = None
477
478 if next == "=" and op in assignOperators:
479 self.cursor += 1
480 token.type = "assign"
481 token.assignOp = operatorNames[op]
482 op += "="
483
484 else:
485 token.type = operatorNames[op]
486 token.assignOp = None
487
488
489 # FIXME: Unicode escape sequences
490 # FIXME: Unicode identifiers
491 def lexIdent(self, ch):
492 token = self.token
493 input = self.source
494
495 try:
496 while True:
497 ch = input[self.cursor]
498 self.cursor += 1
499
500 if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "$" or ch == "_"):
501 break
502
503 except IndexError:
504 self.cursor += 1
505 pass
506
507 # Put the non-word character back.
508 self.cursor -= 1
509
510 identifier = input[token.start:self.cursor]
511 if identifier in Lang.keywords:
512 token.type = identifier
513 else:
514 token.type = "identifier"
515 token.value = identifier
516
517
518 def get(self, scanOperand=False):
519 """
520 It consumes input *only* if there is no lookahead.
521 Dispatches to the appropriate lexing function depending on the input.
522 """
523 while self.lookahead:
524 self.lookahead -= 1
525 self.tokenIndex = (self.tokenIndex + 1) & 3
526 token = self.tokens[self.tokenIndex]
527 if token.type != "newline" or self.scanNewlines:
528 return token.type
529
530 self.skip()
531
532 self.tokenIndex = (self.tokenIndex + 1) & 3
533 self.tokens[self.tokenIndex] = token = Token()
534
535 token.start = self.cursor
536 token.line = self.line
537
538 input = self.source
539 if self.cursor == len(input):
540 token.end = token.start
541 token.type = "end"
542 return token.type
543
544 ch = input[self.cursor]
545 self.cursor += 1
546
547 if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "_":
548 self.lexIdent(ch)
549
550 elif scanOperand and ch == "/":
551 self.lexRegExp(ch)
552
553 elif ch == ".":
554 self.lexDot(ch)
555
556 elif self.scanNewlines and ch == "\n":
557 token.type = "newline"
558 self.line += 1
559
560 elif ch in operatorNames:
561 self.lexOp(ch)
562
563 elif ch >= "1" and ch <= "9":
564 self.lexNumber(ch)
565
566 elif ch == "0":
567 self.lexZeroNumber(ch)
568
569 elif ch == '"' or ch == "'":
570 self.lexString(ch)
571
572 else:
573 raise ParseError("Illegal token: %s (Code: %s)" % (ch, ord(ch)), self.fileId, self.line)
574
575 token.end = self.cursor
576 return token.type
577
578
579 def unget(self):
580 """ Match depends on unget returning undefined."""
581 self.lookahead += 1
582
583 if self.lookahead == 4:
584 raise ParseError("PANIC: too much lookahead!", self.fileId, self.line)
585
586 self.tokenIndex = (self.tokenIndex - 1) & 3
587
588
589 def save(self):
590 return {
591 "cursor" : self.cursor,
592 "tokenIndex": self.tokenIndex,
593 "tokens": copy.copy(self.tokens),
594 "lookahead": self.lookahead,
595 "scanNewlines": self.scanNewlines,
596 "line": self.line
597 }
598
599
600 def rewind(self, point):
601 self.cursor = point["cursor"]
602 self.tokenIndex = point["tokenIndex"]
603 self.tokens = copy.copy(point["tokens"])
604 self.lookahead = point["lookahead"]
605 self.scanNewline = point["scanNewline"]
606 self.line = point["line"]

eric ide

mercurial