ThirdParty/Jasy/jasy/js/tokenize/Tokenizer.py

changeset 6650
1dd52aa8897c
parent 6649
f1b3a73831c9
child 6651
e8f3b5568b21
equal deleted inserted replaced
6649:f1b3a73831c9 6650:1dd52aa8897c
1 #
2 # Jasy - Web Tooling Framework
3 # Copyright 2010-2012 Zynga Inc.
4 #
5
6 #
7 # License: MPL 1.1/GPL 2.0/LGPL 2.1
8 # Authors:
9 # - Brendan Eich <brendan@mozilla.org> (Original JavaScript) (2004-2010)
10 # - Sebastian Werner <info@sebastian-werner.net> (Python Port) (2010)
11 #
12
13 from __future__ import unicode_literals
14
15 import copy
16
17 import jasy.js.tokenize.Lang as Lang
18 import jasy.js.api.Comment as Comment
19 import jasy.core.Console as Console
20
21 __all__ = [ "Tokenizer" ]
22
23
24 # Operator and punctuator mapping from token to tree node type name.
25 # NB: because the lexer doesn't backtrack, all token prefixes must themselves
26 # be valid tokens (e.g. !== is acceptable because its prefixes are the valid
27 # tokens != and !).
28 operatorNames = {
29 '<' : 'lt',
30 '>' : 'gt',
31 '<=' : 'le',
32 '>=' : 'ge',
33 '!=' : 'ne',
34 '!' : 'not',
35 '==' : 'eq',
36 '===' : 'strict_eq',
37 '!==' : 'strict_ne',
38
39 '>>' : 'rsh',
40 '<<' : 'lsh',
41 '>>>' : 'ursh',
42
43 '+' : 'plus',
44 '*' : 'mul',
45 '-' : 'minus',
46 '/' : 'div',
47 '%' : 'mod',
48
49 ',' : 'comma',
50 ';' : 'semicolon',
51 ':' : 'colon',
52 '=' : 'assign',
53 '?' : 'hook',
54
55 '&&' : 'and',
56 '||' : 'or',
57
58 '++' : 'increment',
59 '--' : 'decrement',
60
61 ')' : 'right_paren',
62 '(' : 'left_paren',
63 '[' : 'left_bracket',
64 ']' : 'right_bracket',
65 '{' : 'left_curly',
66 '}' : 'right_curly',
67
68 '&' : 'bitwise_and',
69 '^' : 'bitwise_xor',
70 '|' : 'bitwise_or',
71 '~' : 'bitwise_not'
72 }
73
74
75 # Assignment operators
76 assignOperators = ["|", "^", "&", "<<", ">>", ">>>", "+", "-", "*", "/", "%"]
77
78
79
80
81 #
82 # Classes
83 #
84
85 class Token:
86 __slots__ = ["type", "start", "line", "assignOp", "end", "value"]
87
88
89 class ParseError(Exception):
90 def __init__(self, message, fileId, line):
91 Exception.__init__(self, "Syntax error: %s\n%s:%s" % (message, fileId, line))
92
93
94 class Tokenizer(object):
95 def __init__(self, source, fileId="", line=1):
96 # source: JavaScript source
97 # fileId: Filename (for debugging proposes)
98 # line: Line number (for debugging proposes)
99 self.cursor = 0
100 self.source = str(source)
101 self.tokens = {}
102 self.tokenIndex = 0
103 self.lookahead = 0
104 self.scanNewlines = False
105 self.fileId = fileId
106 self.line = line
107 self.comments = []
108
109 input_ = property(lambda self: self.source[self.cursor:])
110 token = property(lambda self: self.tokens.get(self.tokenIndex))
111
112
113 def done(self):
114 # We need to set scanOperand to true here because the first thing
115 # might be a regexp.
116 return self.peek(True) == "end"
117
118
119 def match(self, tokenType, scanOperand=False):
120 return self.get(scanOperand) == tokenType or self.unget()
121
122
123 def mustMatch(self, tokenType):
124 if not self.match(tokenType):
125 raise ParseError("Missing " + tokenType, self.fileId, self.line)
126
127 return self.token
128
129
130 def peek(self, scanOperand=False):
131 if self.lookahead:
132 next = self.tokens.get((self.tokenIndex + self.lookahead) & 3)
133 if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)):
134 tokenType = "newline"
135 else:
136 tokenType = getattr(next, "type", None)
137 else:
138 tokenType = self.get(scanOperand)
139 self.unget()
140
141 return tokenType
142
143
144 def peekOnSameLine(self, scanOperand=False):
145 self.scanNewlines = True
146 tokenType = self.peek(scanOperand)
147 self.scanNewlines = False
148 return tokenType
149
150
151 def getComments(self):
152 if self.comments:
153 comments = self.comments
154 self.comments = []
155 return comments
156
157 return None
158
159
160 def skip(self):
161 """Eats comments and whitespace."""
162 input = self.source
163 startLine = self.line
164
165 # Whether this is the first called as happen on start parsing a file (eat leading comments/white space)
166 startOfFile = self.cursor is 0
167
168 indent = ""
169
170 while (True):
171 if len(input) > self.cursor:
172 ch = input[self.cursor]
173 else:
174 return
175
176 self.cursor += 1
177
178 if len(input) > self.cursor:
179 next = input[self.cursor]
180 else:
181 next = None
182
183 if ch == "\n" and not self.scanNewlines:
184 self.line += 1
185 indent = ""
186
187 elif ch == "/" and next == "*":
188 self.cursor += 1
189 text = "/*"
190 inline = startLine == self.line and startLine > 1
191 commentStartLine = self.line
192 if startLine == self.line and not startOfFile:
193 mode = "inline"
194 elif (self.line-1) > startLine:
195 # distance before this comment means it is a comment block for a whole section (multiple lines of code)
196 mode = "section"
197 else:
198 # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
199 mode = "block"
200
201 while (True):
202 try:
203 ch = input[self.cursor]
204 self.cursor += 1
205 except IndexError:
206 raise ParseError("Unterminated comment", self.fileId, self.line)
207
208 if ch == "*":
209 next = input[self.cursor]
210 if next == "/":
211 text += "*/"
212 self.cursor += 1
213 break
214
215 elif ch == "\n":
216 self.line += 1
217
218 text += ch
219
220
221 # Filter escaping on slash-star combinations in comment text
222 text = text.replace("*\/", "*/")
223
224 try:
225 self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId))
226 except Comment.CommentException as commentError:
227 Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
228
229
230 elif ch == "/" and next == "/":
231 self.cursor += 1
232 text = "//"
233 if startLine == self.line and not startOfFile:
234 mode = "inline"
235 elif (self.line-1) > startLine:
236 # distance before this comment means it is a comment block for a whole section (multiple lines of code)
237 mode = "section"
238 else:
239 # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
240 mode = "block"
241
242 while (True):
243 try:
244 ch = input[self.cursor]
245 self.cursor += 1
246 except IndexError:
247 # end of file etc.
248 break
249
250 if ch == "\n":
251 self.line += 1
252 break
253
254 text += ch
255
256 try:
257 self.comments.append(Comment.Comment(text, mode, self.line-1, "", self.fileId))
258 except Comment.CommentException:
259 Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
260
261 # check for whitespace, also for special cases like 0xA0
262 elif ch in "\xA0 \t":
263 indent += ch
264
265 else:
266 self.cursor -= 1
267 return
268
269
270 # Lexes the exponential part of a number, if present. Returns True if an
271 # exponential part was found.
272 def lexExponent(self):
273 input = self.source
274 next = input[self.cursor]
275 if next == "e" or next == "E":
276 self.cursor += 1
277 ch = input[self.cursor]
278 self.cursor += 1
279 if ch == "+" or ch == "-":
280 ch = input[self.cursor]
281 self.cursor += 1
282
283 if ch < "0" or ch > "9":
284 raise ParseError("Missing exponent", self.fileId, self.line)
285
286 while(True):
287 ch = input[self.cursor]
288 self.cursor += 1
289 if not (ch >= "0" and ch <= "9"):
290 break
291
292 self.cursor -= 1
293 return True
294
295 return False
296
297
298 def lexZeroNumber(self, ch):
299 token = self.token
300 input = self.source
301 token.type = "number"
302
303 ch = input[self.cursor]
304 self.cursor += 1
305 if ch == ".":
306 while(True):
307 ch = input[self.cursor]
308 self.cursor += 1
309 if not (ch >= "0" and ch <= "9"):
310 break
311
312 self.cursor -= 1
313 self.lexExponent()
314 token.value = input[token.start:self.cursor]
315
316 elif ch == "x" or ch == "X":
317 while(True):
318 ch = input[self.cursor]
319 self.cursor += 1
320 if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")):
321 break
322
323 self.cursor -= 1
324 token.value = input[token.start:self.cursor]
325
326 elif ch >= "0" and ch <= "7":
327 while(True):
328 ch = input[self.cursor]
329 self.cursor += 1
330 if not (ch >= "0" and ch <= "7"):
331 break
332
333 self.cursor -= 1
334 token.value = input[token.start:self.cursor]
335
336 else:
337 self.cursor -= 1
338 self.lexExponent() # 0E1, &c.
339 token.value = 0
340
341
342 def lexNumber(self, ch):
343 token = self.token
344 input = self.source
345 token.type = "number"
346
347 floating = False
348 while(True):
349 ch = input[self.cursor]
350 self.cursor += 1
351
352 if ch == "." and not floating:
353 floating = True
354 ch = input[self.cursor]
355 self.cursor += 1
356
357 if not (ch >= "0" and ch <= "9"):
358 break
359
360 self.cursor -= 1
361
362 exponent = self.lexExponent()
363 segment = input[token.start:self.cursor]
364
365 # Protect float or exponent numbers
366 if floating or exponent:
367 token.value = segment
368 else:
369 token.value = int(segment)
370
371
372 def lexDot(self, ch):
373 token = self.token
374 input = self.source
375 next = input[self.cursor]
376
377 if next >= "0" and next <= "9":
378 while (True):
379 ch = input[self.cursor]
380 self.cursor += 1
381 if not (ch >= "0" and ch <= "9"):
382 break
383
384 self.cursor -= 1
385 self.lexExponent()
386
387 token.type = "number"
388 token.value = input[token.start:self.cursor]
389
390 else:
391 token.type = "dot"
392
393
394 def lexString(self, ch):
395 token = self.token
396 input = self.source
397 token.type = "string"
398
399 hasEscapes = False
400 delim = ch
401 ch = input[self.cursor]
402 self.cursor += 1
403 while ch != delim:
404 if ch == "\\":
405 hasEscapes = True
406 self.cursor += 1
407
408 ch = input[self.cursor]
409 self.cursor += 1
410
411 if hasEscapes:
412 token.value = eval(input[token.start:self.cursor])
413 else:
414 token.value = input[token.start+1:self.cursor-1]
415
416
417 def lexRegExp(self, ch):
418 token = self.token
419 input = self.source
420 token.type = "regexp"
421
422 while (True):
423 try:
424 ch = input[self.cursor]
425 self.cursor += 1
426 except IndexError:
427 raise ParseError("Unterminated regex", self.fileId, self.line)
428
429 if ch == "\\":
430 self.cursor += 1
431
432 elif ch == "[":
433 while (True):
434 if ch == "\\":
435 self.cursor += 1
436
437 try:
438 ch = input[self.cursor]
439 self.cursor += 1
440 except IndexError:
441 raise ParseError("Unterminated character class", self.fileId, self.line)
442
443 if ch == "]":
444 break
445
446 if ch == "/":
447 break
448
449 while(True):
450 ch = input[self.cursor]
451 self.cursor += 1
452 if not (ch >= "a" and ch <= "z"):
453 break
454
455 self.cursor -= 1
456 token.value = input[token.start:self.cursor]
457
458
459 def lexOp(self, ch):
460 token = self.token
461 input = self.source
462
463 op = ch
464 while(True):
465 try:
466 next = input[self.cursor]
467 except IndexError:
468 break
469
470 if (op + next) in operatorNames:
471 self.cursor += 1
472 op += next
473 else:
474 break
475
476 try:
477 next = input[self.cursor]
478 except IndexError:
479 next = None
480
481 if next == "=" and op in assignOperators:
482 self.cursor += 1
483 token.type = "assign"
484 token.assignOp = operatorNames[op]
485 op += "="
486
487 else:
488 token.type = operatorNames[op]
489 token.assignOp = None
490
491
492 # FIXME: Unicode escape sequences
493 # FIXME: Unicode identifiers
494 def lexIdent(self, ch):
495 token = self.token
496 input = self.source
497
498 try:
499 while True:
500 ch = input[self.cursor]
501 self.cursor += 1
502
503 if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "$" or ch == "_"):
504 break
505
506 except IndexError:
507 self.cursor += 1
508 pass
509
510 # Put the non-word character back.
511 self.cursor -= 1
512
513 identifier = input[token.start:self.cursor]
514 if identifier in Lang.keywords:
515 token.type = identifier
516 else:
517 token.type = "identifier"
518 token.value = identifier
519
520
521 def get(self, scanOperand=False):
522 """
523 It consumes input *only* if there is no lookahead.
524 Dispatches to the appropriate lexing function depending on the input.
525 """
526 while self.lookahead:
527 self.lookahead -= 1
528 self.tokenIndex = (self.tokenIndex + 1) & 3
529 token = self.tokens[self.tokenIndex]
530 if token.type != "newline" or self.scanNewlines:
531 return token.type
532
533 self.skip()
534
535 self.tokenIndex = (self.tokenIndex + 1) & 3
536 self.tokens[self.tokenIndex] = token = Token()
537
538 token.start = self.cursor
539 token.line = self.line
540
541 input = self.source
542 if self.cursor == len(input):
543 token.end = token.start
544 token.type = "end"
545 return token.type
546
547 ch = input[self.cursor]
548 self.cursor += 1
549
550 if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "_":
551 self.lexIdent(ch)
552
553 elif scanOperand and ch == "/":
554 self.lexRegExp(ch)
555
556 elif ch == ".":
557 self.lexDot(ch)
558
559 elif self.scanNewlines and ch == "\n":
560 token.type = "newline"
561 self.line += 1
562
563 elif ch in operatorNames:
564 self.lexOp(ch)
565
566 elif ch >= "1" and ch <= "9":
567 self.lexNumber(ch)
568
569 elif ch == "0":
570 self.lexZeroNumber(ch)
571
572 elif ch == '"' or ch == "'":
573 self.lexString(ch)
574
575 else:
576 raise ParseError("Illegal token: %s (Code: %s)" % (ch, ord(ch)), self.fileId, self.line)
577
578 token.end = self.cursor
579 return token.type
580
581
582 def unget(self):
583 """ Match depends on unget returning undefined."""
584 self.lookahead += 1
585
586 if self.lookahead == 4:
587 raise ParseError("PANIC: too much lookahead!", self.fileId, self.line)
588
589 self.tokenIndex = (self.tokenIndex - 1) & 3
590
591
592 def save(self):
593 return {
594 "cursor" : self.cursor,
595 "tokenIndex": self.tokenIndex,
596 "tokens": copy.copy(self.tokens),
597 "lookahead": self.lookahead,
598 "scanNewlines": self.scanNewlines,
599 "line": self.line
600 }
601
602
603 def rewind(self, point):
604 self.cursor = point["cursor"]
605 self.tokenIndex = point["tokenIndex"]
606 self.tokens = copy.copy(point["tokens"])
607 self.lookahead = point["lookahead"]
608 self.scanNewline = point["scanNewline"]
609 self.line = point["line"]

eric ide

mercurial