ThirdParty/Jasy/jasy/script/tokenize/Tokenizer.py

changeset 6650
1dd52aa8897c
parent 5843
76eee727ccd9
equal deleted inserted replaced
6649:f1b3a73831c9 6650:1dd52aa8897c
1 #
2 # Jasy - Web Tooling Framework
3 # Copyright 2010-2012 Zynga Inc.
4 # Copyright 2013-2014 Sebastian Werner
5 #
6
7 #
8 # License: MPL 1.1/GPL 2.0/LGPL 2.1
9 # Authors:
10 # - Brendan Eich <brendan@mozilla.org> (Original JavaScript) (2004-2010)
11 # - Sebastian Werner <info@sebastian-werner.net> (Python Port) (2010)
12 #
13
14 from __future__ import unicode_literals
15
16 import re, copy
17
18 import jasy.script.tokenize.Lang as Lang
19 import jasy.script.api.Comment as Comment
20 import jasy.core.Console as Console
21
22
23 # Operator and punctuator mapping from token to tree node type name.
24 # NB: because the lexer doesn't backtrack, all token prefixes must themselves
25 # be valid tokens (e.g. !== is acceptable because its prefixes are the valid
26 # tokens != and !).
27 operatorNames = {
28 '<' : 'lt',
29 '>' : 'gt',
30 '<=' : 'le',
31 '>=' : 'ge',
32 '!=' : 'ne',
33 '!' : 'not',
34 '==' : 'eq',
35 '===' : 'strict_eq',
36 '!==' : 'strict_ne',
37
38 '>>' : 'rsh',
39 '<<' : 'lsh',
40 '>>>' : 'ursh',
41
42 '+' : 'plus',
43 '*' : 'mul',
44 '-' : 'minus',
45 '/' : 'div',
46 '%' : 'mod',
47
48 ',' : 'comma',
49 ';' : 'semicolon',
50 ':' : 'colon',
51 '=' : 'assign',
52 '?' : 'hook',
53
54 '&&' : 'and',
55 '||' : 'or',
56
57 '++' : 'increment',
58 '--' : 'decrement',
59
60 ')' : 'right_paren',
61 '(' : 'left_paren',
62 '[' : 'left_bracket',
63 ']' : 'right_bracket',
64 '{' : 'left_curly',
65 '}' : 'right_curly',
66
67 '&' : 'bitwise_and',
68 '^' : 'bitwise_xor',
69 '|' : 'bitwise_or',
70 '~' : 'bitwise_not'
71 }
72
73
74 # Assignment operators
75 assignOperators = ["|", "^", "&", "<<", ">>", ">>>", "+", "-", "*", "/", "%"]
76
77
78
79
80 #
81 # Classes
82 #
83
84 class Token:
85 __slots__ = ["type", "start", "line", "assignOp", "end", "value"]
86
87
88 class ParseError(Exception):
89 def __init__(self, message, fileId, line):
90 Exception.__init__(self, "Syntax error: %s\n%s:%s" % (message, fileId, line))
91
92
93 class Tokenizer(object):
94 def __init__(self, source, fileId="", line=1):
95 # source: JavaScript source
96 # fileId: Filename (for debugging proposes)
97 # line: Line number (for debugging proposes)
98 self.cursor = 0
99 self.source = str(source)
100 self.tokens = {}
101 self.tokenIndex = 0
102 self.lookahead = 0
103 self.scanNewlines = False
104 self.fileId = fileId
105 self.line = line
106 self.comments = []
107
108 input_ = property(lambda self: self.source[self.cursor:])
109 token = property(lambda self: self.tokens.get(self.tokenIndex))
110
111
112 def done(self):
113 # We need to set scanOperand to true here because the first thing
114 # might be a regexp.
115 return self.peek(True) == "end"
116
117
118 def match(self, tokenType, scanOperand=False):
119 return self.get(scanOperand) == tokenType or self.unget()
120
121
122 def mustMatch(self, tokenType):
123 if not self.match(tokenType):
124 raise ParseError("Missing " + tokenType, self.fileId, self.line)
125
126 return self.token
127
128
129 def peek(self, scanOperand=False):
130 if self.lookahead:
131 next = self.tokens.get((self.tokenIndex + self.lookahead) & 3)
132 if self.scanNewlines and (getattr(next, "line", None) != getattr(self, "line", None)):
133 tokenType = "newline"
134 else:
135 tokenType = getattr(next, "type", None)
136 else:
137 tokenType = self.get(scanOperand)
138 self.unget()
139
140 return tokenType
141
142
143 def peekOnSameLine(self, scanOperand=False):
144 self.scanNewlines = True
145 tokenType = self.peek(scanOperand)
146 self.scanNewlines = False
147 return tokenType
148
149
150 def getComments(self):
151 if self.comments:
152 comments = self.comments
153 self.comments = []
154 return comments
155
156 return None
157
158
159 def skip(self):
160 """Eats comments and whitespace."""
161 input = self.source
162 startLine = self.line
163
164 # Whether this is the first called as happen on start parsing a file (eat leading comments/white space)
165 startOfFile = self.cursor is 0
166
167 indent = ""
168
169 while (True):
170 if len(input) > self.cursor:
171 ch = input[self.cursor]
172 else:
173 return
174
175 self.cursor += 1
176
177 if len(input) > self.cursor:
178 next = input[self.cursor]
179 else:
180 next = None
181
182 if ch == "\n" and not self.scanNewlines:
183 self.line += 1
184 indent = ""
185
186 elif ch == "/" and next == "*":
187 self.cursor += 1
188 text = "/*"
189 inline = startLine == self.line and startLine > 1
190 commentStartLine = self.line
191 if startLine == self.line and not startOfFile:
192 mode = "inline"
193 elif (self.line-1) > startLine:
194 # distance before this comment means it is a comment block for a whole section (multiple lines of code)
195 mode = "section"
196 else:
197 # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
198 mode = "block"
199
200 while (True):
201 try:
202 ch = input[self.cursor]
203 self.cursor += 1
204 except IndexError:
205 raise ParseError("Unterminated comment", self.fileId, self.line)
206
207 if ch == "*":
208 next = input[self.cursor]
209 if next == "/":
210 text += "*/"
211 self.cursor += 1
212 break
213
214 elif ch == "\n":
215 self.line += 1
216
217 text += ch
218
219
220 # Filter escaping on slash-star combinations in comment text
221 text = text.replace("*\/", "*/")
222
223 try:
224 self.comments.append(Comment.Comment(text, mode, commentStartLine, indent, self.fileId))
225 except Comment.CommentException as commentError:
226 Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
227
228
229 elif ch == "/" and next == "/":
230 self.cursor += 1
231 text = "//"
232 if startLine == self.line and not startOfFile:
233 mode = "inline"
234 elif (self.line-1) > startLine:
235 # distance before this comment means it is a comment block for a whole section (multiple lines of code)
236 mode = "section"
237 else:
238 # comment for maybe multiple following lines of code, but not that important (no visual white space divider)
239 mode = "block"
240
241 while (True):
242 try:
243 ch = input[self.cursor]
244 self.cursor += 1
245 except IndexError:
246 # end of file etc.
247 break
248
249 if ch == "\n":
250 self.line += 1
251 break
252
253 text += ch
254
255 try:
256 self.comments.append(Comment.Comment(text, mode, self.line-1, "", self.fileId))
257 except Comment.CommentException:
258 Console.error("Ignoring comment in %s: %s", self.fileId, commentError)
259
260 # check for whitespace, also for special cases like 0xA0
261 elif ch in "\xA0 \t":
262 indent += ch
263
264 else:
265 self.cursor -= 1
266 return
267
268
269 # Lexes the exponential part of a number, if present. Returns True if an
270 # exponential part was found.
271 def lexExponent(self):
272 input = self.source
273 next = input[self.cursor]
274 if next == "e" or next == "E":
275 self.cursor += 1
276 ch = input[self.cursor]
277 self.cursor += 1
278 if ch == "+" or ch == "-":
279 ch = input[self.cursor]
280 self.cursor += 1
281
282 if ch < "0" or ch > "9":
283 raise ParseError("Missing exponent", self.fileId, self.line)
284
285 while(True):
286 ch = input[self.cursor]
287 self.cursor += 1
288 if not (ch >= "0" and ch <= "9"):
289 break
290
291 self.cursor -= 1
292 return True
293
294 return False
295
296
297 def lexZeroNumber(self, ch):
298 token = self.token
299 input = self.source
300 token.type = "number"
301
302 ch = input[self.cursor]
303 self.cursor += 1
304 if ch == ".":
305 while(True):
306 ch = input[self.cursor]
307 self.cursor += 1
308 if not (ch >= "0" and ch <= "9"):
309 break
310
311 self.cursor -= 1
312 self.lexExponent()
313 token.value = input[token.start:self.cursor]
314
315 elif ch == "x" or ch == "X":
316 while(True):
317 ch = input[self.cursor]
318 self.cursor += 1
319 if not ((ch >= "0" and ch <= "9") or (ch >= "a" and ch <= "f") or (ch >= "A" and ch <= "F")):
320 break
321
322 self.cursor -= 1
323 token.value = input[token.start:self.cursor]
324
325 elif ch >= "0" and ch <= "7":
326 while(True):
327 ch = input[self.cursor]
328 self.cursor += 1
329 if not (ch >= "0" and ch <= "7"):
330 break
331
332 self.cursor -= 1
333 token.value = input[token.start:self.cursor]
334
335 else:
336 self.cursor -= 1
337 self.lexExponent() # 0E1, &c.
338 token.value = 0
339
340
341 def lexNumber(self, ch):
342 token = self.token
343 input = self.source
344 token.type = "number"
345
346 floating = False
347 while(True):
348 ch = input[self.cursor]
349 self.cursor += 1
350
351 if ch == "." and not floating:
352 floating = True
353 ch = input[self.cursor]
354 self.cursor += 1
355
356 if not (ch >= "0" and ch <= "9"):
357 break
358
359 self.cursor -= 1
360
361 exponent = self.lexExponent()
362 segment = input[token.start:self.cursor]
363
364 # Protect float or exponent numbers
365 if floating or exponent:
366 token.value = segment
367 else:
368 token.value = int(segment)
369
370
371 def lexDot(self, ch):
372 token = self.token
373 input = self.source
374 next = input[self.cursor]
375
376 if next >= "0" and next <= "9":
377 while (True):
378 ch = input[self.cursor]
379 self.cursor += 1
380 if not (ch >= "0" and ch <= "9"):
381 break
382
383 self.cursor -= 1
384 self.lexExponent()
385
386 token.type = "number"
387 token.value = input[token.start:self.cursor]
388
389 else:
390 token.type = "dot"
391
392
393 def lexString(self, ch):
394 token = self.token
395 input = self.source
396 token.type = "string"
397
398 hasEscapes = False
399 delim = ch
400 ch = input[self.cursor]
401 self.cursor += 1
402 while ch != delim:
403 if ch == "\\":
404 hasEscapes = True
405 self.cursor += 1
406
407 ch = input[self.cursor]
408 self.cursor += 1
409
410 if hasEscapes:
411 token.value = eval(input[token.start:self.cursor])
412 else:
413 token.value = input[token.start+1:self.cursor-1]
414
415
416 def lexRegExp(self, ch):
417 token = self.token
418 input = self.source
419 token.type = "regexp"
420
421 while (True):
422 try:
423 ch = input[self.cursor]
424 self.cursor += 1
425 except IndexError:
426 raise ParseError("Unterminated regex", self.fileId, self.line)
427
428 if ch == "\\":
429 self.cursor += 1
430
431 elif ch == "[":
432 while (True):
433 if ch == "\\":
434 self.cursor += 1
435
436 try:
437 ch = input[self.cursor]
438 self.cursor += 1
439 except IndexError:
440 raise ParseError("Unterminated character class", self.fileId, self.line)
441
442 if ch == "]":
443 break
444
445 if ch == "/":
446 break
447
448 while(True):
449 ch = input[self.cursor]
450 self.cursor += 1
451 if not (ch >= "a" and ch <= "z"):
452 break
453
454 self.cursor -= 1
455 token.value = input[token.start:self.cursor]
456
457
458 def lexOp(self, ch):
459 token = self.token
460 input = self.source
461
462 op = ch
463 while(True):
464 try:
465 next = input[self.cursor]
466 except IndexError:
467 break
468
469 if (op + next) in operatorNames:
470 self.cursor += 1
471 op += next
472 else:
473 break
474
475 try:
476 next = input[self.cursor]
477 except IndexError:
478 next = None
479
480 if next == "=" and op in assignOperators:
481 self.cursor += 1
482 token.type = "assign"
483 token.assignOp = operatorNames[op]
484 op += "="
485
486 else:
487 token.type = operatorNames[op]
488 token.assignOp = None
489
490
491 # FIXME: Unicode escape sequences
492 # FIXME: Unicode identifiers
493 def lexIdent(self, ch):
494 token = self.token
495 input = self.source
496
497 try:
498 while True:
499 ch = input[self.cursor]
500 self.cursor += 1
501
502 if not ((ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or (ch >= "0" and ch <= "9") or ch == "$" or ch == "_"):
503 break
504
505 except IndexError:
506 self.cursor += 1
507 pass
508
509 # Put the non-word character back.
510 self.cursor -= 1
511
512 identifier = input[token.start:self.cursor]
513 if identifier in Lang.keywords:
514 token.type = identifier
515 else:
516 token.type = "identifier"
517 token.value = identifier
518
519
520 def get(self, scanOperand=False):
521 """
522 It consumes input *only* if there is no lookahead.
523 Dispatches to the appropriate lexing function depending on the input.
524 """
525 while self.lookahead:
526 self.lookahead -= 1
527 self.tokenIndex = (self.tokenIndex + 1) & 3
528 token = self.tokens[self.tokenIndex]
529 if token.type != "newline" or self.scanNewlines:
530 return token.type
531
532 self.skip()
533
534 self.tokenIndex = (self.tokenIndex + 1) & 3
535 self.tokens[self.tokenIndex] = token = Token()
536
537 token.start = self.cursor
538 token.line = self.line
539
540 input = self.source
541 if self.cursor == len(input):
542 token.end = token.start
543 token.type = "end"
544 return token.type
545
546 ch = input[self.cursor]
547 self.cursor += 1
548
549 if (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z") or ch == "$" or ch == "_":
550 self.lexIdent(ch)
551
552 elif scanOperand and ch == "/":
553 self.lexRegExp(ch)
554
555 elif ch == ".":
556 self.lexDot(ch)
557
558 elif self.scanNewlines and ch == "\n":
559 token.type = "newline"
560 self.line += 1
561
562 elif ch in operatorNames:
563 self.lexOp(ch)
564
565 elif ch >= "1" and ch <= "9":
566 self.lexNumber(ch)
567
568 elif ch == "0":
569 self.lexZeroNumber(ch)
570
571 elif ch == '"' or ch == "'":
572 self.lexString(ch)
573
574 else:
575 raise ParseError("Illegal token: %s (Code: %s)" % (ch, ord(ch)), self.fileId, self.line)
576
577 token.end = self.cursor
578 return token.type
579
580
581 def unget(self):
582 """ Match depends on unget returning undefined."""
583 self.lookahead += 1
584
585 if self.lookahead == 4:
586 raise ParseError("PANIC: too much lookahead!", self.fileId, self.line)
587
588 self.tokenIndex = (self.tokenIndex - 1) & 3
589

eric ide

mercurial