146 if self.encoding == 'guess': |
146 if self.encoding == 'guess': |
147 text, _ = guess_decode(text) |
147 text, _ = guess_decode(text) |
148 elif self.encoding == 'chardet': |
148 elif self.encoding == 'chardet': |
149 try: |
149 try: |
150 import chardet |
150 import chardet |
151 except ImportError: |
151 except ImportError as e: |
152 raise ImportError('To enable chardet encoding guessing, ' |
152 raise ImportError('To enable chardet encoding guessing, ' |
153 'please install the chardet library ' |
153 'please install the chardet library ' |
154 'from http://chardet.feedparser.org/') |
154 'from http://chardet.feedparser.org/') from e |
155 # check for BOM first |
155 # check for BOM first |
156 decoded = None |
156 decoded = None |
157 for bom, encoding in _encoding_map: |
157 for bom, encoding in _encoding_map: |
158 if text.startswith(bom): |
158 if text.startswith(bom): |
159 decoded = text[len(bom):].decode(encoding, 'replace') |
159 decoded = text[len(bom):].decode(encoding, 'replace') |
164 decoded = text.decode(enc.get('encoding') or 'utf-8', |
164 decoded = text.decode(enc.get('encoding') or 'utf-8', |
165 'replace') |
165 'replace') |
166 text = decoded |
166 text = decoded |
167 else: |
167 else: |
168 text = text.decode(self.encoding) |
168 text = text.decode(self.encoding) |
169 if text.startswith(u'\ufeff'): |
169 if text.startswith('\ufeff'): |
170 text = text[len(u'\ufeff'):] |
170 text = text[len('\ufeff'):] |
171 else: |
171 else: |
172 if text.startswith(u'\ufeff'): |
172 if text.startswith('\ufeff'): |
173 text = text[len(u'\ufeff'):] |
173 text = text[len('\ufeff'):] |
174 |
174 |
175 # text now *is* a unicode string |
175 # text now *is* a unicode string |
176 text = text.replace('\r\n', '\n') |
176 text = text.replace('\r\n', '\n') |
177 text = text.replace('\r', '\n') |
177 text = text.replace('\r', '\n') |
178 if self.stripall: |
178 if self.stripall: |
494 |
494 |
495 try: |
495 try: |
496 rex = cls._process_regex(tdef[0], rflags, state) |
496 rex = cls._process_regex(tdef[0], rflags, state) |
497 except Exception as err: |
497 except Exception as err: |
498 raise ValueError("uncompilable regex %r in state %r of %r: %s" % |
498 raise ValueError("uncompilable regex %r in state %r of %r: %s" % |
499 (tdef[0], state, cls, err)) |
499 (tdef[0], state, cls, err)) from err |
500 |
500 |
501 token = cls._process_token(tdef[1]) |
501 token = cls._process_token(tdef[1]) |
502 |
502 |
503 if len(tdef) == 2: |
503 if len(tdef) == 2: |
504 new_state = None |
504 new_state = None |
626 if m: |
626 if m: |
627 if action is not None: |
627 if action is not None: |
628 if type(action) is _TokenType: |
628 if type(action) is _TokenType: |
629 yield pos, action, m.group() |
629 yield pos, action, m.group() |
630 else: |
630 else: |
631 for item in action(self, m): |
631 yield from action(self, m) |
632 yield item |
|
633 pos = m.end() |
632 pos = m.end() |
634 if new_state is not None: |
633 if new_state is not None: |
635 # state transition |
634 # state transition |
636 if isinstance(new_state, tuple): |
635 if isinstance(new_state, tuple): |
637 for state in new_state: |
636 for state in new_state: |
662 try: |
661 try: |
663 if text[pos] == '\n': |
662 if text[pos] == '\n': |
664 # at EOL, reset state to "root" |
663 # at EOL, reset state to "root" |
665 statestack = ['root'] |
664 statestack = ['root'] |
666 statetokens = tokendefs['root'] |
665 statetokens = tokendefs['root'] |
667 yield pos, Text, u'\n' |
666 yield pos, Text, '\n' |
668 pos += 1 |
667 pos += 1 |
669 continue |
668 continue |
670 yield pos, Error, text[pos] |
669 yield pos, Error, text[pos] |
671 pos += 1 |
670 pos += 1 |
672 except IndexError: |
671 except IndexError: |
714 if action is not None: |
713 if action is not None: |
715 if type(action) is _TokenType: |
714 if type(action) is _TokenType: |
716 yield ctx.pos, action, m.group() |
715 yield ctx.pos, action, m.group() |
717 ctx.pos = m.end() |
716 ctx.pos = m.end() |
718 else: |
717 else: |
719 for item in action(self, m, ctx): |
718 yield from action(self, m, ctx) |
720 yield item |
|
721 if not new_state: |
719 if not new_state: |
722 # altered the state stack? |
720 # altered the state stack? |
723 statetokens = tokendefs[ctx.stack[-1]] |
721 statetokens = tokendefs[ctx.stack[-1]] |
724 # CAUTION: callback must set ctx.pos! |
722 # CAUTION: callback must set ctx.pos! |
725 if new_state is not None: |
723 if new_state is not None: |
751 break |
749 break |
752 if text[ctx.pos] == '\n': |
750 if text[ctx.pos] == '\n': |
753 # at EOL, reset state to "root" |
751 # at EOL, reset state to "root" |
754 ctx.stack = ['root'] |
752 ctx.stack = ['root'] |
755 statetokens = tokendefs['root'] |
753 statetokens = tokendefs['root'] |
756 yield ctx.pos, Text, u'\n' |
754 yield ctx.pos, Text, '\n' |
757 ctx.pos += 1 |
755 ctx.pos += 1 |
758 continue |
756 continue |
759 yield ctx.pos, Error, text[ctx.pos] |
757 yield ctx.pos, Error, text[ctx.pos] |
760 ctx.pos += 1 |
758 ctx.pos += 1 |
761 except IndexError: |
759 except IndexError: |
854 _prof_sort_index = 4 # defaults to time per call |
851 _prof_sort_index = 4 # defaults to time per call |
855 |
852 |
856 def get_tokens_unprocessed(self, text, stack=('root',)): |
853 def get_tokens_unprocessed(self, text, stack=('root',)): |
857 # this needs to be a stack, since using(this) will produce nested calls |
854 # this needs to be a stack, since using(this) will produce nested calls |
858 self.__class__._prof_data.append({}) |
855 self.__class__._prof_data.append({}) |
859 for tok in RegexLexer.get_tokens_unprocessed(self, text, stack): |
856 yield from RegexLexer.get_tokens_unprocessed(self, text, stack) |
860 yield tok |
|
861 rawdata = self.__class__._prof_data.pop() |
857 rawdata = self.__class__._prof_data.pop() |
862 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65], |
858 data = sorted(((s, repr(r).strip('u\'').replace('\\\\', '\\')[:65], |
863 n, 1000 * t, 1000 * t / n) |
859 n, 1000 * t, 1000 * t / n) |
864 for ((s, r), (n, t)) in rawdata.items()), |
860 for ((s, r), (n, t)) in rawdata.items()), |
865 key=lambda x: x[self._prof_sort_index], |
861 key=lambda x: x[self._prof_sort_index], |