diff -r 6762afd9f963 -r 90c57b50600f ThirdParty/CharDet/chardet/codingstatemachine.py --- a/ThirdParty/CharDet/chardet/codingstatemachine.py Tue Apr 25 18:36:38 2017 +0200 +++ b/ThirdParty/CharDet/chardet/codingstatemachine.py Tue Apr 25 18:40:46 2017 +0200 @@ -25,37 +25,64 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from .constants import eStart -from .compat import wrap_ord +import logging + +from .enums import MachineState -class CodingStateMachine: +class CodingStateMachine(object): + """ + A state machine to verify a byte sequence for a particular encoding. For + each byte the detector receives, it will feed that byte to every active + state machine available, one byte at a time. The state machine changes its + state based on its previous state and the byte it receives. There are 3 + states in a state machine that are of interest to an auto-detector: + + START state: This is the state to start with, or a legal byte sequence + (i.e. a valid code point) for character has been identified. + + ME state: This indicates that the state machine identified a byte sequence + that is specific to the charset it is designed for and that + there is no other possible encoding which can contain this byte + sequence. This will to lead to an immediate positive answer for + the detector. + + ERROR state: This indicates the state machine identified an illegal byte + sequence for that encoding. This will lead to an immediate + negative answer for this encoding. Detector will exclude this + encoding from consideration from here on. + """ def __init__(self, sm): - self._mModel = sm - self._mCurrentBytePos = 0 - self._mCurrentCharLen = 0 + self._model = sm + self._curr_byte_pos = 0 + self._curr_char_len = 0 + self._curr_state = None + self.logger = logging.getLogger(__name__) self.reset() def reset(self): - self._mCurrentState = eStart + self._curr_state = MachineState.START def next_state(self, c): # for each byte we get its class # if it is first byte, we also get byte length - # PY3K: aBuf is a byte stream, so c is an int, not a byte - byteCls = self._mModel['classTable'][wrap_ord(c)] - if self._mCurrentState == eStart: - self._mCurrentBytePos = 0 - self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] - # from byte's class and stateTable, we get its next state - curr_state = (self._mCurrentState * self._mModel['classFactor'] - + byteCls) - self._mCurrentState = self._mModel['stateTable'][curr_state] - self._mCurrentBytePos += 1 - return self._mCurrentState + byte_class = self._model['class_table'][c] + if self._curr_state == MachineState.START: + self._curr_byte_pos = 0 + self._curr_char_len = self._model['char_len_table'][byte_class] + # from byte's class and state_table, we get its next state + curr_state = (self._curr_state * self._model['class_factor'] + + byte_class) + self._curr_state = self._model['state_table'][curr_state] + self._curr_byte_pos += 1 + return self._curr_state def get_current_charlen(self): - return self._mCurrentCharLen + return self._curr_char_len def get_coding_state_machine(self): - return self._mModel['name'] + return self._model['name'] + + @property + def language(self): + return self._model['language']