eric: comparison ThirdParty/CharDet/chardet/codingstatemachine.py

-:6762afd9f963
+:90c57b50600f
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
-from .constants import eStart
+import logging
-from .compat import wrap_ord
+from .enums import MachineState
-class CodingStateMachine:
+class CodingStateMachine(object):
+"""
+A state machine to verify a byte sequence for a particular encoding. For
+each byte the detector receives, it will feed that byte to every active
+state machine available, one byte at a time. The state machine changes its
+state based on its previous state and the byte it receives. There are 3
+states in a state machine that are of interest to an auto-detector:
+START state: This is the state to start with, or a legal byte sequence
+(i.e. a valid code point) for character has been identified.
+ME state:  This indicates that the state machine identified a byte sequence
+that is specific to the charset it is designed for and that
+there is no other possible encoding which can contain this byte
+sequence. This will to lead to an immediate positive answer for
+the detector.
+ERROR state: This indicates the state machine identified an illegal byte
+sequence for that encoding. This will lead to an immediate
+negative answer for this encoding. Detector will exclude this
+encoding from consideration from here on.
+"""
 def __init__(self, sm):
-self._mModel = sm
+self._model = sm
-self._mCurrentBytePos = 0
+self._curr_byte_pos = 0
-self._mCurrentCharLen = 0
+self._curr_char_len = 0
+self._curr_state = None
+self.logger = logging.getLogger(__name__)
 self.reset()
 def reset(self):
-self._mCurrentState = eStart
+self._curr_state = MachineState.START
 def next_state(self, c):
 # for each byte we get its class
 # if it is first byte, we also get byte length
-# PY3K: aBuf is a byte stream, so c is an int, not a byte
+byte_class = self._model['class_table'][c]
-byteCls = self._mModel['classTable'][wrap_ord(c)]
+if self._curr_state == MachineState.START:
-if self._mCurrentState == eStart:
+self._curr_byte_pos = 0
-self._mCurrentBytePos = 0
+self._curr_char_len = self._model['char_len_table'][byte_class]
-self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
+# from byte's class and state_table, we get its next state
-# from byte's class and stateTable, we get its next state
+curr_state = (self._curr_state * self._model['class_factor']
-curr_state = (self._mCurrentState * self._mModel['classFactor']
++ byte_class)
-+ byteCls)
+self._curr_state = self._model['state_table'][curr_state]
-self._mCurrentState = self._mModel['stateTable'][curr_state]
+self._curr_byte_pos += 1
-self._mCurrentBytePos += 1
+return self._curr_state
-return self._mCurrentState
 def get_current_charlen(self):
-return self._mCurrentCharLen
+return self._curr_char_len
 def get_coding_state_machine(self):
-return self._mModel['name']
+return self._model['name']
+@property
+def language(self):
+return self._model['language']

Mercurial Repositories > eric / file comparison

comparison: ThirdParty/CharDet/chardet/codingstatemachine.py

ThirdParty/CharDet/chardet/codingstatemachine.py