ThirdParty/CharDet/chardet/codingstatemachine.py

changeset 5714
90c57b50600f
parent 3537
7662053c3906
diff -r 6762afd9f963 -r 90c57b50600f ThirdParty/CharDet/chardet/codingstatemachine.py
--- a/ThirdParty/CharDet/chardet/codingstatemachine.py	Tue Apr 25 18:36:38 2017 +0200
+++ b/ThirdParty/CharDet/chardet/codingstatemachine.py	Tue Apr 25 18:40:46 2017 +0200
@@ -25,37 +25,64 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-from .constants import eStart
-from .compat import wrap_ord
+import logging
+
+from .enums import MachineState
 
 
-class CodingStateMachine:
+class CodingStateMachine(object):
+    """
+    A state machine to verify a byte sequence for a particular encoding. For
+    each byte the detector receives, it will feed that byte to every active
+    state machine available, one byte at a time. The state machine changes its
+    state based on its previous state and the byte it receives. There are 3
+    states in a state machine that are of interest to an auto-detector:
+
+    START state: This is the state to start with, or a legal byte sequence
+                 (i.e. a valid code point) for character has been identified.
+
+    ME state:  This indicates that the state machine identified a byte sequence
+               that is specific to the charset it is designed for and that
+               there is no other possible encoding which can contain this byte
+               sequence. This will to lead to an immediate positive answer for
+               the detector.
+
+    ERROR state: This indicates the state machine identified an illegal byte
+                 sequence for that encoding. This will lead to an immediate
+                 negative answer for this encoding. Detector will exclude this
+                 encoding from consideration from here on.
+    """
     def __init__(self, sm):
-        self._mModel = sm
-        self._mCurrentBytePos = 0
-        self._mCurrentCharLen = 0
+        self._model = sm
+        self._curr_byte_pos = 0
+        self._curr_char_len = 0
+        self._curr_state = None
+        self.logger = logging.getLogger(__name__)
         self.reset()
 
     def reset(self):
-        self._mCurrentState = eStart
+        self._curr_state = MachineState.START
 
     def next_state(self, c):
         # for each byte we get its class
         # if it is first byte, we also get byte length
-        # PY3K: aBuf is a byte stream, so c is an int, not a byte
-        byteCls = self._mModel['classTable'][wrap_ord(c)]
-        if self._mCurrentState == eStart:
-            self._mCurrentBytePos = 0
-            self._mCurrentCharLen = self._mModel['charLenTable'][byteCls]
-        # from byte's class and stateTable, we get its next state
-        curr_state = (self._mCurrentState * self._mModel['classFactor']
-                      + byteCls)
-        self._mCurrentState = self._mModel['stateTable'][curr_state]
-        self._mCurrentBytePos += 1
-        return self._mCurrentState
+        byte_class = self._model['class_table'][c]
+        if self._curr_state == MachineState.START:
+            self._curr_byte_pos = 0
+            self._curr_char_len = self._model['char_len_table'][byte_class]
+        # from byte's class and state_table, we get its next state
+        curr_state = (self._curr_state * self._model['class_factor']
+                      + byte_class)
+        self._curr_state = self._model['state_table'][curr_state]
+        self._curr_byte_pos += 1
+        return self._curr_state
 
     def get_current_charlen(self):
-        return self._mCurrentCharLen
+        return self._curr_char_len
 
     def get_coding_state_machine(self):
-        return self._mModel['name']
+        return self._model['name']
+
+    @property
+    def language(self):
+        return self._model['language']

eric ide

mercurial