23 # License along with this library; if not, write to the Free Software |
23 # License along with this library; if not, write to the Free Software |
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
25 # 02110-1301 USA |
25 # 02110-1301 USA |
26 ######################### END LICENSE BLOCK ######################### |
26 ######################### END LICENSE BLOCK ######################### |
27 |
27 |
28 from .constants import eStart |
28 import logging |
29 from .compat import wrap_ord |
29 |
|
30 from .enums import MachineState |
30 |
31 |
31 |
32 |
32 class CodingStateMachine: |
33 class CodingStateMachine(object): |
|
34 """ |
|
35 A state machine to verify a byte sequence for a particular encoding. For |
|
36 each byte the detector receives, it will feed that byte to every active |
|
37 state machine available, one byte at a time. The state machine changes its |
|
38 state based on its previous state and the byte it receives. There are 3 |
|
39 states in a state machine that are of interest to an auto-detector: |
|
40 |
|
41 START state: This is the state to start with, or a legal byte sequence |
|
42 (i.e. a valid code point) for character has been identified. |
|
43 |
|
44 ME state: This indicates that the state machine identified a byte sequence |
|
45 that is specific to the charset it is designed for and that |
|
46 there is no other possible encoding which can contain this byte |
|
47 sequence. This will to lead to an immediate positive answer for |
|
48 the detector. |
|
49 |
|
50 ERROR state: This indicates the state machine identified an illegal byte |
|
51 sequence for that encoding. This will lead to an immediate |
|
52 negative answer for this encoding. Detector will exclude this |
|
53 encoding from consideration from here on. |
|
54 """ |
33 def __init__(self, sm): |
55 def __init__(self, sm): |
34 self._mModel = sm |
56 self._model = sm |
35 self._mCurrentBytePos = 0 |
57 self._curr_byte_pos = 0 |
36 self._mCurrentCharLen = 0 |
58 self._curr_char_len = 0 |
|
59 self._curr_state = None |
|
60 self.logger = logging.getLogger(__name__) |
37 self.reset() |
61 self.reset() |
38 |
62 |
39 def reset(self): |
63 def reset(self): |
40 self._mCurrentState = eStart |
64 self._curr_state = MachineState.START |
41 |
65 |
42 def next_state(self, c): |
66 def next_state(self, c): |
43 # for each byte we get its class |
67 # for each byte we get its class |
44 # if it is first byte, we also get byte length |
68 # if it is first byte, we also get byte length |
45 # PY3K: aBuf is a byte stream, so c is an int, not a byte |
69 byte_class = self._model['class_table'][c] |
46 byteCls = self._mModel['classTable'][wrap_ord(c)] |
70 if self._curr_state == MachineState.START: |
47 if self._mCurrentState == eStart: |
71 self._curr_byte_pos = 0 |
48 self._mCurrentBytePos = 0 |
72 self._curr_char_len = self._model['char_len_table'][byte_class] |
49 self._mCurrentCharLen = self._mModel['charLenTable'][byteCls] |
73 # from byte's class and state_table, we get its next state |
50 # from byte's class and stateTable, we get its next state |
74 curr_state = (self._curr_state * self._model['class_factor'] |
51 curr_state = (self._mCurrentState * self._mModel['classFactor'] |
75 + byte_class) |
52 + byteCls) |
76 self._curr_state = self._model['state_table'][curr_state] |
53 self._mCurrentState = self._mModel['stateTable'][curr_state] |
77 self._curr_byte_pos += 1 |
54 self._mCurrentBytePos += 1 |
78 return self._curr_state |
55 return self._mCurrentState |
|
56 |
79 |
57 def get_current_charlen(self): |
80 def get_current_charlen(self): |
58 return self._mCurrentCharLen |
81 return self._curr_char_len |
59 |
82 |
60 def get_coding_state_machine(self): |
83 def get_coding_state_machine(self): |
61 return self._mModel['name'] |
84 return self._model['name'] |
|
85 |
|
86 @property |
|
87 def language(self): |
|
88 return self._model['language'] |