ThirdParty/CharDet/chardet/utf8prober.py

changeset 3537
7662053c3906
parent 12
1d8dd9706f46
child 5714
90c57b50600f
equal deleted inserted replaced
3536:c06338ca892b 3537:7662053c3906
11 # 11 #
12 # This library is free software; you can redistribute it and/or 12 # This library is free software; you can redistribute it and/or
13 # modify it under the terms of the GNU Lesser General Public 13 # modify it under the terms of the GNU Lesser General Public
14 # License as published by the Free Software Foundation; either 14 # License as published by the Free Software Foundation; either
15 # version 2.1 of the License, or (at your option) any later version. 15 # version 2.1 of the License, or (at your option) any later version.
16 # 16 #
17 # This library is distributed in the hope that it will be useful, 17 # This library is distributed in the hope that it will be useful,
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of 18 # but WITHOUT ANY WARRANTY; without even the implied warranty of
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20 # Lesser General Public License for more details. 20 # Lesser General Public License for more details.
21 # 21 #
22 # You should have received a copy of the GNU Lesser General Public 22 # You should have received a copy of the GNU Lesser General Public
23 # License along with this library; if not, write to the Free Software 23 # License along with this library; if not, write to the Free Software
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 # 02110-1301 USA 25 # 02110-1301 USA
26 ######################### END LICENSE BLOCK ######################### 26 ######################### END LICENSE BLOCK #########################
27 27
28 from . import constants 28 from . import constants
29 import sys
30 from .constants import eStart, eError, eItsMe
31 from .charsetprober import CharSetProber 29 from .charsetprober import CharSetProber
32 from .codingstatemachine import CodingStateMachine 30 from .codingstatemachine import CodingStateMachine
33 from .mbcssm import UTF8SMModel 31 from .mbcssm import UTF8SMModel
34 32
35 ONE_CHAR_PROB = 0.5 33 ONE_CHAR_PROB = 0.5
34
36 35
37 class UTF8Prober(CharSetProber): 36 class UTF8Prober(CharSetProber):
38 def __init__(self): 37 def __init__(self):
39 CharSetProber.__init__(self) 38 CharSetProber.__init__(self)
40 self._mCodingSM = CodingStateMachine(UTF8SMModel) 39 self._mCodingSM = CodingStateMachine(UTF8SMModel)
49 return "utf-8" 48 return "utf-8"
50 49
51 def feed(self, aBuf): 50 def feed(self, aBuf):
52 for c in aBuf: 51 for c in aBuf:
53 codingState = self._mCodingSM.next_state(c) 52 codingState = self._mCodingSM.next_state(c)
54 if codingState == eError: 53 if codingState == constants.eError:
55 self._mState = constants.eNotMe 54 self._mState = constants.eNotMe
56 break 55 break
57 elif codingState == eItsMe: 56 elif codingState == constants.eItsMe:
58 self._mState = constants.eFoundIt 57 self._mState = constants.eFoundIt
59 break 58 break
60 elif codingState == eStart: 59 elif codingState == constants.eStart:
61 if self._mCodingSM.get_current_charlen() >= 2: 60 if self._mCodingSM.get_current_charlen() >= 2:
62 self._mNumOfMBChar += 1 61 self._mNumOfMBChar += 1
63 62
64 if self.get_state() == constants.eDetecting: 63 if self.get_state() == constants.eDetecting:
65 if self.get_confidence() > constants.SHORTCUT_THRESHOLD: 64 if self.get_confidence() > constants.SHORTCUT_THRESHOLD:

eric ide

mercurial