|
1 ######################## BEGIN LICENSE BLOCK ######################## |
|
2 # The Original Code is mozilla.org code. |
|
3 # |
|
4 # The Initial Developer of the Original Code is |
|
5 # Netscape Communications Corporation. |
|
6 # Portions created by the Initial Developer are Copyright (C) 1998 |
|
7 # the Initial Developer. All Rights Reserved. |
|
8 # |
|
9 # Contributor(s): |
|
10 # Mark Pilgrim - port to Python |
|
11 # |
|
12 # This library is free software; you can redistribute it and/or |
|
13 # modify it under the terms of the GNU Lesser General Public |
|
14 # License as published by the Free Software Foundation; either |
|
15 # version 2.1 of the License, or (at your option) any later version. |
|
16 # |
|
17 # This library is distributed in the hope that it will be useful, |
|
18 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
19 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
20 # Lesser General Public License for more details. |
|
21 # |
|
22 # You should have received a copy of the GNU Lesser General Public |
|
23 # License along with this library; if not, write to the Free Software |
|
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
|
25 # 02110-1301 USA |
|
26 ######################### END LICENSE BLOCK ######################### |
|
27 |
|
28 import constants, sys |
|
29 from constants import eStart, eError, eItsMe |
|
30 from charsetprober import CharSetProber |
|
31 from codingstatemachine import CodingStateMachine |
|
32 from mbcssm import UTF8SMModel |
|
33 |
|
34 ONE_CHAR_PROB = 0.5 |
|
35 |
|
36 class UTF8Prober(CharSetProber): |
|
37 def __init__(self): |
|
38 CharSetProber.__init__(self) |
|
39 self._mCodingSM = CodingStateMachine(UTF8SMModel) |
|
40 self.reset() |
|
41 |
|
42 def reset(self): |
|
43 CharSetProber.reset(self) |
|
44 self._mCodingSM.reset() |
|
45 self._mNumOfMBChar = 0 |
|
46 |
|
47 def get_charset_name(self): |
|
48 return "utf-8" |
|
49 |
|
50 def feed(self, aBuf): |
|
51 for c in aBuf: |
|
52 codingState = self._mCodingSM.next_state(c) |
|
53 if codingState == eError: |
|
54 self._mState = constants.eNotMe |
|
55 break |
|
56 elif codingState == eItsMe: |
|
57 self._mState = constants.eFoundIt |
|
58 break |
|
59 elif codingState == eStart: |
|
60 if self._mCodingSM.get_current_charlen() >= 2: |
|
61 self._mNumOfMBChar += 1 |
|
62 |
|
63 if self.get_state() == constants.eDetecting: |
|
64 if self.get_confidence() > constants.SHORTCUT_THRESHOLD: |
|
65 self._mState = constants.eFoundIt |
|
66 |
|
67 return self.get_state() |
|
68 |
|
69 def get_confidence(self): |
|
70 unlike = 0.99 |
|
71 if self._mNumOfMBChar < 6: |
|
72 for i in range(0, self._mNumOfMBChar): |
|
73 unlike = unlike * ONE_CHAR_PROB |
|
74 return 1.0 - unlike |
|
75 else: |
|
76 return unlike |