|
1 ######################## BEGIN LICENSE BLOCK ######################## |
|
2 # The Original Code is Mozilla Universal charset detector code. |
|
3 # |
|
4 # The Initial Developer of the Original Code is |
|
5 # Netscape Communications Corporation. |
|
6 # Portions created by the Initial Developer are Copyright (C) 2001 |
|
7 # the Initial Developer. All Rights Reserved. |
|
8 # |
|
9 # Contributor(s): |
|
10 # Mark Pilgrim - port to Python |
|
11 # Shy Shalom - original C code |
|
12 # Proofpoint, Inc. |
|
13 # |
|
14 # This library is free software; you can redistribute it and/or |
|
15 # modify it under the terms of the GNU Lesser General Public |
|
16 # License as published by the Free Software Foundation; either |
|
17 # version 2.1 of the License, or (at your option) any later version. |
|
18 # |
|
19 # This library is distributed in the hope that it will be useful, |
|
20 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
21 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
22 # Lesser General Public License for more details. |
|
23 # |
|
24 # You should have received a copy of the GNU Lesser General Public |
|
25 # License along with this library; if not, write to the Free Software |
|
26 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
|
27 # 02110-1301 USA |
|
28 ######################### END LICENSE BLOCK ######################### |
|
29 |
|
30 import constants, sys |
|
31 from constants import eStart, eError, eItsMe |
|
32 from charsetprober import CharSetProber |
|
33 |
|
34 class MultiByteCharSetProber(CharSetProber): |
|
35 def __init__(self): |
|
36 CharSetProber.__init__(self) |
|
37 self._mDistributionAnalyzer = None |
|
38 self._mCodingSM = None |
|
39 self._mLastChar = ['\x00', '\x00'] |
|
40 |
|
41 def reset(self): |
|
42 CharSetProber.reset(self) |
|
43 if self._mCodingSM: |
|
44 self._mCodingSM.reset() |
|
45 if self._mDistributionAnalyzer: |
|
46 self._mDistributionAnalyzer.reset() |
|
47 self._mLastChar = ['\x00', '\x00'] |
|
48 |
|
49 def get_charset_name(self): |
|
50 pass |
|
51 |
|
52 def feed(self, aBuf): |
|
53 aLen = len(aBuf) |
|
54 for i in range(0, aLen): |
|
55 codingState = self._mCodingSM.next_state(aBuf[i]) |
|
56 if codingState == eError: |
|
57 if constants._debug: |
|
58 sys.stderr.write(self.get_charset_name() + ' prober hit error at byte ' + str(i) + '\n') |
|
59 self._mState = constants.eNotMe |
|
60 break |
|
61 elif codingState == eItsMe: |
|
62 self._mState = constants.eFoundIt |
|
63 break |
|
64 elif codingState == eStart: |
|
65 charLen = self._mCodingSM.get_current_charlen() |
|
66 if i == 0: |
|
67 self._mLastChar[1] = aBuf[0] |
|
68 self._mDistributionAnalyzer.feed(self._mLastChar, charLen) |
|
69 else: |
|
70 self._mDistributionAnalyzer.feed(aBuf[i-1:i+1], charLen) |
|
71 |
|
72 self._mLastChar[0] = aBuf[aLen - 1] |
|
73 |
|
74 if self.get_state() == constants.eDetecting: |
|
75 if self._mDistributionAnalyzer.got_enough_data() and \ |
|
76 (self.get_confidence() > constants.SHORTCUT_THRESHOLD): |
|
77 self._mState = constants.eFoundIt |
|
78 |
|
79 return self.get_state() |
|
80 |
|
81 def get_confidence(self): |
|
82 return self._mDistributionAnalyzer.get_confidence() |