|
1 ######################## BEGIN LICENSE BLOCK ######################## |
|
2 # The Original Code is Mozilla Universal charset detector code. |
|
3 # |
|
4 # The Initial Developer of the Original Code is |
|
5 # Netscape Communications Corporation. |
|
6 # Portions created by the Initial Developer are Copyright (C) 2001 |
|
7 # the Initial Developer. All Rights Reserved. |
|
8 # |
|
9 # Contributor(s): |
|
10 # Mark Pilgrim - port to Python |
|
11 # Shy Shalom - original C code |
|
12 # |
|
13 # This library is free software; you can redistribute it and/or |
|
14 # modify it under the terms of the GNU Lesser General Public |
|
15 # License as published by the Free Software Foundation; either |
|
16 # version 2.1 of the License, or (at your option) any later version. |
|
17 # |
|
18 # This library is distributed in the hope that it will be useful, |
|
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
21 # Lesser General Public License for more details. |
|
22 # |
|
23 # You should have received a copy of the GNU Lesser General Public |
|
24 # License along with this library; if not, write to the Free Software |
|
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
|
26 # 02110-1301 USA |
|
27 ######################### END LICENSE BLOCK ######################### |
|
28 |
|
29 import constants, sys |
|
30 from charsetprober import CharSetProber |
|
31 |
|
32 SAMPLE_SIZE = 64 |
|
33 SB_ENOUGH_REL_THRESHOLD = 1024 |
|
34 POSITIVE_SHORTCUT_THRESHOLD = 0.95 |
|
35 NEGATIVE_SHORTCUT_THRESHOLD = 0.05 |
|
36 SYMBOL_CAT_ORDER = 250 |
|
37 NUMBER_OF_SEQ_CAT = 4 |
|
38 POSITIVE_CAT = NUMBER_OF_SEQ_CAT - 1 |
|
39 #NEGATIVE_CAT = 0 |
|
40 |
|
41 class SingleByteCharSetProber(CharSetProber): |
|
42 def __init__(self, model, reversed=constants.False, nameProber=None): |
|
43 CharSetProber.__init__(self) |
|
44 self._mModel = model |
|
45 self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup |
|
46 self._mNameProber = nameProber # Optional auxiliary prober for name decision |
|
47 self.reset() |
|
48 |
|
49 def reset(self): |
|
50 CharSetProber.reset(self) |
|
51 self._mLastOrder = 255 # char order of last character |
|
52 self._mSeqCounters = [0] * NUMBER_OF_SEQ_CAT |
|
53 self._mTotalSeqs = 0 |
|
54 self._mTotalChar = 0 |
|
55 self._mFreqChar = 0 # characters that fall in our sampling range |
|
56 |
|
57 def get_charset_name(self): |
|
58 if self._mNameProber: |
|
59 return self._mNameProber.get_charset_name() |
|
60 else: |
|
61 return self._mModel['charsetName'] |
|
62 |
|
63 def feed(self, aBuf): |
|
64 if not self._mModel['keepEnglishLetter']: |
|
65 aBuf = self.filter_without_english_letters(aBuf) |
|
66 aLen = len(aBuf) |
|
67 if not aLen: |
|
68 return self.get_state() |
|
69 for c in aBuf: |
|
70 order = self._mModel['charToOrderMap'][ord(c)] |
|
71 if order < SYMBOL_CAT_ORDER: |
|
72 self._mTotalChar += 1 |
|
73 if order < SAMPLE_SIZE: |
|
74 self._mFreqChar += 1 |
|
75 if self._mLastOrder < SAMPLE_SIZE: |
|
76 self._mTotalSeqs += 1 |
|
77 if not self._mReversed: |
|
78 self._mSeqCounters[self._mModel['precedenceMatrix'][(self._mLastOrder * SAMPLE_SIZE) + order]] += 1 |
|
79 else: # reverse the order of the letters in the lookup |
|
80 self._mSeqCounters[self._mModel['precedenceMatrix'][(order * SAMPLE_SIZE) + self._mLastOrder]] += 1 |
|
81 self._mLastOrder = order |
|
82 |
|
83 if self.get_state() == constants.eDetecting: |
|
84 if self._mTotalSeqs > SB_ENOUGH_REL_THRESHOLD: |
|
85 cf = self.get_confidence() |
|
86 if cf > POSITIVE_SHORTCUT_THRESHOLD: |
|
87 if constants._debug: |
|
88 sys.stderr.write('%s confidence = %s, we have a winner\n' % (self._mModel['charsetName'], cf)) |
|
89 self._mState = constants.eFoundIt |
|
90 elif cf < NEGATIVE_SHORTCUT_THRESHOLD: |
|
91 if constants._debug: |
|
92 sys.stderr.write('%s confidence = %s, below negative shortcut threshhold %s\n' % (self._mModel['charsetName'], cf, NEGATIVE_SHORTCUT_THRESHOLD)) |
|
93 self._mState = constants.eNotMe |
|
94 |
|
95 return self.get_state() |
|
96 |
|
97 def get_confidence(self): |
|
98 r = 0.01 |
|
99 if self._mTotalSeqs > 0: |
|
100 # print self._mSeqCounters[POSITIVE_CAT], self._mTotalSeqs, self._mModel['mTypicalPositiveRatio'] |
|
101 r = (1.0 * self._mSeqCounters[POSITIVE_CAT]) / self._mTotalSeqs / self._mModel['mTypicalPositiveRatio'] |
|
102 # print r, self._mFreqChar, self._mTotalChar |
|
103 r = r * self._mFreqChar / self._mTotalChar |
|
104 if r >= 1.0: |
|
105 r = 0.99 |
|
106 return r |