ThirdParty/CharDet/chardet/chardistribution.py

changeset 12
1d8dd9706f46
parent 0
de9c2efb9d02
child 3537
7662053c3906
equal deleted inserted replaced
11:b0996e4a289e 12:1d8dd9706f46
23 # License along with this library; if not, write to the Free Software 23 # License along with this library; if not, write to the Free Software
24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 24 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
25 # 02110-1301 USA 25 # 02110-1301 USA
26 ######################### END LICENSE BLOCK ######################### 26 ######################### END LICENSE BLOCK #########################
27 27
28 import constants 28 from . import constants
29 from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO 29 from .euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
30 from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO 30 from .euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
31 from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO 31 from .gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
32 from big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO 32 from .big5freq import Big5CharToFreqOrder, BIG5_TABLE_SIZE, BIG5_TYPICAL_DISTRIBUTION_RATIO
33 from jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO 33 from .jisfreq import JISCharToFreqOrder, JIS_TABLE_SIZE, JIS_TYPICAL_DISTRIBUTION_RATIO
34 34
35 ENOUGH_DATA_THRESHOLD = 1024 35 ENOUGH_DATA_THRESHOLD = 1024
36 SURE_YES = 0.99 36 SURE_YES = 0.99
37 SURE_NO = 0.01 37 SURE_NO = 0.01
38 38
43 self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail. 43 self._mTypicalDistributionRatio = None # This is a constant value which varies from language to language, used in calculating confidence. See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
44 self.reset() 44 self.reset()
45 45
46 def reset(self): 46 def reset(self):
47 """reset analyser, clear any state""" 47 """reset analyser, clear any state"""
48 self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made 48 self._mDone = False # If this flag is set to True, detection is done and conclusion has been made
49 self._mTotalChars = 0 # Total characters encountered 49 self._mTotalChars = 0 # Total characters encountered
50 self._mFreqChars = 0 # The number of characters whose frequency order is less than 512 50 self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
51 51
52 def feed(self, aStr, aCharLen): 52 def feed(self, aBuf, aCharLen):
53 """feed a character with known length""" 53 """feed a character with known length"""
54 if aCharLen == 2: 54 if aCharLen == 2:
55 # we only care about 2-bytes character in our distribution analysis 55 # we only care about 2-bytes character in our distribution analysis
56 order = self.get_order(aStr) 56 order = self.get_order(aBuf)
57 else: 57 else:
58 order = -1 58 order = -1
59 if order >= 0: 59 if order >= 0:
60 self._mTotalChars += 1 60 self._mTotalChars += 1
61 # order is valid 61 # order is valid
80 def got_enough_data(self): 80 def got_enough_data(self):
81 # It is not necessary to receive all data to draw conclusion. For charset detection, 81 # It is not necessary to receive all data to draw conclusion. For charset detection,
82 # certain amount of data is enough 82 # certain amount of data is enough
83 return self._mTotalChars > ENOUGH_DATA_THRESHOLD 83 return self._mTotalChars > ENOUGH_DATA_THRESHOLD
84 84
85 def get_order(self, aStr): 85 def get_order(self, aBuf):
86 # We do not handle characters based on the original encoding string, but 86 # We do not handle characters based on the original encoding string, but
87 # convert this encoding string to a number, here called order. 87 # convert this encoding string to a number, here called order.
88 # This allows multiple encodings of a language to share one frequency table. 88 # This allows multiple encodings of a language to share one frequency table.
89 return -1 89 return -1
90 90
93 CharDistributionAnalysis.__init__(self) 93 CharDistributionAnalysis.__init__(self)
94 self._mCharToFreqOrder = EUCTWCharToFreqOrder 94 self._mCharToFreqOrder = EUCTWCharToFreqOrder
95 self._mTableSize = EUCTW_TABLE_SIZE 95 self._mTableSize = EUCTW_TABLE_SIZE
96 self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO 96 self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
97 97
98 def get_order(self, aStr): 98 def get_order(self, aBuf):
99 # for euc-TW encoding, we are interested 99 # for euc-TW encoding, we are interested
100 # first byte range: 0xc4 -- 0xfe 100 # first byte range: 0xc4 -- 0xfe
101 # second byte range: 0xa1 -- 0xfe 101 # second byte range: 0xa1 -- 0xfe
102 # no validation needed here. State machine has done that 102 # no validation needed here. State machine has done that
103 if aStr[0] >= '\xC4': 103 if aBuf[0] >= 0xC4:
104 return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1 104 return 94 * (aBuf[0] - 0xC4) + aBuf[1] - 0xA1
105 else: 105 else:
106 return -1 106 return -1
107 107
108 class EUCKRDistributionAnalysis(CharDistributionAnalysis): 108 class EUCKRDistributionAnalysis(CharDistributionAnalysis):
109 def __init__(self): 109 def __init__(self):
110 CharDistributionAnalysis.__init__(self) 110 CharDistributionAnalysis.__init__(self)
111 self._mCharToFreqOrder = EUCKRCharToFreqOrder 111 self._mCharToFreqOrder = EUCKRCharToFreqOrder
112 self._mTableSize = EUCKR_TABLE_SIZE 112 self._mTableSize = EUCKR_TABLE_SIZE
113 self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO 113 self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
114 114
115 def get_order(self, aStr): 115 def get_order(self, aBuf):
116 # for euc-KR encoding, we are interested 116 # for euc-KR encoding, we are interested
117 # first byte range: 0xb0 -- 0xfe 117 # first byte range: 0xb0 -- 0xfe
118 # second byte range: 0xa1 -- 0xfe 118 # second byte range: 0xa1 -- 0xfe
119 # no validation needed here. State machine has done that 119 # no validation needed here. State machine has done that
120 if aStr[0] >= '\xB0': 120 if aBuf[0] >= 0xB0:
121 return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 121 return 94 * (aBuf[0] - 0xB0) + aBuf[1] - 0xA1
122 else: 122 else:
123 return -1; 123 return -1;
124 124
125 class GB2312DistributionAnalysis(CharDistributionAnalysis): 125 class GB2312DistributionAnalysis(CharDistributionAnalysis):
126 def __init__(self): 126 def __init__(self):
127 CharDistributionAnalysis.__init__(self) 127 CharDistributionAnalysis.__init__(self)
128 self._mCharToFreqOrder = GB2312CharToFreqOrder 128 self._mCharToFreqOrder = GB2312CharToFreqOrder
129 self._mTableSize = GB2312_TABLE_SIZE 129 self._mTableSize = GB2312_TABLE_SIZE
130 self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO 130 self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
131 131
132 def get_order(self, aStr): 132 def get_order(self, aBuf):
133 # for GB2312 encoding, we are interested 133 # for GB2312 encoding, we are interested
134 # first byte range: 0xb0 -- 0xfe 134 # first byte range: 0xb0 -- 0xfe
135 # second byte range: 0xa1 -- 0xfe 135 # second byte range: 0xa1 -- 0xfe
136 # no validation needed here. State machine has done that 136 # no validation needed here. State machine has done that
137 if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'): 137 if (aBuf[0] >= 0xB0) and (aBuf[1] >= 0xA1):
138 return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 138 return 94 * (aBuf[0] - 0xB0) + aBuf[1] - 0xA1
139 else: 139 else:
140 return -1; 140 return -1;
141 141
142 class Big5DistributionAnalysis(CharDistributionAnalysis): 142 class Big5DistributionAnalysis(CharDistributionAnalysis):
143 def __init__(self): 143 def __init__(self):
144 CharDistributionAnalysis.__init__(self) 144 CharDistributionAnalysis.__init__(self)
145 self._mCharToFreqOrder = Big5CharToFreqOrder 145 self._mCharToFreqOrder = Big5CharToFreqOrder
146 self._mTableSize = BIG5_TABLE_SIZE 146 self._mTableSize = BIG5_TABLE_SIZE
147 self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO 147 self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
148 148
149 def get_order(self, aStr): 149 def get_order(self, aBuf):
150 # for big5 encoding, we are interested 150 # for big5 encoding, we are interested
151 # first byte range: 0xa4 -- 0xfe 151 # first byte range: 0xa4 -- 0xfe
152 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe 152 # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
153 # no validation needed here. State machine has done that 153 # no validation needed here. State machine has done that
154 if aStr[0] >= '\xA4': 154 if aBuf[0] >= 0xA4:
155 if aStr[1] >= '\xA1': 155 if aBuf[1] >= 0xA1:
156 return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63 156 return 157 * (aBuf[0] - 0xA4) + aBuf[1] - 0xA1 + 63
157 else: 157 else:
158 return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40 158 return 157 * (aBuf[0] - 0xA4) + aBuf[1] - 0x40
159 else: 159 else:
160 return -1 160 return -1
161 161
162 class SJISDistributionAnalysis(CharDistributionAnalysis): 162 class SJISDistributionAnalysis(CharDistributionAnalysis):
163 def __init__(self): 163 def __init__(self):
164 CharDistributionAnalysis.__init__(self) 164 CharDistributionAnalysis.__init__(self)
165 self._mCharToFreqOrder = JISCharToFreqOrder 165 self._mCharToFreqOrder = JISCharToFreqOrder
166 self._mTableSize = JIS_TABLE_SIZE 166 self._mTableSize = JIS_TABLE_SIZE
167 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO 167 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
168 168
169 def get_order(self, aStr): 169 def get_order(self, aBuf):
170 # for sjis encoding, we are interested 170 # for sjis encoding, we are interested
171 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe 171 # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
172 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe 172 # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
173 # no validation needed here. State machine has done that 173 # no validation needed here. State machine has done that
174 if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'): 174 if (aBuf[0] >= 0x81) and (aBuf[0] <= 0x9F):
175 order = 188 * (ord(aStr[0]) - 0x81) 175 order = 188 * (aBuf[0] - 0x81)
176 elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'): 176 elif (aBuf[0] >= 0xE0) and (aBuf[0] <= 0xEF):
177 order = 188 * (ord(aStr[0]) - 0xE0 + 31) 177 order = 188 * (aBuf[0] - 0xE0 + 31)
178 else: 178 else:
179 return -1; 179 return -1;
180 order = order + ord(aStr[1]) - 0x40 180 order = order + aBuf[1] - 0x40
181 if aStr[1] > '\x7F': 181 if aBuf[1] > 0x7F:
182 order =- 1 182 order =- 1
183 return order 183 return order
184 184
185 class EUCJPDistributionAnalysis(CharDistributionAnalysis): 185 class EUCJPDistributionAnalysis(CharDistributionAnalysis):
186 def __init__(self): 186 def __init__(self):
187 CharDistributionAnalysis.__init__(self) 187 CharDistributionAnalysis.__init__(self)
188 self._mCharToFreqOrder = JISCharToFreqOrder 188 self._mCharToFreqOrder = JISCharToFreqOrder
189 self._mTableSize = JIS_TABLE_SIZE 189 self._mTableSize = JIS_TABLE_SIZE
190 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO 190 self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
191 191
192 def get_order(self, aStr): 192 def get_order(self, aBuf):
193 # for euc-JP encoding, we are interested 193 # for euc-JP encoding, we are interested
194 # first byte range: 0xa0 -- 0xfe 194 # first byte range: 0xa0 -- 0xfe
195 # second byte range: 0xa1 -- 0xfe 195 # second byte range: 0xa1 -- 0xfe
196 # no validation needed here. State machine has done that 196 # no validation needed here. State machine has done that
197 if aStr[0] >= '\xA0': 197 if aBuf[0] >= 0xA0:
198 return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1 198 return 94 * (aBuf[0] - 0xA1) + aBuf[1] - 0xa1
199 else: 199 else:
200 return -1 200 return -1

eric ide

mercurial