ThirdParty/CharDet/chardet/jpcntx.py

changeset 5714
90c57b50600f
parent 5310
f2b774d78b4a
diff -r 6762afd9f963 -r 90c57b50600f ThirdParty/CharDet/chardet/jpcntx.py
--- a/ThirdParty/CharDet/chardet/jpcntx.py	Tue Apr 25 18:36:38 2017 +0200
+++ b/ThirdParty/CharDet/chardet/jpcntx.py	Tue Apr 25 18:40:46 2017 +0200
@@ -25,13 +25,6 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-from .compat import wrap_ord
-
-NUM_OF_CATEGORY = 6
-DONT_KNOW = -1
-ENOUGH_REL_THRESHOLD = 100
-MAX_REL_THRESHOLD = 1000
-MINIMUM_DATA_THRESHOLD = 4
 
 # This is hiragana 2-char sequence table, the number in each cell represents its frequency category
 jp2CharContext = (
@@ -120,24 +113,35 @@
 (0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1),
 )
 
-class JapaneseContextAnalysis:
+class JapaneseContextAnalysis(object):
+    NUM_OF_CATEGORY = 6
+    DONT_KNOW = -1
+    ENOUGH_REL_THRESHOLD = 100
+    MAX_REL_THRESHOLD = 1000
+    MINIMUM_DATA_THRESHOLD = 4
+
     def __init__(self):
+        self._total_rel = None
+        self._rel_sample = None
+        self._need_to_skip_char_num = None
+        self._last_char_order = None
+        self._done = None
         self.reset()
 
     def reset(self):
-        self._mTotalRel = 0  # total sequence received
-        # category counters, each interger counts sequence in its category
-        self._mRelSample = [0] * NUM_OF_CATEGORY
+        self._total_rel = 0  # total sequence received
+        # category counters, each integer counts sequence in its category
+        self._rel_sample = [0] * self.NUM_OF_CATEGORY
         # if last byte in current buffer is not the last byte of a character,
         # we need to know how many bytes to skip in next buffer
-        self._mNeedToSkipCharNum = 0
-        self._mLastCharOrder = -1  # The order of previous char
+        self._need_to_skip_char_num = 0
+        self._last_char_order = -1  # The order of previous char
         # If this flag is set to True, detection is done and conclusion has
         # been made
-        self._mDone = False
+        self._done = False
 
-    def feed(self, aBuf, aLen):
-        if self._mDone:
+    def feed(self, byte_str, num_bytes):
+        if self._done:
             return
 
         # The buffer we got is byte oriented, and a character may span in more than one
@@ -147,81 +151,83 @@
         # well and analyse the character once it is complete, but since a
         # character will not make much difference, by simply skipping
         # this character will simply our logic and improve performance.
-        i = self._mNeedToSkipCharNum
-        while i < aLen:
-            order, charLen = self.get_order(aBuf[i:i + 2])
-            i += charLen
-            if i > aLen:
-                self._mNeedToSkipCharNum = i - aLen
-                self._mLastCharOrder = -1
+        i = self._need_to_skip_char_num
+        while i < num_bytes:
+            order, char_len = self.get_order(byte_str[i:i + 2])
+            i += char_len
+            if i > num_bytes:
+                self._need_to_skip_char_num = i - num_bytes
+                self._last_char_order = -1
             else:
-                if (order != -1) and (self._mLastCharOrder != -1):
-                    self._mTotalRel += 1
-                    if self._mTotalRel > MAX_REL_THRESHOLD:
-                        self._mDone = True
+                if (order != -1) and (self._last_char_order != -1):
+                    self._total_rel += 1
+                    if self._total_rel > self.MAX_REL_THRESHOLD:
+                        self._done = True
                         break
-                    self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
-                self._mLastCharOrder = order
+                    self._rel_sample[jp2CharContext[self._last_char_order][order]] += 1
+                self._last_char_order = order
 
     def got_enough_data(self):
-        return self._mTotalRel > ENOUGH_REL_THRESHOLD
+        return self._total_rel > self.ENOUGH_REL_THRESHOLD
 
     def get_confidence(self):
         # This is just one way to calculate confidence. It works well for me.
-        if self._mTotalRel > MINIMUM_DATA_THRESHOLD:
-            return (self._mTotalRel - self._mRelSample[0]) / self._mTotalRel
+        if self._total_rel > self.MINIMUM_DATA_THRESHOLD:
+            return (self._total_rel - self._rel_sample[0]) / self._total_rel
         else:
-            return DONT_KNOW
+            return self.DONT_KNOW
 
-    def get_order(self, aBuf):
+    def get_order(self, byte_str):
         return -1, 1
 
 class SJISContextAnalysis(JapaneseContextAnalysis):
     def __init__(self):
-        self.charset_name = "SHIFT_JIS"
+        super(SJISContextAnalysis, self).__init__()
+        self._charset_name = "SHIFT_JIS"
 
-    def get_charset_name(self):
-        return self.charset_name
+    @property
+    def charset_name(self):
+        return self._charset_name
 
-    def get_order(self, aBuf):
-        if not aBuf:
+    def get_order(self, byte_str):
+        if not byte_str:
             return -1, 1
         # find out current char's byte length
-        first_char = wrap_ord(aBuf[0])
-        if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)):
-            charLen = 2
+        first_char = byte_str[0]
+        if (0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC):
+            char_len = 2
             if (first_char == 0x87) or (0xFA <= first_char <= 0xFC):
-                self.charset_name = "CP932"
+                self._charset_name = "CP932"
         else:
-            charLen = 1
+            char_len = 1
 
         # return its order if it is hiragana
-        if len(aBuf) > 1:
-            second_char = wrap_ord(aBuf[1])
+        if len(byte_str) > 1:
+            second_char = byte_str[1]
             if (first_char == 202) and (0x9F <= second_char <= 0xF1):
-                return second_char - 0x9F, charLen
+                return second_char - 0x9F, char_len
 
-        return -1, charLen
+        return -1, char_len
 
 class EUCJPContextAnalysis(JapaneseContextAnalysis):
-    def get_order(self, aBuf):
-        if not aBuf:
+    def get_order(self, byte_str):
+        if not byte_str:
             return -1, 1
         # find out current char's byte length
-        first_char = wrap_ord(aBuf[0])
+        first_char = byte_str[0]
         if (first_char == 0x8E) or (0xA1 <= first_char <= 0xFE):
-            charLen = 2
+            char_len = 2
         elif first_char == 0x8F:
-            charLen = 3
+            char_len = 3
         else:
-            charLen = 1
+            char_len = 1
 
         # return its order if it is hiragana
-        if len(aBuf) > 1:
-            second_char = wrap_ord(aBuf[1])
+        if len(byte_str) > 1:
+            second_char = byte_str[1]
             if (first_char == 0xA4) and (0xA1 <= second_char <= 0xF3):
-                return second_char - 0xA1, charLen
+                return second_char - 0xA1, char_len
 
-        return -1, charLen
+        return -1, char_len
 
-# flake8: noqa
+

eric ide

mercurial