ThirdParty/CharDet/chardet/hebrewprober.py

changeset 5714
90c57b50600f
parent 3537
7662053c3906
diff -r 6762afd9f963 -r 90c57b50600f ThirdParty/CharDet/chardet/hebrewprober.py
--- a/ThirdParty/CharDet/chardet/hebrewprober.py	Tue Apr 25 18:36:38 2017 +0200
+++ b/ThirdParty/CharDet/chardet/hebrewprober.py	Tue Apr 25 18:40:46 2017 +0200
@@ -26,8 +26,7 @@
 ######################### END LICENSE BLOCK #########################
 
 from .charsetprober import CharSetProber
-from .constants import eNotMe, eDetecting
-from .compat import wrap_ord
+from .enums import ProbingState
 
 # This prober doesn't actually recognize a language or a charset.
 # It is a helper prober for the use of the Hebrew model probers
@@ -126,56 +125,59 @@
 # model probers scores. The answer is returned in the form of the name of the
 # charset identified, either "windows-1255" or "ISO-8859-8".
 
-# windows-1255 / ISO-8859-8 code points of interest
-FINAL_KAF = 0xea
-NORMAL_KAF = 0xeb
-FINAL_MEM = 0xed
-NORMAL_MEM = 0xee
-FINAL_NUN = 0xef
-NORMAL_NUN = 0xf0
-FINAL_PE = 0xf3
-NORMAL_PE = 0xf4
-FINAL_TSADI = 0xf5
-NORMAL_TSADI = 0xf6
+class HebrewProber(CharSetProber):
+    # windows-1255 / ISO-8859-8 code points of interest
+    FINAL_KAF = 0xea
+    NORMAL_KAF = 0xeb
+    FINAL_MEM = 0xed
+    NORMAL_MEM = 0xee
+    FINAL_NUN = 0xef
+    NORMAL_NUN = 0xf0
+    FINAL_PE = 0xf3
+    NORMAL_PE = 0xf4
+    FINAL_TSADI = 0xf5
+    NORMAL_TSADI = 0xf6
 
-# Minimum Visual vs Logical final letter score difference.
-# If the difference is below this, don't rely solely on the final letter score
-# distance.
-MIN_FINAL_CHAR_DISTANCE = 5
+    # Minimum Visual vs Logical final letter score difference.
+    # If the difference is below this, don't rely solely on the final letter score
+    # distance.
+    MIN_FINAL_CHAR_DISTANCE = 5
 
-# Minimum Visual vs Logical model score difference.
-# If the difference is below this, don't rely at all on the model score
-# distance.
-MIN_MODEL_DISTANCE = 0.01
+    # Minimum Visual vs Logical model score difference.
+    # If the difference is below this, don't rely at all on the model score
+    # distance.
+    MIN_MODEL_DISTANCE = 0.01
 
-VISUAL_HEBREW_NAME = "ISO-8859-8"
-LOGICAL_HEBREW_NAME = "windows-1255"
-
+    VISUAL_HEBREW_NAME = "ISO-8859-8"
+    LOGICAL_HEBREW_NAME = "windows-1255"
 
-class HebrewProber(CharSetProber):
     def __init__(self):
-        CharSetProber.__init__(self)
-        self._mLogicalProber = None
-        self._mVisualProber = None
+        super(HebrewProber, self).__init__()
+        self._final_char_logical_score = None
+        self._final_char_visual_score = None
+        self._prev = None
+        self._before_prev = None
+        self._logical_prober = None
+        self._visual_prober = None
         self.reset()
 
     def reset(self):
-        self._mFinalCharLogicalScore = 0
-        self._mFinalCharVisualScore = 0
+        self._final_char_logical_score = 0
+        self._final_char_visual_score = 0
         # The two last characters seen in the previous buffer,
         # mPrev and mBeforePrev are initialized to space in order to simulate
         # a word delimiter at the beginning of the data
-        self._mPrev = ' '
-        self._mBeforePrev = ' '
+        self._prev = ' '
+        self._before_prev = ' '
         # These probers are owned by the group prober.
 
     def set_model_probers(self, logicalProber, visualProber):
-        self._mLogicalProber = logicalProber
-        self._mVisualProber = visualProber
+        self._logical_prober = logicalProber
+        self._visual_prober = visualProber
 
     def is_final(self, c):
-        return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE,
-                               FINAL_TSADI]
+        return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN,
+                     self.FINAL_PE, self.FINAL_TSADI]
 
     def is_non_final(self, c):
         # The normal Tsadi is not a good Non-Final letter due to words like
@@ -188,9 +190,10 @@
         # for example legally end with a Non-Final Pe or Kaf. However, the
         # benefit of these letters as Non-Final letters outweighs the damage
         # since these words are quite rare.
-        return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE]
+        return c in [self.NORMAL_KAF, self.NORMAL_MEM,
+                     self.NORMAL_NUN, self.NORMAL_PE]
 
-    def feed(self, aBuf):
+    def feed(self, byte_str):
         # Final letter analysis for logical-visual decision.
         # Look for evidence that the received buffer is either logical Hebrew
         # or visual Hebrew.
@@ -217,67 +220,73 @@
         # We automatically filter out all 7-bit characters (replace them with
         # spaces) so the word boundary detection works properly. [MAP]
 
-        if self.get_state() == eNotMe:
+        if self.state == ProbingState.NOT_ME:
             # Both model probers say it's not them. No reason to continue.
-            return eNotMe
+            return ProbingState.NOT_ME
 
-        aBuf = self.filter_high_bit_only(aBuf)
+        byte_str = self.filter_high_byte_only(byte_str)
 
-        for cur in aBuf:
+        for cur in byte_str:
             if cur == ' ':
                 # We stand on a space - a word just ended
-                if self._mBeforePrev != ' ':
-                    # next-to-last char was not a space so self._mPrev is not a
+                if self._before_prev != ' ':
+                    # next-to-last char was not a space so self._prev is not a
                     # 1 letter word
-                    if self.is_final(self._mPrev):
+                    if self.is_final(self._prev):
                         # case (1) [-2:not space][-1:final letter][cur:space]
-                        self._mFinalCharLogicalScore += 1
-                    elif self.is_non_final(self._mPrev):
+                        self._final_char_logical_score += 1
+                    elif self.is_non_final(self._prev):
                         # case (2) [-2:not space][-1:Non-Final letter][
                         #  cur:space]
-                        self._mFinalCharVisualScore += 1
+                        self._final_char_visual_score += 1
             else:
                 # Not standing on a space
-                if ((self._mBeforePrev == ' ') and
-                        (self.is_final(self._mPrev)) and (cur != ' ')):
+                if ((self._before_prev == ' ') and
+                        (self.is_final(self._prev)) and (cur != ' ')):
                     # case (3) [-2:space][-1:final letter][cur:not space]
-                    self._mFinalCharVisualScore += 1
-            self._mBeforePrev = self._mPrev
-            self._mPrev = cur
+                    self._final_char_visual_score += 1
+            self._before_prev = self._prev
+            self._prev = cur
 
         # Forever detecting, till the end or until both model probers return
-        # eNotMe (handled above)
-        return eDetecting
+        # ProbingState.NOT_ME (handled above)
+        return ProbingState.DETECTING
 
-    def get_charset_name(self):
+    @property
+    def charset_name(self):
         # Make the decision: is it Logical or Visual?
         # If the final letter score distance is dominant enough, rely on it.
-        finalsub = self._mFinalCharLogicalScore - self._mFinalCharVisualScore
-        if finalsub >= MIN_FINAL_CHAR_DISTANCE:
-            return LOGICAL_HEBREW_NAME
-        if finalsub <= -MIN_FINAL_CHAR_DISTANCE:
-            return VISUAL_HEBREW_NAME
+        finalsub = self._final_char_logical_score - self._final_char_visual_score
+        if finalsub >= self.MIN_FINAL_CHAR_DISTANCE:
+            return self.LOGICAL_HEBREW_NAME
+        if finalsub <= -self.MIN_FINAL_CHAR_DISTANCE:
+            return self.VISUAL_HEBREW_NAME
 
         # It's not dominant enough, try to rely on the model scores instead.
-        modelsub = (self._mLogicalProber.get_confidence()
-                    - self._mVisualProber.get_confidence())
-        if modelsub > MIN_MODEL_DISTANCE:
-            return LOGICAL_HEBREW_NAME
-        if modelsub < -MIN_MODEL_DISTANCE:
-            return VISUAL_HEBREW_NAME
+        modelsub = (self._logical_prober.get_confidence()
+                    - self._visual_prober.get_confidence())
+        if modelsub > self.MIN_MODEL_DISTANCE:
+            return self.LOGICAL_HEBREW_NAME
+        if modelsub < -self.MIN_MODEL_DISTANCE:
+            return self.VISUAL_HEBREW_NAME
 
         # Still no good, back to final letter distance, maybe it'll save the
         # day.
         if finalsub < 0.0:
-            return VISUAL_HEBREW_NAME
+            return self.VISUAL_HEBREW_NAME
 
         # (finalsub > 0 - Logical) or (don't know what to do) default to
         # Logical.
-        return LOGICAL_HEBREW_NAME
+        return self.LOGICAL_HEBREW_NAME
 
-    def get_state(self):
+    @property
+    def language(self):
+        return 'Hebrew'
+
+    @property
+    def state(self):
         # Remain active as long as any of the model probers are active.
-        if (self._mLogicalProber.get_state() == eNotMe) and \
-           (self._mVisualProber.get_state() == eNotMe):
-            return eNotMe
-        return eDetecting
+        if (self._logical_prober.state == ProbingState.NOT_ME) and \
+           (self._visual_prober.state == ProbingState.NOT_ME):
+            return ProbingState.NOT_ME
+        return ProbingState.DETECTING

eric ide

mercurial