diff -r 6762afd9f963 -r 90c57b50600f ThirdParty/CharDet/chardet/hebrewprober.py --- a/ThirdParty/CharDet/chardet/hebrewprober.py Tue Apr 25 18:36:38 2017 +0200 +++ b/ThirdParty/CharDet/chardet/hebrewprober.py Tue Apr 25 18:40:46 2017 +0200 @@ -26,8 +26,7 @@ ######################### END LICENSE BLOCK ######################### from .charsetprober import CharSetProber -from .constants import eNotMe, eDetecting -from .compat import wrap_ord +from .enums import ProbingState # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers @@ -126,56 +125,59 @@ # model probers scores. The answer is returned in the form of the name of the # charset identified, either "windows-1255" or "ISO-8859-8". -# windows-1255 / ISO-8859-8 code points of interest -FINAL_KAF = 0xea -NORMAL_KAF = 0xeb -FINAL_MEM = 0xed -NORMAL_MEM = 0xee -FINAL_NUN = 0xef -NORMAL_NUN = 0xf0 -FINAL_PE = 0xf3 -NORMAL_PE = 0xf4 -FINAL_TSADI = 0xf5 -NORMAL_TSADI = 0xf6 +class HebrewProber(CharSetProber): + # windows-1255 / ISO-8859-8 code points of interest + FINAL_KAF = 0xea + NORMAL_KAF = 0xeb + FINAL_MEM = 0xed + NORMAL_MEM = 0xee + FINAL_NUN = 0xef + NORMAL_NUN = 0xf0 + FINAL_PE = 0xf3 + NORMAL_PE = 0xf4 + FINAL_TSADI = 0xf5 + NORMAL_TSADI = 0xf6 -# Minimum Visual vs Logical final letter score difference. -# If the difference is below this, don't rely solely on the final letter score -# distance. -MIN_FINAL_CHAR_DISTANCE = 5 + # Minimum Visual vs Logical final letter score difference. + # If the difference is below this, don't rely solely on the final letter score + # distance. + MIN_FINAL_CHAR_DISTANCE = 5 -# Minimum Visual vs Logical model score difference. -# If the difference is below this, don't rely at all on the model score -# distance. -MIN_MODEL_DISTANCE = 0.01 + # Minimum Visual vs Logical model score difference. + # If the difference is below this, don't rely at all on the model score + # distance. + MIN_MODEL_DISTANCE = 0.01 -VISUAL_HEBREW_NAME = "ISO-8859-8" -LOGICAL_HEBREW_NAME = "windows-1255" - + VISUAL_HEBREW_NAME = "ISO-8859-8" + LOGICAL_HEBREW_NAME = "windows-1255" -class HebrewProber(CharSetProber): def __init__(self): - CharSetProber.__init__(self) - self._mLogicalProber = None - self._mVisualProber = None + super(HebrewProber, self).__init__() + self._final_char_logical_score = None + self._final_char_visual_score = None + self._prev = None + self._before_prev = None + self._logical_prober = None + self._visual_prober = None self.reset() def reset(self): - self._mFinalCharLogicalScore = 0 - self._mFinalCharVisualScore = 0 + self._final_char_logical_score = 0 + self._final_char_visual_score = 0 # The two last characters seen in the previous buffer, # mPrev and mBeforePrev are initialized to space in order to simulate # a word delimiter at the beginning of the data - self._mPrev = ' ' - self._mBeforePrev = ' ' + self._prev = ' ' + self._before_prev = ' ' # These probers are owned by the group prober. def set_model_probers(self, logicalProber, visualProber): - self._mLogicalProber = logicalProber - self._mVisualProber = visualProber + self._logical_prober = logicalProber + self._visual_prober = visualProber def is_final(self, c): - return wrap_ord(c) in [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, - FINAL_TSADI] + return c in [self.FINAL_KAF, self.FINAL_MEM, self.FINAL_NUN, + self.FINAL_PE, self.FINAL_TSADI] def is_non_final(self, c): # The normal Tsadi is not a good Non-Final letter due to words like @@ -188,9 +190,10 @@ # for example legally end with a Non-Final Pe or Kaf. However, the # benefit of these letters as Non-Final letters outweighs the damage # since these words are quite rare. - return wrap_ord(c) in [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE] + return c in [self.NORMAL_KAF, self.NORMAL_MEM, + self.NORMAL_NUN, self.NORMAL_PE] - def feed(self, aBuf): + def feed(self, byte_str): # Final letter analysis for logical-visual decision. # Look for evidence that the received buffer is either logical Hebrew # or visual Hebrew. @@ -217,67 +220,73 @@ # We automatically filter out all 7-bit characters (replace them with # spaces) so the word boundary detection works properly. [MAP] - if self.get_state() == eNotMe: + if self.state == ProbingState.NOT_ME: # Both model probers say it's not them. No reason to continue. - return eNotMe + return ProbingState.NOT_ME - aBuf = self.filter_high_bit_only(aBuf) + byte_str = self.filter_high_byte_only(byte_str) - for cur in aBuf: + for cur in byte_str: if cur == ' ': # We stand on a space - a word just ended - if self._mBeforePrev != ' ': - # next-to-last char was not a space so self._mPrev is not a + if self._before_prev != ' ': + # next-to-last char was not a space so self._prev is not a # 1 letter word - if self.is_final(self._mPrev): + if self.is_final(self._prev): # case (1) [-2:not space][-1:final letter][cur:space] - self._mFinalCharLogicalScore += 1 - elif self.is_non_final(self._mPrev): + self._final_char_logical_score += 1 + elif self.is_non_final(self._prev): # case (2) [-2:not space][-1:Non-Final letter][ # cur:space] - self._mFinalCharVisualScore += 1 + self._final_char_visual_score += 1 else: # Not standing on a space - if ((self._mBeforePrev == ' ') and - (self.is_final(self._mPrev)) and (cur != ' ')): + if ((self._before_prev == ' ') and + (self.is_final(self._prev)) and (cur != ' ')): # case (3) [-2:space][-1:final letter][cur:not space] - self._mFinalCharVisualScore += 1 - self._mBeforePrev = self._mPrev - self._mPrev = cur + self._final_char_visual_score += 1 + self._before_prev = self._prev + self._prev = cur # Forever detecting, till the end or until both model probers return - # eNotMe (handled above) - return eDetecting + # ProbingState.NOT_ME (handled above) + return ProbingState.DETECTING - def get_charset_name(self): + @property + def charset_name(self): # Make the decision: is it Logical or Visual? # If the final letter score distance is dominant enough, rely on it. - finalsub = self._mFinalCharLogicalScore - self._mFinalCharVisualScore - if finalsub >= MIN_FINAL_CHAR_DISTANCE: - return LOGICAL_HEBREW_NAME - if finalsub <= -MIN_FINAL_CHAR_DISTANCE: - return VISUAL_HEBREW_NAME + finalsub = self._final_char_logical_score - self._final_char_visual_score + if finalsub >= self.MIN_FINAL_CHAR_DISTANCE: + return self.LOGICAL_HEBREW_NAME + if finalsub <= -self.MIN_FINAL_CHAR_DISTANCE: + return self.VISUAL_HEBREW_NAME # It's not dominant enough, try to rely on the model scores instead. - modelsub = (self._mLogicalProber.get_confidence() - - self._mVisualProber.get_confidence()) - if modelsub > MIN_MODEL_DISTANCE: - return LOGICAL_HEBREW_NAME - if modelsub < -MIN_MODEL_DISTANCE: - return VISUAL_HEBREW_NAME + modelsub = (self._logical_prober.get_confidence() + - self._visual_prober.get_confidence()) + if modelsub > self.MIN_MODEL_DISTANCE: + return self.LOGICAL_HEBREW_NAME + if modelsub < -self.MIN_MODEL_DISTANCE: + return self.VISUAL_HEBREW_NAME # Still no good, back to final letter distance, maybe it'll save the # day. if finalsub < 0.0: - return VISUAL_HEBREW_NAME + return self.VISUAL_HEBREW_NAME # (finalsub > 0 - Logical) or (don't know what to do) default to # Logical. - return LOGICAL_HEBREW_NAME + return self.LOGICAL_HEBREW_NAME - def get_state(self): + @property + def language(self): + return 'Hebrew' + + @property + def state(self): # Remain active as long as any of the model probers are active. - if (self._mLogicalProber.get_state() == eNotMe) and \ - (self._mVisualProber.get_state() == eNotMe): - return eNotMe - return eDetecting + if (self._logical_prober.state == ProbingState.NOT_ME) and \ + (self._visual_prober.state == ProbingState.NOT_ME): + return ProbingState.NOT_ME + return ProbingState.DETECTING