diff -r 6762afd9f963 -r 90c57b50600f ThirdParty/CharDet/chardet/sjisprober.py --- a/ThirdParty/CharDet/chardet/sjisprober.py Tue Apr 25 18:36:38 2017 +0200 +++ b/ThirdParty/CharDet/chardet/sjisprober.py Tue Apr 25 18:40:46 2017 +0200 @@ -25,67 +25,68 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import sys from .mbcharsetprober import MultiByteCharSetProber from .codingstatemachine import CodingStateMachine from .chardistribution import SJISDistributionAnalysis from .jpcntx import SJISContextAnalysis -from .mbcssm import SJISSMModel -from . import constants +from .mbcssm import SJIS_SM_MODEL +from .enums import ProbingState, MachineState class SJISProber(MultiByteCharSetProber): def __init__(self): - MultiByteCharSetProber.__init__(self) - self._mCodingSM = CodingStateMachine(SJISSMModel) - self._mDistributionAnalyzer = SJISDistributionAnalysis() - self._mContextAnalyzer = SJISContextAnalysis() + super(SJISProber, self).__init__() + self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) + self.distribution_analyzer = SJISDistributionAnalysis() + self.context_analyzer = SJISContextAnalysis() self.reset() def reset(self): - MultiByteCharSetProber.reset(self) - self._mContextAnalyzer.reset() + super(SJISProber, self).reset() + self.context_analyzer.reset() - def get_charset_name(self): - return self._mContextAnalyzer.get_charset_name() + @property + def charset_name(self): + return self.context_analyzer.charset_name + + @property + def language(self): + return "Japanese" - def feed(self, aBuf): - aLen = len(aBuf) - for i in range(0, aLen): - codingState = self._mCodingSM.next_state(aBuf[i]) - if codingState == constants.eError: - if constants._debug: - sys.stderr.write(self.get_charset_name() - + ' prober hit error at byte ' + str(i) - + '\n') - self._mState = constants.eNotMe + def feed(self, byte_str): + for i in range(len(byte_str)): + coding_state = self.coding_sm.next_state(byte_str[i]) + if coding_state == MachineState.ERROR: + self.logger.debug('%s %s prober hit error at byte %s', + self.charset_name, self.language, i) + self._state = ProbingState.NOT_ME break - elif codingState == constants.eItsMe: - self._mState = constants.eFoundIt + elif coding_state == MachineState.ITS_ME: + self._state = ProbingState.FOUND_IT break - elif codingState == constants.eStart: - charLen = self._mCodingSM.get_current_charlen() + elif coding_state == MachineState.START: + char_len = self.coding_sm.get_current_charlen() if i == 0: - self._mLastChar[1] = aBuf[0] - self._mContextAnalyzer.feed(self._mLastChar[2 - charLen:], - charLen) - self._mDistributionAnalyzer.feed(self._mLastChar, charLen) + self._last_char[1] = byte_str[0] + self.context_analyzer.feed(self._last_char[2 - char_len:], + char_len) + self.distribution_analyzer.feed(self._last_char, char_len) else: - self._mContextAnalyzer.feed(aBuf[i + 1 - charLen:i + 3 - - charLen], charLen) - self._mDistributionAnalyzer.feed(aBuf[i - 1:i + 1], - charLen) + self.context_analyzer.feed(byte_str[i + 1 - char_len:i + 3 + - char_len], char_len) + self.distribution_analyzer.feed(byte_str[i - 1:i + 1], + char_len) - self._mLastChar[0] = aBuf[aLen - 1] + self._last_char[0] = byte_str[-1] - if self.get_state() == constants.eDetecting: - if (self._mContextAnalyzer.got_enough_data() and - (self.get_confidence() > constants.SHORTCUT_THRESHOLD)): - self._mState = constants.eFoundIt + if self.state == ProbingState.DETECTING: + if (self.context_analyzer.got_enough_data() and + (self.get_confidence() > self.SHORTCUT_THRESHOLD)): + self._state = ProbingState.FOUND_IT - return self.get_state() + return self.state def get_confidence(self): - contxtCf = self._mContextAnalyzer.get_confidence() - distribCf = self._mDistributionAnalyzer.get_confidence() - return max(contxtCf, distribCf) + context_conf = self.context_analyzer.get_confidence() + distrib_conf = self.distribution_analyzer.get_confidence() + return max(context_conf, distrib_conf)