eric6/ThirdParty/CharDet/chardet/sbcharsetprober.py

changeset 7974
f425b578ede7
parent 6942
2602857055c5
diff -r e836d196e888 -r f425b578ede7 eric6/ThirdParty/CharDet/chardet/sbcharsetprober.py
--- a/eric6/ThirdParty/CharDet/chardet/sbcharsetprober.py	Wed Jan 13 19:02:58 2021 +0100
+++ b/eric6/ThirdParty/CharDet/chardet/sbcharsetprober.py	Wed Jan 13 19:05:48 2021 +0100
@@ -26,10 +26,22 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
+from collections import namedtuple
+
 from .charsetprober import CharSetProber
 from .enums import CharacterCategory, ProbingState, SequenceLikelihood
 
 
+SingleByteCharSetModel = namedtuple('SingleByteCharSetModel',
+                                    ['charset_name',
+                                     'language',
+                                     'char_to_order_map',
+                                     'language_model',
+                                     'typical_positive_ratio',
+                                     'keep_ascii_letters',
+                                     'alphabet'])
+
+
 class SingleByteCharSetProber(CharSetProber):
     SAMPLE_SIZE = 64
     SB_ENOUGH_REL_THRESHOLD = 1024  #  0.25 * SAMPLE_SIZE^2
@@ -65,25 +77,25 @@
         if self._name_prober:
             return self._name_prober.charset_name
         else:
-            return self._model['charset_name']
+            return self._model.charset_name
 
     @property
     def language(self):
         if self._name_prober:
             return self._name_prober.language
         else:
-            return self._model.get('language')
+            return self._model.language
 
     def feed(self, byte_str):
-        if not self._model['keep_english_letter']:
+        # TODO: Make filter_international_words keep things in self.alphabet
+        if not self._model.keep_ascii_letters:
             byte_str = self.filter_international_words(byte_str)
         if not byte_str:
             return self.state
-        char_to_order_map = self._model['char_to_order_map']
-        for i, c in enumerate(byte_str):
-            # XXX: Order is in range 1-64, so one would think we want 0-63 here,
-            #      but that leads to 27 more test failures than before.
-            order = char_to_order_map[c]
+        char_to_order_map = self._model.char_to_order_map
+        language_model = self._model.language_model
+        for char in byte_str:
+            order = char_to_order_map.get(char, CharacterCategory.UNDEFINED)
             # XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but
             #      CharacterCategory.SYMBOL is actually 253, so we use CONTROL
             #      to make it closer to the original intent. The only difference
@@ -91,20 +103,21 @@
             #      _total_char purposes.
             if order < CharacterCategory.CONTROL:
                 self._total_char += 1
+            # TODO: Follow uchardet's lead and discount confidence for frequent
+            #       control characters.
+            #       See https://github.com/BYVoid/uchardet/commit/55b4f23971db61
             if order < self.SAMPLE_SIZE:
                 self._freq_char += 1
                 if self._last_order < self.SAMPLE_SIZE:
                     self._total_seqs += 1
                     if not self._reversed:
-                        i = (self._last_order * self.SAMPLE_SIZE) + order
-                        model = self._model['precedence_matrix'][i]
-                    else:  # reverse the order of the letters in the lookup
-                        i = (order * self.SAMPLE_SIZE) + self._last_order
-                        model = self._model['precedence_matrix'][i]
-                    self._seq_counters[model] += 1
+                        lm_cat = language_model[self._last_order][order]
+                    else:
+                        lm_cat = language_model[order][self._last_order]
+                    self._seq_counters[lm_cat] += 1
             self._last_order = order
 
-        charset_name = self._model['charset_name']
+        charset_name = self._model.charset_name
         if self.state == ProbingState.DETECTING:
             if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:
                 confidence = self.get_confidence()
@@ -125,7 +138,7 @@
         r = 0.01
         if self._total_seqs > 0:
             r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /
-                 self._total_seqs / self._model['typical_positive_ratio'])
+                 self._total_seqs / self._model.typical_positive_ratio)
             r = r * self._freq_char / self._total_char
             if r >= 1.0:
                 r = 0.99

eric ide

mercurial