eric: comparison ThirdParty/CharDet/chardet/charsetprober.py

-:6762afd9f963
+:90c57b50600f
 # License along with this library; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
-from . import constants
+import logging
 import re
+from .enums import ProbingState
-class CharSetProber:
-def __init__(self):
+class CharSetProber(object):
+SHORTCUT_THRESHOLD = 0.95
+def __init__(self, lang_filter=None):
+self._state = None
+self.lang_filter = lang_filter
+self.logger = logging.getLogger(__name__)
+def reset(self):
+self._state = ProbingState.DETECTING
+@property
+def charset_name(self):
+return None
+def feed(self, buf):
 pass
-def reset(self):
+@property
-self._mState = constants.eDetecting
+def state(self):
+return self._state
-def get_charset_name(self):
-return None
-def feed(self, aBuf):
-pass
-def get_state(self):
-return self._mState
 def get_confidence(self):
 return 0.0
-def filter_high_bit_only(self, aBuf):
+@staticmethod
-aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf)
+def filter_high_byte_only(buf):
-return aBuf
+buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
+return buf
-def filter_without_english_letters(self, aBuf):
+@staticmethod
-aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf)
+def filter_international_words(buf):
-return aBuf
+"""
+We define three types of bytes:
+alphabet: english alphabets [a-zA-Z]
+international: international characters [\x80-\xFF]
+marker: everything else [^a-zA-Z\x80-\xFF]
-def filter_with_english_letters(self, aBuf):
+The input buffer can be thought to contain a series of words delimited
-# TODO
+by markers. This function works to filter all words that contain at
-return aBuf
+least one international character. All contiguous sequences of markers
+are replaced by a single space ascii character.
+This filter applies to all scripts which do not use English characters.
+"""
+filtered = bytearray()
+# This regex expression filters out only words that have at-least one
+# international character. The word may include one marker character at
+# the end.
+words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
+buf)
+for word in words:
+filtered.extend(word[:-1])
+# If the last character in the word is a marker, replace it with a
+# space as markers shouldn't affect our analysis (they are used
+# similarly across all languages and may thus have similar
+# frequencies).
+last_char = word[-1:]
+if not last_char.isalpha() and last_char < b'\x80':
+last_char = b' '
+filtered.extend(last_char)
+return filtered
+@staticmethod
+def filter_with_english_letters(buf):
+"""
+Returns a copy of ``buf`` that retains only the sequences of English
+alphabet and high byte characters that are not between <> characters.
+Also retains English alphabet and high byte characters immediately
+before occurrences of >.
+This filter can be applied to all scripts which contain both English
+characters and extended ASCII characters, but is currently only used by
+``Latin1Prober``.
+"""
+filtered = bytearray()
+in_tag = False
+prev = 0
+for curr in range(len(buf)):
+# Slice here to get bytes instead of an int with Python 3
+buf_char = buf[curr:curr + 1]
+# Check if we're coming out of or entering an HTML tag
+if buf_char == b'>':
+in_tag = False
+elif buf_char == b'<':
+in_tag = True
+# If current character is not extended-ASCII and not alphabetic...
+if buf_char < b'\x80' and not buf_char.isalpha():
+# ...and we're not in a tag
+if curr > prev and not in_tag:
+# Keep everything after last non-extended-ASCII,
+# non-alphabetic character
+filtered.extend(buf[prev:curr])
+# Output a space to delimit stretch we kept
+filtered.extend(b' ')
+prev = curr + 1
+# If we're not in a tag...
+if not in_tag:
+# Keep everything after last non-extended-ASCII, non-alphabetic
+# character
+filtered.extend(buf[prev:])
+return filtered

Mercurial Repositories > eric / file comparison

comparison: ThirdParty/CharDet/chardet/charsetprober.py

ThirdParty/CharDet/chardet/charsetprober.py