ThirdParty/CharDet/chardet/charsetprober.py

changeset 5714
90c57b50600f
parent 3537
7662053c3906
equal deleted inserted replaced
5713:6762afd9f963 5714:90c57b50600f
24 # License along with this library; if not, write to the Free Software 24 # License along with this library; if not, write to the Free Software
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26 # 02110-1301 USA 26 # 02110-1301 USA
27 ######################### END LICENSE BLOCK ######################### 27 ######################### END LICENSE BLOCK #########################
28 28
29 from . import constants 29 import logging
30 import re 30 import re
31 31
32 from .enums import ProbingState
32 33
33 class CharSetProber: 34
34 def __init__(self): 35 class CharSetProber(object):
36
37 SHORTCUT_THRESHOLD = 0.95
38
39 def __init__(self, lang_filter=None):
40 self._state = None
41 self.lang_filter = lang_filter
42 self.logger = logging.getLogger(__name__)
43
44 def reset(self):
45 self._state = ProbingState.DETECTING
46
47 @property
48 def charset_name(self):
49 return None
50
51 def feed(self, buf):
35 pass 52 pass
36 53
37 def reset(self): 54 @property
38 self._mState = constants.eDetecting 55 def state(self):
39 56 return self._state
40 def get_charset_name(self):
41 return None
42
43 def feed(self, aBuf):
44 pass
45
46 def get_state(self):
47 return self._mState
48 57
49 def get_confidence(self): 58 def get_confidence(self):
50 return 0.0 59 return 0.0
51 60
52 def filter_high_bit_only(self, aBuf): 61 @staticmethod
53 aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf) 62 def filter_high_byte_only(buf):
54 return aBuf 63 buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
64 return buf
55 65
56 def filter_without_english_letters(self, aBuf): 66 @staticmethod
57 aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf) 67 def filter_international_words(buf):
58 return aBuf 68 """
69 We define three types of bytes:
70 alphabet: english alphabets [a-zA-Z]
71 international: international characters [\x80-\xFF]
72 marker: everything else [^a-zA-Z\x80-\xFF]
59 73
60 def filter_with_english_letters(self, aBuf): 74 The input buffer can be thought to contain a series of words delimited
61 # TODO 75 by markers. This function works to filter all words that contain at
62 return aBuf 76 least one international character. All contiguous sequences of markers
77 are replaced by a single space ascii character.
78
79 This filter applies to all scripts which do not use English characters.
80 """
81 filtered = bytearray()
82
83 # This regex expression filters out only words that have at-least one
84 # international character. The word may include one marker character at
85 # the end.
86 words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
87 buf)
88
89 for word in words:
90 filtered.extend(word[:-1])
91
92 # If the last character in the word is a marker, replace it with a
93 # space as markers shouldn't affect our analysis (they are used
94 # similarly across all languages and may thus have similar
95 # frequencies).
96 last_char = word[-1:]
97 if not last_char.isalpha() and last_char < b'\x80':
98 last_char = b' '
99 filtered.extend(last_char)
100
101 return filtered
102
103 @staticmethod
104 def filter_with_english_letters(buf):
105 """
106 Returns a copy of ``buf`` that retains only the sequences of English
107 alphabet and high byte characters that are not between <> characters.
108 Also retains English alphabet and high byte characters immediately
109 before occurrences of >.
110
111 This filter can be applied to all scripts which contain both English
112 characters and extended ASCII characters, but is currently only used by
113 ``Latin1Prober``.
114 """
115 filtered = bytearray()
116 in_tag = False
117 prev = 0
118
119 for curr in range(len(buf)):
120 # Slice here to get bytes instead of an int with Python 3
121 buf_char = buf[curr:curr + 1]
122 # Check if we're coming out of or entering an HTML tag
123 if buf_char == b'>':
124 in_tag = False
125 elif buf_char == b'<':
126 in_tag = True
127
128 # If current character is not extended-ASCII and not alphabetic...
129 if buf_char < b'\x80' and not buf_char.isalpha():
130 # ...and we're not in a tag
131 if curr > prev and not in_tag:
132 # Keep everything after last non-extended-ASCII,
133 # non-alphabetic character
134 filtered.extend(buf[prev:curr])
135 # Output a space to delimit stretch we kept
136 filtered.extend(b' ')
137 prev = curr + 1
138
139 # If we're not in a tag...
140 if not in_tag:
141 # Keep everything after last non-extended-ASCII, non-alphabetic
142 # character
143 filtered.extend(buf[prev:])
144
145 return filtered

eric ide

mercurial