24 # License along with this library; if not, write to the Free Software |
24 # License along with this library; if not, write to the Free Software |
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
26 # 02110-1301 USA |
26 # 02110-1301 USA |
27 ######################### END LICENSE BLOCK ######################### |
27 ######################### END LICENSE BLOCK ######################### |
28 |
28 |
29 from . import constants |
29 import logging |
30 import re |
30 import re |
31 |
31 |
|
32 from .enums import ProbingState |
32 |
33 |
33 class CharSetProber: |
34 |
34 def __init__(self): |
35 class CharSetProber(object): |
|
36 |
|
37 SHORTCUT_THRESHOLD = 0.95 |
|
38 |
|
39 def __init__(self, lang_filter=None): |
|
40 self._state = None |
|
41 self.lang_filter = lang_filter |
|
42 self.logger = logging.getLogger(__name__) |
|
43 |
|
44 def reset(self): |
|
45 self._state = ProbingState.DETECTING |
|
46 |
|
47 @property |
|
48 def charset_name(self): |
|
49 return None |
|
50 |
|
51 def feed(self, buf): |
35 pass |
52 pass |
36 |
53 |
37 def reset(self): |
54 @property |
38 self._mState = constants.eDetecting |
55 def state(self): |
39 |
56 return self._state |
40 def get_charset_name(self): |
|
41 return None |
|
42 |
|
43 def feed(self, aBuf): |
|
44 pass |
|
45 |
|
46 def get_state(self): |
|
47 return self._mState |
|
48 |
57 |
49 def get_confidence(self): |
58 def get_confidence(self): |
50 return 0.0 |
59 return 0.0 |
51 |
60 |
52 def filter_high_bit_only(self, aBuf): |
61 @staticmethod |
53 aBuf = re.sub(b'([\x00-\x7F])+', b' ', aBuf) |
62 def filter_high_byte_only(buf): |
54 return aBuf |
63 buf = re.sub(b'([\x00-\x7F])+', b' ', buf) |
|
64 return buf |
55 |
65 |
56 def filter_without_english_letters(self, aBuf): |
66 @staticmethod |
57 aBuf = re.sub(b'([A-Za-z])+', b' ', aBuf) |
67 def filter_international_words(buf): |
58 return aBuf |
68 """ |
|
69 We define three types of bytes: |
|
70 alphabet: english alphabets [a-zA-Z] |
|
71 international: international characters [\x80-\xFF] |
|
72 marker: everything else [^a-zA-Z\x80-\xFF] |
59 |
73 |
60 def filter_with_english_letters(self, aBuf): |
74 The input buffer can be thought to contain a series of words delimited |
61 # TODO |
75 by markers. This function works to filter all words that contain at |
62 return aBuf |
76 least one international character. All contiguous sequences of markers |
|
77 are replaced by a single space ascii character. |
|
78 |
|
79 This filter applies to all scripts which do not use English characters. |
|
80 """ |
|
81 filtered = bytearray() |
|
82 |
|
83 # This regex expression filters out only words that have at-least one |
|
84 # international character. The word may include one marker character at |
|
85 # the end. |
|
86 words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?', |
|
87 buf) |
|
88 |
|
89 for word in words: |
|
90 filtered.extend(word[:-1]) |
|
91 |
|
92 # If the last character in the word is a marker, replace it with a |
|
93 # space as markers shouldn't affect our analysis (they are used |
|
94 # similarly across all languages and may thus have similar |
|
95 # frequencies). |
|
96 last_char = word[-1:] |
|
97 if not last_char.isalpha() and last_char < b'\x80': |
|
98 last_char = b' ' |
|
99 filtered.extend(last_char) |
|
100 |
|
101 return filtered |
|
102 |
|
103 @staticmethod |
|
104 def filter_with_english_letters(buf): |
|
105 """ |
|
106 Returns a copy of ``buf`` that retains only the sequences of English |
|
107 alphabet and high byte characters that are not between <> characters. |
|
108 Also retains English alphabet and high byte characters immediately |
|
109 before occurrences of >. |
|
110 |
|
111 This filter can be applied to all scripts which contain both English |
|
112 characters and extended ASCII characters, but is currently only used by |
|
113 ``Latin1Prober``. |
|
114 """ |
|
115 filtered = bytearray() |
|
116 in_tag = False |
|
117 prev = 0 |
|
118 |
|
119 for curr in range(len(buf)): |
|
120 # Slice here to get bytes instead of an int with Python 3 |
|
121 buf_char = buf[curr:curr + 1] |
|
122 # Check if we're coming out of or entering an HTML tag |
|
123 if buf_char == b'>': |
|
124 in_tag = False |
|
125 elif buf_char == b'<': |
|
126 in_tag = True |
|
127 |
|
128 # If current character is not extended-ASCII and not alphabetic... |
|
129 if buf_char < b'\x80' and not buf_char.isalpha(): |
|
130 # ...and we're not in a tag |
|
131 if curr > prev and not in_tag: |
|
132 # Keep everything after last non-extended-ASCII, |
|
133 # non-alphabetic character |
|
134 filtered.extend(buf[prev:curr]) |
|
135 # Output a space to delimit stretch we kept |
|
136 filtered.extend(b' ') |
|
137 prev = curr + 1 |
|
138 |
|
139 # If we're not in a tag... |
|
140 if not in_tag: |
|
141 # Keep everything after last non-extended-ASCII, non-alphabetic |
|
142 # character |
|
143 filtered.extend(buf[prev:]) |
|
144 |
|
145 return filtered |