|
1 ######################## BEGIN LICENSE BLOCK ######################## |
|
2 # The Original Code is Mozilla Universal charset detector code. |
|
3 # |
|
4 # The Initial Developer of the Original Code is |
|
5 # Netscape Communications Corporation. |
|
6 # Portions created by the Initial Developer are Copyright (C) 2001 |
|
7 # the Initial Developer. All Rights Reserved. |
|
8 # |
|
9 # Contributor(s): |
|
10 # Mark Pilgrim - port to Python |
|
11 # Shy Shalom - original C code |
|
12 # |
|
13 # This library is free software; you can redistribute it and/or |
|
14 # modify it under the terms of the GNU Lesser General Public |
|
15 # License as published by the Free Software Foundation; either |
|
16 # version 2.1 of the License, or (at your option) any later version. |
|
17 # |
|
18 # This library is distributed in the hope that it will be useful, |
|
19 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
20 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
21 # Lesser General Public License for more details. |
|
22 # |
|
23 # You should have received a copy of the GNU Lesser General Public |
|
24 # License along with this library; if not, write to the Free Software |
|
25 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
|
26 # 02110-1301 USA |
|
27 ######################### END LICENSE BLOCK ######################### |
|
28 |
|
29 import constants, re |
|
30 |
|
31 class CharSetProber: |
|
32 def __init__(self): |
|
33 pass |
|
34 |
|
35 def reset(self): |
|
36 self._mState = constants.eDetecting |
|
37 |
|
38 def get_charset_name(self): |
|
39 return None |
|
40 |
|
41 def feed(self, aBuf): |
|
42 pass |
|
43 |
|
44 def get_state(self): |
|
45 return self._mState |
|
46 |
|
47 def get_confidence(self): |
|
48 return 0.0 |
|
49 |
|
50 def filter_high_bit_only(self, aBuf): |
|
51 aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf) |
|
52 return aBuf |
|
53 |
|
54 def filter_without_english_letters(self, aBuf): |
|
55 aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf) |
|
56 return aBuf |
|
57 |
|
58 def filter_with_english_letters(self, aBuf): |
|
59 # TODO |
|
60 return aBuf |