ThirdParty/CharDet/chardet/chardetect.py

Fri, 25 Apr 2014 22:07:19 +0200

author
T.Rzepka <Tobias.Rzepka@gmail.com>
date
Fri, 25 Apr 2014 22:07:19 +0200
changeset 3537
7662053c3906
child 5310
f2b774d78b4a
permissions
-rw-r--r--

updated CharDet to 2.2.1, updated changelog

3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
2 """
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
3 Script which takes one or more file paths and reports on their detected
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
4 encodings
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
5
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
6 Example::
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
7
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
8 % chardetect somefile someotherfile
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
9 somefile: windows-1252 with confidence 0.5
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
10 someotherfile: ascii with confidence 1.0
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
11
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
12 If no paths are provided, it takes its input from stdin.
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
13
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
14 """
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
15 from io import open
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
16 from sys import argv, stdin
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
17
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
18 from chardet.universaldetector import UniversalDetector
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
19
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
20
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
21 def description_of(file, name='stdin'):
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
22 """Return a string describing the probable encoding of a file."""
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
23 u = UniversalDetector()
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
24 for line in file:
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
25 u.feed(line)
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
26 u.close()
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
27 result = u.result
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
28 if result['encoding']:
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
29 return '%s: %s with confidence %s' % (name,
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
30 result['encoding'],
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
31 result['confidence'])
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
32 else:
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
33 return '%s: no result' % name
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
34
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
35
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
36 def main():
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
37 if len(argv) <= 1:
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
38 print(description_of(stdin))
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
39 else:
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
40 for path in argv[1:]:
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
41 with open(path, 'rb') as f:
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
42 print(description_of(f, path))
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
43
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
44
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
45 if __name__ == '__main__':
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
46 main()

eric ide

mercurial