Fri, 25 Apr 2014 22:07:19 +0200
updated CharDet to 2.2.1, updated changelog
3537
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
1 | #!/usr/bin/env python |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
2 | """ |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
3 | Script which takes one or more file paths and reports on their detected |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
4 | encodings |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
5 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
6 | Example:: |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
7 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
8 | % chardetect somefile someotherfile |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
9 | somefile: windows-1252 with confidence 0.5 |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
10 | someotherfile: ascii with confidence 1.0 |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
11 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
12 | If no paths are provided, it takes its input from stdin. |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
13 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
14 | """ |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
15 | from io import open |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
16 | from sys import argv, stdin |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
17 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
18 | from chardet.universaldetector import UniversalDetector |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
19 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
20 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
21 | def description_of(file, name='stdin'): |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
22 | """Return a string describing the probable encoding of a file.""" |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
23 | u = UniversalDetector() |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
24 | for line in file: |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
25 | u.feed(line) |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
26 | u.close() |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
27 | result = u.result |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
28 | if result['encoding']: |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
29 | return '%s: %s with confidence %s' % (name, |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
30 | result['encoding'], |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
31 | result['confidence']) |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
32 | else: |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
33 | return '%s: no result' % name |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
34 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
35 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
36 | def main(): |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
37 | if len(argv) <= 1: |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
38 | print(description_of(stdin)) |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
39 | else: |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
40 | for path in argv[1:]: |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
41 | with open(path, 'rb') as f: |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
42 | print(description_of(f, path)) |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
43 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
44 | |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
45 | if __name__ == '__main__': |
7662053c3906
updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff
changeset
|
46 | main() |