ThirdParty/CharDet/chardet/chardetect.py

Thu, 10 Nov 2016 18:57:50 +0100

author
Detlev Offenbach <detlev@die-offenbachs.de>
date
Thu, 10 Nov 2016 18:57:50 +0100
changeset 5310
f2b774d78b4a
parent 3537
7662053c3906
permissions
-rw-r--r--

Updated chardet to version 2.3.0.

3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
1 #!/usr/bin/env python
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
2 """
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
3 Script which takes one or more file paths and reports on their detected
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
4 encodings
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
5
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
6 Example::
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
7
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
8 % chardetect somefile someotherfile
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
9 somefile: windows-1252 with confidence 0.5
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
10 someotherfile: ascii with confidence 1.0
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
11
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
12 If no paths are provided, it takes its input from stdin.
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
13
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
14 """
5310
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
15
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
16 from __future__ import absolute_import, print_function, unicode_literals
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
17
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
18 import argparse
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
19 import sys
3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
20 from io import open
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
21
5310
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
22 from chardet import __version__
3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
23 from chardet.universaldetector import UniversalDetector
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
24
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
25
5310
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
26 def description_of(lines, name='stdin'):
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
27 """
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
28 Return a string describing the probable encoding of a file or
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
29 list of strings.
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
30
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
31 :param lines: The lines to get the encoding of.
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
32 :type lines: Iterable of bytes
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
33 :param name: Name of file or collection of lines
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
34 :type name: str
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
35 """
3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
36 u = UniversalDetector()
5310
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
37 for line in lines:
3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
38 u.feed(line)
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
39 u.close()
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
40 result = u.result
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
41 if result['encoding']:
5310
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
42 return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
43 result['confidence'])
3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
44 else:
5310
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
45 return '{0}: no result'.format(name)
3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
46
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
47
5310
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
48 def main(argv=None):
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
49 '''
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
50 Handles command line arguments and gets things started.
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
51
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
52 :param argv: List of arguments, as if specified on the command-line.
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
53 If None, ``sys.argv[1:]`` is used instead.
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
54 :type argv: list of str
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
55 '''
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
56 # Get command line arguments
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
57 parser = argparse.ArgumentParser(
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
58 description="Takes one or more file paths and reports their detected \
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
59 encodings",
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
60 formatter_class=argparse.ArgumentDefaultsHelpFormatter,
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
61 conflict_handler='resolve')
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
62 parser.add_argument('input',
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
63 help='File whose encoding we would like to determine.',
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
64 type=argparse.FileType('rb'), nargs='*',
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
65 default=[sys.stdin])
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
66 parser.add_argument('--version', action='version',
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
67 version='%(prog)s {0}'.format(__version__))
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
68 args = parser.parse_args(argv)
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
69
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
70 for f in args.input:
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
71 if f.isatty():
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
72 print("You are running chardetect interactively. Press " +
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
73 "CTRL-D twice at the start of a blank line to signal the " +
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
74 "end of your input. If you want help, run chardetect " +
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
75 "--help\n", file=sys.stderr)
f2b774d78b4a Updated chardet to version 2.3.0.
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 3537
diff changeset
76 print(description_of(f, f.name))
3537
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
77
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
78
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
79 if __name__ == '__main__':
7662053c3906 updated CharDet to 2.2.1, updated changelog
T.Rzepka <Tobias.Rzepka@gmail.com>
parents:
diff changeset
80 main()

eric ide

mercurial