eric6/ThirdParty/CharDet/chardet/cli/chardetect.py

Wed, 13 Jul 2022 15:34:50 +0200

author
Detlev Offenbach <detlev@die-offenbachs.de>
date
Wed, 13 Jul 2022 15:34:50 +0200
branch
with_python2
changeset 9225
bf799f79455c
parent 6942
2602857055c5
child 7974
f425b578ede7
permissions
-rw-r--r--

Revisions <no_multi_processing, Variables Viewer, with_python2> closed.

5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
1 #!/usr/bin/env python
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
2 """
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
3 Script which takes one or more file paths and reports on their detected
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
4 encodings
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
5
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
6 Example::
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
7
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
8 % chardetect somefile someotherfile
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
9 somefile: windows-1252 with confidence 0.5
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
10 someotherfile: ascii with confidence 1.0
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
11
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
12 If no paths are provided, it takes its input from stdin.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
13
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
14 """
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
15
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
16 from __future__ import absolute_import, print_function, unicode_literals
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
17
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
18 import argparse
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
19 import sys
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
20 from io import open
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
21
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
22 from chardet import __version__
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
23 from chardet.universaldetector import UniversalDetector
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
24
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
25
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
26 def description_of(lines, name='stdin'):
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
27 """
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
28 Return a string describing the probable encoding of a file or
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
29 list of strings.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
30
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
31 :param lines: The lines to get the encoding of.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
32 :type lines: Iterable of bytes
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
33 :param name: Name of file or collection of lines
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
34 :type name: str
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
35 """
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
36 u = UniversalDetector()
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
37 for line in lines:
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
38 u.feed(line)
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
39 u.close()
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
40 result = u.result
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
41 if result['encoding']:
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
42 return '{0}: {1} with confidence {2}'.format(name, result['encoding'],
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
43 result['confidence'])
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
44 else:
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
45 return '{0}: no result'.format(name)
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
46
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
47
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
48 def main(argv=None):
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
49 '''
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
50 Handles command line arguments and gets things started.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
51
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
52 :param argv: List of arguments, as if specified on the command-line.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
53 If None, ``sys.argv[1:]`` is used instead.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
54 :type argv: list of str
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
55 '''
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
56 # Get command line arguments
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
57 parser = argparse.ArgumentParser(
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
58 description="Takes one or more file paths and reports their detected \
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
59 encodings",
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
60 formatter_class=argparse.ArgumentDefaultsHelpFormatter,
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
61 conflict_handler='resolve')
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
62 parser.add_argument('input',
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
63 help='File whose encoding we would like to determine.',
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
64 type=argparse.FileType('rb'), nargs='*',
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
65 default=[sys.stdin])
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
66 parser.add_argument('--version', action='version',
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
67 version='%(prog)s {0}'.format(__version__))
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
68 args = parser.parse_args(argv)
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
69
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
70 for f in args.input:
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
71 if f.isatty():
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
72 print("You are running chardetect interactively. Press " +
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
73 "CTRL-D twice at the start of a blank line to signal the " +
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
74 "end of your input. If you want help, run chardetect " +
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
75 "--help\n", file=sys.stderr)
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
76 print(description_of(f, f.name))
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
77
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
78
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
79 if __name__ == '__main__':
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
80 main()

eric ide

mercurial