eric6/ThirdParty/CharDet/chardet/cli/chardetect.py

Wed, 13 Jan 2021 19:05:48 +0100

author
Detlev Offenbach <detlev@die-offenbachs.de>
date
Wed, 13 Jan 2021 19:05:48 +0100
changeset 7974
f425b578ede7
parent 6942
2602857055c5
permissions
-rw-r--r--

Third Party Packages
- updated chardet to 4.0.0

5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
1 """
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
2 Script which takes one or more file paths and reports on their detected
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
3 encodings
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
4
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
5 Example::
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
6
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
7 % chardetect somefile someotherfile
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
8 somefile: windows-1252 with confidence 0.5
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
9 someotherfile: ascii with confidence 1.0
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
10
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
11 If no paths are provided, it takes its input from stdin.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
12
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
13 """
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
14
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
15 from __future__ import absolute_import, print_function, unicode_literals
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
16
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
17 import argparse
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
18 import sys
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
19
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
20 from chardet import __version__
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
21 from chardet.compat import PY2
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
22 from chardet.universaldetector import UniversalDetector
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
23
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
24
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
25 def description_of(lines, name='stdin'):
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
26 """
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
27 Return a string describing the probable encoding of a file or
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
28 list of strings.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
29
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
30 :param lines: The lines to get the encoding of.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
31 :type lines: Iterable of bytes
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
32 :param name: Name of file or collection of lines
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
33 :type name: str
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
34 """
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
35 u = UniversalDetector()
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
36 for line in lines:
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
37 line = bytearray(line)
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
38 u.feed(line)
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
39 # shortcut out of the loop to save reading further - particularly useful if we read a BOM.
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
40 if u.done:
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
41 break
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
42 u.close()
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
43 result = u.result
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
44 if PY2:
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
45 name = name.decode(sys.getfilesystemencoding(), 'ignore')
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
46 if result['encoding']:
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
47 return '{}: {} with confidence {}'.format(name, result['encoding'],
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
48 result['confidence'])
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
49 else:
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
50 return '{}: no result'.format(name)
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
51
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
52
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
53 def main(argv=None):
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
54 """
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
55 Handles command line arguments and gets things started.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
56
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
57 :param argv: List of arguments, as if specified on the command-line.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
58 If None, ``sys.argv[1:]`` is used instead.
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
59 :type argv: list of str
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
60 """
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
61 # Get command line arguments
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
62 parser = argparse.ArgumentParser(
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
63 description="Takes one or more file paths and reports their detected \
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
64 encodings")
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
65 parser.add_argument('input',
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
66 help='File whose encoding we would like to determine. \
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
67 (default: stdin)',
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
68 type=argparse.FileType('rb'), nargs='*',
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
69 default=[sys.stdin if PY2 else sys.stdin.buffer])
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
70 parser.add_argument('--version', action='version',
7974
f425b578ede7 Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents: 6942
diff changeset
71 version='%(prog)s {}'.format(__version__))
5714
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
72 args = parser.parse_args(argv)
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
73
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
74 for f in args.input:
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
75 if f.isatty():
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
76 print("You are running chardetect interactively. Press " +
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
77 "CTRL-D twice at the start of a blank line to signal the " +
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
78 "end of your input. If you want help, run chardetect " +
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
79 "--help\n", file=sys.stderr)
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
80 print(description_of(f, f.name))
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
81
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
82
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
83 if __name__ == '__main__':
90c57b50600f Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff changeset
84 main()

eric ide

mercurial