Wed, 13 Jan 2021 19:05:48 +0100
Third Party Packages
- updated chardet to 4.0.0
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
1 | """ |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
2 | Script which takes one or more file paths and reports on their detected |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
3 | encodings |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
4 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
5 | Example:: |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
6 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
7 | % chardetect somefile someotherfile |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
8 | somefile: windows-1252 with confidence 0.5 |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
9 | someotherfile: ascii with confidence 1.0 |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
10 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
11 | If no paths are provided, it takes its input from stdin. |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
12 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
13 | """ |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
14 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
15 | from __future__ import absolute_import, print_function, unicode_literals |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
16 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
17 | import argparse |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
18 | import sys |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
19 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
20 | from chardet import __version__ |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
21 | from chardet.compat import PY2 |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
22 | from chardet.universaldetector import UniversalDetector |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
23 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
24 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
25 | def description_of(lines, name='stdin'): |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
26 | """ |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
27 | Return a string describing the probable encoding of a file or |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
28 | list of strings. |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
29 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
30 | :param lines: The lines to get the encoding of. |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
31 | :type lines: Iterable of bytes |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
32 | :param name: Name of file or collection of lines |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
33 | :type name: str |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
34 | """ |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
35 | u = UniversalDetector() |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
36 | for line in lines: |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
37 | line = bytearray(line) |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
38 | u.feed(line) |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
39 | # shortcut out of the loop to save reading further - particularly useful if we read a BOM. |
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
40 | if u.done: |
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
41 | break |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
42 | u.close() |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
43 | result = u.result |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
44 | if PY2: |
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
45 | name = name.decode(sys.getfilesystemencoding(), 'ignore') |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
46 | if result['encoding']: |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
47 | return '{}: {} with confidence {}'.format(name, result['encoding'], |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
48 | result['confidence']) |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
49 | else: |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
50 | return '{}: no result'.format(name) |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
51 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
52 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
53 | def main(argv=None): |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
54 | """ |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
55 | Handles command line arguments and gets things started. |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
56 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
57 | :param argv: List of arguments, as if specified on the command-line. |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
58 | If None, ``sys.argv[1:]`` is used instead. |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
59 | :type argv: list of str |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
60 | """ |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
61 | # Get command line arguments |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
62 | parser = argparse.ArgumentParser( |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
63 | description="Takes one or more file paths and reports their detected \ |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
64 | encodings") |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
65 | parser.add_argument('input', |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
66 | help='File whose encoding we would like to determine. \ |
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
67 | (default: stdin)', |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
68 | type=argparse.FileType('rb'), nargs='*', |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
69 | default=[sys.stdin if PY2 else sys.stdin.buffer]) |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
70 | parser.add_argument('--version', action='version', |
7974
f425b578ede7
Third Party Packages
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
6942
diff
changeset
|
71 | version='%(prog)s {}'.format(__version__)) |
5714
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
72 | args = parser.parse_args(argv) |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
73 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
74 | for f in args.input: |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
75 | if f.isatty(): |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
76 | print("You are running chardetect interactively. Press " + |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
77 | "CTRL-D twice at the start of a blank line to signal the " + |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
78 | "end of your input. If you want help, run chardetect " + |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
79 | "--help\n", file=sys.stderr) |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
80 | print(description_of(f, f.name)) |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
81 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
82 | |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
83 | if __name__ == '__main__': |
90c57b50600f
Updated chardet to 3.0.2.
Detlev Offenbach <detlev@die-offenbachs.de>
parents:
diff
changeset
|
84 | main() |