|
1 # -*- coding: utf-8 -*- |
|
2 |
|
3 # Copyright (c) 2011 Detlev Offenbach <detlev@die-offenbachs.de> |
|
4 # |
|
5 |
|
6 """ |
|
7 Module implementing tool functions. |
|
8 """ |
|
9 |
|
10 import re |
|
11 from codecs import BOM_UTF8, BOM_UTF16, BOM_UTF32 |
|
12 |
|
13 coding_regexps = [ |
|
14 (2, re.compile(r'''coding[:=]\s*([-\w_.]+)''')), |
|
15 (1, re.compile(r'''<\?xml.*\bencoding\s*=\s*['"]([-\w_.]+)['"]\?>''')), |
|
16 ] |
|
17 |
|
18 def get_coding(text): |
|
19 """ |
|
20 Function to get the coding of a text. |
|
21 |
|
22 @param text text to inspect (string) |
|
23 @return coding string |
|
24 """ |
|
25 lines = text.splitlines() |
|
26 for coding in coding_regexps: |
|
27 coding_re = coding[1] |
|
28 head = lines[:coding[0]] |
|
29 for l in head: |
|
30 m = coding_re.search(l) |
|
31 if m: |
|
32 return m.group(1).lower() |
|
33 return None |
|
34 |
|
35 def decode(text): |
|
36 """ |
|
37 Function to decode a text. |
|
38 |
|
39 @param text text to decode (string) |
|
40 @return decoded text and encoding |
|
41 """ |
|
42 try: |
|
43 if text.startswith(BOM_UTF8): |
|
44 # UTF-8 with BOM |
|
45 return unicode(text[len(BOM_UTF8):], 'utf-8'), 'utf-8-bom' |
|
46 elif text.startswith(BOM_UTF16): |
|
47 # UTF-16 with BOM |
|
48 return unicode(text[len(BOM_UTF16):], 'utf-16'), 'utf-16' |
|
49 elif text.startswith(BOM_UTF32): |
|
50 # UTF-32 with BOM |
|
51 return unicode(text[len(BOM_UTF32):], 'utf-32'), 'utf-32' |
|
52 coding = get_coding(text) |
|
53 if coding: |
|
54 return unicode(text, coding), coding |
|
55 except (UnicodeError, LookupError): |
|
56 pass |
|
57 |
|
58 # Assume UTF-8 |
|
59 try: |
|
60 return unicode(text, 'utf-8'), 'utf-8-guessed' |
|
61 except (UnicodeError, LookupError): |
|
62 pass |
|
63 |
|
64 # Assume Latin-1 (behaviour before 3.7.1) |
|
65 return unicode(text, "latin-1"), 'latin-1-guessed' |
|
66 |
|
67 def readEncodedFile(filename): |
|
68 """ |
|
69 Function to read a file and decode it's contents into proper text. |
|
70 |
|
71 @param filename name of the file to read (string) |
|
72 @return tuple of decoded text and encoding (string, string) |
|
73 """ |
|
74 f = open(filename) |
|
75 text = f.read() |
|
76 f.close() |
|
77 return decode(text) |
|
78 |
|
79 def normalizeCode(codestring): |
|
80 """ |
|
81 Function to normalize the given code. |
|
82 |
|
83 @param codestring code to be normalized (string) |
|
84 @return normalized code (string) |
|
85 """ |
|
86 if type(codestring) == type(u""): |
|
87 codestring = codestring.encode('utf-8') |
|
88 codestring = codestring.replace("\r\n","\n").replace("\r","\n") |
|
89 |
|
90 if codestring and codestring[-1] != '\n': |
|
91 codestring = codestring + '\n' |
|
92 |
|
93 return codestring |
|
94 |
|
95 # |
|
96 # eflag: FileType = Python2 |