diff -r 3465556892de -r 83ca4d1ff648 UtilitiesPython2/Tools.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/UtilitiesPython2/Tools.py Tue Jan 04 17:37:48 2011 +0100 @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- + +# Copyright (c) 2011 Detlev Offenbach <detlev@die-offenbachs.de> +# + +""" +Module implementing tool functions. +""" + +import re +from codecs import BOM_UTF8, BOM_UTF16, BOM_UTF32 + +coding_regexps = [ + (2, re.compile(r'''coding[:=]\s*([-\w_.]+)''')), + (1, re.compile(r'''<\?xml.*\bencoding\s*=\s*['"]([-\w_.]+)['"]\?>''')), +] + +def get_coding(text): + """ + Function to get the coding of a text. + + @param text text to inspect (string) + @return coding string + """ + lines = text.splitlines() + for coding in coding_regexps: + coding_re = coding[1] + head = lines[:coding[0]] + for l in head: + m = coding_re.search(l) + if m: + return m.group(1).lower() + return None + +def decode(text): + """ + Function to decode a text. + + @param text text to decode (string) + @return decoded text and encoding + """ + try: + if text.startswith(BOM_UTF8): + # UTF-8 with BOM + return unicode(text[len(BOM_UTF8):], 'utf-8'), 'utf-8-bom' + elif text.startswith(BOM_UTF16): + # UTF-16 with BOM + return unicode(text[len(BOM_UTF16):], 'utf-16'), 'utf-16' + elif text.startswith(BOM_UTF32): + # UTF-32 with BOM + return unicode(text[len(BOM_UTF32):], 'utf-32'), 'utf-32' + coding = get_coding(text) + if coding: + return unicode(text, coding), coding + except (UnicodeError, LookupError): + pass + + # Assume UTF-8 + try: + return unicode(text, 'utf-8'), 'utf-8-guessed' + except (UnicodeError, LookupError): + pass + + # Assume Latin-1 (behaviour before 3.7.1) + return unicode(text, "latin-1"), 'latin-1-guessed' + +def readEncodedFile(filename): + """ + Function to read a file and decode it's contents into proper text. + + @param filename name of the file to read (string) + @return tuple of decoded text and encoding (string, string) + """ + f = open(filename) + text = f.read() + f.close() + return decode(text) + +def normalizeCode(codestring): + """ + Function to normalize the given code. + + @param codestring code to be normalized (string) + @return normalized code (string) + """ + if type(codestring) == type(u""): + codestring = codestring.encode('utf-8') + codestring = codestring.replace("\r\n","\n").replace("\r","\n") + + if codestring and codestring[-1] != '\n': + codestring = codestring + '\n' + + return codestring + +# +# eflag: FileType = Python2