--- a/Utilities/__init__.py Sun Jan 10 13:59:15 2010 +0000 +++ b/Utilities/__init__.py Sun Jan 10 19:19:52 2010 +0000 @@ -12,7 +12,6 @@ import re import fnmatch import glob -##from types import UnicodeType import random import base64 @@ -36,10 +35,15 @@ configDir = None -##coding_regexps = [ -## (2, re.compile(br'''coding[:=]\s*([-\w_.]+)''')), -## (1, re.compile(br'''<\?xml.*\bencoding\s*=\s*['"]([-\w_.]+)['"]\?>''')), -##] +codingBytes_regexps = [ + (2, re.compile(br'''coding[:=]\s*([-\w_.]+)''')), + (1, re.compile(br'''<\?xml.*\bencoding\s*=\s*['"]([-\w_.]+)['"]\?>''')), +] +coding_regexps = [ + (2, re.compile(r'''coding[:=]\s*([-\w_.]+)''')), + (1, re.compile(r'''<\?xml.*\bencoding\s*=\s*['"]([-\w_.]+)['"]\?>''')), +] + supportedCodecs = ['utf-8', 'iso8859-1', 'iso8859-15', 'iso8859-2', 'iso8859-3', 'iso8859-4', 'iso8859-5', 'iso8859-6', 'iso8859-7', @@ -57,193 +61,191 @@ 'gb2312', 'gb18030', 'ascii'] -##class CodingError(Exception): -## """ -## Class implementing an exception, which is raised, if a given coding is incorrect. -## """ -## def __init__(self, coding): -## """ -## Constructor -## """ -## self.errorMessage = QApplication.translate("CodingError", -## "The coding '{0}' is wrong for the given text.").format(coding) -## -## def __repr__(self): -## """ -## Private method returning a representation of the exception. -## -## @return string representing the error message -## """ -## return str(self.errorMessage) -## -## def __str__(self): -## """ -## Private method returning a string representation of the exception. -## -## @return string representing the error message -## """ -## return str(self.errorMessage) -## -##def get_coding(text): -## """ -## Function to get the coding of a text. -## -## @param text text to inspect (string) -## @return coding string -## """ -## lines = text.splitlines() -## for coding in coding_regexps: -## coding_re = coding[1] -## head = lines[:coding[0]] -## for l in head: -## m = coding_re.search(l) -## if m: -## return m.group(1).lower() -## return None -## -##def decode(text): -## """ -## Function to decode a text. -## -## @param text text to decode (string) -## @return decoded text and encoding -## """ -## try: -## if text.startswith(BOM_UTF8): -## # UTF-8 with BOM -## return str(text[len(BOM_UTF8):], 'utf-8'), 'utf-8-bom' -## elif text.startswith(BOM_UTF16): -## # UTF-16 with BOM -## return str(text[len(BOM_UTF16):], 'utf-16'), 'utf-16' -## elif text.startswith(BOM_UTF32): -## # UTF-32 with BOM -## return str(text[len(BOM_UTF32):], 'utf-32'), 'utf-32' -## coding = get_coding(text) -## if coding: -## coding = coding.decode() -## return text.decode(coding), coding -## except (UnicodeError, LookupError): -## pass -## -## guess = None -## if Preferences.getEditor("AdvancedEncodingDetection"): -## # Try the universal character encoding detector -## try: -## import ThirdParty.CharDet.chardet -## guess = ThirdParty.CharDet.chardet.detect(text) -## if guess and guess['confidence'] > 0.95 and guess['encoding'] is not None: -## codec = guess['encoding'].lower() -## return str(text, codec), '%s-guessed' % codec -## except (UnicodeError, LookupError): -## pass -## except ImportError: -## pass -## -## # Try default encoding -## try: -## codec = Preferences.getEditor("DefaultEncoding") -## return str(text, codec), '%s-default' % codec -## except (UnicodeError, LookupError): -## pass -## -## # Assume UTF-8 -## try: -## return str(text, 'utf-8'), 'utf-8-guessed' -## except (UnicodeError, LookupError): -## pass -## -## if Preferences.getEditor("AdvancedEncodingDetection"): -## # Use the guessed one even if confifence level is low -## if guess and guess['encoding'] is not None: -## try: -## codec = guess['encoding'].lower() -## return str(text, codec), '%s-guessed' % codec -## except (UnicodeError, LookupError): -## pass -## -## # Assume Latin-1 (behaviour before 3.7.1) -## return str(text, "latin-1"), 'latin-1-guessed' -## -##def encode(text, orig_coding): -## """ -## Function to encode a text. -## -## @param text text to encode (string) -## @param orig_coding type of the original coding (string) -## @return encoded text and encoding -## """ -## if orig_coding == 'utf-8-bom': -## return BOM_UTF8 + text.encode("utf-8"), 'utf-8-bom' -## -## # Try declared coding spec -## coding = get_coding(text) -## if coding: -## try: -## return text.encode(coding), coding -## except (UnicodeError, LookupError): -## # Error: Declared encoding is incorrect -## raise CodingError(coding) -## -## if orig_coding and orig_coding.endswith('-selected'): -## coding = orig_coding.replace("-selected", "") -## try: -## return text.encode(coding), coding -## except (UnicodeError, LookupError): -## pass -## if orig_coding and orig_coding.endswith('-default'): -## coding = orig_coding.replace("-default", "") -## try: -## return text.encode(coding), coding -## except (UnicodeError, LookupError): -## pass -## if orig_coding and orig_coding.endswith('-guessed'): -## coding = orig_coding.replace("-guessed", "") -## try: -## return text.encode(coding), coding -## except (UnicodeError, LookupError): -## pass -## -## # Try configured default -## try: -## codec = Preferences.getEditor("DefaultEncoding") -## return text.encode(codec), codec -## except (UnicodeError, LookupError): -## pass -## -## # Try saving as ASCII -## try: -## return text.encode('ascii'), 'ascii' -## except UnicodeError: -## pass -## -## # Save as UTF-8 without BOM -## return text.encode('utf-8'), 'utf-8' -## -##def toUnicode(s): -## """ -## Public method to convert a string to unicode. -## -## Various codes are tried until one converts the string without an error. -## If all codecs fail, the string is returned unaltered. -## -## @param s string to be converted (string) -## @return converted string (unicode) -## """ -## if isinstance(s, str): -## return s -## -## for codec in supportedCodecs: -## try: -## u = str(s, codec) -## return u -## except UnicodeError: -## pass -## except TypeError: -## break -## -## # we didn't succeed -## return s -## +class CodingError(Exception): + """ + Class implementing an exception, which is raised, if a given coding is incorrect. + """ + def __init__(self, coding): + """ + Constructor + """ + self.errorMessage = QApplication.translate("CodingError", + "The coding '{0}' is wrong for the given text.").format(coding) + + def __repr__(self): + """ + Private method returning a representation of the exception. + + @return string representing the error message + """ + return str(self.errorMessage) + + def __str__(self): + """ + Private method returning a string representation of the exception. + + @return string representing the error message + """ + return str(self.errorMessage) + +def get_codingBytes(text): + """ + Function to get the coding of a bytes text. + + @param text bytes text to inspect (bytes) + @return coding string + """ + lines = text.splitlines() + for coding in codingBytes_regexps: + coding_re = coding[1] + head = lines[:coding[0]] + for l in head: + m = coding_re.search(l) + if m: + return str(m.group(1), "ascii").lower() + return None + +def get_coding(text): + """ + Function to get the coding of a text. + + @param text text to inspect (string) + @return coding string + """ + lines = text.splitlines() + for coding in coding_regexps: + coding_re = coding[1] + head = lines[:coding[0]] + for l in head: + m = coding_re.search(l) + if m: + return m.group(1).lower() + return None + +def readEncodedFile(filename): + """ + Function to read a file and decode it's contents into proper text. + + @param filename name of the file to read (string) + @return tuple of decoded text and encoding (string, string) + """ + f = open(filename, "rb") + text = f.read() + f.close() + try: + if text.startswith(BOM_UTF8): + # UTF-8 with BOM + return str(text[len(BOM_UTF8):], 'utf-8'), 'utf-8-bom' + elif text.startswith(BOM_UTF16): + # UTF-16 with BOM + return str(text[len(BOM_UTF16):], 'utf-16'), 'utf-16' + elif text.startswith(BOM_UTF32): + # UTF-32 with BOM + return str(text[len(BOM_UTF32):], 'utf-32'), 'utf-32' + coding = get_codingBytes(text) + if coding: + return str(text, coding), coding + except (UnicodeError, LookupError): + pass + + guess = None + if Preferences.getEditor("AdvancedEncodingDetection"): + # Try the universal character encoding detector + try: + import ThirdParty.CharDet.chardet + guess = ThirdParty.CharDet.chardet.detect(text) + if guess and guess['confidence'] > 0.95 and guess['encoding'] is not None: + codec = guess['encoding'].lower() + return str(text, codec), '%s-guessed' % codec + except (UnicodeError, LookupError): + pass + except ImportError: + pass + + # Try default encoding + try: + codec = Preferences.getEditor("DefaultEncoding") + return str(text, codec), '%s-default' % codec + except (UnicodeError, LookupError): + pass + + # Assume UTF-8 + try: + return str(text, 'utf-8'), 'utf-8-guessed' + except (UnicodeError, LookupError): + pass + + if Preferences.getEditor("AdvancedEncodingDetection"): + # Use the guessed one even if confifence level is low + if guess and guess['encoding'] is not None: + try: + codec = guess['encoding'].lower() + return str(text, codec), '%s-guessed' % codec + except (UnicodeError, LookupError): + pass + + # Assume UTF-8 loosing information + return str(text, "utf-8", "ignore"), 'utf-8-ignore' + +def writeEncodedFile(filename, text, orig_coding): + """ + Function to write a file with properly encoded text. + + @param filename name of the file to read (string) + @param text text to be written (string) + @param orig_coding type of the original encoding (string) + @return encoding used for writing the file (string) + """ + encoding = None + if orig_coding == 'utf-8-bom': + etext, encoding = BOM_UTF8 + text.encode("utf-8"), 'utf-8-bom' + else: + # Try declared coding spec + coding = get_coding(text) + if coding: + try: + etext, encoding = text.encode(coding), coding + except (UnicodeError, LookupError): + # Error: Declared encoding is incorrect + raise CodingError(coding) + else: + if orig_coding and orig_coding.endswith( + ('-selected', '-default', '-guessed', '-ignore')): + coding = orig_coding\ + .replace("-selected", "")\ + .replace("-default", "")\ + .replace("-guessed", "")\ + .replace("-ignore", "") + try: + etext, encoding = text.encode(coding), coding + except (UnicodeError, LookupError): + pass + + if encoding is None: + # Try configured default + try: + codec = Preferences.getEditor("DefaultEncoding") + etext, encoding = text.encode(codec), codec + except (UnicodeError, LookupError): + pass + + if encoding is None: + # Try saving as ASCII + try: + etext, encoding = text.encode('ascii'), 'ascii' + except UnicodeError: + pass + + if encoding is None: + # Save as UTF-8 without BOM + etext, encoding = text.encode('utf-8'), 'utf-8' + + f = open(filename, "wb") + f.write(etext) + f.close() + + return encoding + _escape = re.compile(eval(r'"[&<>\"\u0080-\uffff]"')) _escape_map = { @@ -887,15 +889,10 @@ import builtins if not codestring: try: - f = open(file) -## codestring, encoding = decode(f.read()) - codestring = f.read() - f.close() - except IOError: + codestring = Utilities.readEncodedFile(file)[0] + except (UnicodeDecodeError, IOError): return (False, None, None, None, None) -## if isinstance(codestring, type("")): -## codestring = codestring.encode('utf-8') codestring = codestring.replace("\r\n","\n") codestring = codestring.replace("\r","\n") @@ -903,9 +900,6 @@ codestring = codestring + '\n' try: -## if isinstance(file, type("")): -## file = file.encode('utf-8') -## if file.endswith('.ptl'): try: import quixote.ptl_compile