Utilities/__init__.py

changeset 45
9a18f4dbb493
parent 41
572a009369f0
child 46
b09750fd2a06
--- a/Utilities/__init__.py	Sun Jan 10 13:59:15 2010 +0000
+++ b/Utilities/__init__.py	Sun Jan 10 19:19:52 2010 +0000
@@ -12,7 +12,6 @@
 import re
 import fnmatch
 import glob
-##from types import UnicodeType
 import random
 import base64
 
@@ -36,10 +35,15 @@
 
 configDir = None
 
-##coding_regexps = [
-##    (2, re.compile(br'''coding[:=]\s*([-\w_.]+)''')), 
-##    (1, re.compile(br'''<\?xml.*\bencoding\s*=\s*['"]([-\w_.]+)['"]\?>''')), 
-##]
+codingBytes_regexps = [
+    (2, re.compile(br'''coding[:=]\s*([-\w_.]+)''')), 
+    (1, re.compile(br'''<\?xml.*\bencoding\s*=\s*['"]([-\w_.]+)['"]\?>''')), 
+]
+coding_regexps = [
+    (2, re.compile(r'''coding[:=]\s*([-\w_.]+)''')), 
+    (1, re.compile(r'''<\?xml.*\bencoding\s*=\s*['"]([-\w_.]+)['"]\?>''')), 
+]
+
 supportedCodecs = ['utf-8', 
           'iso8859-1', 'iso8859-15', 'iso8859-2', 'iso8859-3', 
           'iso8859-4', 'iso8859-5', 'iso8859-6', 'iso8859-7', 
@@ -57,193 +61,191 @@
           'gb2312', 'gb18030', 
           'ascii']
 
-##class CodingError(Exception):
-##    """
-##    Class implementing an exception, which is raised, if a given coding is incorrect.
-##    """
-##    def __init__(self, coding):
-##        """
-##        Constructor
-##        """
-##        self.errorMessage = QApplication.translate("CodingError", 
-##            "The coding '{0}' is wrong for the given text.").format(coding)
-##        
-##    def __repr__(self):
-##        """
-##        Private method returning a representation of the exception.
-##        
-##        @return string representing the error message
-##        """
-##        return str(self.errorMessage)
-##        
-##    def __str__(self):
-##        """
-##        Private method returning a string representation of the exception.
-##        
-##        @return string representing the error message
-##        """
-##        return str(self.errorMessage)
-##    
-##def get_coding(text):
-##    """
-##    Function to get the coding of a text.
-##    
-##    @param text text to inspect (string)
-##    @return coding string
-##    """
-##    lines = text.splitlines()
-##    for coding in coding_regexps:
-##        coding_re = coding[1]
-##        head = lines[:coding[0]]
-##        for l in head:
-##            m = coding_re.search(l)
-##            if m:
-##                return m.group(1).lower()
-##    return None
-##
-##def decode(text):
-##    """
-##    Function to decode a text.
-##    
-##    @param text text to decode (string)
-##    @return decoded text and encoding
-##    """
-##    try:
-##        if text.startswith(BOM_UTF8):
-##            # UTF-8 with BOM
-##            return str(text[len(BOM_UTF8):], 'utf-8'), 'utf-8-bom'
-##        elif text.startswith(BOM_UTF16):
-##            # UTF-16 with BOM
-##            return str(text[len(BOM_UTF16):], 'utf-16'), 'utf-16'
-##        elif text.startswith(BOM_UTF32):
-##            # UTF-32 with BOM
-##            return str(text[len(BOM_UTF32):], 'utf-32'), 'utf-32'
-##        coding = get_coding(text)
-##        if coding:
-##            coding = coding.decode()
-##            return text.decode(coding), coding
-##    except (UnicodeError, LookupError):
-##        pass
-##    
-##    guess = None
-##    if Preferences.getEditor("AdvancedEncodingDetection"):
-##        # Try the universal character encoding detector
-##        try:
-##            import ThirdParty.CharDet.chardet
-##            guess = ThirdParty.CharDet.chardet.detect(text)
-##            if guess and guess['confidence'] > 0.95 and guess['encoding'] is not None:
-##                codec = guess['encoding'].lower()
-##                return str(text, codec), '%s-guessed' % codec
-##        except (UnicodeError, LookupError):
-##            pass
-##        except ImportError:
-##            pass
-##    
-##    # Try default encoding
-##    try:
-##        codec = Preferences.getEditor("DefaultEncoding")
-##        return str(text, codec), '%s-default' % codec
-##    except (UnicodeError, LookupError):
-##        pass
-##    
-##    # Assume UTF-8
-##    try:
-##        return str(text, 'utf-8'), 'utf-8-guessed'
-##    except (UnicodeError, LookupError):
-##        pass
-##    
-##    if Preferences.getEditor("AdvancedEncodingDetection"):
-##        # Use the guessed one even if confifence level is low
-##        if guess and guess['encoding'] is not None:
-##            try:
-##                codec = guess['encoding'].lower()
-##                return str(text, codec), '%s-guessed' % codec
-##            except (UnicodeError, LookupError):
-##                pass
-##    
-##    # Assume Latin-1 (behaviour before 3.7.1)
-##    return str(text, "latin-1"), 'latin-1-guessed'
-##
-##def encode(text, orig_coding):
-##    """
-##    Function to encode a text.
-##    
-##    @param text text to encode (string)
-##    @param orig_coding type of the original coding (string)
-##    @return encoded text and encoding
-##    """
-##    if orig_coding == 'utf-8-bom':
-##        return BOM_UTF8 + text.encode("utf-8"), 'utf-8-bom'
-##    
-##    # Try declared coding spec
-##    coding = get_coding(text)
-##    if coding:
-##        try:
-##            return text.encode(coding), coding
-##        except (UnicodeError, LookupError):
-##            # Error: Declared encoding is incorrect
-##            raise CodingError(coding)
-##    
-##    if orig_coding and orig_coding.endswith('-selected'):
-##        coding = orig_coding.replace("-selected", "")
-##        try:
-##            return text.encode(coding), coding
-##        except (UnicodeError, LookupError):
-##            pass
-##    if orig_coding and orig_coding.endswith('-default'):
-##        coding = orig_coding.replace("-default", "")
-##        try:
-##            return text.encode(coding), coding
-##        except (UnicodeError, LookupError):
-##            pass
-##    if orig_coding and orig_coding.endswith('-guessed'):
-##        coding = orig_coding.replace("-guessed", "")
-##        try:
-##            return text.encode(coding), coding
-##        except (UnicodeError, LookupError):
-##            pass
-##    
-##    # Try configured default
-##    try:
-##        codec = Preferences.getEditor("DefaultEncoding")
-##        return text.encode(codec), codec
-##    except (UnicodeError, LookupError):
-##        pass
-##    
-##    # Try saving as ASCII
-##    try:
-##        return text.encode('ascii'), 'ascii'
-##    except UnicodeError:
-##        pass
-##    
-##    # Save as UTF-8 without BOM
-##    return text.encode('utf-8'), 'utf-8'
-##    
-##def toUnicode(s):
-##    """
-##    Public method to convert a string to unicode.
-##    
-##    Various codes are tried until one converts the string without an error.
-##    If all codecs fail, the string is returned unaltered.
-##    
-##    @param s string to be converted (string)
-##    @return converted string (unicode)
-##    """
-##    if isinstance(s, str):
-##        return s
-##    
-##    for codec in supportedCodecs:
-##        try:
-##            u = str(s, codec)
-##            return u
-##        except UnicodeError:
-##            pass
-##        except TypeError:
-##            break
-##    
-##    # we didn't succeed
-##    return s
-##    
+class CodingError(Exception):
+    """
+    Class implementing an exception, which is raised, if a given coding is incorrect.
+    """
+    def __init__(self, coding):
+        """
+        Constructor
+        """
+        self.errorMessage = QApplication.translate("CodingError", 
+            "The coding '{0}' is wrong for the given text.").format(coding)
+        
+    def __repr__(self):
+        """
+        Private method returning a representation of the exception.
+        
+        @return string representing the error message
+        """
+        return str(self.errorMessage)
+        
+    def __str__(self):
+        """
+        Private method returning a string representation of the exception.
+        
+        @return string representing the error message
+        """
+        return str(self.errorMessage)
+    
+def get_codingBytes(text):
+    """
+    Function to get the coding of a bytes text.
+    
+    @param text bytes text to inspect (bytes)
+    @return coding string
+    """
+    lines = text.splitlines()
+    for coding in codingBytes_regexps:
+        coding_re = coding[1]
+        head = lines[:coding[0]]
+        for l in head:
+            m = coding_re.search(l)
+            if m:
+                return str(m.group(1), "ascii").lower()
+    return None
+
+def get_coding(text):
+    """
+    Function to get the coding of a text.
+    
+    @param text text to inspect (string)
+    @return coding string
+    """
+    lines = text.splitlines()
+    for coding in coding_regexps:
+        coding_re = coding[1]
+        head = lines[:coding[0]]
+        for l in head:
+            m = coding_re.search(l)
+            if m:
+                return m.group(1).lower()
+    return None
+
+def readEncodedFile(filename):
+    """
+    Function to read a file and decode it's contents into proper text.
+    
+    @param filename name of the file to read (string)
+    @return tuple of decoded text and encoding (string, string)
+    """
+    f = open(filename, "rb")
+    text = f.read()
+    f.close()
+    try:
+        if text.startswith(BOM_UTF8):
+            # UTF-8 with BOM
+            return str(text[len(BOM_UTF8):], 'utf-8'), 'utf-8-bom'
+        elif text.startswith(BOM_UTF16):
+            # UTF-16 with BOM
+            return str(text[len(BOM_UTF16):], 'utf-16'), 'utf-16'
+        elif text.startswith(BOM_UTF32):
+            # UTF-32 with BOM
+            return str(text[len(BOM_UTF32):], 'utf-32'), 'utf-32'
+        coding = get_codingBytes(text)
+        if coding:
+            return str(text, coding), coding
+    except (UnicodeError, LookupError):
+        pass
+    
+    guess = None
+    if Preferences.getEditor("AdvancedEncodingDetection"):
+        # Try the universal character encoding detector
+        try:
+            import ThirdParty.CharDet.chardet
+            guess = ThirdParty.CharDet.chardet.detect(text)
+            if guess and guess['confidence'] > 0.95 and guess['encoding'] is not None:
+                codec = guess['encoding'].lower()
+                return str(text, codec), '%s-guessed' % codec
+        except (UnicodeError, LookupError):
+            pass
+        except ImportError:
+            pass
+    
+    # Try default encoding
+    try:
+        codec = Preferences.getEditor("DefaultEncoding")
+        return str(text, codec), '%s-default' % codec
+    except (UnicodeError, LookupError):
+        pass
+    
+    # Assume UTF-8
+    try:
+        return str(text, 'utf-8'), 'utf-8-guessed'
+    except (UnicodeError, LookupError):
+        pass
+    
+    if Preferences.getEditor("AdvancedEncodingDetection"):
+        # Use the guessed one even if confifence level is low
+        if guess and guess['encoding'] is not None:
+            try:
+                codec = guess['encoding'].lower()
+                return str(text, codec), '%s-guessed' % codec
+            except (UnicodeError, LookupError):
+                pass
+    
+    # Assume UTF-8 loosing information
+    return str(text, "utf-8", "ignore"), 'utf-8-ignore'
+
+def writeEncodedFile(filename, text, orig_coding):
+    """
+    Function to write a file with properly encoded text.
+    
+    @param filename name of the file to read (string)
+    @param text text to be written (string)
+    @param orig_coding type of the original encoding (string)
+    @return encoding used for writing the file (string)
+    """
+    encoding = None
+    if orig_coding == 'utf-8-bom':
+        etext, encoding = BOM_UTF8 + text.encode("utf-8"), 'utf-8-bom'
+    else:
+        # Try declared coding spec
+        coding = get_coding(text)
+        if coding:
+            try:
+                etext, encoding = text.encode(coding), coding
+            except (UnicodeError, LookupError):
+                # Error: Declared encoding is incorrect
+                raise CodingError(coding)
+        else:
+            if orig_coding and orig_coding.endswith(
+                ('-selected', '-default', '-guessed', '-ignore')):
+                coding = orig_coding\
+                    .replace("-selected", "")\
+                    .replace("-default", "")\
+                    .replace("-guessed", "")\
+                    .replace("-ignore", "")
+                try:
+                    etext, encoding = text.encode(coding), coding
+                except (UnicodeError, LookupError):
+                    pass
+            
+            if encoding is None:
+                # Try configured default
+                try:
+                    codec = Preferences.getEditor("DefaultEncoding")
+                    etext, encoding = text.encode(codec), codec
+                except (UnicodeError, LookupError):
+                    pass
+                
+                if encoding is None:
+                    # Try saving as ASCII
+                    try:
+                        etext, encoding = text.encode('ascii'), 'ascii'
+                    except UnicodeError:
+                        pass
+                    
+                    if encoding is None:
+                        # Save as UTF-8 without BOM
+                        etext, encoding = text.encode('utf-8'), 'utf-8'
+    
+    f = open(filename, "wb")
+    f.write(etext)
+    f.close()
+    
+    return encoding
+    
 _escape = re.compile(eval(r'"[&<>\"\u0080-\uffff]"'))
 
 _escape_map = {
@@ -887,15 +889,10 @@
     import builtins
     if not codestring:
         try:
-            f = open(file)
-##            codestring, encoding = decode(f.read())
-            codestring = f.read()
-            f.close()
-        except IOError:
+            codestring = Utilities.readEncodedFile(file)[0]
+        except (UnicodeDecodeError, IOError):
             return (False, None, None, None, None)
 
-##    if isinstance(codestring, type("")):
-##        codestring = codestring.encode('utf-8')
     codestring = codestring.replace("\r\n","\n")
     codestring = codestring.replace("\r","\n")
 
@@ -903,9 +900,6 @@
         codestring = codestring + '\n'
     
     try:
-##        if isinstance(file, type("")):
-##            file = file.encode('utf-8')
-##        
         if file.endswith('.ptl'):
             try:
                 import quixote.ptl_compile

eric ide

mercurial