src/eric7/EricUtilities/__init__.py

branch
eric7
changeset 10928
46651e194fbe
parent 10926
9ef616cd220d
child 11090
f5f5f5803935
--- a/src/eric7/EricUtilities/__init__.py	Thu Sep 26 09:48:49 2024 +0200
+++ b/src/eric7/EricUtilities/__init__.py	Thu Sep 26 15:49:36 2024 +0200
@@ -7,8 +7,12 @@
 Package containing utility modules and functions.
 """
 
+import codecs
+import contextlib
 import os
+import re
 
+import chardet
 import semver
 
 from PyQt6.QtCore import QByteArray, QCoreApplication
@@ -280,3 +284,199 @@
             return QCoreApplication.translate("EricUtilities", "{0} TiB").format(
                 loc.toString(size, "f", 2)
             )
+
+
+def decodeString(text):
+    """
+    Function to decode a string containing Unicode encoded characters.
+
+    @param text text containing encoded chars
+    @type str
+    @return decoded text
+    @rtype str
+    """
+    buf = b""
+    index = 0
+    while index < len(text):
+        if text[index] == "\\":
+            qb = QByteArray.fromHex(text[index : index + 4].encode())
+            buf += bytes(qb)
+            index += 4
+        else:
+            buf += codecs.encode(text[index], "utf-8")
+            index += 1
+    buf = buf.replace(b"\x00", b"")
+    return decodeBytes(buf)
+
+
+def decodeBytes(buffer):
+    """
+    Function to decode some byte text into a string.
+
+    @param buffer byte buffer to decode
+    @type bytes
+    @return decoded text
+    @rtype str
+    """
+    # try UTF with BOM
+    with contextlib.suppress(UnicodeError, LookupError):
+        if buffer.startswith(codecs.BOM_UTF8):
+            # UTF-8 with BOM
+            return str(buffer[len(codecs.BOM_UTF8) :], encoding="utf-8")
+        elif buffer.startswith(codecs.BOM_UTF16):
+            # UTF-16 with BOM
+            return str(buffer[len(codecs.BOM_UTF16) :], encoding="utf-16")
+        elif buffer.startswith(codecs.BOM_UTF32):
+            # UTF-32 with BOM
+            return str(buffer[len(codecs.BOM_UTF32) :], encoding="utf-32")
+
+    # try UTF-8
+    with contextlib.suppress(UnicodeError):
+        return str(buffer, encoding="utf-8")
+
+    # try codec detection
+    try:
+        guess = chardet.detect(buffer)
+        if guess and guess["encoding"] is not None:
+            codec = guess["encoding"].lower()
+            return str(buffer, encoding=codec)
+    except (LookupError, UnicodeError):
+        pass
+    except ImportError:
+        pass
+
+    return str(buffer, encoding="utf-8", errors="ignore")
+
+
+def readStringFromStream(stream):
+    """
+    Module function to read a string from the given stream.
+
+    @param stream data stream opened for reading
+    @type QDataStream
+    @return string read from the stream
+    @rtype str
+    """
+    data = stream.readString()
+    if data is None:
+        data = b""
+    return data.decode("utf-8")
+
+
+###############################################################################
+## Functions for HTML string handling.
+###############################################################################
+
+
+_escape = re.compile("[&<>\"'\u0080-\uffff]")
+
+_escape_map = {
+    "&": "&amp;",
+    "<": "&lt;",
+    ">": "&gt;",
+    '"': "&quot;",
+    "'": "&#x27;",
+}
+
+
+def escape_entities(m, escmap=_escape_map):
+    """
+    Function to encode html entities.
+
+    @param m the match object
+    @type re.Match
+    @param escmap the map of entities to encode
+    @type dict
+    @return the converted text
+    @rtype str
+    """
+    char = m.group()
+    text = escmap.get(char)
+    if text is None:
+        text = "&#{0:d};".format(ord(char))
+    return text
+
+
+def html_encode(text, pattern=_escape):
+    """
+    Function to correctly encode a text for html.
+
+    @param text text to be encoded
+    @type str
+    @param pattern search pattern for text to be encoded
+    @type str
+    @return the encoded text
+    @rtype str
+    """
+    if not text:
+        return ""
+    text = pattern.sub(escape_entities, text)
+    return text
+
+
+_uescape = re.compile("[\u0080-\uffff]")
+
+
+def escape_uentities(m):
+    """
+    Function to encode html entities.
+
+    @param m the match object
+    @type re.Match
+    @return the converted text
+    @rtype str
+    """
+    char = m.group()
+    text = "&#{0:d};".format(ord(char))
+    return text
+
+
+def html_uencode(text, pattern=_uescape):
+    """
+    Function to correctly encode a unicode text for html.
+
+    @param text text to be encoded
+    @type str
+    @param pattern search pattern for text to be encoded
+    @type str
+    @return the encoded text
+    @rtype str
+    """
+    if not text:
+        return ""
+    text = pattern.sub(escape_uentities, text)
+    return text
+
+
+_uunescape = re.compile(r"&#\d+;")
+
+
+def unescape_uentities(m):
+    """
+    Function to decode html entities.
+
+    @param m the match object
+    @type re.Match
+    @return the converted text
+    @rtype str
+    """
+    char = m.group()
+    ordinal = int(char[2:-1])
+    return chr(ordinal)
+
+
+def html_udecode(text, pattern=_uunescape):
+    """
+    Function to correctly decode a html text to a unicode text.
+
+    @param text text to be decoded
+    @type str
+    @param pattern search pattern for text to be decoded
+    @type str
+    @return the decoded text
+    @rtype str
+    """
+    if not text:
+        return ""
+    text = pattern.sub(unescape_uentities, text)
+    return text

eric ide

mercurial