--- a/src/eric7/EricUtilities/__init__.py Thu Sep 26 09:48:49 2024 +0200 +++ b/src/eric7/EricUtilities/__init__.py Thu Sep 26 15:49:36 2024 +0200 @@ -7,8 +7,12 @@ Package containing utility modules and functions. """ +import codecs +import contextlib import os +import re +import chardet import semver from PyQt6.QtCore import QByteArray, QCoreApplication @@ -280,3 +284,199 @@ return QCoreApplication.translate("EricUtilities", "{0} TiB").format( loc.toString(size, "f", 2) ) + + +def decodeString(text): + """ + Function to decode a string containing Unicode encoded characters. + + @param text text containing encoded chars + @type str + @return decoded text + @rtype str + """ + buf = b"" + index = 0 + while index < len(text): + if text[index] == "\\": + qb = QByteArray.fromHex(text[index : index + 4].encode()) + buf += bytes(qb) + index += 4 + else: + buf += codecs.encode(text[index], "utf-8") + index += 1 + buf = buf.replace(b"\x00", b"") + return decodeBytes(buf) + + +def decodeBytes(buffer): + """ + Function to decode some byte text into a string. + + @param buffer byte buffer to decode + @type bytes + @return decoded text + @rtype str + """ + # try UTF with BOM + with contextlib.suppress(UnicodeError, LookupError): + if buffer.startswith(codecs.BOM_UTF8): + # UTF-8 with BOM + return str(buffer[len(codecs.BOM_UTF8) :], encoding="utf-8") + elif buffer.startswith(codecs.BOM_UTF16): + # UTF-16 with BOM + return str(buffer[len(codecs.BOM_UTF16) :], encoding="utf-16") + elif buffer.startswith(codecs.BOM_UTF32): + # UTF-32 with BOM + return str(buffer[len(codecs.BOM_UTF32) :], encoding="utf-32") + + # try UTF-8 + with contextlib.suppress(UnicodeError): + return str(buffer, encoding="utf-8") + + # try codec detection + try: + guess = chardet.detect(buffer) + if guess and guess["encoding"] is not None: + codec = guess["encoding"].lower() + return str(buffer, encoding=codec) + except (LookupError, UnicodeError): + pass + except ImportError: + pass + + return str(buffer, encoding="utf-8", errors="ignore") + + +def readStringFromStream(stream): + """ + Module function to read a string from the given stream. + + @param stream data stream opened for reading + @type QDataStream + @return string read from the stream + @rtype str + """ + data = stream.readString() + if data is None: + data = b"" + return data.decode("utf-8") + + +############################################################################### +## Functions for HTML string handling. +############################################################################### + + +_escape = re.compile("[&<>\"'\u0080-\uffff]") + +_escape_map = { + "&": "&", + "<": "<", + ">": ">", + '"': """, + "'": "'", +} + + +def escape_entities(m, escmap=_escape_map): + """ + Function to encode html entities. + + @param m the match object + @type re.Match + @param escmap the map of entities to encode + @type dict + @return the converted text + @rtype str + """ + char = m.group() + text = escmap.get(char) + if text is None: + text = "&#{0:d};".format(ord(char)) + return text + + +def html_encode(text, pattern=_escape): + """ + Function to correctly encode a text for html. + + @param text text to be encoded + @type str + @param pattern search pattern for text to be encoded + @type str + @return the encoded text + @rtype str + """ + if not text: + return "" + text = pattern.sub(escape_entities, text) + return text + + +_uescape = re.compile("[\u0080-\uffff]") + + +def escape_uentities(m): + """ + Function to encode html entities. + + @param m the match object + @type re.Match + @return the converted text + @rtype str + """ + char = m.group() + text = "&#{0:d};".format(ord(char)) + return text + + +def html_uencode(text, pattern=_uescape): + """ + Function to correctly encode a unicode text for html. + + @param text text to be encoded + @type str + @param pattern search pattern for text to be encoded + @type str + @return the encoded text + @rtype str + """ + if not text: + return "" + text = pattern.sub(escape_uentities, text) + return text + + +_uunescape = re.compile(r"&#\d+;") + + +def unescape_uentities(m): + """ + Function to decode html entities. + + @param m the match object + @type re.Match + @return the converted text + @rtype str + """ + char = m.group() + ordinal = int(char[2:-1]) + return chr(ordinal) + + +def html_udecode(text, pattern=_uunescape): + """ + Function to correctly decode a html text to a unicode text. + + @param text text to be decoded + @type str + @param pattern search pattern for text to be decoded + @type str + @return the decoded text + @rtype str + """ + if not text: + return "" + text = pattern.sub(unescape_uentities, text) + return text