--- a/src/eric7/Utilities/__init__.py Thu Sep 26 09:48:49 2024 +0200 +++ b/src/eric7/Utilities/__init__.py Thu Sep 26 15:49:36 2024 +0200 @@ -18,8 +18,6 @@ import sys import warnings -from codecs import BOM_UTF8, BOM_UTF16, BOM_UTF32 - import chardet from PyQt6 import sip @@ -35,6 +33,14 @@ from eric7 import Preferences from eric7.__version__ import Version +from eric7.EricUtilities import ( # noqa + decodeBytes, + decodeString, + html_encode, + html_udecode, + html_uencode, + readStringFromStream, +) from eric7.EricWidgets.EricApplication import ericApp from eric7.SystemUtilities import DesktopUtilities, FileSystemUtilities, OSUtilities from eric7.UI.Info import Program @@ -290,15 +296,15 @@ @rtype tuple of (str, str) """ with contextlib.suppress(UnicodeError, LookupError): - if text.startswith(BOM_UTF8): + if text.startswith(codecs.BOM_UTF8): # UTF-8 with BOM - return str(text[len(BOM_UTF8) :], "utf-8"), "utf-8-bom" - elif text.startswith(BOM_UTF16): + return str(text[len(codecs.BOM_UTF8) :], "utf-8"), "utf-8-bom" + elif text.startswith(codecs.BOM_UTF16): # UTF-16 with BOM - return str(text[len(BOM_UTF16) :], "utf-16"), "utf-16" - elif text.startswith(BOM_UTF32): + return str(text[len(codecs.BOM_UTF16) :], "utf-16"), "utf-16" + elif text.startswith(codecs.BOM_UTF32): # UTF-32 with BOM - return str(text[len(BOM_UTF32) :], "utf-32"), "utf-32" + return str(text[len(codecs.BOM_UTF32) :], "utf-32"), "utf-32" coding = get_codingBytes(text) if coding: return str(text, coding), coding @@ -422,7 +428,7 @@ """ encoding = None if origEncoding == "utf-8-bom": - etext, encoding = BOM_UTF8 + text.encode("utf-8"), "utf-8-bom" + etext, encoding = codecs.BOM_UTF8 + text.encode("utf-8"), "utf-8-bom" else: # Try declared coding spec coding = get_coding(text) @@ -470,83 +476,6 @@ return etext, encoding -def decodeString(text): - """ - Function to decode a string containing Unicode encoded characters. - - @param text text containing encoded chars - @type str - @return decoded text - @rtype str - """ - buf = b"" - index = 0 - while index < len(text): - if text[index] == "\\": - qb = QByteArray.fromHex(text[index : index + 4].encode()) - buf += bytes(qb) - index += 4 - else: - buf += codecs.encode(text[index], "utf-8") - index += 1 - buf = buf.replace(b"\x00", b"") - return decodeBytes(buf) - - -def decodeBytes(buffer): - """ - Function to decode some byte text into a string. - - @param buffer byte buffer to decode - @type bytes - @return decoded text - @rtype str - """ - # try UTF with BOM - with contextlib.suppress(UnicodeError, LookupError): - if buffer.startswith(BOM_UTF8): - # UTF-8 with BOM - return str(buffer[len(BOM_UTF8) :], encoding="utf-8") - elif buffer.startswith(BOM_UTF16): - # UTF-16 with BOM - return str(buffer[len(BOM_UTF16) :], encoding="utf-16") - elif buffer.startswith(BOM_UTF32): - # UTF-32 with BOM - return str(buffer[len(BOM_UTF32) :], encoding="utf-32") - - # try UTF-8 - with contextlib.suppress(UnicodeError): - return str(buffer, encoding="utf-8") - - # try codec detection - try: - guess = chardet.detect(buffer) - if guess and guess["encoding"] is not None: - codec = guess["encoding"].lower() - return str(buffer, encoding=codec) - except (LookupError, UnicodeError): - pass - except ImportError: - pass - - return str(buffer, encoding="utf-8", errors="ignore") - - -def readStringFromStream(stream): - """ - Module function to read a string from the given stream. - - @param stream data stream opened for reading - @type QDataStream - @return string read from the stream - @rtype str - """ - data = stream.readString() - if data is None: - data = b"" - return data.decode("utf-8") - - def normalizeCode(codestring): """ Function to normalize the given code. @@ -564,120 +493,6 @@ return codestring -_escape = re.compile("[&<>\"'\u0080-\uffff]") - -_escape_map = { - "&": "&", - "<": "<", - ">": ">", - '"': """, - "'": "'", -} - - -def escape_entities(m, escmap=_escape_map): - """ - Function to encode html entities. - - @param m the match object - @type re.Match - @param escmap the map of entities to encode - @type dict - @return the converted text - @rtype str - """ - char = m.group() - text = escmap.get(char) - if text is None: - text = "&#{0:d};".format(ord(char)) - return text - - -def html_encode(text, pattern=_escape): - """ - Function to correctly encode a text for html. - - @param text text to be encoded - @type str - @param pattern search pattern for text to be encoded - @type str - @return the encoded text - @rtype str - """ - if not text: - return "" - text = pattern.sub(escape_entities, text) - return text - - -_uescape = re.compile("[\u0080-\uffff]") - - -def escape_uentities(m): - """ - Function to encode html entities. - - @param m the match object - @type re.Match - @return the converted text - @rtype str - """ - char = m.group() - text = "&#{0:d};".format(ord(char)) - return text - - -def html_uencode(text, pattern=_uescape): - """ - Function to correctly encode a unicode text for html. - - @param text text to be encoded - @type str - @param pattern search pattern for text to be encoded - @type str - @return the encoded text - @rtype str - """ - if not text: - return "" - text = pattern.sub(escape_uentities, text) - return text - - -_uunescape = re.compile(r"&#\d+;") - - -def unescape_uentities(m): - """ - Function to decode html entities. - - @param m the match object - @type re.Match - @return the converted text - @rtype str - """ - char = m.group() - ordinal = int(char[2:-1]) - return chr(ordinal) - - -def html_udecode(text, pattern=_uunescape): - """ - Function to correctly decode a html text to a unicode text. - - @param text text to be decoded - @type str - @param pattern search pattern for text to be decoded - @type str - @return the decoded text - @rtype str - """ - if not text: - return "" - text = pattern.sub(unescape_uentities, text) - return text - - def convertLineEnds(text, eol): """ Function to convert the end of line characters.