Utilities/__init__.py

changeset 1732
b140a24e147a
parent 1659
d5215347c209
child 1814
2da3b3749cac
--- a/Utilities/__init__.py	Sat Mar 24 19:30:26 2012 +0100
+++ b/Utilities/__init__.py	Sun Mar 25 14:01:25 2012 +0200
@@ -439,6 +439,34 @@
     text = pattern.sub(escape_uentities, text)
     return text
 
+_uunescape = re.compile('&#\d+;')
+
+
+def unescape_uentities(m):
+    """
+    Function to decode html entities.
+    
+    @param m the match object
+    @return the converted text (string)
+    """
+    char = m.group()
+    ord = int(char[2:-1])
+    return chr(ord)
+
+
+def html_udecode(text, pattern=_uunescape):
+    """
+    Function to correctly decode a html text to a unicode text.
+    
+    @param text text to be decoded (string)
+    @param pattern search pattern for text to be decoded (string)
+    @return the decoded text (string)
+    """
+    if not text:
+        return ""
+    text = pattern.sub(unescape_uentities, text)
+    return text
+
 
 def convertLineEnds(text, eol):
     """

eric ide

mercurial