Thu, 10 Nov 2016 18:57:50 +0100
Updated chardet to version 2.3.0.
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/ThirdParty/CharDet/README.rst Thu Nov 10 18:57:50 2016 +0100 @@ -0,0 +1,46 @@ +Chardet: The Universal Character Encoding Detector +-------------------------------------------------- + +Detects + - ASCII, UTF-8, UTF-16 (2 variants), UTF-32 (4 variants) + - Big5, GB2312, EUC-TW, HZ-GB-2312, ISO-2022-CN (Traditional and Simplified Chinese) + - EUC-JP, SHIFT_JIS, CP932, ISO-2022-JP (Japanese) + - EUC-KR, ISO-2022-KR (Korean) + - KOI8-R, MacCyrillic, IBM855, IBM866, ISO-8859-5, windows-1251 (Cyrillic) + - ISO-8859-2, windows-1250 (Hungarian) + - ISO-8859-5, windows-1251 (Bulgarian) + - windows-1252 (English) + - ISO-8859-7, windows-1253 (Greek) + - ISO-8859-8, windows-1255 (Visual and Logical Hebrew) + - TIS-620 (Thai) + +Requires Python 2.6 or later + +Installation +------------ + +Install from `PyPI <https://pypi.python.org/pypi/chardet>`_:: + + pip install chardet + + +Command-line Tool +----------------- + +chardet comes with a command-line script which reports on the encodings of one +or more files:: + + % chardetect somefile someotherfile + somefile: windows-1252 with confidence 0.5 + someotherfile: ascii with confidence 1.0 + +About +----- + +This is a continuation of Mark Pilgrim's excellent chardet. Previously, two +versions needed to be maintained: one that supported python 2.x and one that +supported python 3.x. We've recently merged with `Ian Cordasco <https://github.com/sigmavirus24>`_'s +`charade <https://github.com/sigmavirus24/charade>`_ fork, so now we have one +coherent version that works for Python 2.6+. + +:maintainer: Dan Blanchard
--- a/ThirdParty/CharDet/chardet/__init__.py Thu Nov 10 18:54:02 2016 +0100 +++ b/ThirdParty/CharDet/chardet/__init__.py Thu Nov 10 18:57:50 2016 +0100 @@ -15,7 +15,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -__version__ = "2.2.1" +__version__ = "2.3.0" from sys import version_info
--- a/ThirdParty/CharDet/chardet/chardetect.py Thu Nov 10 18:54:02 2016 +0100 +++ b/ThirdParty/CharDet/chardet/chardetect.py Thu Nov 10 18:57:50 2016 +0100 @@ -12,34 +12,68 @@ If no paths are provided, it takes its input from stdin. """ + +from __future__ import absolute_import, print_function, unicode_literals + +import argparse +import sys from io import open -from sys import argv, stdin +from chardet import __version__ from chardet.universaldetector import UniversalDetector -def description_of(file, name='stdin'): - """Return a string describing the probable encoding of a file.""" +def description_of(lines, name='stdin'): + """ + Return a string describing the probable encoding of a file or + list of strings. + + :param lines: The lines to get the encoding of. + :type lines: Iterable of bytes + :param name: Name of file or collection of lines + :type name: str + """ u = UniversalDetector() - for line in file: + for line in lines: u.feed(line) u.close() result = u.result if result['encoding']: - return '%s: %s with confidence %s' % (name, - result['encoding'], - result['confidence']) + return '{0}: {1} with confidence {2}'.format(name, result['encoding'], + result['confidence']) else: - return '%s: no result' % name + return '{0}: no result'.format(name) -def main(): - if len(argv) <= 1: - print(description_of(stdin)) - else: - for path in argv[1:]: - with open(path, 'rb') as f: - print(description_of(f, path)) +def main(argv=None): + ''' + Handles command line arguments and gets things started. + + :param argv: List of arguments, as if specified on the command-line. + If None, ``sys.argv[1:]`` is used instead. + :type argv: list of str + ''' + # Get command line arguments + parser = argparse.ArgumentParser( + description="Takes one or more file paths and reports their detected \ + encodings", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + conflict_handler='resolve') + parser.add_argument('input', + help='File whose encoding we would like to determine.', + type=argparse.FileType('rb'), nargs='*', + default=[sys.stdin]) + parser.add_argument('--version', action='version', + version='%(prog)s {0}'.format(__version__)) + args = parser.parse_args(argv) + + for f in args.input: + if f.isatty(): + print("You are running chardetect interactively. Press " + + "CTRL-D twice at the start of a blank line to signal the " + + "end of your input. If you want help, run chardetect " + + "--help\n", file=sys.stderr) + print(description_of(f, f.name)) if __name__ == '__main__':
--- a/ThirdParty/CharDet/chardet/jpcntx.py Thu Nov 10 18:54:02 2016 +0100 +++ b/ThirdParty/CharDet/chardet/jpcntx.py Thu Nov 10 18:57:50 2016 +0100 @@ -177,6 +177,12 @@ return -1, 1 class SJISContextAnalysis(JapaneseContextAnalysis): + def __init__(self): + self.charset_name = "SHIFT_JIS" + + def get_charset_name(self): + return self.charset_name + def get_order(self, aBuf): if not aBuf: return -1, 1 @@ -184,6 +190,8 @@ first_char = wrap_ord(aBuf[0]) if ((0x81 <= first_char <= 0x9F) or (0xE0 <= first_char <= 0xFC)): charLen = 2 + if (first_char == 0x87) or (0xFA <= first_char <= 0xFC): + self.charset_name = "CP932" else: charLen = 1
--- a/ThirdParty/CharDet/chardet/latin1prober.py Thu Nov 10 18:54:02 2016 +0100 +++ b/ThirdParty/CharDet/chardet/latin1prober.py Thu Nov 10 18:57:50 2016 +0100 @@ -129,11 +129,11 @@ if total < 0.01: confidence = 0.0 else: - confidence = ((self._mFreqCounter[3] / total) - - (self._mFreqCounter[1] * 20.0 / total)) + confidence = ((self._mFreqCounter[3] - self._mFreqCounter[1] * 20.0) + / total) if confidence < 0.0: confidence = 0.0 # lower the confidence of latin1 so that other more accurate # detector can take priority. - confidence = confidence * 0.5 + confidence = confidence * 0.73 return confidence
--- a/ThirdParty/CharDet/chardet/mbcssm.py Thu Nov 10 18:54:02 2016 +0100 +++ b/ThirdParty/CharDet/chardet/mbcssm.py Thu Nov 10 18:57:50 2016 +0100 @@ -353,7 +353,7 @@ 2,2,2,2,2,2,2,2, # 68 - 6f 2,2,2,2,2,2,2,2, # 70 - 77 2,2,2,2,2,2,2,1, # 78 - 7f - 3,3,3,3,3,3,3,3, # 80 - 87 + 3,3,3,3,3,2,2,3, # 80 - 87 3,3,3,3,3,3,3,3, # 88 - 8f 3,3,3,3,3,3,3,3, # 90 - 97 3,3,3,3,3,3,3,3, # 98 - 9f @@ -369,9 +369,8 @@ 2,2,2,2,2,2,2,2, # d8 - df 3,3,3,3,3,3,3,3, # e0 - e7 3,3,3,3,3,4,4,4, # e8 - ef - 4,4,4,4,4,4,4,4, # f0 - f7 - 4,4,4,4,4,0,0,0 # f8 - ff -) + 3,3,3,3,3,3,3,3, # f0 - f7 + 3,3,3,3,3,0,0,0) # f8 - ff SJIS_st = ( @@ -571,5 +570,3 @@ 'stateTable': UTF8_st, 'charLenTable': UTF8CharLenTable, 'name': 'UTF-8'} - -# flake8: noqa
--- a/ThirdParty/CharDet/chardet/sjisprober.py Thu Nov 10 18:54:02 2016 +0100 +++ b/ThirdParty/CharDet/chardet/sjisprober.py Thu Nov 10 18:57:50 2016 +0100 @@ -47,7 +47,7 @@ self._mContextAnalyzer.reset() def get_charset_name(self): - return "SHIFT_JIS" + return self._mContextAnalyzer.get_charset_name() def feed(self, aBuf): aLen = len(aBuf)
--- a/ThirdParty/CharDet/chardet/universaldetector.py Thu Nov 10 18:54:02 2016 +0100 +++ b/ThirdParty/CharDet/chardet/universaldetector.py Thu Nov 10 18:57:50 2016 +0100 @@ -71,9 +71,9 @@ if not self._mGotData: # If the data starts with BOM, we know it is UTF - if aBuf[:3] == codecs.BOM: + if aBuf[:3] == codecs.BOM_UTF8: # EF BB BF UTF-8 with BOM - self.result = {'encoding': "UTF-8", 'confidence': 1.0} + self.result = {'encoding': "UTF-8-SIG", 'confidence': 1.0} elif aBuf[:4] == codecs.BOM_UTF32_LE: # FF FE 00 00 UTF-32, little-endian BOM self.result = {'encoding': "UTF-32LE", 'confidence': 1.0}
--- a/ThirdParty/CharDet/docs/css/chardet.css Thu Nov 10 18:54:02 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,299 +0,0 @@ -html { - margin: 0; - padding: 0; -} - -body { - background-color: #fff; - color: #333; - font-family: 'Lucida Grande', Verdana, Geneva, Lucida, Helvetica, sans-serif; - font-size: 100%; - margin: 10px; - padding: 0; -} - -a:link, a:visited { - background-color: transparent; - color: #333; - text-decoration: none !important; - border-bottom: 1px dotted #333 !important; - text-decoration: underline; - border-bottom: 0; -} - -a:hover { - background-color: transparent; - color: #993344; - text-decoration: none !important; - text-decoration: underline; - border-bottom: 1px dotted #993344 !important; - border-bottom: 0; -} - -h1 { - margin: 8px 0 0 0; - padding: 0; - font-variant: small-caps; - letter-spacing: 0.1em; - font-family: "Book Antiqua", Georgia, Palatino, Times, "Times New Roman", serif; -} - -h1 a:link, h1 a:visited, h1 a:hover { - background-color: transparent ! important; - color: #333 ! important; - text-decoration: none ! important; - border-bottom: 0px ! important; -} - -#intro { - width: 730px; - } - -#intro ul { - margin-left: 0; - padding-left: 0; - display: inline; -} - -#intro ul li { - display: inline; - font-size: small; -} - -#intro ul li.li1 { -} - -#intro p { - font-size: small; - font-weight: normal; - margin: 1.2em 0 0 0; - padding: 0; -} - -.z { - float:left; - background: url(/img/shadowAlpha.png) no-repeat bottom right !important; - background: url(/img/shadow.gif) no-repeat bottom right; - margin: 15px 0 0 10px !important; - margin: 15px 0 0 5px; -} - -.z .sectionInner { - background: none !important; - background: url(/img/shadow2.gif) no-repeat left top; - padding: 0 !important; - padding: 0 6px 6px 0; - } - -.z .sectionInner .sectionInner2 { - background-color: #fff; - border: 1px solid #a9a9a9; - padding: 4px; - margin: -6px 6px 6px -6px !important; - margin: 0; -} - -.s { - margin-left: 1em; - margin-right: 1em; - margin-bottom: 1em; -} - -#main { - clear: left; - margin-left: 11px; - margin-bottom: 2em; - font-size: small; -} - -#mainInner { - margin-left: 1em; - margin-bottom: 2em; - padding-top: 1em; -} - -.footernavigation { - clear: both; - font-size: small; - padding-bottom: 1em; - margin-bottom: 0; -} - -.example, .section, .appendix { - line-height: 150%; -} - -#breadcrumb { - width: 100%; - margin: 0 0 1em 0; - padding: 0; - line-height: 140%; - font-size: small; -} - -#breadcrumb #thispage { - font-weight: bold; -} - -/* ----- Python code syntax coloring ----- */ -.computeroutput, .traceback, .pykeyword, .pystring, .pycomment, .pyfunction, .pyclass { - background-color: white; -} - -.pykeyword, .pyfunction, .pyclass { - font-weight: bold; -} - -.computeroutput { - color: teal; -} - -.traceback { - color: red; -} - -.pykeyword { - color: navy; -} - -.pystring { - color: olive; -} - -.pycomment { - color: green; - font-style: italic; -} - -.pyfunction { - color: teal; -} - -.pyclass { - color: blue; -} - -/* ----- standard stuff ----- */ -.skip { - display: none; -} - -samp, code, tt, pre { - font-weight: normal; - font-family: monospace; - font-size: small; -} - -img { - border: 0; -} - -acronym, abbr { -/* border-bottom: 1px dotted #333;*/ - border-bottom: 0; - font-style: normal; - cursor: help; -} - -hr { - clear: both; - margin-top: 2em !important; - margin-top: 1em; - height: 1px; - background-color: #cecbc6; - color: #cecbc6; -} - -#footer { - text-align: center; - font-size: x-small; -} - -body.docs .example { - border-left: 4px double #ddd !important; - border-left: 1px solid #ccc; - margin-left: 2em; - padding-left: 2em; -} - -body.docs .example h3 { - font-size: 100%; -} - -body.docs .example a.skip:link, -body.docs .example a.skip:visited, -body.docs .example a.skip:hover, -body.docs .section h3.title a.skip:link, -body.docs .section h3.title a.skip:visited, -body.docs .section h3.title a.skip:hover, -body.docs .appendix h3.title a.skip:link, -body.docs .appendix h3.title a.skip:visited, -body.docs .appendix h3.title a.skip:hover { - display: block; - float: left; - vertical-align: bottom; - text-decoration: none; - border-bottom: 0 ! important; - margin-right: 6px; -} - -.reference-from h3, -.seealso h3, -.furtherreading h3 { - margin-top: -1.2em; - margin-left: -15px; - font-size: small; - width: 8em; - border: 1px solid #a9a9a9; - padding: 3px 3px 3px 13px; - background: white; - position: relative; -} - -.reference-from, .seealso, .furtherreading { - width: 680px; - margin-top: 3em; - margin-bottom: 3em; - border: 1px solid #a9a9a9; -} - -table.tip, table.note, table.warning, table.caution, table.important { - margin-bottom: 1em; -} - -.table h3 { - display: none; -} - -.table table td { - padding: 5px 1em 5px 1em; -} - -div.download { - width: 708px; - margin-top: 3em; - margin-bottom: 3em; - border: 1px solid #a9a9a9; -} - -div.download h3 { - margin-top: -1.2em; - margin-left: -15px; - font-size: small; - width: 10em; - border: 1px solid #a9a9a9; - padding: 3px 3px 3px 13px; - background-color: #fff; - color: #222; - position: relative; -} - -div.download p { - margin-left: 1em; -} - -div.download ul { - list-style: none; - padding-left: 1em; - margin-left: 0; -}
--- a/ThirdParty/CharDet/docs/faq.html Thu Nov 10 18:54:02 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> -<html lang="en"> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> -<title>Frequently asked questions [Universal Encoding Detector]</title> -<link rel="stylesheet" href="css/chardet.css" type="text/css"> -<link rev="made" href="mailto:mark@diveintomark.org"> -<meta name="generator" content="DocBook XSL Stylesheets V1.65.1"> -<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed"> -<link rel="start" href="index.html" title="Documentation"> -<link rel="up" href="index.html" title="Documentation"> -<link rel="prev" href="index.html" title="Documentation"> -<link rel="next" href="supported-encodings.html" title="Supported encodings"> -</head> -<body id="chardet-feedparser-org" class="docs"> -<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2"> -<div class="s" id="pageHeader"> -<h1><a href="/">Universal Encoding Detector</a></h1> -<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p> -</div> -<div class="s" id="quickSummary"><ul> -<li class="li1"> -<a href="http://chardet.feedparser.org/download/">Download</a> ·</li> -<li class="li2"> -<a href="index.html">Documentation</a> ·</li> -<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li> -</ul></div> -</div></div></div> -<div id="main"><div id="mainInner"> -<p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">Frequently asked questions</span></p> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h2 class="title"> -<a name="faq" class="skip" href="#faq" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Frequently asked questions</h2></div></div> -<div></div> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="faq.intro" class="skip" href="#faq.intro" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> What is character encoding?</h3></div></div> -<div></div> -</div> -<p>When you think of “<span class="quote">text</span>”, you probably think of “<span class="quote">characters and symbols I see on my computer screen</span>”. But computers don’t deal in characters and symbols; they deal in bits and bytes. Every piece of text you’ve ever seen on a computer screen is actually stored in a particular <span class="emphasis"><em>character encoding</em></span>. There are many different character encodings, some optimized for particular languages like Russian or Chinese or English, and others that can be used for multiple languages. Very roughly speaking, the character encoding provides a mapping between the stuff you see on your screen and the stuff your computer actually stores in memory and on disk.</p> -<p>In reality, it’s more complicated than that. Many characters are common to multiple encodings, but each encoding may use a different sequence of bytes to actually store those characters in memory or on disk. So you can think of the character encoding as a kind of decryption key for the text. Whenever someone gives you a sequence of bytes and claims it’s “<span class="quote">text</span>”, you need to know what character encoding they used so you can decode the bytes into characters and display them (or process them, or whatever).</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="faq.what" class="skip" href="#faq.what" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> What is character encoding auto-detection?</h3></div></div> -<div></div> -</div> -<p>It means taking a sequence of bytes in an unknown character encoding, and attempting to determine the encoding so you can read the text. It’s like cracking a code when you don’t have the decryption key.</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="faq.impossible" class="skip" href="#faq.impossible" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Isn’t that impossible?</h3></div></div> -<div></div> -</div> -<p>In general, yes. However, some encodings are optimized for specific languages, and languages are not random. Some character sequences pop up all the time, while other sequences make no sense. A person fluent in English who opens a newspaper and finds “<span class="quote">txzqJv 2!dasd0a QqdKjvz</span>” will instantly recognize that that isn’t English (even though it is composed entirely of English letters). By studying lots of “<span class="quote">typical</span>” text, a computer algorithm can simulate this kind of fluency and make an educated guess about a text’s language.</p> -<p>In other words, encoding detection is really language detection, combined with knowledge of which languages tend to use which character encodings.</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="faq.who" class="skip" href="#faq.who" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Who wrote this detection algorithm?</h3></div></div> -<div></div> -</div> -<p>This library is a port of <a href="http://lxr.mozilla.org/seamonkey/source/extensions/universalchardet/src/base/">the auto-detection code in Mozilla</a>. I have attempted to maintain as much of the original structure as possible (mostly for selfish reasons, to make it easier to maintain the port as the original code evolves). I have also retained the original authors’ comments, which are quite extensive and informative.</p> -<p>You may also be interested in the research paper which led to the Mozilla implementation, <a href="http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html">A composite approach to language/encoding detection</a>.</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="faq.yippie" class="skip" href="#faq.yippie" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Yippie! Screw the standards, I’ll just auto-detect everything!</h3></div></div> -<div></div> -</div> -<p>Don’t do that. Virtually every format and protocol contains a method for specifying character encoding.</p> -<div class="itemizedlist"><ul> -<li>HTTP can define a <tt class="literal">charset</tt> parameter in the <tt class="literal">Content-type</tt> header.</li> -<li>HTML documents can define a <tt class="literal"><meta http-equiv="content-type"></tt> element in the <tt class="literal"><head></tt> of a web page.</li> -<li>XML documents can define an <tt class="literal">encoding</tt> attribute in the XML prolog.</li> -</ul></div> -<p>If text comes with explicit character encoding information, you should use it. If the text has no explicit information, but the relevant standard defines a default encoding, you should use that. (This is harder than it sounds, because standards can overlap. If you fetch an XML document over HTTP, you need to support both standards <span class="emphasis"><em>and</em></span> figure out which one wins if they give you conflicting information.)</p> -<p>Despite the complexity, it’s worthwhile to follow standards and <a href="http://www.w3.org/2001/tag/doc/mime-respect">respect explicit character encoding information</a>. It will almost certainly be faster and more accurate than trying to auto-detect the encoding. It will also make the world a better place, since your program will interoperate with other programs that follow the same standards.</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="faq.why" class="skip" href="#faq.why" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Why bother with auto-detection if it’s slow, inaccurate, and non-standard?</h3></div></div> -<div></div> -</div> -<p>Sometimes you receive text with verifiably inaccurate encoding information. Or text without any encoding information, and the specified default encoding doesn’t work. There are also some poorly designed standards that have no way to specify encoding at all.</p> -<p>If following the relevant standards gets you nowhere, <span class="emphasis"><em>and</em></span> you decide that processing the text is more important than maintaining interoperability, then you can try to auto-detect the character encoding as a last resort. An example is my <a href="http://feedparser.org/">Universal Feed Parser</a>, which calls this auto-detection library <a href="http://feedparser.org/docs/character-encoding.html">only after exhausting all other options</a>.</p> -</div> -</div> -<div class="footernavigation"> -<div style="float: left">← <a class="NavigationArrow" href="index.html">Documentation</a> -</div> -<div style="text-align: right"> -<a class="NavigationArrow" href="supported-encodings.html">Supported encodings</a> →</div> -</div> -<hr> -<div id="footer"><p class="copyright">Copyright © 2006, 2007, 2008 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div> -</div></div> -</body> -</html>
--- a/ThirdParty/CharDet/docs/history.html Thu Nov 10 18:54:02 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> -<html lang="en"> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> -<title>Revision history [Universal Encoding Detector]</title> -<link rel="stylesheet" href="css/chardet.css" type="text/css"> -<link rev="made" href="mailto:mark@diveintomark.org"> -<meta name="generator" content="DocBook XSL Stylesheets V1.65.1"> -<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed"> -<link rel="start" href="index.html" title="Documentation"> -<link rel="up" href="index.html" title="Documentation"> -<link rel="prev" href="how-it-works.html" title="How it works"> -<link rel="next" href="license.html" title="Terms of use"> -</head> -<body id="chardet-feedparser-org" class="docs"> -<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2"> -<div class="s" id="pageHeader"> -<h1><a href="/">Universal Encoding Detector</a></h1> -<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p> -</div> -<div class="s" id="quickSummary"><ul> -<li class="li1"> -<a href="http://chardet.feedparser.org/download/">Download</a> ·</li> -<li class="li2"> -<a href="index.html">Documentation</a> ·</li> -<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li> -</ul></div> -</div></div></div> -<div id="main"><div id="mainInner"> -<p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">Revision history</span></p> -<div class="section" lang="en"> -<div class="titlepage"> -<div> -<div><h2 class="title"> -<a name="history" class="skip" href="#history" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Revision history</h2></div> -<div><div class="abstract"> -<h3 class="title"></h3> -<p><span class="application">Universal Encoding Detector</span> 2.0.1 is the latest version.</p> -</div></div> -</div> -<div></div> -</div> -<div class="variablelist"><dl> -<dt><span class="term"><tt class="constant">2.0.1</tt> (2009-11-10)</span></dt> -<dd><div class="itemizedlist"><ul> -<li>Migrated to <a href="http://code.google.com/p/chardet">http://code.google.com/p/chardet</a> -</li> -<li>Fixed minor typo in documentation</li> -<li>Synchronized version numbers of Python 2 and Python 3 versions</li> -</ul></div></dd> -<dt><span class="term"><tt class="constant">1.0.1-py3</tt> (2009-06-26)</span></dt> -<dd><div class="itemizedlist"><ul><li>Released Python 3 version</li></ul></div></dd> -<dt><span class="term"><tt class="constant">1.0.1</tt> (2008-03-05)</span></dt> -<dd><div class="itemizedlist"><ul> -<li>fixed typo in detecting little endian UTF-16; closes <a href="http://code.google.com/p/feedparser/issues/detail?id=81">issue 81</a> -</li> -<li>fixed length of <tt class="constant">ISO2022JPCharLenTable</tt>; closes <a href="http://code.google.com/p/feedparser/issues/detail?id=98">issue 98</a> -</li> -</ul></div></dd> -<dt><span class="term"><tt class="constant">1.0</tt> (2006-01-10)</span></dt> -<dd><div class="itemizedlist"><ul><li>Initial release</li></ul></div></dd> -</dl></div> -</div> -<div class="footernavigation"> -<div style="float: left">← <a class="NavigationArrow" href="how-it-works.html">How it works</a> -</div> -<div style="text-align: right"> -<a class="NavigationArrow" href="license.html">Terms of use</a> →</div> -</div> -<hr> -<div id="footer"><p class="copyright">Copyright © 2006, 2007, 2008 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div> -</div></div> -</body> -</html>
--- a/ThirdParty/CharDet/docs/how-it-works.html Thu Nov 10 18:54:02 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> -<html lang="en"> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> -<title>How it works [Universal Encoding Detector]</title> -<link rel="stylesheet" href="css/chardet.css" type="text/css"> -<link rev="made" href="mailto:mark@diveintomark.org"> -<meta name="generator" content="DocBook XSL Stylesheets V1.65.1"> -<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed"> -<link rel="start" href="index.html" title="Documentation"> -<link rel="up" href="index.html" title="Documentation"> -<link rel="prev" href="usage.html" title="Usage"> -<link rel="next" href="history.html" title="Revision history"> -</head> -<body id="chardet-feedparser-org" class="docs"> -<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2"> -<div class="s" id="pageHeader"> -<h1><a href="/">Universal Encoding Detector</a></h1> -<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p> -</div> -<div class="s" id="quickSummary"><ul> -<li class="li1"> -<a href="http://chardet.feedparser.org/download/">Download</a> ·</li> -<li class="li2"> -<a href="index.html">Documentation</a> ·</li> -<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li> -</ul></div> -</div></div></div> -<div id="main"><div id="mainInner"> -<p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">How it works</span></p> -<div class="section" lang="en"> -<div class="titlepage"> -<div> -<div><h2 class="title"> -<a name="howitworks" class="skip" href="#howitworks" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> How it works</h2></div> -<div><div class="abstract"> -<h3 class="title"></h3> -<p>This is a brief guide to navigating the code itself.</p> -</div></div> -</div> -<div></div> -</div> -<p>First, you should read <a href="http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html">A composite approach to language/encoding detection</a>, which explains the detection algorithm and how it was derived. This will help you later when you stumble across the huge character frequency distribution tables like <tt class="filename">big5freq.py</tt> and language models like <tt class="filename">langcyrillicmodel.py</tt>.</p> -<p>The main entry point for the detection algorithm is <tt class="filename">universaldetector.py</tt>, which has one class, <tt class="classname">UniversalDetector</tt>. (You might think the main entry point is the <tt class="function">detect</tt> function in <tt class="filename">chardet/__init__.py</tt>, but that’s really just a convenience function that creates a <tt class="classname">UniversalDetector</tt> object, calls it, and returns its result.)</p> -<p>There are 5 categories of encodings that <tt class="classname">UniversalDetector</tt> handles:</p> -<div class="orderedlist"><ol type="1"> -<li> -<tt class="literal">UTF-n</tt> with a <acronym title="Byte Order Mark">BOM</acronym>. This includes <tt class="literal">UTF-8</tt>, both <acronym title="Big Endian">BE</acronym> and <acronym title="Little Endian">LE</acronym> variants of <tt class="literal">UTF-16</tt>, and all 4 byte-order variants of <tt class="literal">UTF-32</tt>.</li> -<li>Escaped encodings, which are entirely 7-bit <acronym>ASCII</acronym> compatible, where non-<acronym>ASCII</acronym> characters start with an escape sequence. Examples: <tt class="literal">ISO-2022-JP</tt> (Japanese) and <tt class="literal">HZ-GB-2312</tt> (Chinese).</li> -<li>Multi-byte encodings, where each character is represented by a variable number of bytes. Examples: <tt class="literal">Big5</tt> (Chinese), <tt class="literal">SHIFT_JIS</tt> (Japanese), <tt class="literal">EUC-KR</tt> (Korean), and <tt class="literal">UTF-8</tt> without a <acronym title="Byte Order Mark">BOM</acronym>.</li> -<li>Single-byte encodings, where each character is represented by one byte. Examples: <tt class="literal">KOI8-R</tt> (Russian), <tt class="literal">windows-1255</tt> (Hebrew), and <tt class="literal">TIS-620</tt> (Thai).</li> -<li> -<tt class="literal">windows-1252</tt>, which is used primarily on Microsoft Windows by middle managers who don’t know a character encoding from a hole in the ground.</li> -</ol></div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="how.bom" class="skip" href="#how.bom" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> <tt class="literal">UTF-n</tt> with a <acronym title="Byte Order Mark">BOM</acronym> -</h3></div></div> -<div></div> -</div> -<p>If the text starts with a <acronym title="Byte Order Mark">BOM</acronym>, we can reasonably assume that the text is encoded in <tt class="literal">UTF-8</tt>, <tt class="literal">UTF-16</tt>, or <tt class="literal">UTF-32</tt>. (The <acronym title="Byte Order Mark">BOM</acronym> will tell us exactly which one; that’s what it’s for.) This is handled inline in <tt class="classname">UniversalDetector</tt>, which returns the result immediately without any further processing.</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="how.esc" class="skip" href="#how.esc" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Escaped encodings</h3></div></div> -<div></div> -</div> -<p>If the text contains a recognizable escape sequence that might indicate an escaped encoding, <tt class="classname">UniversalDetector</tt> creates an <tt class="classname">EscCharSetProber</tt> (defined in <tt class="filename">escprober.py</tt>) and feeds it the text.</p> -<p><tt class="classname">EscCharSetProber</tt> creates a series of state machines, based on models of <tt class="literal">HZ-GB-2312</tt>, <tt class="literal">ISO-2022-CN</tt>, <tt class="literal">ISO-2022-JP</tt>, and <tt class="literal">ISO-2022-KR</tt> (defined in <tt class="filename">escsm.py</tt>). <tt class="classname">EscCharSetProber</tt> feeds the text to each of these state machines, one byte at a time. If any state machine ends up uniquely identifying the encoding, <tt class="classname">EscCharSetProber</tt> immediately returns the positive result to <tt class="classname">UniversalDetector</tt>, which returns it to the caller. If any state machine hits an illegal sequence, it is dropped and processing continues with the other state machines.</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="how.mb" class="skip" href="#how.mb" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Multi-byte encodings</h3></div></div> -<div></div> -</div> -<p>Assuming no <acronym title="Byte Order Mark">BOM</acronym>, <tt class="classname">UniversalDetector</tt> checks whether the text contains any high-bit characters. If so, it creates a series of “<span class="quote">probers</span>” for detecting multi-byte encodings, single-byte encodings, and as a last resort, <tt class="literal">windows-1252</tt>.</p> -<p>The multi-byte encoding prober, <tt class="classname">MBCSGroupProber</tt> (defined in <tt class="filename">mbcsgroupprober.py</tt>), is really just a shell that manages a group of other probers, one for each multi-byte encoding: <tt class="literal">Big5</tt>, <tt class="literal">GB2312</tt>, <tt class="literal">EUC-TW</tt>, <tt class="literal">EUC-KR</tt>, <tt class="literal">EUC-JP</tt>, <tt class="literal">SHIFT_JIS</tt>, and <tt class="literal">UTF-8</tt>. <tt class="classname">MBCSGroupProber</tt> feeds the text to each of these encoding-specific probers and checks the results. If a prober reports that it has found an illegal byte sequence, it is dropped from further processing (so that, for instance, any subsequent calls to <tt class="classname">UniversalDetector</tt>.<tt class="methodname">feed</tt> will skip that prober). If a prober reports that it is reasonably confident that it has detected the encoding, <tt class="classname">MBCSGroupProber</tt> reports this positive result to <tt class="classname">UniversalDetector</tt>, which reports the result to the caller.</p> -<p>Most of the multi-byte encoding probers are inherited from <tt class="classname">MultiByteCharSetProber</tt> (defined in <tt class="filename">mbcharsetprober.py</tt>), and simply hook up the appropriate state machine and distribution analyzer and let <tt class="classname">MultiByteCharSetProber</tt> do the rest of the work. <tt class="classname">MultiByteCharSetProber</tt> runs the text through the encoding-specific state machine, one byte at a time, to look for byte sequences that would indicate a conclusive positive or negative result. At the same time, <tt class="classname">MultiByteCharSetProber</tt> feeds the text to an encoding-specific distribution analyzer.</p> -<p>The distribution analyzers (each defined in <tt class="filename">chardistribution.py</tt>) use language-specific models of which characters are used most frequently. Once <tt class="classname">MultiByteCharSetProber</tt> has fed enough text to the distribution analyzer, it calculates a confidence rating based on the number of frequently-used characters, the total number of characters, and a language-specific distribution ratio. If the confidence is high enough, <tt class="classname">MultiByteCharSetProber</tt> returns the result to <tt class="classname">MBCSGroupProber</tt>, which returns it to <tt class="classname">UniversalDetector</tt>, which returns it to the caller.</p> -<p>The case of Japanese is more difficult. Single-character distribution analysis is not always sufficient to distinguish between <tt class="literal">EUC-JP</tt> and <tt class="literal">SHIFT_JIS</tt>, so the <tt class="classname">SJISProber</tt> (defined in <tt class="filename">sjisprober.py</tt>) also uses 2-character distribution analysis. <tt class="classname">SJISContextAnalysis</tt> and <tt class="classname">EUCJPContextAnalysis</tt> (both defined in <tt class="filename">jpcntx.py</tt> and both inheriting from a common <tt class="classname">JapaneseContextAnalysis</tt> class) check the frequency of Hiragana syllabary characters within the text. Once enough text has been processed, they return a confidence level to <tt class="classname">SJISProber</tt>, which checks both analyzers and returns the higher confidence level to <tt class="classname">MBCSGroupProber</tt>.</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="how.sb" class="skip" href="#how.sb" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Single-byte encodings</h3></div></div> -<div></div> -</div> -<p>The single-byte encoding prober, <tt class="classname">SBCSGroupProber</tt> (defined in <tt class="filename">sbcsgroupprober.py</tt>), is also just a shell that manages a group of other probers, one for each combination of single-byte encoding and language: <tt class="literal">windows-1251</tt>, <tt class="literal">KOI8-R</tt>, <tt class="literal">ISO-8859-5</tt>, <tt class="literal">MacCyrillic</tt>, <tt class="literal">IBM855</tt>, and <tt class="literal">IBM866</tt> (Russian); <tt class="literal">ISO-8859-7</tt> and <tt class="literal">windows-1253</tt> (Greek); <tt class="literal">ISO-8859-5</tt> and <tt class="literal">windows-1251</tt> (Bulgarian); <tt class="literal">ISO-8859-2</tt> and <tt class="literal">windows-1250</tt> (Hungarian); <tt class="literal">TIS-620</tt> (Thai); <tt class="literal">windows-1255</tt> and <tt class="literal">ISO-8859-8</tt> (Hebrew).</p> -<p><tt class="classname">SBCSGroupProber</tt> feeds the text to each of these encoding+language-specific probers and checks the results. These probers are all implemented as a single class, <tt class="classname">SingleByteCharSetProber</tt> (defined in <tt class="filename">sbcharsetprober.py</tt>), which takes a language model as an argument. The language model defines how frequently different 2-character sequences appear in typical text. <tt class="classname">SingleByteCharSetProber</tt> processes the text and tallies the most frequently used 2-character sequences. Once enough text has been processed, it calculates a confidence level based on the number of frequently-used sequences, the total number of characters, and a language-specific distribution ratio.</p> -<p>Hebrew is handled as a special case. If the text appears to be Hebrew based on 2-character distribution analysis, <tt class="classname">HebrewProber</tt> (defined in <tt class="filename">hebrewprober.py</tt>) tries to distinguish between Visual Hebrew (where the source text actually stored “<span class="quote">backwards</span>” line-by-line, and then displayed verbatim so it can be read from right to left) and Logical Hebrew (where the source text is stored in reading order and then rendered right-to-left by the client). Because certain characters are encoded differently based on whether they appear in the middle of or at the end of a word, we can make a reasonable guess about direction of the source text, and return the appropriate encoding (<tt class="literal">windows-1255</tt> for Logical Hebrew, or <tt class="literal">ISO-8859-8</tt> for Visual Hebrew).</p> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="how.windows1252" class="skip" href="#how.windows1252" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> windows-1252</h3></div></div> -<div></div> -</div> -<p>If <tt class="classname">UniversalDetector</tt> detects a high-bit character in the text, but none of the other multi-byte or single-byte encoding probers return a confident result, it creates a <tt class="classname">Latin1Prober</tt> (defined in <tt class="filename">latin1prober.py</tt>) to try to detect English text in a <tt class="literal">windows-1252</tt> encoding. This detection is inherently unreliable, because English letters are encoded in the same way in many different encodings. The only way to distinguish <tt class="literal">windows-1252</tt> is through commonly used symbols like smart quotes, curly apostrophes, copyright symbols, and the like. <tt class="classname">Latin1Prober</tt> automatically reduces its confidence rating to allow more accurate probers to win if at all possible.</p> -</div> -</div> -<div class="footernavigation"> -<div style="float: left">← <a class="NavigationArrow" href="usage.html">Usage</a> -</div> -<div style="text-align: right"> -<a class="NavigationArrow" href="history.html">Revision history</a> →</div> -</div> -<hr> -<div id="footer"><p class="copyright">Copyright © 2006, 2007, 2008 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div> -</div></div> -</body> -</html>
--- a/ThirdParty/CharDet/docs/index.html Thu Nov 10 18:54:02 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,74 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> -<html lang="en"> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> -<title>Documentation [Universal Encoding Detector]</title> -<link rel="stylesheet" href="css/chardet.css" type="text/css"> -<link rev="made" href="mailto:mark@diveintomark.org"> -<meta name="generator" content="DocBook XSL Stylesheets V1.65.1"> -<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed"> -<link rel="start" href="index.html" title="Documentation"> -<link rel="next" href="faq.html" title="Frequently asked questions"> -</head> -<body id="chardet-feedparser-org" class="docs"> -<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2"> -<div class="s" id="pageHeader"> -<h1><a href="/">Universal Encoding Detector</a></h1> -<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p> -</div> -<div class="s" id="quickSummary"><ul> -<li class="li1"> -<a href="http://chardet.feedparser.org/download/">Download</a> ·</li> -<li class="li2"> -<a href="index.html">Documentation</a> ·</li> -<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li> -</ul></div> -</div></div></div> -<div id="main"><div id="mainInner"> -<p id="breadcrumb">You are here: <span class="thispage">Documentation</span></p> -<div class="article" lang="en"> -<div class="titlepage"> -<div></div> -<div></div> -</div> -<div class="toc"><ul> -<li> -<span class="section"><a href="faq.html">Frequently asked questions</a></span><ul> -<li><span class="section"><a href="faq.html#faq.intro">What is character encoding?</a></span></li> -<li><span class="section"><a href="faq.html#faq.what">What is character encoding auto-detection?</a></span></li> -<li><span class="section"><a href="faq.html#faq.impossible">Isn’t that impossible?</a></span></li> -<li><span class="section"><a href="faq.html#faq.who">Who wrote this detection algorithm?</a></span></li> -<li><span class="section"><a href="faq.html#faq.yippie">Yippie! Screw the standards, I’ll just auto-detect everything!</a></span></li> -<li><span class="section"><a href="faq.html#faq.why">Why bother with auto-detection if it’s slow, inaccurate, and non-standard?</a></span></li> -</ul> -</li> -<li><span class="section"><a href="supported-encodings.html">Supported encodings</a></span></li> -<li> -<span class="section"><a href="usage.html">Usage</a></span><ul> -<li><span class="section"><a href="usage.html#usage.basic">Basic usage</a></span></li> -<li><span class="section"><a href="usage.html#usage.advanced">Advanced usage</a></span></li> -</ul> -</li> -<li> -<span class="section"><a href="how-it-works.html">How it works</a></span><ul> -<li><span class="section"><a href="how-it-works.html#how.bom">UTF-n with a BOM</a></span></li> -<li><span class="section"><a href="how-it-works.html#how.esc">Escaped encodings</a></span></li> -<li><span class="section"><a href="how-it-works.html#how.mb">Multi-byte encodings</a></span></li> -<li><span class="section"><a href="how-it-works.html#how.sb">Single-byte encodings</a></span></li> -<li><span class="section"><a href="how-it-works.html#how.windows1252">windows-1252</a></span></li> -</ul> -</li> -<li><span class="section"><a href="history.html">Revision history</a></span></li> -<li><span class="appendix"><a href="license.html">Terms of use</a></span></li> -</ul></div> -</div> -<div class="footernavigation"> -<div style="float: left"></div> -<div style="text-align: right"> -<a class="NavigationArrow" href="faq.html">Frequently asked questions</a> →</div> -</div> -<hr> -<div id="footer"><p class="copyright">Copyright © 2006, 2007, 2008 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div> -</div></div> -</body> -</html>
--- a/ThirdParty/CharDet/docs/license.html Thu Nov 10 18:54:02 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,62 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> -<html lang="en"> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> -<title>Terms of use [Universal Encoding Detector]</title> -<link rel="stylesheet" href="css/chardet.css" type="text/css"> -<link rev="made" href="mailto:mark@diveintomark.org"> -<meta name="generator" content="DocBook XSL Stylesheets V1.65.1"> -<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed"> -<link rel="start" href="index.html" title="Documentation"> -<link rel="up" href="index.html" title="Documentation"> -<link rel="prev" href="history.html" title="Revision history"> -</head> -<body id="chardet-feedparser-org" class="docs"> -<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2"> -<div class="s" id="pageHeader"> -<h1><a href="/">Universal Encoding Detector</a></h1> -<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p> -</div> -<div class="s" id="quickSummary"><ul> -<li class="li1"> -<a href="http://chardet.feedparser.org/download/">Download</a> ·</li> -<li class="li2"> -<a href="index.html">Documentation</a> ·</li> -<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li> -</ul></div> -</div></div></div> -<div id="main"><div id="mainInner"> -<p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">Terms of use</span></p> -<div class="appendix" lang="en"> -<div class="titlepage"> -<div><div><h2 class="title"> -<a name="license" class="skip" href="#license" title="link to this appendix"><img src="images/permalink.gif" alt="[link]" title="link to this appendix" width="8" height="9"></a> Terms of use</h2></div></div> -<div></div> -</div> -<h3> -<a name="id663316" class="skip" href="#id663316" title="link to this bridgehead"><img src="images/permalink.gif" alt="[link]" title="link to this bridgehead" width="8" height="9"></a> Software</h3> -<p>The <span class="application">Universal Encoding Detector</span> library is copyright © 2006-2009 Mark Pilgrim. All rights reserved.</p> -<p>Portions copyright © 1998-2001 Netscape Communications Corporation. All rights reserved.</p> -<p>The <span class="application">Universal Encoding Detector</span> library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version.</p> -<p>This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.</p> -<p>You should have received a copy of the GNU Lesser General Public License along with this library (in a file named <tt class="literal">COPYING</tt>); if not, write to the Free Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA</p> -<h3> -<a name="id663370" class="skip" href="#id663370" title="link to this bridgehead"><img src="images/permalink.gif" alt="[link]" title="link to this bridgehead" width="8" height="9"></a> Documentation</h3> -<p>The <span class="application">Universal Encoding Detector</span> documentation is copyright © 2006-2009 Mark Pilgrim. All rights reserved.</p> -<p>Redistribution and use in source (XML DocBook) and “<span class="quote">compiled</span>” forms (SGML, HTML, PDF, PostScript, RTF and so forth) with or without modification, are permitted provided that the following conditions are met:</p> -<div class="orderedlist"><ol type="1"> -<li>Redistributions of source code (XML DocBook) must retain the above copyright notice, this list of conditions and the following disclaimer unmodified.</li> -<li>Redistributions in compiled form (transformed to other DTDs, converted to PDF, PostScript, RTF and other formats) must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.</li> -</ol></div> -<p>THIS DOCUMENTATION IS PROVIDED BY THE AUTHOR “<span class="quote">AS IS</span>” AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.</p> -</div> -<div class="footernavigation"> -<div style="float: left">← <a class="NavigationArrow" href="history.html">Revision history</a> -</div> -<div style="text-align: right"></div> -</div> -<hr> -<div id="footer"><p class="copyright">Copyright © 2006, 2007, 2008 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div> -</div></div> -</body> -</html>
--- a/ThirdParty/CharDet/docs/supported-encodings.html Thu Nov 10 18:54:02 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,86 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> -<html lang="en"> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> -<title>Supported encodings [Universal Encoding Detector]</title> -<link rel="stylesheet" href="css/chardet.css" type="text/css"> -<link rev="made" href="mailto:mark@diveintomark.org"> -<meta name="generator" content="DocBook XSL Stylesheets V1.65.1"> -<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed"> -<link rel="start" href="index.html" title="Documentation"> -<link rel="up" href="index.html" title="Documentation"> -<link rel="prev" href="faq.html" title="Frequently asked questions"> -<link rel="next" href="usage.html" title="Usage"> -</head> -<body id="chardet-feedparser-org" class="docs"> -<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2"> -<div class="s" id="pageHeader"> -<h1><a href="/">Universal Encoding Detector</a></h1> -<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p> -</div> -<div class="s" id="quickSummary"><ul> -<li class="li1"> -<a href="http://chardet.feedparser.org/download/">Download</a> ·</li> -<li class="li2"> -<a href="index.html">Documentation</a> ·</li> -<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li> -</ul></div> -</div></div></div> -<div id="main"><div id="mainInner"> -<p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">Supported encodings</span></p> -<div class="section" lang="en"> -<div class="titlepage"> -<div> -<div><h2 class="title"> -<a name="encodings" class="skip" href="#encodings" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Supported encodings</h2></div> -<div><div class="abstract"> -<h3 class="title"></h3> -<p><span class="application">Universal Encoding Detector</span> currently supports over two dozen character encodings.</p> -</div></div> -</div> -<div></div> -</div> -<div class="itemizedlist"><ul> -<li> -<tt class="literal">Big5</tt>, <tt class="literal">GB2312</tt>/<tt class="literal">GB18030</tt>, <tt class="literal">EUC-TW</tt>, <tt class="literal">HZ-GB-2312</tt>, and <tt class="literal">ISO-2022-CN</tt> (Traditional and Simplified Chinese)</li> -<li> -<tt class="literal">EUC-JP</tt>, <tt class="literal">SHIFT_JIS</tt>, and <tt class="literal">ISO-2022-JP</tt> (Japanese)</li> -<li> -<tt class="literal">EUC-KR</tt> and <tt class="literal">ISO-2022-KR</tt> (Korean)</li> -<li> -<tt class="literal">KOI8-R</tt>, <tt class="literal">MacCyrillic</tt>, <tt class="literal">IBM855</tt>, <tt class="literal">IBM866</tt>, <tt class="literal">ISO-8859-5</tt>, and <tt class="literal">windows-1251</tt> (Russian)</li> -<li> -<tt class="literal">ISO-8859-2</tt> and <tt class="literal">windows-1250</tt> (Hungarian)</li> -<li> -<tt class="literal">ISO-8859-5</tt> and <tt class="literal">windows-1251</tt> (Bulgarian)</li> -<li><tt class="literal">windows-1252</tt></li> -<li> -<tt class="literal">ISO-8859-7</tt> and <tt class="literal">windows-1253</tt> (Greek)</li> -<li> -<tt class="literal">ISO-8859-8</tt> and <tt class="literal">windows-1255</tt> (Visual and Logical Hebrew)</li> -<li> -<tt class="literal">TIS-620</tt> (Thai)</li> -<li> -<tt class="literal">UTF-32</tt> <acronym title="Big Endian">BE</acronym>, <acronym title="Little Endian">LE</acronym>, 3412-ordered, or 2143-ordered (with a <acronym title="Byte Order Mark">BOM</acronym>)</li> -<li> -<tt class="literal">UTF-16</tt> <acronym title="Big Endian">BE</acronym> or <acronym title="Little Endian">LE</acronym> (with a <acronym title="Byte Order Mark">BOM</acronym>)</li> -<li> -<tt class="literal">UTF-8</tt> (with or without a <acronym title="Byte Order Mark">BOM</acronym>)</li> -<li><acronym>ASCII</acronym></li> -</ul></div> -<a name="id667094"></a><table class="caution" border="0" summary=""> -<tr><td rowspan="2" align="center" valign="top" width="1%"><img src="images/caution.png" alt="Caution" title="" width="24" height="24"></td></tr> -<tr><td colspan="2" align="left" valign="top" width="99%">Due to inherent similarities between certain encodings, some encodings may be detected incorrectly. In my tests, the most problematic case was Hungarian text encoded as <tt class="literal">ISO-8859-2</tt> or <tt class="literal">windows-1250</tt> (encoded as one but reported as the other). Also, Greek text encoded as <tt class="literal">ISO-8859-7</tt> was often mis-reported as <tt class="literal">ISO-8859-2</tt>. Your mileage may vary.</td></tr> -</table> -</div> -<div class="footernavigation"> -<div style="float: left">← <a class="NavigationArrow" href="faq.html">Frequently asked questions</a> -</div> -<div style="text-align: right"> -<a class="NavigationArrow" href="usage.html">Usage</a> →</div> -</div> -<hr> -<div id="footer"><p class="copyright">Copyright © 2006, 2007, 2008 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div> -</div></div> -</body> -</html>
--- a/ThirdParty/CharDet/docs/usage.html Thu Nov 10 18:54:02 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,107 +0,0 @@ -<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"> -<html lang="en"> -<head> -<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> -<title>Usage [Universal Encoding Detector]</title> -<link rel="stylesheet" href="css/chardet.css" type="text/css"> -<link rev="made" href="mailto:mark@diveintomark.org"> -<meta name="generator" content="DocBook XSL Stylesheets V1.65.1"> -<meta name="keywords" content="character, set, encoding, detection, Python, XML, feed"> -<link rel="start" href="index.html" title="Documentation"> -<link rel="up" href="index.html" title="Documentation"> -<link rel="prev" href="supported-encodings.html" title="Supported encodings"> -<link rel="next" href="how-it-works.html" title="How it works"> -</head> -<body id="chardet-feedparser-org" class="docs"> -<div class="z" id="intro"><div class="sectionInner"><div class="sectionInner2"> -<div class="s" id="pageHeader"> -<h1><a href="/">Universal Encoding Detector</a></h1> -<p>Character encoding auto-detection in Python. As smart as your browser. Open source.</p> -</div> -<div class="s" id="quickSummary"><ul> -<li class="li1"> -<a href="http://chardet.feedparser.org/download/">Download</a> ·</li> -<li class="li2"> -<a href="index.html">Documentation</a> ·</li> -<li class="li3"><a href="faq.html" title="Frequently Asked Questions">FAQ</a></li> -</ul></div> -</div></div></div> -<div id="main"><div id="mainInner"> -<p id="breadcrumb">You are here: <a href="index.html">Documentation</a> → <span class="thispage">Usage</span></p> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h2 class="title"> -<a name="usage" class="skip" href="#usage" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Usage</h2></div></div> -<div></div> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="usage.basic" class="skip" href="#usage.basic" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Basic usage</h3></div></div> -<div></div> -</div> -<p>The easiest way to use the <span class="application">Universal Encoding Detector</span> library is with the <tt class="function">detect</tt> function.</p> -<div class="example"> -<a name="example.basic.detect" class="skip" href="#example.basic.detect" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Using the <tt class="function">detect</tt> function</h3> -<p>The <tt class="function">detect</tt> function takes one argument, a non-Unicode string. It returns a dictionary containing the auto-detected character encoding and a confidence level from <tt class="constant">0</tt> to <tt class="constant">1</tt>.</p> -<pre class="screen"><tt class="prompt">>>> </tt><span class="userinput"><font color='navy'><b>import</b></font> urllib</span> -<tt class="prompt">>>> </tt><span class="userinput">rawdata = urllib.urlopen(<font color='olive'>'http://yahoo.co.jp/'</font>).read()</span> -<tt class="prompt">>>> </tt><span class="userinput"><font color='navy'><b>import</b></font> chardet</span> -<tt class="prompt">>>> </tt><span class="userinput">chardet.detect(rawdata)</span> -<span class="computeroutput">{'encoding': 'EUC-JP', 'confidence': 0.99}</span></pre> -</div> -</div> -<div class="section" lang="en"> -<div class="titlepage"> -<div><div><h3 class="title"> -<a name="usage.advanced" class="skip" href="#usage.advanced" title="link to this section"><img src="images/permalink.gif" alt="[link]" title="link to this section" width="8" height="9"></a> Advanced usage</h3></div></div> -<div></div> -</div> -<p>If you’re dealing with a large amount of text, you can call the <span class="application">Universal Encoding Detector</span> library incrementally, and it will stop as soon as it is confident enough to report its results.</p> -<p>Create a <tt class="classname">UniversalDetector</tt> object, then call its <tt class="methodname">feed</tt> method repeatedly with each block of text. If the detector reaches a minimum threshold of confidence, it will set <tt class="varname">detector.done</tt> to <tt class="constant">True</tt>.</p> -<p>Once you’ve exhausted the source text, call <tt class="methodname">detector.close()</tt>, which will do some final calculations in case the detector didn’t hit its minimum confidence threshold earlier. Then <tt class="varname">detector.result</tt> will be a dictionary containing the auto-detected character encoding and confidence level (the same as <a href="usage.html#example.basic.detect" title="Example: Using the detect function">the <tt class="function">chardet.detect</tt> function returns</a>).</p> -<div class="example"> -<a name="example.multiline" class="skip" href="#example.multiline" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Detecting encoding incrementally</h3> -<pre class="programlisting python"><font color='navy'><b>import</b></font> urllib -<font color='navy'><b>from</b></font> chardet.universaldetector <font color='navy'><b>import</b></font> UniversalDetector - -usock = urllib.urlopen(<font color='olive'>'http://yahoo.co.jp/'</font>) -detector = UniversalDetector() -<font color='navy'><b>for</b></font> line <font color='navy'><b>in</b></font> usock.readlines(): - detector.feed(line) - <font color='navy'><b>if</b></font> detector.done: <font color='navy'><b>break</b></font> -detector.close() -usock.close() -<font color='navy'><b>print</b></font> detector.result</pre> -<pre class="screen"><span class="computeroutput">{'encoding': 'EUC-JP', 'confidence': 0.99}</span></pre> -</div> -<p>If you want to detect the encoding of multiple texts (such as separate files), you can re-use a single <tt class="classname">UniversalDetector</tt> object. Just call <tt class="methodname">detector.reset()</tt> at the start of each file, call <tt class="methodname">detector.feed</tt> as many times as you like, and then call <tt class="methodname">detector.close()</tt> and check the <tt class="varname">detector.result</tt> dictionary for the file’s results.</p> -<div class="example"> -<a name="advanced.multifile.multiline" class="skip" href="#advanced.multifile.multiline" title="link to this example"><img src="images/permalink.gif" alt="[link]" title="link to this example" width="8" height="9"></a> <h3 class="title">Example: Detecting encodings of multiple files</h3> -<pre class="programlisting python"><font color='navy'><b>import</b></font> glob -<font color='navy'><b>from</b></font> chardet.universaldetector <font color='navy'><b>import</b></font> UniversalDetector - -detector = UniversalDetector() -<font color='navy'><b>for</b></font> filename <font color='navy'><b>in</b></font> glob.glob(<font color='olive'>'*.xml'</font>): - <font color='navy'><b>print</b></font> filename.ljust(60), - detector.reset() - <font color='navy'><b>for</b></font> line <font color='navy'><b>in</b></font> file(filename, <font color='olive'>'rb'</font>): - detector.feed(line) - <font color='navy'><b>if</b></font> detector.done: <font color='navy'><b>break</b></font> - detector.close() - <font color='navy'><b>print</b></font> detector.result -</pre> -</div> -</div> -</div> -<div class="footernavigation"> -<div style="float: left">← <a class="NavigationArrow" href="supported-encodings.html">Supported encodings</a> -</div> -<div style="text-align: right"> -<a class="NavigationArrow" href="how-it-works.html">How it works</a> →</div> -</div> -<hr> -<div id="footer"><p class="copyright">Copyright © 2006, 2007, 2008 Mark Pilgrim · <a href="mailto:mark@diveintomark.org">mark@diveintomark.org</a> · <a href="license.html">Terms of use</a></p></div> -</div></div> -</body> -</html>
--- a/changelog Thu Nov 10 18:54:02 2016 +0100 +++ b/changelog Thu Nov 10 18:57:50 2016 +0100 @@ -1,6 +1,11 @@ Change Log ---------- -Version 6.2.0: +Version 16.xx: +- bug fixes +- Third Party packages + -- updated chardet to 2.3.0 + +Version 16.11: - bug fixes - General -- added a status bar icon to show the online status to the main window
--- a/eric6.e4p Thu Nov 10 18:54:02 2016 +0100 +++ b/eric6.e4p Thu Nov 10 18:57:50 2016 +0100 @@ -1945,14 +1945,14 @@ <Interfaces/> <Others> <Other>.hgignore</Other> - <Other>APIs/Python/zope-2.10.7.api</Other> - <Other>APIs/Python/zope-2.11.2.api</Other> - <Other>APIs/Python/zope-3.3.1.api</Other> <Other>APIs/Python3/PyQt4.bas</Other> <Other>APIs/Python3/PyQt5.bas</Other> <Other>APIs/Python3/QScintilla2.bas</Other> <Other>APIs/Python3/eric6.api</Other> <Other>APIs/Python3/eric6.bas</Other> + <Other>APIs/Python/zope-2.10.7.api</Other> + <Other>APIs/Python/zope-2.11.2.api</Other> + <Other>APIs/Python/zope-3.3.1.api</Other> <Other>APIs/QSS/qss.api</Other> <Other>APIs/Ruby/Ruby-1.8.7.api</Other> <Other>APIs/Ruby/Ruby-1.8.7.bas</Other> @@ -2035,7 +2035,7 @@ <Other>Styles</Other> <Other>THANKS</Other> <Other>ThirdParty/CharDet/LICENSE</Other> - <Other>ThirdParty/CharDet/docs</Other> + <Other>ThirdParty/CharDet/README.rst</Other> <Other>ThirdParty/Jasy/jasy/license.md</Other> <Other>ThirdParty/Pygments/pygments/AUTHORS</Other> <Other>ThirdParty/Pygments/pygments/CHANGES</Other>