eric6/ThirdParty/CharDet/chardet/metadata/languages.py

changeset 8258
82b608e352ec
parent 8257
28146736bbfc
child 8259
2bbec88047dd
equal deleted inserted replaced
8257:28146736bbfc 8258:82b608e352ec
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 """
4 Metadata about languages used by our model training code for our
5 SingleByteCharSetProbers. Could be used for other things in the future.
6
7 This code is based on the language metadata from the uchardet project.
8 """
9 from __future__ import absolute_import, print_function
10
11 from string import ascii_letters
12
13
14 # TODO: Add Ukranian (KOI8-U)
15
16 class Language(object):
17 """Metadata about a language useful for training models
18
19 :ivar name: The human name for the language, in English.
20 :type name: str
21 :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise,
22 or use another catalog as a last resort.
23 :type iso_code: str
24 :ivar use_ascii: Whether or not ASCII letters should be included in trained
25 models.
26 :type use_ascii: bool
27 :ivar charsets: The charsets we want to support and create data for.
28 :type charsets: list of str
29 :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is
30 `True`, you only need to add those not in the ASCII set.
31 :type alphabet: str
32 :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling
33 Wikipedia for training data.
34 :type wiki_start_pages: list of str
35 """
36 def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None,
37 alphabet=None, wiki_start_pages=None):
38 super(Language, self).__init__()
39 self.name = name
40 self.iso_code = iso_code
41 self.use_ascii = use_ascii
42 self.charsets = charsets
43 if self.use_ascii:
44 if alphabet:
45 alphabet += ascii_letters
46 else:
47 alphabet = ascii_letters
48 elif not alphabet:
49 raise ValueError('Must supply alphabet if use_ascii is False')
50 self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None
51 self.wiki_start_pages = wiki_start_pages
52
53 def __repr__(self):
54 return '{}({})'.format(self.__class__.__name__,
55 ', '.join('{}={!r}'.format(k, v)
56 for k, v in self.__dict__.items()
57 if not k.startswith('_')))
58
59
60 LANGUAGES = {'Arabic': Language(name='Arabic',
61 iso_code='ar',
62 use_ascii=False,
63 # We only support encodings that use isolated
64 # forms, because the current recommendation is
65 # that the rendering system handles presentation
66 # forms. This means we purposefully skip IBM864.
67 charsets=['ISO-8859-6', 'WINDOWS-1256',
68 'CP720', 'CP864'],
69 alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ',
70 wiki_start_pages=[u'الصفحة_الرئيسية']),
71 'Belarusian': Language(name='Belarusian',
72 iso_code='be',
73 use_ascii=False,
74 charsets=['ISO-8859-5', 'WINDOWS-1251',
75 'IBM866', 'MacCyrillic'],
76 alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ'
77 u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'),
78 wiki_start_pages=[u'Галоўная_старонка']),
79 'Bulgarian': Language(name='Bulgarian',
80 iso_code='bg',
81 use_ascii=False,
82 charsets=['ISO-8859-5', 'WINDOWS-1251',
83 'IBM855'],
84 alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ'
85 u'абвгдежзийклмнопрстуфхцчшщъьюя'),
86 wiki_start_pages=[u'Начална_страница']),
87 'Czech': Language(name='Czech',
88 iso_code='cz',
89 use_ascii=True,
90 charsets=['ISO-8859-2', 'WINDOWS-1250'],
91 alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ',
92 wiki_start_pages=[u'Hlavní_strana']),
93 'Danish': Language(name='Danish',
94 iso_code='da',
95 use_ascii=True,
96 charsets=['ISO-8859-1', 'ISO-8859-15',
97 'WINDOWS-1252'],
98 alphabet=u'æøåÆØÅ',
99 wiki_start_pages=[u'Forside']),
100 'German': Language(name='German',
101 iso_code='de',
102 use_ascii=True,
103 charsets=['ISO-8859-1', 'WINDOWS-1252'],
104 alphabet=u'äöüßÄÖÜ',
105 wiki_start_pages=[u'Wikipedia:Hauptseite']),
106 'Greek': Language(name='Greek',
107 iso_code='el',
108 use_ascii=False,
109 charsets=['ISO-8859-7', 'WINDOWS-1253'],
110 alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ'
111 u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'),
112 wiki_start_pages=[u'Πύλη:Κύρια']),
113 'English': Language(name='English',
114 iso_code='en',
115 use_ascii=True,
116 charsets=['ISO-8859-1', 'WINDOWS-1252'],
117 wiki_start_pages=[u'Main_Page']),
118 'Esperanto': Language(name='Esperanto',
119 iso_code='eo',
120 # Q, W, X, and Y not used at all
121 use_ascii=False,
122 charsets=['ISO-8859-3'],
123 alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz'
124 u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'),
125 wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']),
126 'Spanish': Language(name='Spanish',
127 iso_code='es',
128 use_ascii=True,
129 charsets=['ISO-8859-1', 'ISO-8859-15',
130 'WINDOWS-1252'],
131 alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ',
132 wiki_start_pages=[u'Wikipedia:Portada']),
133 'Estonian': Language(name='Estonian',
134 iso_code='et',
135 use_ascii=False,
136 charsets=['ISO-8859-4', 'ISO-8859-13',
137 'WINDOWS-1257'],
138 # C, F, Š, Q, W, X, Y, Z, Ž are only for
139 # loanwords
140 alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ'
141 u'abdeghijklmnoprstuvõäöü'),
142 wiki_start_pages=[u'Esileht']),
143 'Finnish': Language(name='Finnish',
144 iso_code='fi',
145 use_ascii=True,
146 charsets=['ISO-8859-1', 'ISO-8859-15',
147 'WINDOWS-1252'],
148 alphabet=u'ÅÄÖŠŽåäöšž',
149 wiki_start_pages=[u'Wikipedia:Etusivu']),
150 'French': Language(name='French',
151 iso_code='fr',
152 use_ascii=True,
153 charsets=['ISO-8859-1', 'ISO-8859-15',
154 'WINDOWS-1252'],
155 alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ',
156 wiki_start_pages=[u'Wikipédia:Accueil_principal',
157 u'Bœuf (animal)']),
158 'Hebrew': Language(name='Hebrew',
159 iso_code='he',
160 use_ascii=False,
161 charsets=['ISO-8859-8', 'WINDOWS-1255'],
162 alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ',
163 wiki_start_pages=[u'עמוד_ראשי']),
164 'Croatian': Language(name='Croatian',
165 iso_code='hr',
166 # Q, W, X, Y are only used for foreign words.
167 use_ascii=False,
168 charsets=['ISO-8859-2', 'WINDOWS-1250'],
169 alphabet=(u'abcčćdđefghijklmnoprsštuvzž'
170 u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'),
171 wiki_start_pages=[u'Glavna_stranica']),
172 'Hungarian': Language(name='Hungarian',
173 iso_code='hu',
174 # Q, W, X, Y are only used for foreign words.
175 use_ascii=False,
176 charsets=['ISO-8859-2', 'WINDOWS-1250'],
177 alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű'
178 u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'),
179 wiki_start_pages=[u'Kezdőlap']),
180 'Italian': Language(name='Italian',
181 iso_code='it',
182 use_ascii=True,
183 charsets=['ISO-8859-1', 'ISO-8859-15',
184 'WINDOWS-1252'],
185 alphabet=u'ÀÈÉÌÒÓÙàèéìòóù',
186 wiki_start_pages=[u'Pagina_principale']),
187 'Lithuanian': Language(name='Lithuanian',
188 iso_code='lt',
189 use_ascii=False,
190 charsets=['ISO-8859-13', 'WINDOWS-1257',
191 'ISO-8859-4'],
192 # Q, W, and X not used at all
193 alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ'
194 u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'),
195 wiki_start_pages=[u'Pagrindinis_puslapis']),
196 'Latvian': Language(name='Latvian',
197 iso_code='lv',
198 use_ascii=False,
199 charsets=['ISO-8859-13', 'WINDOWS-1257',
200 'ISO-8859-4'],
201 # Q, W, X, Y are only for loanwords
202 alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ'
203 u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'),
204 wiki_start_pages=[u'Sākumlapa']),
205 'Macedonian': Language(name='Macedonian',
206 iso_code='mk',
207 use_ascii=False,
208 charsets=['ISO-8859-5', 'WINDOWS-1251',
209 'MacCyrillic', 'IBM855'],
210 alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ'
211 u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'),
212 wiki_start_pages=[u'Главна_страница']),
213 'Dutch': Language(name='Dutch',
214 iso_code='nl',
215 use_ascii=True,
216 charsets=['ISO-8859-1', 'WINDOWS-1252'],
217 wiki_start_pages=[u'Hoofdpagina']),
218 'Polish': Language(name='Polish',
219 iso_code='pl',
220 # Q and X are only used for foreign words.
221 use_ascii=False,
222 charsets=['ISO-8859-2', 'WINDOWS-1250'],
223 alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ'
224 u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'),
225 wiki_start_pages=[u'Wikipedia:Strona_główna']),
226 'Portuguese': Language(name='Portuguese',
227 iso_code='pt',
228 use_ascii=True,
229 charsets=['ISO-8859-1', 'ISO-8859-15',
230 'WINDOWS-1252'],
231 alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú',
232 wiki_start_pages=[u'Wikipédia:Página_principal']),
233 'Romanian': Language(name='Romanian',
234 iso_code='ro',
235 use_ascii=True,
236 charsets=['ISO-8859-2', 'WINDOWS-1250'],
237 alphabet=u'ăâîșțĂÂÎȘȚ',
238 wiki_start_pages=[u'Pagina_principală']),
239 'Russian': Language(name='Russian',
240 iso_code='ru',
241 use_ascii=False,
242 charsets=['ISO-8859-5', 'WINDOWS-1251',
243 'KOI8-R', 'MacCyrillic', 'IBM866',
244 'IBM855'],
245 alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
246 u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'),
247 wiki_start_pages=[u'Заглавная_страница']),
248 'Slovak': Language(name='Slovak',
249 iso_code='sk',
250 use_ascii=True,
251 charsets=['ISO-8859-2', 'WINDOWS-1250'],
252 alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ',
253 wiki_start_pages=[u'Hlavná_stránka']),
254 'Slovene': Language(name='Slovene',
255 iso_code='sl',
256 # Q, W, X, Y are only used for foreign words.
257 use_ascii=False,
258 charsets=['ISO-8859-2', 'WINDOWS-1250'],
259 alphabet=(u'abcčdefghijklmnoprsštuvzž'
260 u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'),
261 wiki_start_pages=[u'Glavna_stran']),
262 # Serbian can be written in both Latin and Cyrillic, but there's no
263 # simple way to get the Latin alphabet pages from Wikipedia through
264 # the API, so for now we just support Cyrillic.
265 'Serbian': Language(name='Serbian',
266 iso_code='sr',
267 alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ'
268 u'абвгдђежзијклљмнњопрстћуфхцчџш'),
269 charsets=['ISO-8859-5', 'WINDOWS-1251',
270 'MacCyrillic', 'IBM855'],
271 wiki_start_pages=[u'Главна_страна']),
272 'Thai': Language(name='Thai',
273 iso_code='th',
274 use_ascii=False,
275 charsets=['ISO-8859-11', 'TIS-620', 'CP874'],
276 alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛',
277 wiki_start_pages=[u'หน้าหลัก']),
278 'Turkish': Language(name='Turkish',
279 iso_code='tr',
280 # Q, W, and X are not used by Turkish
281 use_ascii=False,
282 charsets=['ISO-8859-3', 'ISO-8859-9',
283 'WINDOWS-1254'],
284 alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû'
285 u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'),
286 wiki_start_pages=[u'Ana_Sayfa']),
287 'Vietnamese': Language(name='Vietnamese',
288 iso_code='vi',
289 use_ascii=False,
290 # Windows-1258 is the only common 8-bit
291 # Vietnamese encoding supported by Python.
292 # From Wikipedia:
293 # For systems that lack support for Unicode,
294 # dozens of 8-bit Vietnamese code pages are
295 # available.[1] The most common are VISCII
296 # (TCVN 5712:1993), VPS, and Windows-1258.[3]
297 # Where ASCII is required, such as when
298 # ensuring readability in plain text e-mail,
299 # Vietnamese letters are often encoded
300 # according to Vietnamese Quoted-Readable
301 # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4]
302 # though usage of either variable-width
303 # scheme has declined dramatically following
304 # the adoption of Unicode on the World Wide
305 # Web.
306 charsets=['WINDOWS-1258'],
307 alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy'
308 u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'),
309 wiki_start_pages=[u'Chữ_Quốc_ngữ']),
310 }

eric ide

mercurial