1 #!/usr/bin/env python |
|
2 # -*- coding: utf-8 -*- |
|
3 """ |
|
4 Metadata about languages used by our model training code for our |
|
5 SingleByteCharSetProbers. Could be used for other things in the future. |
|
6 |
|
7 This code is based on the language metadata from the uchardet project. |
|
8 """ |
|
9 from __future__ import absolute_import, print_function |
|
10 |
|
11 from string import ascii_letters |
|
12 |
|
13 |
|
14 # TODO: Add Ukranian (KOI8-U) |
|
15 |
|
16 class Language(object): |
|
17 """Metadata about a language useful for training models |
|
18 |
|
19 :ivar name: The human name for the language, in English. |
|
20 :type name: str |
|
21 :ivar iso_code: 2-letter ISO 639-1 if possible, 3-letter ISO code otherwise, |
|
22 or use another catalog as a last resort. |
|
23 :type iso_code: str |
|
24 :ivar use_ascii: Whether or not ASCII letters should be included in trained |
|
25 models. |
|
26 :type use_ascii: bool |
|
27 :ivar charsets: The charsets we want to support and create data for. |
|
28 :type charsets: list of str |
|
29 :ivar alphabet: The characters in the language's alphabet. If `use_ascii` is |
|
30 `True`, you only need to add those not in the ASCII set. |
|
31 :type alphabet: str |
|
32 :ivar wiki_start_pages: The Wikipedia pages to start from if we're crawling |
|
33 Wikipedia for training data. |
|
34 :type wiki_start_pages: list of str |
|
35 """ |
|
36 def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None, |
|
37 alphabet=None, wiki_start_pages=None): |
|
38 super(Language, self).__init__() |
|
39 self.name = name |
|
40 self.iso_code = iso_code |
|
41 self.use_ascii = use_ascii |
|
42 self.charsets = charsets |
|
43 if self.use_ascii: |
|
44 if alphabet: |
|
45 alphabet += ascii_letters |
|
46 else: |
|
47 alphabet = ascii_letters |
|
48 elif not alphabet: |
|
49 raise ValueError('Must supply alphabet if use_ascii is False') |
|
50 self.alphabet = ''.join(sorted(set(alphabet))) if alphabet else None |
|
51 self.wiki_start_pages = wiki_start_pages |
|
52 |
|
53 def __repr__(self): |
|
54 return '{}({})'.format(self.__class__.__name__, |
|
55 ', '.join('{}={!r}'.format(k, v) |
|
56 for k, v in self.__dict__.items() |
|
57 if not k.startswith('_'))) |
|
58 |
|
59 |
|
60 LANGUAGES = {'Arabic': Language(name='Arabic', |
|
61 iso_code='ar', |
|
62 use_ascii=False, |
|
63 # We only support encodings that use isolated |
|
64 # forms, because the current recommendation is |
|
65 # that the rendering system handles presentation |
|
66 # forms. This means we purposefully skip IBM864. |
|
67 charsets=['ISO-8859-6', 'WINDOWS-1256', |
|
68 'CP720', 'CP864'], |
|
69 alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ', |
|
70 wiki_start_pages=[u'الصفحة_الرئيسية']), |
|
71 'Belarusian': Language(name='Belarusian', |
|
72 iso_code='be', |
|
73 use_ascii=False, |
|
74 charsets=['ISO-8859-5', 'WINDOWS-1251', |
|
75 'IBM866', 'MacCyrillic'], |
|
76 alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ' |
|
77 u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'), |
|
78 wiki_start_pages=[u'Галоўная_старонка']), |
|
79 'Bulgarian': Language(name='Bulgarian', |
|
80 iso_code='bg', |
|
81 use_ascii=False, |
|
82 charsets=['ISO-8859-5', 'WINDOWS-1251', |
|
83 'IBM855'], |
|
84 alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ' |
|
85 u'абвгдежзийклмнопрстуфхцчшщъьюя'), |
|
86 wiki_start_pages=[u'Начална_страница']), |
|
87 'Czech': Language(name='Czech', |
|
88 iso_code='cz', |
|
89 use_ascii=True, |
|
90 charsets=['ISO-8859-2', 'WINDOWS-1250'], |
|
91 alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ', |
|
92 wiki_start_pages=[u'Hlavní_strana']), |
|
93 'Danish': Language(name='Danish', |
|
94 iso_code='da', |
|
95 use_ascii=True, |
|
96 charsets=['ISO-8859-1', 'ISO-8859-15', |
|
97 'WINDOWS-1252'], |
|
98 alphabet=u'æøåÆØÅ', |
|
99 wiki_start_pages=[u'Forside']), |
|
100 'German': Language(name='German', |
|
101 iso_code='de', |
|
102 use_ascii=True, |
|
103 charsets=['ISO-8859-1', 'WINDOWS-1252'], |
|
104 alphabet=u'äöüßÄÖÜ', |
|
105 wiki_start_pages=[u'Wikipedia:Hauptseite']), |
|
106 'Greek': Language(name='Greek', |
|
107 iso_code='el', |
|
108 use_ascii=False, |
|
109 charsets=['ISO-8859-7', 'WINDOWS-1253'], |
|
110 alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ' |
|
111 u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'), |
|
112 wiki_start_pages=[u'Πύλη:Κύρια']), |
|
113 'English': Language(name='English', |
|
114 iso_code='en', |
|
115 use_ascii=True, |
|
116 charsets=['ISO-8859-1', 'WINDOWS-1252'], |
|
117 wiki_start_pages=[u'Main_Page']), |
|
118 'Esperanto': Language(name='Esperanto', |
|
119 iso_code='eo', |
|
120 # Q, W, X, and Y not used at all |
|
121 use_ascii=False, |
|
122 charsets=['ISO-8859-3'], |
|
123 alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz' |
|
124 u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'), |
|
125 wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']), |
|
126 'Spanish': Language(name='Spanish', |
|
127 iso_code='es', |
|
128 use_ascii=True, |
|
129 charsets=['ISO-8859-1', 'ISO-8859-15', |
|
130 'WINDOWS-1252'], |
|
131 alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ', |
|
132 wiki_start_pages=[u'Wikipedia:Portada']), |
|
133 'Estonian': Language(name='Estonian', |
|
134 iso_code='et', |
|
135 use_ascii=False, |
|
136 charsets=['ISO-8859-4', 'ISO-8859-13', |
|
137 'WINDOWS-1257'], |
|
138 # C, F, Š, Q, W, X, Y, Z, Ž are only for |
|
139 # loanwords |
|
140 alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ' |
|
141 u'abdeghijklmnoprstuvõäöü'), |
|
142 wiki_start_pages=[u'Esileht']), |
|
143 'Finnish': Language(name='Finnish', |
|
144 iso_code='fi', |
|
145 use_ascii=True, |
|
146 charsets=['ISO-8859-1', 'ISO-8859-15', |
|
147 'WINDOWS-1252'], |
|
148 alphabet=u'ÅÄÖŠŽåäöšž', |
|
149 wiki_start_pages=[u'Wikipedia:Etusivu']), |
|
150 'French': Language(name='French', |
|
151 iso_code='fr', |
|
152 use_ascii=True, |
|
153 charsets=['ISO-8859-1', 'ISO-8859-15', |
|
154 'WINDOWS-1252'], |
|
155 alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ', |
|
156 wiki_start_pages=[u'Wikipédia:Accueil_principal', |
|
157 u'Bœuf (animal)']), |
|
158 'Hebrew': Language(name='Hebrew', |
|
159 iso_code='he', |
|
160 use_ascii=False, |
|
161 charsets=['ISO-8859-8', 'WINDOWS-1255'], |
|
162 alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ', |
|
163 wiki_start_pages=[u'עמוד_ראשי']), |
|
164 'Croatian': Language(name='Croatian', |
|
165 iso_code='hr', |
|
166 # Q, W, X, Y are only used for foreign words. |
|
167 use_ascii=False, |
|
168 charsets=['ISO-8859-2', 'WINDOWS-1250'], |
|
169 alphabet=(u'abcčćdđefghijklmnoprsštuvzž' |
|
170 u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'), |
|
171 wiki_start_pages=[u'Glavna_stranica']), |
|
172 'Hungarian': Language(name='Hungarian', |
|
173 iso_code='hu', |
|
174 # Q, W, X, Y are only used for foreign words. |
|
175 use_ascii=False, |
|
176 charsets=['ISO-8859-2', 'WINDOWS-1250'], |
|
177 alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű' |
|
178 u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'), |
|
179 wiki_start_pages=[u'Kezdőlap']), |
|
180 'Italian': Language(name='Italian', |
|
181 iso_code='it', |
|
182 use_ascii=True, |
|
183 charsets=['ISO-8859-1', 'ISO-8859-15', |
|
184 'WINDOWS-1252'], |
|
185 alphabet=u'ÀÈÉÌÒÓÙàèéìòóù', |
|
186 wiki_start_pages=[u'Pagina_principale']), |
|
187 'Lithuanian': Language(name='Lithuanian', |
|
188 iso_code='lt', |
|
189 use_ascii=False, |
|
190 charsets=['ISO-8859-13', 'WINDOWS-1257', |
|
191 'ISO-8859-4'], |
|
192 # Q, W, and X not used at all |
|
193 alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ' |
|
194 u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'), |
|
195 wiki_start_pages=[u'Pagrindinis_puslapis']), |
|
196 'Latvian': Language(name='Latvian', |
|
197 iso_code='lv', |
|
198 use_ascii=False, |
|
199 charsets=['ISO-8859-13', 'WINDOWS-1257', |
|
200 'ISO-8859-4'], |
|
201 # Q, W, X, Y are only for loanwords |
|
202 alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ' |
|
203 u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'), |
|
204 wiki_start_pages=[u'Sākumlapa']), |
|
205 'Macedonian': Language(name='Macedonian', |
|
206 iso_code='mk', |
|
207 use_ascii=False, |
|
208 charsets=['ISO-8859-5', 'WINDOWS-1251', |
|
209 'MacCyrillic', 'IBM855'], |
|
210 alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ' |
|
211 u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'), |
|
212 wiki_start_pages=[u'Главна_страница']), |
|
213 'Dutch': Language(name='Dutch', |
|
214 iso_code='nl', |
|
215 use_ascii=True, |
|
216 charsets=['ISO-8859-1', 'WINDOWS-1252'], |
|
217 wiki_start_pages=[u'Hoofdpagina']), |
|
218 'Polish': Language(name='Polish', |
|
219 iso_code='pl', |
|
220 # Q and X are only used for foreign words. |
|
221 use_ascii=False, |
|
222 charsets=['ISO-8859-2', 'WINDOWS-1250'], |
|
223 alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ' |
|
224 u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'), |
|
225 wiki_start_pages=[u'Wikipedia:Strona_główna']), |
|
226 'Portuguese': Language(name='Portuguese', |
|
227 iso_code='pt', |
|
228 use_ascii=True, |
|
229 charsets=['ISO-8859-1', 'ISO-8859-15', |
|
230 'WINDOWS-1252'], |
|
231 alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú', |
|
232 wiki_start_pages=[u'Wikipédia:Página_principal']), |
|
233 'Romanian': Language(name='Romanian', |
|
234 iso_code='ro', |
|
235 use_ascii=True, |
|
236 charsets=['ISO-8859-2', 'WINDOWS-1250'], |
|
237 alphabet=u'ăâîșțĂÂÎȘȚ', |
|
238 wiki_start_pages=[u'Pagina_principală']), |
|
239 'Russian': Language(name='Russian', |
|
240 iso_code='ru', |
|
241 use_ascii=False, |
|
242 charsets=['ISO-8859-5', 'WINDOWS-1251', |
|
243 'KOI8-R', 'MacCyrillic', 'IBM866', |
|
244 'IBM855'], |
|
245 alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' |
|
246 u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'), |
|
247 wiki_start_pages=[u'Заглавная_страница']), |
|
248 'Slovak': Language(name='Slovak', |
|
249 iso_code='sk', |
|
250 use_ascii=True, |
|
251 charsets=['ISO-8859-2', 'WINDOWS-1250'], |
|
252 alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ', |
|
253 wiki_start_pages=[u'Hlavná_stránka']), |
|
254 'Slovene': Language(name='Slovene', |
|
255 iso_code='sl', |
|
256 # Q, W, X, Y are only used for foreign words. |
|
257 use_ascii=False, |
|
258 charsets=['ISO-8859-2', 'WINDOWS-1250'], |
|
259 alphabet=(u'abcčdefghijklmnoprsštuvzž' |
|
260 u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'), |
|
261 wiki_start_pages=[u'Glavna_stran']), |
|
262 # Serbian can be written in both Latin and Cyrillic, but there's no |
|
263 # simple way to get the Latin alphabet pages from Wikipedia through |
|
264 # the API, so for now we just support Cyrillic. |
|
265 'Serbian': Language(name='Serbian', |
|
266 iso_code='sr', |
|
267 alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ' |
|
268 u'абвгдђежзијклљмнњопрстћуфхцчџш'), |
|
269 charsets=['ISO-8859-5', 'WINDOWS-1251', |
|
270 'MacCyrillic', 'IBM855'], |
|
271 wiki_start_pages=[u'Главна_страна']), |
|
272 'Thai': Language(name='Thai', |
|
273 iso_code='th', |
|
274 use_ascii=False, |
|
275 charsets=['ISO-8859-11', 'TIS-620', 'CP874'], |
|
276 alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛', |
|
277 wiki_start_pages=[u'หน้าหลัก']), |
|
278 'Turkish': Language(name='Turkish', |
|
279 iso_code='tr', |
|
280 # Q, W, and X are not used by Turkish |
|
281 use_ascii=False, |
|
282 charsets=['ISO-8859-3', 'ISO-8859-9', |
|
283 'WINDOWS-1254'], |
|
284 alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû' |
|
285 u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'), |
|
286 wiki_start_pages=[u'Ana_Sayfa']), |
|
287 'Vietnamese': Language(name='Vietnamese', |
|
288 iso_code='vi', |
|
289 use_ascii=False, |
|
290 # Windows-1258 is the only common 8-bit |
|
291 # Vietnamese encoding supported by Python. |
|
292 # From Wikipedia: |
|
293 # For systems that lack support for Unicode, |
|
294 # dozens of 8-bit Vietnamese code pages are |
|
295 # available.[1] The most common are VISCII |
|
296 # (TCVN 5712:1993), VPS, and Windows-1258.[3] |
|
297 # Where ASCII is required, such as when |
|
298 # ensuring readability in plain text e-mail, |
|
299 # Vietnamese letters are often encoded |
|
300 # according to Vietnamese Quoted-Readable |
|
301 # (VIQR) or VSCII Mnemonic (VSCII-MNEM),[4] |
|
302 # though usage of either variable-width |
|
303 # scheme has declined dramatically following |
|
304 # the adoption of Unicode on the World Wide |
|
305 # Web. |
|
306 charsets=['WINDOWS-1258'], |
|
307 alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy' |
|
308 u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'), |
|
309 wiki_start_pages=[u'Chữ_Quốc_ngữ']), |
|
310 } |
|