203 rv = tag_re.search(text[:1000]) is not None |
203 rv = tag_re.search(text[:1000]) is not None |
204 _looks_like_xml_cache[key] = rv |
204 _looks_like_xml_cache[key] = rv |
205 return rv |
205 return rv |
206 |
206 |
207 |
207 |
208 # Python narrow build compatibility |
208 def surrogatepair(c): |
209 |
209 """Given a unicode character code with length greater than 16 bits, |
210 def _surrogatepair(c): |
210 return the two 16 bit surrogate pair. |
211 # Given a unicode character code |
211 """ |
212 # with length greater than 16 bits, |
|
213 # return the two 16 bit surrogate pair. |
|
214 # From example D28 of: |
212 # From example D28 of: |
215 # http://www.unicode.org/book/ch03.pdf |
213 # http://www.unicode.org/book/ch03.pdf |
216 return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff))) |
214 return (0xd7c0 + (c >> 10), (0xdc00 + (c & 0x3ff))) |
217 |
|
218 |
|
219 def unirange(a, b): |
|
220 """Returns a regular expression string to match the given non-BMP range.""" |
|
221 if b < a: |
|
222 raise ValueError("Bad character range") |
|
223 if a < 0x10000 or b < 0x10000: |
|
224 raise ValueError("unirange is only defined for non-BMP ranges") |
|
225 |
|
226 if sys.maxunicode > 0xffff: |
|
227 # wide build |
|
228 return u'[%s-%s]' % (chr(a), chr(b)) |
|
229 else: |
|
230 # narrow build stores surrogates, and the 're' module handles them |
|
231 # (incorrectly) as characters. Since there is still ordering among |
|
232 # these characters, expand the range to one that it understands. Some |
|
233 # background in http://bugs.python.org/issue3665 and |
|
234 # http://bugs.python.org/issue12749 |
|
235 # |
|
236 # Additionally, the lower constants are using chr rather than |
|
237 # literals because jython [which uses the wide path] can't load this |
|
238 # file if they are literals. |
|
239 ah, al = _surrogatepair(a) |
|
240 bh, bl = _surrogatepair(b) |
|
241 if ah == bh: |
|
242 return u'(?:%s[%s-%s])' % (chr(ah), chr(al), chr(bl)) |
|
243 else: |
|
244 buf = [] |
|
245 buf.append(u'%s[%s-%s]' % (chr(ah), chr(al), |
|
246 ah == bh and chr(bl) or chr(0xdfff))) |
|
247 if ah - bh > 1: |
|
248 buf.append(u'[%s-%s][%s-%s]' % |
|
249 chr(ah+1), chr(bh-1), chr(0xdc00), chr(0xdfff)) |
|
250 if ah != bh: |
|
251 buf.append(u'%s[%s-%s]' % |
|
252 (chr(bh), chr(0xdc00), chr(bl))) |
|
253 |
|
254 return u'(?:' + u'|'.join(buf) + u')' |
|
255 |
215 |
256 |
216 |
257 def format_lines(var_name, seq, raw=False, indent_level=0): |
217 def format_lines(var_name, seq, raw=False, indent_level=0): |
258 """Formats a sequence of strings for output.""" |
218 """Formats a sequence of strings for output.""" |
259 lines = [] |
219 lines = [] |