Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hhhrrrttt222111
GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/dammit.py
811 views
1
# -*- coding: utf-8 -*-
2
"""Beautiful Soup bonus library: Unicode, Dammit
3
4
This library converts a bytestream to Unicode through any means
5
necessary. It is heavily based on code from Mark Pilgrim's Universal
6
Feed Parser. It works best on XML and HTML, but it does not rewrite the
7
XML or HTML to reflect a new encoding; that's the tree builder's job.
8
"""
9
# Use of this source code is governed by the MIT license.
10
__license__ = "MIT"
11
12
import codecs
13
from html.entities import codepoint2name
14
import re
15
import logging
16
import string
17
18
# Import a library to autodetect character encodings.
19
chardet_type = None
20
try:
21
# First try the fast C implementation.
22
# PyPI package: cchardet
23
import cchardet
24
def chardet_dammit(s):
25
if isinstance(s, str):
26
return None
27
return cchardet.detect(s)['encoding']
28
except ImportError:
29
try:
30
# Fall back to the pure Python implementation
31
# Debian package: python-chardet
32
# PyPI package: chardet
33
import chardet
34
def chardet_dammit(s):
35
if isinstance(s, str):
36
return None
37
return chardet.detect(s)['encoding']
38
#import chardet.constants
39
#chardet.constants._debug = 1
40
except ImportError:
41
# No chardet available.
42
def chardet_dammit(s):
43
return None
44
45
# Available from http://cjkpython.i18n.org/.
46
#
47
# TODO: This doesn't work anymore and the closest thing, iconv_codecs,
48
# is GPL-licensed. Check whether this is still necessary.
49
try:
50
import iconv_codec
51
except ImportError:
52
pass
53
54
# Build bytestring and Unicode versions of regular expressions for finding
55
# a declared encoding inside an XML or HTML document.
56
xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
57
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
58
encoding_res = dict()
59
encoding_res[bytes] = {
60
'html' : re.compile(html_meta.encode("ascii"), re.I),
61
'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
62
}
63
encoding_res[str] = {
64
'html' : re.compile(html_meta, re.I),
65
'xml' : re.compile(xml_encoding, re.I)
66
}
67
68
class EntitySubstitution(object):
69
"""The ability to substitute XML or HTML entities for certain characters."""
70
71
def _populate_class_variables():
72
lookup = {}
73
reverse_lookup = {}
74
characters_for_re = []
75
76
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
77
# entity. We don't want to use it, but we want to recognize it on the way in.
78
#
79
# TODO: Ideally we would be able to recognize all HTML 5 named
80
# entities, but that's a little tricky.
81
extra = [(39, 'apos')]
82
for codepoint, name in list(codepoint2name.items()) + extra:
83
character = chr(codepoint)
84
if codepoint not in (34, 39):
85
# There's no point in turning the quotation mark into
86
# &quot; or the single quote into &apos;, unless it
87
# happens within an attribute value, which is handled
88
# elsewhere.
89
characters_for_re.append(character)
90
lookup[character] = name
91
# But we do want to recognize those entities on the way in and
92
# convert them to Unicode characters.
93
reverse_lookup[name] = character
94
re_definition = "[%s]" % "".join(characters_for_re)
95
return lookup, reverse_lookup, re.compile(re_definition)
96
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
97
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
98
99
CHARACTER_TO_XML_ENTITY = {
100
"'": "apos",
101
'"': "quot",
102
"&": "amp",
103
"<": "lt",
104
">": "gt",
105
}
106
107
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
108
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
109
")")
110
111
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
112
113
@classmethod
114
def _substitute_html_entity(cls, matchobj):
115
"""Used with a regular expression to substitute the
116
appropriate HTML entity for a special character."""
117
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
118
return "&%s;" % entity
119
120
@classmethod
121
def _substitute_xml_entity(cls, matchobj):
122
"""Used with a regular expression to substitute the
123
appropriate XML entity for a special character."""
124
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
125
return "&%s;" % entity
126
127
@classmethod
128
def quoted_attribute_value(self, value):
129
"""Make a value into a quoted XML attribute, possibly escaping it.
130
131
Most strings will be quoted using double quotes.
132
133
Bob's Bar -> "Bob's Bar"
134
135
If a string contains double quotes, it will be quoted using
136
single quotes.
137
138
Welcome to "my bar" -> 'Welcome to "my bar"'
139
140
If a string contains both single and double quotes, the
141
double quotes will be escaped, and the string will be quoted
142
using double quotes.
143
144
Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
145
"""
146
quote_with = '"'
147
if '"' in value:
148
if "'" in value:
149
# The string contains both single and double
150
# quotes. Turn the double quotes into
151
# entities. We quote the double quotes rather than
152
# the single quotes because the entity name is
153
# "&quot;" whether this is HTML or XML. If we
154
# quoted the single quotes, we'd have to decide
155
# between &apos; and &squot;.
156
replace_with = "&quot;"
157
value = value.replace('"', replace_with)
158
else:
159
# There are double quotes but no single quotes.
160
# We can use single quotes to quote the attribute.
161
quote_with = "'"
162
return quote_with + value + quote_with
163
164
@classmethod
165
def substitute_xml(cls, value, make_quoted_attribute=False):
166
"""Substitute XML entities for special XML characters.
167
168
:param value: A string to be substituted. The less-than sign
169
will become &lt;, the greater-than sign will become &gt;,
170
and any ampersands will become &amp;. If you want ampersands
171
that appear to be part of an entity definition to be left
172
alone, use substitute_xml_containing_entities() instead.
173
174
:param make_quoted_attribute: If True, then the string will be
175
quoted, as befits an attribute value.
176
"""
177
# Escape angle brackets and ampersands.
178
value = cls.AMPERSAND_OR_BRACKET.sub(
179
cls._substitute_xml_entity, value)
180
181
if make_quoted_attribute:
182
value = cls.quoted_attribute_value(value)
183
return value
184
185
@classmethod
186
def substitute_xml_containing_entities(
187
cls, value, make_quoted_attribute=False):
188
"""Substitute XML entities for special XML characters.
189
190
:param value: A string to be substituted. The less-than sign will
191
become &lt;, the greater-than sign will become &gt;, and any
192
ampersands that are not part of an entity defition will
193
become &amp;.
194
195
:param make_quoted_attribute: If True, then the string will be
196
quoted, as befits an attribute value.
197
"""
198
# Escape angle brackets, and ampersands that aren't part of
199
# entities.
200
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
201
cls._substitute_xml_entity, value)
202
203
if make_quoted_attribute:
204
value = cls.quoted_attribute_value(value)
205
return value
206
207
@classmethod
208
def substitute_html(cls, s):
209
"""Replace certain Unicode characters with named HTML entities.
210
211
This differs from data.encode(encoding, 'xmlcharrefreplace')
212
in that the goal is to make the result more readable (to those
213
with ASCII displays) rather than to recover from
214
errors. There's absolutely nothing wrong with a UTF-8 string
215
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
216
character with "&eacute;" will make it more readable to some
217
people.
218
219
:param s: A Unicode string.
220
"""
221
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
222
cls._substitute_html_entity, s)
223
224
225
class EncodingDetector:
226
"""Suggests a number of possible encodings for a bytestring.
227
228
Order of precedence:
229
230
1. Encodings you specifically tell EncodingDetector to try first
231
(the override_encodings argument to the constructor).
232
233
2. An encoding declared within the bytestring itself, either in an
234
XML declaration (if the bytestring is to be interpreted as an XML
235
document), or in a <meta> tag (if the bytestring is to be
236
interpreted as an HTML document.)
237
238
3. An encoding detected through textual analysis by chardet,
239
cchardet, or a similar external library.
240
241
4. UTF-8.
242
243
5. Windows-1252.
244
"""
245
def __init__(self, markup, override_encodings=None, is_html=False,
246
exclude_encodings=None):
247
"""Constructor.
248
249
:param markup: Some markup in an unknown encoding.
250
:param override_encodings: These encodings will be tried first.
251
:param is_html: If True, this markup is considered to be HTML. Otherwise
252
it's assumed to be XML.
253
:param exclude_encodings: These encodings will not be tried, even
254
if they otherwise would be.
255
"""
256
self.override_encodings = override_encodings or []
257
exclude_encodings = exclude_encodings or []
258
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
259
self.chardet_encoding = None
260
self.is_html = is_html
261
self.declared_encoding = None
262
263
# First order of business: strip a byte-order mark.
264
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
265
266
def _usable(self, encoding, tried):
267
"""Should we even bother to try this encoding?
268
269
:param encoding: Name of an encoding.
270
:param tried: Encodings that have already been tried. This will be modified
271
as a side effect.
272
"""
273
if encoding is not None:
274
encoding = encoding.lower()
275
if encoding in self.exclude_encodings:
276
return False
277
if encoding not in tried:
278
tried.add(encoding)
279
return True
280
return False
281
282
@property
283
def encodings(self):
284
"""Yield a number of encodings that might work for this markup.
285
286
:yield: A sequence of strings.
287
"""
288
tried = set()
289
for e in self.override_encodings:
290
if self._usable(e, tried):
291
yield e
292
293
# Did the document originally start with a byte-order mark
294
# that indicated its encoding?
295
if self._usable(self.sniffed_encoding, tried):
296
yield self.sniffed_encoding
297
298
# Look within the document for an XML or HTML encoding
299
# declaration.
300
if self.declared_encoding is None:
301
self.declared_encoding = self.find_declared_encoding(
302
self.markup, self.is_html)
303
if self._usable(self.declared_encoding, tried):
304
yield self.declared_encoding
305
306
# Use third-party character set detection to guess at the
307
# encoding.
308
if self.chardet_encoding is None:
309
self.chardet_encoding = chardet_dammit(self.markup)
310
if self._usable(self.chardet_encoding, tried):
311
yield self.chardet_encoding
312
313
# As a last-ditch effort, try utf-8 and windows-1252.
314
for e in ('utf-8', 'windows-1252'):
315
if self._usable(e, tried):
316
yield e
317
318
@classmethod
319
def strip_byte_order_mark(cls, data):
320
"""If a byte-order mark is present, strip it and return the encoding it implies.
321
322
:param data: Some markup.
323
:return: A 2-tuple (modified data, implied encoding)
324
"""
325
encoding = None
326
if isinstance(data, str):
327
# Unicode data cannot have a byte-order mark.
328
return data, encoding
329
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
330
and (data[2:4] != '\x00\x00'):
331
encoding = 'utf-16be'
332
data = data[2:]
333
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
334
and (data[2:4] != '\x00\x00'):
335
encoding = 'utf-16le'
336
data = data[2:]
337
elif data[:3] == b'\xef\xbb\xbf':
338
encoding = 'utf-8'
339
data = data[3:]
340
elif data[:4] == b'\x00\x00\xfe\xff':
341
encoding = 'utf-32be'
342
data = data[4:]
343
elif data[:4] == b'\xff\xfe\x00\x00':
344
encoding = 'utf-32le'
345
data = data[4:]
346
return data, encoding
347
348
@classmethod
349
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
350
"""Given a document, tries to find its declared encoding.
351
352
An XML encoding is declared at the beginning of the document.
353
354
An HTML encoding is declared in a <meta> tag, hopefully near the
355
beginning of the document.
356
357
:param markup: Some markup.
358
:param is_html: If True, this markup is considered to be HTML. Otherwise
359
it's assumed to be XML.
360
:param search_entire_document: Since an encoding is supposed to declared near the beginning
361
of the document, most of the time it's only necessary to search a few kilobytes of data.
362
Set this to True to force this method to search the entire document.
363
"""
364
if search_entire_document:
365
xml_endpos = html_endpos = len(markup)
366
else:
367
xml_endpos = 1024
368
html_endpos = max(2048, int(len(markup) * 0.05))
369
370
if isinstance(markup, bytes):
371
res = encoding_res[bytes]
372
else:
373
res = encoding_res[str]
374
375
xml_re = res['xml']
376
html_re = res['html']
377
declared_encoding = None
378
declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
379
if not declared_encoding_match and is_html:
380
declared_encoding_match = html_re.search(markup, endpos=html_endpos)
381
if declared_encoding_match is not None:
382
declared_encoding = declared_encoding_match.groups()[0]
383
if declared_encoding:
384
if isinstance(declared_encoding, bytes):
385
declared_encoding = declared_encoding.decode('ascii', 'replace')
386
return declared_encoding.lower()
387
return None
388
389
class UnicodeDammit:
390
"""A class for detecting the encoding of a *ML document and
391
converting it to a Unicode string. If the source encoding is
392
windows-1252, can replace MS smart quotes with their HTML or XML
393
equivalents."""
394
395
# This dictionary maps commonly seen values for "charset" in HTML
396
# meta tags to the corresponding Python codec names. It only covers
397
# values that aren't in Python's aliases and can't be determined
398
# by the heuristics in find_codec.
399
CHARSET_ALIASES = {"macintosh": "mac-roman",
400
"x-sjis": "shift-jis"}
401
402
ENCODINGS_WITH_SMART_QUOTES = [
403
"windows-1252",
404
"iso-8859-1",
405
"iso-8859-2",
406
]
407
408
def __init__(self, markup, override_encodings=[],
409
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
410
"""Constructor.
411
412
:param markup: A bytestring representing markup in an unknown encoding.
413
:param override_encodings: These encodings will be tried first,
414
before any sniffing code is run.
415
416
:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
417
to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
418
Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
419
will convert them to HTML entity references.
420
:param is_html: If True, this markup is considered to be HTML. Otherwise
421
it's assumed to be XML.
422
:param exclude_encodings: These encodings will not be considered, even
423
if the sniffing code thinks they might make sense.
424
"""
425
self.smart_quotes_to = smart_quotes_to
426
self.tried_encodings = []
427
self.contains_replacement_characters = False
428
self.is_html = is_html
429
self.log = logging.getLogger(__name__)
430
self.detector = EncodingDetector(
431
markup, override_encodings, is_html, exclude_encodings)
432
433
# Short-circuit if the data is in Unicode to begin with.
434
if isinstance(markup, str) or markup == '':
435
self.markup = markup
436
self.unicode_markup = str(markup)
437
self.original_encoding = None
438
return
439
440
# The encoding detector may have stripped a byte-order mark.
441
# Use the stripped markup from this point on.
442
self.markup = self.detector.markup
443
444
u = None
445
for encoding in self.detector.encodings:
446
markup = self.detector.markup
447
u = self._convert_from(encoding)
448
if u is not None:
449
break
450
451
if not u:
452
# None of the encodings worked. As an absolute last resort,
453
# try them again with character replacement.
454
455
for encoding in self.detector.encodings:
456
if encoding != "ascii":
457
u = self._convert_from(encoding, "replace")
458
if u is not None:
459
self.log.warning(
460
"Some characters could not be decoded, and were "
461
"replaced with REPLACEMENT CHARACTER."
462
)
463
self.contains_replacement_characters = True
464
break
465
466
# If none of that worked, we could at this point force it to
467
# ASCII, but that would destroy so much data that I think
468
# giving up is better.
469
self.unicode_markup = u
470
if not u:
471
self.original_encoding = None
472
473
def _sub_ms_char(self, match):
474
"""Changes a MS smart quote character to an XML or HTML
475
entity, or an ASCII character."""
476
orig = match.group(1)
477
if self.smart_quotes_to == 'ascii':
478
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
479
else:
480
sub = self.MS_CHARS.get(orig)
481
if type(sub) == tuple:
482
if self.smart_quotes_to == 'xml':
483
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
484
else:
485
sub = '&'.encode() + sub[0].encode() + ';'.encode()
486
else:
487
sub = sub.encode()
488
return sub
489
490
def _convert_from(self, proposed, errors="strict"):
491
"""Attempt to convert the markup to the proposed encoding.
492
493
:param proposed: The name of a character encoding.
494
"""
495
proposed = self.find_codec(proposed)
496
if not proposed or (proposed, errors) in self.tried_encodings:
497
return None
498
self.tried_encodings.append((proposed, errors))
499
markup = self.markup
500
# Convert smart quotes to HTML if coming from an encoding
501
# that might have them.
502
if (self.smart_quotes_to is not None
503
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
504
smart_quotes_re = b"([\x80-\x9f])"
505
smart_quotes_compiled = re.compile(smart_quotes_re)
506
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
507
508
try:
509
#print("Trying to convert document to %s (errors=%s)" % (
510
# proposed, errors))
511
u = self._to_unicode(markup, proposed, errors)
512
self.markup = u
513
self.original_encoding = proposed
514
except Exception as e:
515
#print("That didn't work!")
516
#print(e)
517
return None
518
#print("Correct encoding: %s" % proposed)
519
return self.markup
520
521
def _to_unicode(self, data, encoding, errors="strict"):
522
"""Given a string and its encoding, decodes the string into Unicode.
523
524
:param encoding: The name of an encoding.
525
"""
526
return str(data, encoding, errors)
527
528
@property
529
def declared_html_encoding(self):
530
"""If the markup is an HTML document, returns the encoding declared _within_
531
the document.
532
"""
533
if not self.is_html:
534
return None
535
return self.detector.declared_encoding
536
537
def find_codec(self, charset):
538
"""Convert the name of a character set to a codec name.
539
540
:param charset: The name of a character set.
541
:return: The name of a codec.
542
"""
543
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
544
or (charset and self._codec(charset.replace("-", "")))
545
or (charset and self._codec(charset.replace("-", "_")))
546
or (charset and charset.lower())
547
or charset
548
)
549
if value:
550
return value.lower()
551
return None
552
553
def _codec(self, charset):
554
if not charset:
555
return charset
556
codec = None
557
try:
558
codecs.lookup(charset)
559
codec = charset
560
except (LookupError, ValueError):
561
pass
562
return codec
563
564
565
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
566
MS_CHARS = {b'\x80': ('euro', '20AC'),
567
b'\x81': ' ',
568
b'\x82': ('sbquo', '201A'),
569
b'\x83': ('fnof', '192'),
570
b'\x84': ('bdquo', '201E'),
571
b'\x85': ('hellip', '2026'),
572
b'\x86': ('dagger', '2020'),
573
b'\x87': ('Dagger', '2021'),
574
b'\x88': ('circ', '2C6'),
575
b'\x89': ('permil', '2030'),
576
b'\x8A': ('Scaron', '160'),
577
b'\x8B': ('lsaquo', '2039'),
578
b'\x8C': ('OElig', '152'),
579
b'\x8D': '?',
580
b'\x8E': ('#x17D', '17D'),
581
b'\x8F': '?',
582
b'\x90': '?',
583
b'\x91': ('lsquo', '2018'),
584
b'\x92': ('rsquo', '2019'),
585
b'\x93': ('ldquo', '201C'),
586
b'\x94': ('rdquo', '201D'),
587
b'\x95': ('bull', '2022'),
588
b'\x96': ('ndash', '2013'),
589
b'\x97': ('mdash', '2014'),
590
b'\x98': ('tilde', '2DC'),
591
b'\x99': ('trade', '2122'),
592
b'\x9a': ('scaron', '161'),
593
b'\x9b': ('rsaquo', '203A'),
594
b'\x9c': ('oelig', '153'),
595
b'\x9d': '?',
596
b'\x9e': ('#x17E', '17E'),
597
b'\x9f': ('Yuml', ''),}
598
599
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
600
# horrors like stripping diacritical marks to turn á into a, but also
601
# contains non-horrors like turning “ into ".
602
MS_CHARS_TO_ASCII = {
603
b'\x80' : 'EUR',
604
b'\x81' : ' ',
605
b'\x82' : ',',
606
b'\x83' : 'f',
607
b'\x84' : ',,',
608
b'\x85' : '...',
609
b'\x86' : '+',
610
b'\x87' : '++',
611
b'\x88' : '^',
612
b'\x89' : '%',
613
b'\x8a' : 'S',
614
b'\x8b' : '<',
615
b'\x8c' : 'OE',
616
b'\x8d' : '?',
617
b'\x8e' : 'Z',
618
b'\x8f' : '?',
619
b'\x90' : '?',
620
b'\x91' : "'",
621
b'\x92' : "'",
622
b'\x93' : '"',
623
b'\x94' : '"',
624
b'\x95' : '*',
625
b'\x96' : '-',
626
b'\x97' : '--',
627
b'\x98' : '~',
628
b'\x99' : '(TM)',
629
b'\x9a' : 's',
630
b'\x9b' : '>',
631
b'\x9c' : 'oe',
632
b'\x9d' : '?',
633
b'\x9e' : 'z',
634
b'\x9f' : 'Y',
635
b'\xa0' : ' ',
636
b'\xa1' : '!',
637
b'\xa2' : 'c',
638
b'\xa3' : 'GBP',
639
b'\xa4' : '$', #This approximation is especially parochial--this is the
640
#generic currency symbol.
641
b'\xa5' : 'YEN',
642
b'\xa6' : '|',
643
b'\xa7' : 'S',
644
b'\xa8' : '..',
645
b'\xa9' : '',
646
b'\xaa' : '(th)',
647
b'\xab' : '<<',
648
b'\xac' : '!',
649
b'\xad' : ' ',
650
b'\xae' : '(R)',
651
b'\xaf' : '-',
652
b'\xb0' : 'o',
653
b'\xb1' : '+-',
654
b'\xb2' : '2',
655
b'\xb3' : '3',
656
b'\xb4' : ("'", 'acute'),
657
b'\xb5' : 'u',
658
b'\xb6' : 'P',
659
b'\xb7' : '*',
660
b'\xb8' : ',',
661
b'\xb9' : '1',
662
b'\xba' : '(th)',
663
b'\xbb' : '>>',
664
b'\xbc' : '1/4',
665
b'\xbd' : '1/2',
666
b'\xbe' : '3/4',
667
b'\xbf' : '?',
668
b'\xc0' : 'A',
669
b'\xc1' : 'A',
670
b'\xc2' : 'A',
671
b'\xc3' : 'A',
672
b'\xc4' : 'A',
673
b'\xc5' : 'A',
674
b'\xc6' : 'AE',
675
b'\xc7' : 'C',
676
b'\xc8' : 'E',
677
b'\xc9' : 'E',
678
b'\xca' : 'E',
679
b'\xcb' : 'E',
680
b'\xcc' : 'I',
681
b'\xcd' : 'I',
682
b'\xce' : 'I',
683
b'\xcf' : 'I',
684
b'\xd0' : 'D',
685
b'\xd1' : 'N',
686
b'\xd2' : 'O',
687
b'\xd3' : 'O',
688
b'\xd4' : 'O',
689
b'\xd5' : 'O',
690
b'\xd6' : 'O',
691
b'\xd7' : '*',
692
b'\xd8' : 'O',
693
b'\xd9' : 'U',
694
b'\xda' : 'U',
695
b'\xdb' : 'U',
696
b'\xdc' : 'U',
697
b'\xdd' : 'Y',
698
b'\xde' : 'b',
699
b'\xdf' : 'B',
700
b'\xe0' : 'a',
701
b'\xe1' : 'a',
702
b'\xe2' : 'a',
703
b'\xe3' : 'a',
704
b'\xe4' : 'a',
705
b'\xe5' : 'a',
706
b'\xe6' : 'ae',
707
b'\xe7' : 'c',
708
b'\xe8' : 'e',
709
b'\xe9' : 'e',
710
b'\xea' : 'e',
711
b'\xeb' : 'e',
712
b'\xec' : 'i',
713
b'\xed' : 'i',
714
b'\xee' : 'i',
715
b'\xef' : 'i',
716
b'\xf0' : 'o',
717
b'\xf1' : 'n',
718
b'\xf2' : 'o',
719
b'\xf3' : 'o',
720
b'\xf4' : 'o',
721
b'\xf5' : 'o',
722
b'\xf6' : 'o',
723
b'\xf7' : '/',
724
b'\xf8' : 'o',
725
b'\xf9' : 'u',
726
b'\xfa' : 'u',
727
b'\xfb' : 'u',
728
b'\xfc' : 'u',
729
b'\xfd' : 'y',
730
b'\xfe' : 'b',
731
b'\xff' : 'y',
732
}
733
734
# A map used when removing rogue Windows-1252/ISO-8859-1
735
# characters in otherwise UTF-8 documents.
736
#
737
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
738
# Windows-1252.
739
WINDOWS_1252_TO_UTF8 = {
740
0x80 : b'\xe2\x82\xac', # €
741
0x82 : b'\xe2\x80\x9a', # ‚
742
0x83 : b'\xc6\x92', # ƒ
743
0x84 : b'\xe2\x80\x9e', # „
744
0x85 : b'\xe2\x80\xa6', # …
745
0x86 : b'\xe2\x80\xa0', # †
746
0x87 : b'\xe2\x80\xa1', # ‡
747
0x88 : b'\xcb\x86', # ˆ
748
0x89 : b'\xe2\x80\xb0', # ‰
749
0x8a : b'\xc5\xa0', # Š
750
0x8b : b'\xe2\x80\xb9', # ‹
751
0x8c : b'\xc5\x92', # Œ
752
0x8e : b'\xc5\xbd', # Ž
753
0x91 : b'\xe2\x80\x98', # ‘
754
0x92 : b'\xe2\x80\x99', # ’
755
0x93 : b'\xe2\x80\x9c', # “
756
0x94 : b'\xe2\x80\x9d', # ”
757
0x95 : b'\xe2\x80\xa2', # •
758
0x96 : b'\xe2\x80\x93', # –
759
0x97 : b'\xe2\x80\x94', # —
760
0x98 : b'\xcb\x9c', # ˜
761
0x99 : b'\xe2\x84\xa2', # ™
762
0x9a : b'\xc5\xa1', # š
763
0x9b : b'\xe2\x80\xba', # ›
764
0x9c : b'\xc5\x93', # œ
765
0x9e : b'\xc5\xbe', # ž
766
0x9f : b'\xc5\xb8', # Ÿ
767
0xa0 : b'\xc2\xa0', #  
768
0xa1 : b'\xc2\xa1', # ¡
769
0xa2 : b'\xc2\xa2', # ¢
770
0xa3 : b'\xc2\xa3', # £
771
0xa4 : b'\xc2\xa4', # ¤
772
0xa5 : b'\xc2\xa5', # ¥
773
0xa6 : b'\xc2\xa6', # ¦
774
0xa7 : b'\xc2\xa7', # §
775
0xa8 : b'\xc2\xa8', # ¨
776
0xa9 : b'\xc2\xa9', # ©
777
0xaa : b'\xc2\xaa', # ª
778
0xab : b'\xc2\xab', # «
779
0xac : b'\xc2\xac', # ¬
780
0xad : b'\xc2\xad', # ­
781
0xae : b'\xc2\xae', # ®
782
0xaf : b'\xc2\xaf', # ¯
783
0xb0 : b'\xc2\xb0', # °
784
0xb1 : b'\xc2\xb1', # ±
785
0xb2 : b'\xc2\xb2', # ²
786
0xb3 : b'\xc2\xb3', # ³
787
0xb4 : b'\xc2\xb4', # ´
788
0xb5 : b'\xc2\xb5', # µ
789
0xb6 : b'\xc2\xb6', # ¶
790
0xb7 : b'\xc2\xb7', # ·
791
0xb8 : b'\xc2\xb8', # ¸
792
0xb9 : b'\xc2\xb9', # ¹
793
0xba : b'\xc2\xba', # º
794
0xbb : b'\xc2\xbb', # »
795
0xbc : b'\xc2\xbc', # ¼
796
0xbd : b'\xc2\xbd', # ½
797
0xbe : b'\xc2\xbe', # ¾
798
0xbf : b'\xc2\xbf', # ¿
799
0xc0 : b'\xc3\x80', # À
800
0xc1 : b'\xc3\x81', # Á
801
0xc2 : b'\xc3\x82', # Â
802
0xc3 : b'\xc3\x83', # Ã
803
0xc4 : b'\xc3\x84', # Ä
804
0xc5 : b'\xc3\x85', # Å
805
0xc6 : b'\xc3\x86', # Æ
806
0xc7 : b'\xc3\x87', # Ç
807
0xc8 : b'\xc3\x88', # È
808
0xc9 : b'\xc3\x89', # É
809
0xca : b'\xc3\x8a', # Ê
810
0xcb : b'\xc3\x8b', # Ë
811
0xcc : b'\xc3\x8c', # Ì
812
0xcd : b'\xc3\x8d', # Í
813
0xce : b'\xc3\x8e', # Î
814
0xcf : b'\xc3\x8f', # Ï
815
0xd0 : b'\xc3\x90', # Ð
816
0xd1 : b'\xc3\x91', # Ñ
817
0xd2 : b'\xc3\x92', # Ò
818
0xd3 : b'\xc3\x93', # Ó
819
0xd4 : b'\xc3\x94', # Ô
820
0xd5 : b'\xc3\x95', # Õ
821
0xd6 : b'\xc3\x96', # Ö
822
0xd7 : b'\xc3\x97', # ×
823
0xd8 : b'\xc3\x98', # Ø
824
0xd9 : b'\xc3\x99', # Ù
825
0xda : b'\xc3\x9a', # Ú
826
0xdb : b'\xc3\x9b', # Û
827
0xdc : b'\xc3\x9c', # Ü
828
0xdd : b'\xc3\x9d', # Ý
829
0xde : b'\xc3\x9e', # Þ
830
0xdf : b'\xc3\x9f', # ß
831
0xe0 : b'\xc3\xa0', # à
832
0xe1 : b'\xa1', # á
833
0xe2 : b'\xc3\xa2', # â
834
0xe3 : b'\xc3\xa3', # ã
835
0xe4 : b'\xc3\xa4', # ä
836
0xe5 : b'\xc3\xa5', # å
837
0xe6 : b'\xc3\xa6', # æ
838
0xe7 : b'\xc3\xa7', # ç
839
0xe8 : b'\xc3\xa8', # è
840
0xe9 : b'\xc3\xa9', # é
841
0xea : b'\xc3\xaa', # ê
842
0xeb : b'\xc3\xab', # ë
843
0xec : b'\xc3\xac', # ì
844
0xed : b'\xc3\xad', # í
845
0xee : b'\xc3\xae', # î
846
0xef : b'\xc3\xaf', # ï
847
0xf0 : b'\xc3\xb0', # ð
848
0xf1 : b'\xc3\xb1', # ñ
849
0xf2 : b'\xc3\xb2', # ò
850
0xf3 : b'\xc3\xb3', # ó
851
0xf4 : b'\xc3\xb4', # ô
852
0xf5 : b'\xc3\xb5', # õ
853
0xf6 : b'\xc3\xb6', # ö
854
0xf7 : b'\xc3\xb7', # ÷
855
0xf8 : b'\xc3\xb8', # ø
856
0xf9 : b'\xc3\xb9', # ù
857
0xfa : b'\xc3\xba', # ú
858
0xfb : b'\xc3\xbb', # û
859
0xfc : b'\xc3\xbc', # ü
860
0xfd : b'\xc3\xbd', # ý
861
0xfe : b'\xc3\xbe', # þ
862
}
863
864
MULTIBYTE_MARKERS_AND_SIZES = [
865
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
866
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
867
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
868
]
869
870
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
871
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
872
873
@classmethod
874
def detwingle(cls, in_bytes, main_encoding="utf8",
875
embedded_encoding="windows-1252"):
876
"""Fix characters from one encoding embedded in some other encoding.
877
878
Currently the only situation supported is Windows-1252 (or its
879
subset ISO-8859-1), embedded in UTF-8.
880
881
:param in_bytes: A bytestring that you suspect contains
882
characters from multiple encodings. Note that this _must_
883
be a bytestring. If you've already converted the document
884
to Unicode, you're too late.
885
:param main_encoding: The primary encoding of `in_bytes`.
886
:param embedded_encoding: The encoding that was used to embed characters
887
in the main document.
888
:return: A bytestring in which `embedded_encoding`
889
characters have been converted to their `main_encoding`
890
equivalents.
891
"""
892
if embedded_encoding.replace('_', '-').lower() not in (
893
'windows-1252', 'windows_1252'):
894
raise NotImplementedError(
895
"Windows-1252 and ISO-8859-1 are the only currently supported "
896
"embedded encodings.")
897
898
if main_encoding.lower() not in ('utf8', 'utf-8'):
899
raise NotImplementedError(
900
"UTF-8 is the only currently supported main encoding.")
901
902
byte_chunks = []
903
904
chunk_start = 0
905
pos = 0
906
while pos < len(in_bytes):
907
byte = in_bytes[pos]
908
if not isinstance(byte, int):
909
# Python 2.x
910
byte = ord(byte)
911
if (byte >= cls.FIRST_MULTIBYTE_MARKER
912
and byte <= cls.LAST_MULTIBYTE_MARKER):
913
# This is the start of a UTF-8 multibyte character. Skip
914
# to the end.
915
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
916
if byte >= start and byte <= end:
917
pos += size
918
break
919
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
920
# We found a Windows-1252 character!
921
# Save the string up to this point as a chunk.
922
byte_chunks.append(in_bytes[chunk_start:pos])
923
924
# Now translate the Windows-1252 character into UTF-8
925
# and add it as another, one-byte chunk.
926
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
927
pos += 1
928
chunk_start = pos
929
else:
930
# Go on to the next character.
931
pos += 1
932
if chunk_start == 0:
933
# The string is unchanged.
934
return in_bytes
935
else:
936
# Store the final chunk.
937
byte_chunks.append(in_bytes[chunk_start:])
938
return b''.join(byte_chunks)
939
940
941