CoCalc -- dammit.py

GitHub Repository: hhhrrrttt222111/Dorkify
Path: blob/master/venv/Lib/site-packages/bs4/dammit.py
⁸¹¹ views
1
# -*- coding: utf-8 -*-
2
"""Beautiful Soup bonus library: Unicode, Dammit
3

4
This library converts a bytestream to Unicode through any means
5
necessary. It is heavily based on code from Mark Pilgrim's Universal
6
Feed Parser. It works best on XML and HTML, but it does not rewrite the
7
XML or HTML to reflect a new encoding; that's the tree builder's job.
8
"""
9
# Use of this source code is governed by the MIT license.
10
__license__ = "MIT"
11

12
import codecs
13
from html.entities import codepoint2name
14
import re
15
import logging
16
import string
17

18
# Import a library to autodetect character encodings.
19
chardet_type = None
20
try:
21
    # First try the fast C implementation.
22
    #  PyPI package: cchardet
23
    import cchardet
24
    def chardet_dammit(s):
25
        if isinstance(s, str):
26
            return None
27
        return cchardet.detect(s)['encoding']
28
except ImportError:
29
    try:
30
        # Fall back to the pure Python implementation
31
        #  Debian package: python-chardet
32
        #  PyPI package: chardet
33
        import chardet
34
        def chardet_dammit(s):
35
            if isinstance(s, str):
36
                return None
37
            return chardet.detect(s)['encoding']
38
        #import chardet.constants
39
        #chardet.constants._debug = 1
40
    except ImportError:
41
        # No chardet available.
42
        def chardet_dammit(s):
43
            return None
44

45
# Available from http://cjkpython.i18n.org/.
46
#
47
# TODO: This doesn't work anymore and the closest thing, iconv_codecs,
48
# is GPL-licensed. Check whether this is still necessary.
49
try:
50
    import iconv_codec
51
except ImportError:
52
    pass
53

54
# Build bytestring and Unicode versions of regular expressions for finding
55
# a declared encoding inside an XML or HTML document.
56
xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'
57
html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'
58
encoding_res = dict()
59
encoding_res[bytes] = {
60
    'html' : re.compile(html_meta.encode("ascii"), re.I),
61
    'xml' : re.compile(xml_encoding.encode("ascii"), re.I),
62
}
63
encoding_res[str] = {
64
    'html' : re.compile(html_meta, re.I),
65
    'xml' : re.compile(xml_encoding, re.I)
66
}
67

68
class EntitySubstitution(object):
69
    """The ability to substitute XML or HTML entities for certain characters."""
70

71
    def _populate_class_variables():
72
        lookup = {}
73
        reverse_lookup = {}
74
        characters_for_re = []
75

76
        # &apos is an XHTML entity and an HTML 5, but not an HTML 4
77
        # entity. We don't want to use it, but we want to recognize it on the way in.
78
        #
79
        # TODO: Ideally we would be able to recognize all HTML 5 named
80
        # entities, but that's a little tricky.
81
        extra = [(39, 'apos')]
82
        for codepoint, name in list(codepoint2name.items()) + extra:
83
            character = chr(codepoint)
84
            if codepoint not in (34, 39):
85
                # There's no point in turning the quotation mark into
86
                # &quot; or the single quote into &apos;, unless it
87
                # happens within an attribute value, which is handled
88
                # elsewhere.
89
                characters_for_re.append(character)
90
                lookup[character] = name
91
            # But we do want to recognize those entities on the way in and
92
            # convert them to Unicode characters.
93
            reverse_lookup[name] = character
94
        re_definition = "[%s]" % "".join(characters_for_re)
95
        return lookup, reverse_lookup, re.compile(re_definition)
96
    (CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
97
     CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
98

99
    CHARACTER_TO_XML_ENTITY = {
100
        "'": "apos",
101
        '"': "quot",
102
        "&": "amp",
103
        "<": "lt",
104
        ">": "gt",
105
        }
106

107
    BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
108
                                           "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
109
                                           ")")
110

111
    AMPERSAND_OR_BRACKET = re.compile("([<>&])")
112

113
    @classmethod
114
    def _substitute_html_entity(cls, matchobj):
115
        """Used with a regular expression to substitute the
116
        appropriate HTML entity for a special character."""
117
        entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
118
        return "&%s;" % entity
119

120
    @classmethod
121
    def _substitute_xml_entity(cls, matchobj):
122
        """Used with a regular expression to substitute the
123
        appropriate XML entity for a special character."""
124
        entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
125
        return "&%s;" % entity
126

127
    @classmethod
128
    def quoted_attribute_value(self, value):
129
        """Make a value into a quoted XML attribute, possibly escaping it.
130

131
         Most strings will be quoted using double quotes.
132

133
          Bob's Bar -> "Bob's Bar"
134

135
         If a string contains double quotes, it will be quoted using
136
         single quotes.
137

138
          Welcome to "my bar" -> 'Welcome to "my bar"'
139

140
         If a string contains both single and double quotes, the
141
         double quotes will be escaped, and the string will be quoted
142
         using double quotes.
143

144
          Welcome to "Bob's Bar" -> "Welcome to &quot;Bob's bar&quot;
145
        """
146
        quote_with = '"'
147
        if '"' in value:
148
            if "'" in value:
149
                # The string contains both single and double
150
                # quotes.  Turn the double quotes into
151
                # entities. We quote the double quotes rather than
152
                # the single quotes because the entity name is
153
                # "&quot;" whether this is HTML or XML.  If we
154
                # quoted the single quotes, we'd have to decide
155
                # between &apos; and &squot;.
156
                replace_with = "&quot;"
157
                value = value.replace('"', replace_with)
158
            else:
159
                # There are double quotes but no single quotes.
160
                # We can use single quotes to quote the attribute.
161
                quote_with = "'"
162
        return quote_with + value + quote_with
163

164
    @classmethod
165
    def substitute_xml(cls, value, make_quoted_attribute=False):
166
        """Substitute XML entities for special XML characters.
167

168
        :param value: A string to be substituted. The less-than sign
169
          will become &lt;, the greater-than sign will become &gt;,
170
          and any ampersands will become &amp;. If you want ampersands
171
          that appear to be part of an entity definition to be left
172
          alone, use substitute_xml_containing_entities() instead.
173

174
        :param make_quoted_attribute: If True, then the string will be
175
         quoted, as befits an attribute value.
176
        """
177
        # Escape angle brackets and ampersands.
178
        value = cls.AMPERSAND_OR_BRACKET.sub(
179
            cls._substitute_xml_entity, value)
180

181
        if make_quoted_attribute:
182
            value = cls.quoted_attribute_value(value)
183
        return value
184

185
    @classmethod
186
    def substitute_xml_containing_entities(
187
        cls, value, make_quoted_attribute=False):
188
        """Substitute XML entities for special XML characters.
189

190
        :param value: A string to be substituted. The less-than sign will
191
          become &lt;, the greater-than sign will become &gt;, and any
192
          ampersands that are not part of an entity defition will
193
          become &amp;.
194

195
        :param make_quoted_attribute: If True, then the string will be
196
         quoted, as befits an attribute value.
197
        """
198
        # Escape angle brackets, and ampersands that aren't part of
199
        # entities.
200
        value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
201
            cls._substitute_xml_entity, value)
202

203
        if make_quoted_attribute:
204
            value = cls.quoted_attribute_value(value)
205
        return value
206

207
    @classmethod
208
    def substitute_html(cls, s):
209
        """Replace certain Unicode characters with named HTML entities.
210

211
        This differs from data.encode(encoding, 'xmlcharrefreplace')
212
        in that the goal is to make the result more readable (to those
213
        with ASCII displays) rather than to recover from
214
        errors. There's absolutely nothing wrong with a UTF-8 string
215
        containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
216
        character with "&eacute;" will make it more readable to some
217
        people.
218

219
        :param s: A Unicode string.
220
        """
221
        return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
222
            cls._substitute_html_entity, s)
223

224

225
class EncodingDetector:
226
    """Suggests a number of possible encodings for a bytestring.
227

228
    Order of precedence:
229

230
    1. Encodings you specifically tell EncodingDetector to try first
231
    (the override_encodings argument to the constructor).
232

233
    2. An encoding declared within the bytestring itself, either in an
234
    XML declaration (if the bytestring is to be interpreted as an XML
235
    document), or in a <meta> tag (if the bytestring is to be
236
    interpreted as an HTML document.)
237

238
    3. An encoding detected through textual analysis by chardet,
239
    cchardet, or a similar external library.
240

241
    4. UTF-8.
242

243
    5. Windows-1252.
244
    """
245
    def __init__(self, markup, override_encodings=None, is_html=False,
246
                 exclude_encodings=None):
247
        """Constructor.
248

249
        :param markup: Some markup in an unknown encoding.
250
        :param override_encodings: These encodings will be tried first.
251
        :param is_html: If True, this markup is considered to be HTML. Otherwise
252
            it's assumed to be XML.
253
        :param exclude_encodings: These encodings will not be tried, even
254
            if they otherwise would be.
255
        """
256
        self.override_encodings = override_encodings or []
257
        exclude_encodings = exclude_encodings or []
258
        self.exclude_encodings = set([x.lower() for x in exclude_encodings])
259
        self.chardet_encoding = None
260
        self.is_html = is_html
261
        self.declared_encoding = None
262

263
        # First order of business: strip a byte-order mark.
264
        self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
265

266
    def _usable(self, encoding, tried):
267
        """Should we even bother to try this encoding?
268

269
        :param encoding: Name of an encoding.
270
        :param tried: Encodings that have already been tried. This will be modified
271
            as a side effect.
272
        """
273
        if encoding is not None:
274
            encoding = encoding.lower()
275
            if encoding in self.exclude_encodings:
276
                return False
277
            if encoding not in tried:
278
                tried.add(encoding)
279
                return True
280
        return False
281

282
    @property
283
    def encodings(self):
284
        """Yield a number of encodings that might work for this markup.
285

286
        :yield: A sequence of strings.
287
        """
288
        tried = set()
289
        for e in self.override_encodings:
290
            if self._usable(e, tried):
291
                yield e
292

293
        # Did the document originally start with a byte-order mark
294
        # that indicated its encoding?
295
        if self._usable(self.sniffed_encoding, tried):
296
            yield self.sniffed_encoding
297

298
        # Look within the document for an XML or HTML encoding
299
        # declaration.
300
        if self.declared_encoding is None:
301
            self.declared_encoding = self.find_declared_encoding(
302
                self.markup, self.is_html)
303
        if self._usable(self.declared_encoding, tried):
304
            yield self.declared_encoding
305

306
        # Use third-party character set detection to guess at the
307
        # encoding.
308
        if self.chardet_encoding is None:
309
            self.chardet_encoding = chardet_dammit(self.markup)
310
        if self._usable(self.chardet_encoding, tried):
311
            yield self.chardet_encoding
312

313
        # As a last-ditch effort, try utf-8 and windows-1252.
314
        for e in ('utf-8', 'windows-1252'):
315
            if self._usable(e, tried):
316
                yield e
317

318
    @classmethod
319
    def strip_byte_order_mark(cls, data):
320
        """If a byte-order mark is present, strip it and return the encoding it implies.
321

322
        :param data: Some markup.
323
        :return: A 2-tuple (modified data, implied encoding)
324
        """
325
        encoding = None
326
        if isinstance(data, str):
327
            # Unicode data cannot have a byte-order mark.
328
            return data, encoding
329
        if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
330
               and (data[2:4] != '\x00\x00'):
331
            encoding = 'utf-16be'
332
            data = data[2:]
333
        elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
334
                 and (data[2:4] != '\x00\x00'):
335
            encoding = 'utf-16le'
336
            data = data[2:]
337
        elif data[:3] == b'\xef\xbb\xbf':
338
            encoding = 'utf-8'
339
            data = data[3:]
340
        elif data[:4] == b'\x00\x00\xfe\xff':
341
            encoding = 'utf-32be'
342
            data = data[4:]
343
        elif data[:4] == b'\xff\xfe\x00\x00':
344
            encoding = 'utf-32le'
345
            data = data[4:]
346
        return data, encoding
347

348
    @classmethod
349
    def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
350
        """Given a document, tries to find its declared encoding.
351

352
        An XML encoding is declared at the beginning of the document.
353

354
        An HTML encoding is declared in a <meta> tag, hopefully near the
355
        beginning of the document.
356

357
        :param markup: Some markup.
358
        :param is_html: If True, this markup is considered to be HTML. Otherwise
359
            it's assumed to be XML.
360
        :param search_entire_document: Since an encoding is supposed to declared near the beginning
361
            of the document, most of the time it's only necessary to search a few kilobytes of data.
362
            Set this to True to force this method to search the entire document.
363
        """
364
        if search_entire_document:
365
            xml_endpos = html_endpos = len(markup)
366
        else:
367
            xml_endpos = 1024
368
            html_endpos = max(2048, int(len(markup) * 0.05))
369

370
        if isinstance(markup, bytes):
371
            res = encoding_res[bytes]
372
        else:
373
            res = encoding_res[str]
374

375
        xml_re = res['xml']
376
        html_re = res['html']
377
        declared_encoding = None
378
        declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)
379
        if not declared_encoding_match and is_html:
380
            declared_encoding_match = html_re.search(markup, endpos=html_endpos)
381
        if declared_encoding_match is not None:
382
            declared_encoding = declared_encoding_match.groups()[0]
383
        if declared_encoding:
384
            if isinstance(declared_encoding, bytes):
385
                declared_encoding = declared_encoding.decode('ascii', 'replace')
386
            return declared_encoding.lower()
387
        return None
388

389
class UnicodeDammit:
390
    """A class for detecting the encoding of a *ML document and
391
    converting it to a Unicode string. If the source encoding is
392
    windows-1252, can replace MS smart quotes with their HTML or XML
393
    equivalents."""
394

395
    # This dictionary maps commonly seen values for "charset" in HTML
396
    # meta tags to the corresponding Python codec names. It only covers
397
    # values that aren't in Python's aliases and can't be determined
398
    # by the heuristics in find_codec.
399
    CHARSET_ALIASES = {"macintosh": "mac-roman",
400
                       "x-sjis": "shift-jis"}
401

402
    ENCODINGS_WITH_SMART_QUOTES = [
403
        "windows-1252",
404
        "iso-8859-1",
405
        "iso-8859-2",
406
        ]
407

408
    def __init__(self, markup, override_encodings=[],
409
                 smart_quotes_to=None, is_html=False, exclude_encodings=[]):
410
        """Constructor.
411

412
        :param markup: A bytestring representing markup in an unknown encoding.
413
        :param override_encodings: These encodings will be tried first,
414
           before any sniffing code is run.
415

416
        :param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted
417
           to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.
418
           Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'
419
           will convert them to HTML entity references.
420
        :param is_html: If True, this markup is considered to be HTML. Otherwise
421
            it's assumed to be XML.
422
        :param exclude_encodings: These encodings will not be considered, even
423
            if the sniffing code thinks they might make sense.
424
        """
425
        self.smart_quotes_to = smart_quotes_to
426
        self.tried_encodings = []
427
        self.contains_replacement_characters = False
428
        self.is_html = is_html
429
        self.log = logging.getLogger(__name__)
430
        self.detector = EncodingDetector(
431
            markup, override_encodings, is_html, exclude_encodings)
432

433
        # Short-circuit if the data is in Unicode to begin with.
434
        if isinstance(markup, str) or markup == '':
435
            self.markup = markup
436
            self.unicode_markup = str(markup)
437
            self.original_encoding = None
438
            return
439

440
        # The encoding detector may have stripped a byte-order mark.
441
        # Use the stripped markup from this point on.
442
        self.markup = self.detector.markup
443

444
        u = None
445
        for encoding in self.detector.encodings:
446
            markup = self.detector.markup
447
            u = self._convert_from(encoding)
448
            if u is not None:
449
                break
450

451
        if not u:
452
            # None of the encodings worked. As an absolute last resort,
453
            # try them again with character replacement.
454

455
            for encoding in self.detector.encodings:
456
                if encoding != "ascii":
457
                    u = self._convert_from(encoding, "replace")
458
                if u is not None:
459
                    self.log.warning(
460
                            "Some characters could not be decoded, and were "
461
                            "replaced with REPLACEMENT CHARACTER."
462
                    )
463
                    self.contains_replacement_characters = True
464
                    break
465

466
        # If none of that worked, we could at this point force it to
467
        # ASCII, but that would destroy so much data that I think
468
        # giving up is better.
469
        self.unicode_markup = u
470
        if not u:
471
            self.original_encoding = None
472

473
    def _sub_ms_char(self, match):
474
        """Changes a MS smart quote character to an XML or HTML
475
        entity, or an ASCII character."""
476
        orig = match.group(1)
477
        if self.smart_quotes_to == 'ascii':
478
            sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
479
        else:
480
            sub = self.MS_CHARS.get(orig)
481
            if type(sub) == tuple:
482
                if self.smart_quotes_to == 'xml':
483
                    sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
484
                else:
485
                    sub = '&'.encode() + sub[0].encode() + ';'.encode()
486
            else:
487
                sub = sub.encode()
488
        return sub
489

490
    def _convert_from(self, proposed, errors="strict"):
491
        """Attempt to convert the markup to the proposed encoding.
492

493
        :param proposed: The name of a character encoding.
494
        """
495
        proposed = self.find_codec(proposed)
496
        if not proposed or (proposed, errors) in self.tried_encodings:
497
            return None
498
        self.tried_encodings.append((proposed, errors))
499
        markup = self.markup
500
        # Convert smart quotes to HTML if coming from an encoding
501
        # that might have them.
502
        if (self.smart_quotes_to is not None
503
            and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
504
            smart_quotes_re = b"([\x80-\x9f])"
505
            smart_quotes_compiled = re.compile(smart_quotes_re)
506
            markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
507

508
        try:
509
            #print("Trying to convert document to %s (errors=%s)" % (
510
            #    proposed, errors))
511
            u = self._to_unicode(markup, proposed, errors)
512
            self.markup = u
513
            self.original_encoding = proposed
514
        except Exception as e:
515
            #print("That didn't work!")
516
            #print(e)
517
            return None
518
        #print("Correct encoding: %s" % proposed)
519
        return self.markup
520

521
    def _to_unicode(self, data, encoding, errors="strict"):
522
        """Given a string and its encoding, decodes the string into Unicode.
523

524
        :param encoding: The name of an encoding.
525
        """
526
        return str(data, encoding, errors)
527

528
    @property
529
    def declared_html_encoding(self):
530
        """If the markup is an HTML document, returns the encoding declared _within_
531
        the document.
532
        """
533
        if not self.is_html:
534
            return None
535
        return self.detector.declared_encoding
536

537
    def find_codec(self, charset):
538
        """Convert the name of a character set to a codec name.
539

540
        :param charset: The name of a character set.
541
        :return: The name of a codec.
542
        """
543
        value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
544
               or (charset and self._codec(charset.replace("-", "")))
545
               or (charset and self._codec(charset.replace("-", "_")))
546
               or (charset and charset.lower())
547
               or charset
548
                )
549
        if value:
550
            return value.lower()
551
        return None
552

553
    def _codec(self, charset):
554
        if not charset:
555
            return charset
556
        codec = None
557
        try:
558
            codecs.lookup(charset)
559
            codec = charset
560
        except (LookupError, ValueError):
561
            pass
562
        return codec
563

564

565
    # A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
566
    MS_CHARS = {b'\x80': ('euro', '20AC'),
567
                b'\x81': ' ',
568
                b'\x82': ('sbquo', '201A'),
569
                b'\x83': ('fnof', '192'),
570
                b'\x84': ('bdquo', '201E'),
571
                b'\x85': ('hellip', '2026'),
572
                b'\x86': ('dagger', '2020'),
573
                b'\x87': ('Dagger', '2021'),
574
                b'\x88': ('circ', '2C6'),
575
                b'\x89': ('permil', '2030'),
576
                b'\x8A': ('Scaron', '160'),
577
                b'\x8B': ('lsaquo', '2039'),
578
                b'\x8C': ('OElig', '152'),
579
                b'\x8D': '?',
580
                b'\x8E': ('#x17D', '17D'),
581
                b'\x8F': '?',
582
                b'\x90': '?',
583
                b'\x91': ('lsquo', '2018'),
584
                b'\x92': ('rsquo', '2019'),
585
                b'\x93': ('ldquo', '201C'),
586
                b'\x94': ('rdquo', '201D'),
587
                b'\x95': ('bull', '2022'),
588
                b'\x96': ('ndash', '2013'),
589
                b'\x97': ('mdash', '2014'),
590
                b'\x98': ('tilde', '2DC'),
591
                b'\x99': ('trade', '2122'),
592
                b'\x9a': ('scaron', '161'),
593
                b'\x9b': ('rsaquo', '203A'),
594
                b'\x9c': ('oelig', '153'),
595
                b'\x9d': '?',
596
                b'\x9e': ('#x17E', '17E'),
597
                b'\x9f': ('Yuml', ''),}
598

599
    # A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
600
    # horrors like stripping diacritical marks to turn á into a, but also
601
    # contains non-horrors like turning “ into ".
602
    MS_CHARS_TO_ASCII = {
603
        b'\x80' : 'EUR',
604
        b'\x81' : ' ',
605
        b'\x82' : ',',
606
        b'\x83' : 'f',
607
        b'\x84' : ',,',
608
        b'\x85' : '...',
609
        b'\x86' : '+',
610
        b'\x87' : '++',
611
        b'\x88' : '^',
612
        b'\x89' : '%',
613
        b'\x8a' : 'S',
614
        b'\x8b' : '<',
615
        b'\x8c' : 'OE',
616
        b'\x8d' : '?',
617
        b'\x8e' : 'Z',
618
        b'\x8f' : '?',
619
        b'\x90' : '?',
620
        b'\x91' : "'",
621
        b'\x92' : "'",
622
        b'\x93' : '"',
623
        b'\x94' : '"',
624
        b'\x95' : '*',
625
        b'\x96' : '-',
626
        b'\x97' : '--',
627
        b'\x98' : '~',
628
        b'\x99' : '(TM)',
629
        b'\x9a' : 's',
630
        b'\x9b' : '>',
631
        b'\x9c' : 'oe',
632
        b'\x9d' : '?',
633
        b'\x9e' : 'z',
634
        b'\x9f' : 'Y',
635
        b'\xa0' : ' ',
636
        b'\xa1' : '!',
637
        b'\xa2' : 'c',
638
        b'\xa3' : 'GBP',
639
        b'\xa4' : '$', #This approximation is especially parochial--this is the
640
                       #generic currency symbol.
641
        b'\xa5' : 'YEN',
642
        b'\xa6' : '|',
643
        b'\xa7' : 'S',
644
        b'\xa8' : '..',
645
        b'\xa9' : '',
646
        b'\xaa' : '(th)',
647
        b'\xab' : '<<',
648
        b'\xac' : '!',
649
        b'\xad' : ' ',
650
        b'\xae' : '(R)',
651
        b'\xaf' : '-',
652
        b'\xb0' : 'o',
653
        b'\xb1' : '+-',
654
        b'\xb2' : '2',
655
        b'\xb3' : '3',
656
        b'\xb4' : ("'", 'acute'),
657
        b'\xb5' : 'u',
658
        b'\xb6' : 'P',
659
        b'\xb7' : '*',
660
        b'\xb8' : ',',
661
        b'\xb9' : '1',
662
        b'\xba' : '(th)',
663
        b'\xbb' : '>>',
664
        b'\xbc' : '1/4',
665
        b'\xbd' : '1/2',
666
        b'\xbe' : '3/4',
667
        b'\xbf' : '?',
668
        b'\xc0' : 'A',
669
        b'\xc1' : 'A',
670
        b'\xc2' : 'A',
671
        b'\xc3' : 'A',
672
        b'\xc4' : 'A',
673
        b'\xc5' : 'A',
674
        b'\xc6' : 'AE',
675
        b'\xc7' : 'C',
676
        b'\xc8' : 'E',
677
        b'\xc9' : 'E',
678
        b'\xca' : 'E',
679
        b'\xcb' : 'E',
680
        b'\xcc' : 'I',
681
        b'\xcd' : 'I',
682
        b'\xce' : 'I',
683
        b'\xcf' : 'I',
684
        b'\xd0' : 'D',
685
        b'\xd1' : 'N',
686
        b'\xd2' : 'O',
687
        b'\xd3' : 'O',
688
        b'\xd4' : 'O',
689
        b'\xd5' : 'O',
690
        b'\xd6' : 'O',
691
        b'\xd7' : '*',
692
        b'\xd8' : 'O',
693
        b'\xd9' : 'U',
694
        b'\xda' : 'U',
695
        b'\xdb' : 'U',
696
        b'\xdc' : 'U',
697
        b'\xdd' : 'Y',
698
        b'\xde' : 'b',
699
        b'\xdf' : 'B',
700
        b'\xe0' : 'a',
701
        b'\xe1' : 'a',
702
        b'\xe2' : 'a',
703
        b'\xe3' : 'a',
704
        b'\xe4' : 'a',
705
        b'\xe5' : 'a',
706
        b'\xe6' : 'ae',
707
        b'\xe7' : 'c',
708
        b'\xe8' : 'e',
709
        b'\xe9' : 'e',
710
        b'\xea' : 'e',
711
        b'\xeb' : 'e',
712
        b'\xec' : 'i',
713
        b'\xed' : 'i',
714
        b'\xee' : 'i',
715
        b'\xef' : 'i',
716
        b'\xf0' : 'o',
717
        b'\xf1' : 'n',
718
        b'\xf2' : 'o',
719
        b'\xf3' : 'o',
720
        b'\xf4' : 'o',
721
        b'\xf5' : 'o',
722
        b'\xf6' : 'o',
723
        b'\xf7' : '/',
724
        b'\xf8' : 'o',
725
        b'\xf9' : 'u',
726
        b'\xfa' : 'u',
727
        b'\xfb' : 'u',
728
        b'\xfc' : 'u',
729
        b'\xfd' : 'y',
730
        b'\xfe' : 'b',
731
        b'\xff' : 'y',
732
        }
733

734
    # A map used when removing rogue Windows-1252/ISO-8859-1
735
    # characters in otherwise UTF-8 documents.
736
    #
737
    # Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
738
    # Windows-1252.
739
    WINDOWS_1252_TO_UTF8 = {
740
        0x80 : b'\xe2\x82\xac', # €
741
        0x82 : b'\xe2\x80\x9a', # ‚
742
        0x83 : b'\xc6\x92',     # ƒ
743
        0x84 : b'\xe2\x80\x9e', # „
744
        0x85 : b'\xe2\x80\xa6', # …
745
        0x86 : b'\xe2\x80\xa0', # †
746
        0x87 : b'\xe2\x80\xa1', # ‡
747
        0x88 : b'\xcb\x86',     # ˆ
748
        0x89 : b'\xe2\x80\xb0', # ‰
749
        0x8a : b'\xc5\xa0',     # Š
750
        0x8b : b'\xe2\x80\xb9', # ‹
751
        0x8c : b'\xc5\x92',     # Œ
752
        0x8e : b'\xc5\xbd',     # Ž
753
        0x91 : b'\xe2\x80\x98', # ‘
754
        0x92 : b'\xe2\x80\x99', # ’
755
        0x93 : b'\xe2\x80\x9c', # “
756
        0x94 : b'\xe2\x80\x9d', # ”
757
        0x95 : b'\xe2\x80\xa2', # •
758
        0x96 : b'\xe2\x80\x93', # –
759
        0x97 : b'\xe2\x80\x94', # —
760
        0x98 : b'\xcb\x9c',     # ˜
761
        0x99 : b'\xe2\x84\xa2', # ™
762
        0x9a : b'\xc5\xa1',     # š
763
        0x9b : b'\xe2\x80\xba', # ›
764
        0x9c : b'\xc5\x93',     # œ
765
        0x9e : b'\xc5\xbe',     # ž
766
        0x9f : b'\xc5\xb8',     # Ÿ
767
        0xa0 : b'\xc2\xa0',     #  
768
        0xa1 : b'\xc2\xa1',     # ¡
769
        0xa2 : b'\xc2\xa2',     # ¢
770
        0xa3 : b'\xc2\xa3',     # £
771
        0xa4 : b'\xc2\xa4',     # ¤
772
        0xa5 : b'\xc2\xa5',     # ¥
773
        0xa6 : b'\xc2\xa6',     # ¦
774
        0xa7 : b'\xc2\xa7',     # §
775
        0xa8 : b'\xc2\xa8',     # ¨
776
        0xa9 : b'\xc2\xa9',     # ©
777
        0xaa : b'\xc2\xaa',     # ª
778
        0xab : b'\xc2\xab',     # «
779
        0xac : b'\xc2\xac',     # ¬
780
        0xad : b'\xc2\xad',     # 
781
        0xae : b'\xc2\xae',     # ®
782
        0xaf : b'\xc2\xaf',     # ¯
783
        0xb0 : b'\xc2\xb0',     # °
784
        0xb1 : b'\xc2\xb1',     # ±
785
        0xb2 : b'\xc2\xb2',     # ²
786
        0xb3 : b'\xc2\xb3',     # ³
787
        0xb4 : b'\xc2\xb4',     # ´
788
        0xb5 : b'\xc2\xb5',     # µ
789
        0xb6 : b'\xc2\xb6',     # ¶
790
        0xb7 : b'\xc2\xb7',     # ·
791
        0xb8 : b'\xc2\xb8',     # ¸
792
        0xb9 : b'\xc2\xb9',     # ¹
793
        0xba : b'\xc2\xba',     # º
794
        0xbb : b'\xc2\xbb',     # »
795
        0xbc : b'\xc2\xbc',     # ¼
796
        0xbd : b'\xc2\xbd',     # ½
797
        0xbe : b'\xc2\xbe',     # ¾
798
        0xbf : b'\xc2\xbf',     # ¿
799
        0xc0 : b'\xc3\x80',     # À
800
        0xc1 : b'\xc3\x81',     # Á
801
        0xc2 : b'\xc3\x82',     # Â
802
        0xc3 : b'\xc3\x83',     # Ã
803
        0xc4 : b'\xc3\x84',     # Ä
804
        0xc5 : b'\xc3\x85',     # Å
805
        0xc6 : b'\xc3\x86',     # Æ
806
        0xc7 : b'\xc3\x87',     # Ç
807
        0xc8 : b'\xc3\x88',     # È
808
        0xc9 : b'\xc3\x89',     # É
809
        0xca : b'\xc3\x8a',     # Ê
810
        0xcb : b'\xc3\x8b',     # Ë
811
        0xcc : b'\xc3\x8c',     # Ì
812
        0xcd : b'\xc3\x8d',     # Í
813
        0xce : b'\xc3\x8e',     # Î
814
        0xcf : b'\xc3\x8f',     # Ï
815
        0xd0 : b'\xc3\x90',     # Ð
816
        0xd1 : b'\xc3\x91',     # Ñ
817
        0xd2 : b'\xc3\x92',     # Ò
818
        0xd3 : b'\xc3\x93',     # Ó
819
        0xd4 : b'\xc3\x94',     # Ô
820
        0xd5 : b'\xc3\x95',     # Õ
821
        0xd6 : b'\xc3\x96',     # Ö
822
        0xd7 : b'\xc3\x97',     # ×
823
        0xd8 : b'\xc3\x98',     # Ø
824
        0xd9 : b'\xc3\x99',     # Ù
825
        0xda : b'\xc3\x9a',     # Ú
826
        0xdb : b'\xc3\x9b',     # Û
827
        0xdc : b'\xc3\x9c',     # Ü
828
        0xdd : b'\xc3\x9d',     # Ý
829
        0xde : b'\xc3\x9e',     # Þ
830
        0xdf : b'\xc3\x9f',     # ß
831
        0xe0 : b'\xc3\xa0',     # à
832
        0xe1 : b'\xa1',         # á
833
        0xe2 : b'\xc3\xa2',     # â
834
        0xe3 : b'\xc3\xa3',     # ã
835
        0xe4 : b'\xc3\xa4',     # ä
836
        0xe5 : b'\xc3\xa5',     # å
837
        0xe6 : b'\xc3\xa6',     # æ
838
        0xe7 : b'\xc3\xa7',     # ç
839
        0xe8 : b'\xc3\xa8',     # è
840
        0xe9 : b'\xc3\xa9',     # é
841
        0xea : b'\xc3\xaa',     # ê
842
        0xeb : b'\xc3\xab',     # ë
843
        0xec : b'\xc3\xac',     # ì
844
        0xed : b'\xc3\xad',     # í
845
        0xee : b'\xc3\xae',     # î
846
        0xef : b'\xc3\xaf',     # ï
847
        0xf0 : b'\xc3\xb0',     # ð
848
        0xf1 : b'\xc3\xb1',     # ñ
849
        0xf2 : b'\xc3\xb2',     # ò
850
        0xf3 : b'\xc3\xb3',     # ó
851
        0xf4 : b'\xc3\xb4',     # ô
852
        0xf5 : b'\xc3\xb5',     # õ
853
        0xf6 : b'\xc3\xb6',     # ö
854
        0xf7 : b'\xc3\xb7',     # ÷
855
        0xf8 : b'\xc3\xb8',     # ø
856
        0xf9 : b'\xc3\xb9',     # ù
857
        0xfa : b'\xc3\xba',     # ú
858
        0xfb : b'\xc3\xbb',     # û
859
        0xfc : b'\xc3\xbc',     # ü
860
        0xfd : b'\xc3\xbd',     # ý
861
        0xfe : b'\xc3\xbe',     # þ
862
        }
863

864
    MULTIBYTE_MARKERS_AND_SIZES = [
865
        (0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
866
        (0xe0, 0xef, 3), # 3-byte characters start with E0-EF
867
        (0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
868
        ]
869

870
    FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
871
    LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
872

873
    @classmethod
874
    def detwingle(cls, in_bytes, main_encoding="utf8",
875
                  embedded_encoding="windows-1252"):
876
        """Fix characters from one encoding embedded in some other encoding.
877

878
        Currently the only situation supported is Windows-1252 (or its
879
        subset ISO-8859-1), embedded in UTF-8.
880

881
        :param in_bytes: A bytestring that you suspect contains
882
            characters from multiple encodings. Note that this _must_
883
            be a bytestring. If you've already converted the document
884
            to Unicode, you're too late.
885
        :param main_encoding: The primary encoding of `in_bytes`.
886
        :param embedded_encoding: The encoding that was used to embed characters
887
            in the main document.
888
        :return: A bytestring in which `embedded_encoding`
889
          characters have been converted to their `main_encoding`
890
          equivalents.
891
        """
892
        if embedded_encoding.replace('_', '-').lower() not in (
893
            'windows-1252', 'windows_1252'):
894
            raise NotImplementedError(
895
                "Windows-1252 and ISO-8859-1 are the only currently supported "
896
                "embedded encodings.")
897

898
        if main_encoding.lower() not in ('utf8', 'utf-8'):
899
            raise NotImplementedError(
900
                "UTF-8 is the only currently supported main encoding.")
901

902
        byte_chunks = []
903

904
        chunk_start = 0
905
        pos = 0
906
        while pos < len(in_bytes):
907
            byte = in_bytes[pos]
908
            if not isinstance(byte, int):
909
                # Python 2.x
910
                byte = ord(byte)
911
            if (byte >= cls.FIRST_MULTIBYTE_MARKER
912
                and byte <= cls.LAST_MULTIBYTE_MARKER):
913
                # This is the start of a UTF-8 multibyte character. Skip
914
                # to the end.
915
                for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
916
                    if byte >= start and byte <= end:
917
                        pos += size
918
                        break
919
            elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
920
                # We found a Windows-1252 character!
921
                # Save the string up to this point as a chunk.
922
                byte_chunks.append(in_bytes[chunk_start:pos])
923

924
                # Now translate the Windows-1252 character into UTF-8
925
                # and add it as another, one-byte chunk.
926
                byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
927
                pos += 1
928
                chunk_start = pos
929
            else:
930
                # Go on to the next character.
931
                pos += 1
932
        if chunk_start == 0:
933
            # The string is unchanged.
934
            return in_bytes
935
        else:
936
            # Store the final chunk.
937
            byte_chunks.append(in_bytes[chunk_start:])
938
        return b''.join(byte_chunks)
939

940

941
Product

Resources

Company