Path: blob/master/venv/Lib/site-packages/bs4/dammit.py
811 views
# -*- coding: utf-8 -*-1"""Beautiful Soup bonus library: Unicode, Dammit23This library converts a bytestream to Unicode through any means4necessary. It is heavily based on code from Mark Pilgrim's Universal5Feed Parser. It works best on XML and HTML, but it does not rewrite the6XML or HTML to reflect a new encoding; that's the tree builder's job.7"""8# Use of this source code is governed by the MIT license.9__license__ = "MIT"1011import codecs12from html.entities import codepoint2name13import re14import logging15import string1617# Import a library to autodetect character encodings.18chardet_type = None19try:20# First try the fast C implementation.21# PyPI package: cchardet22import cchardet23def chardet_dammit(s):24if isinstance(s, str):25return None26return cchardet.detect(s)['encoding']27except ImportError:28try:29# Fall back to the pure Python implementation30# Debian package: python-chardet31# PyPI package: chardet32import chardet33def chardet_dammit(s):34if isinstance(s, str):35return None36return chardet.detect(s)['encoding']37#import chardet.constants38#chardet.constants._debug = 139except ImportError:40# No chardet available.41def chardet_dammit(s):42return None4344# Available from http://cjkpython.i18n.org/.45#46# TODO: This doesn't work anymore and the closest thing, iconv_codecs,47# is GPL-licensed. Check whether this is still necessary.48try:49import iconv_codec50except ImportError:51pass5253# Build bytestring and Unicode versions of regular expressions for finding54# a declared encoding inside an XML or HTML document.55xml_encoding = '^\\s*<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'56html_meta = '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'57encoding_res = dict()58encoding_res[bytes] = {59'html' : re.compile(html_meta.encode("ascii"), re.I),60'xml' : re.compile(xml_encoding.encode("ascii"), re.I),61}62encoding_res[str] = {63'html' : re.compile(html_meta, re.I),64'xml' : re.compile(xml_encoding, re.I)65}6667class EntitySubstitution(object):68"""The ability to substitute XML or HTML entities for certain characters."""6970def _populate_class_variables():71lookup = {}72reverse_lookup = {}73characters_for_re = []7475# &apos is an XHTML entity and an HTML 5, but not an HTML 476# entity. We don't want to use it, but we want to recognize it on the way in.77#78# TODO: Ideally we would be able to recognize all HTML 5 named79# entities, but that's a little tricky.80extra = [(39, 'apos')]81for codepoint, name in list(codepoint2name.items()) + extra:82character = chr(codepoint)83if codepoint not in (34, 39):84# There's no point in turning the quotation mark into85# " or the single quote into ', unless it86# happens within an attribute value, which is handled87# elsewhere.88characters_for_re.append(character)89lookup[character] = name90# But we do want to recognize those entities on the way in and91# convert them to Unicode characters.92reverse_lookup[name] = character93re_definition = "[%s]" % "".join(characters_for_re)94return lookup, reverse_lookup, re.compile(re_definition)95(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,96CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()9798CHARACTER_TO_XML_ENTITY = {99"'": "apos",100'"': "quot",101"&": "amp",102"<": "lt",103">": "gt",104}105106BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"107"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"108")")109110AMPERSAND_OR_BRACKET = re.compile("([<>&])")111112@classmethod113def _substitute_html_entity(cls, matchobj):114"""Used with a regular expression to substitute the115appropriate HTML entity for a special character."""116entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))117return "&%s;" % entity118119@classmethod120def _substitute_xml_entity(cls, matchobj):121"""Used with a regular expression to substitute the122appropriate XML entity for a special character."""123entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]124return "&%s;" % entity125126@classmethod127def quoted_attribute_value(self, value):128"""Make a value into a quoted XML attribute, possibly escaping it.129130Most strings will be quoted using double quotes.131132Bob's Bar -> "Bob's Bar"133134If a string contains double quotes, it will be quoted using135single quotes.136137Welcome to "my bar" -> 'Welcome to "my bar"'138139If a string contains both single and double quotes, the140double quotes will be escaped, and the string will be quoted141using double quotes.142143Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"144"""145quote_with = '"'146if '"' in value:147if "'" in value:148# The string contains both single and double149# quotes. Turn the double quotes into150# entities. We quote the double quotes rather than151# the single quotes because the entity name is152# """ whether this is HTML or XML. If we153# quoted the single quotes, we'd have to decide154# between ' and &squot;.155replace_with = """156value = value.replace('"', replace_with)157else:158# There are double quotes but no single quotes.159# We can use single quotes to quote the attribute.160quote_with = "'"161return quote_with + value + quote_with162163@classmethod164def substitute_xml(cls, value, make_quoted_attribute=False):165"""Substitute XML entities for special XML characters.166167:param value: A string to be substituted. The less-than sign168will become <, the greater-than sign will become >,169and any ampersands will become &. If you want ampersands170that appear to be part of an entity definition to be left171alone, use substitute_xml_containing_entities() instead.172173:param make_quoted_attribute: If True, then the string will be174quoted, as befits an attribute value.175"""176# Escape angle brackets and ampersands.177value = cls.AMPERSAND_OR_BRACKET.sub(178cls._substitute_xml_entity, value)179180if make_quoted_attribute:181value = cls.quoted_attribute_value(value)182return value183184@classmethod185def substitute_xml_containing_entities(186cls, value, make_quoted_attribute=False):187"""Substitute XML entities for special XML characters.188189:param value: A string to be substituted. The less-than sign will190become <, the greater-than sign will become >, and any191ampersands that are not part of an entity defition will192become &.193194:param make_quoted_attribute: If True, then the string will be195quoted, as befits an attribute value.196"""197# Escape angle brackets, and ampersands that aren't part of198# entities.199value = cls.BARE_AMPERSAND_OR_BRACKET.sub(200cls._substitute_xml_entity, value)201202if make_quoted_attribute:203value = cls.quoted_attribute_value(value)204return value205206@classmethod207def substitute_html(cls, s):208"""Replace certain Unicode characters with named HTML entities.209210This differs from data.encode(encoding, 'xmlcharrefreplace')211in that the goal is to make the result more readable (to those212with ASCII displays) rather than to recover from213errors. There's absolutely nothing wrong with a UTF-8 string214containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that215character with "é" will make it more readable to some216people.217218:param s: A Unicode string.219"""220return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(221cls._substitute_html_entity, s)222223224class EncodingDetector:225"""Suggests a number of possible encodings for a bytestring.226227Order of precedence:2282291. Encodings you specifically tell EncodingDetector to try first230(the override_encodings argument to the constructor).2312322. An encoding declared within the bytestring itself, either in an233XML declaration (if the bytestring is to be interpreted as an XML234document), or in a <meta> tag (if the bytestring is to be235interpreted as an HTML document.)2362373. An encoding detected through textual analysis by chardet,238cchardet, or a similar external library.2392404. UTF-8.2412425. Windows-1252.243"""244def __init__(self, markup, override_encodings=None, is_html=False,245exclude_encodings=None):246"""Constructor.247248:param markup: Some markup in an unknown encoding.249:param override_encodings: These encodings will be tried first.250:param is_html: If True, this markup is considered to be HTML. Otherwise251it's assumed to be XML.252:param exclude_encodings: These encodings will not be tried, even253if they otherwise would be.254"""255self.override_encodings = override_encodings or []256exclude_encodings = exclude_encodings or []257self.exclude_encodings = set([x.lower() for x in exclude_encodings])258self.chardet_encoding = None259self.is_html = is_html260self.declared_encoding = None261262# First order of business: strip a byte-order mark.263self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)264265def _usable(self, encoding, tried):266"""Should we even bother to try this encoding?267268:param encoding: Name of an encoding.269:param tried: Encodings that have already been tried. This will be modified270as a side effect.271"""272if encoding is not None:273encoding = encoding.lower()274if encoding in self.exclude_encodings:275return False276if encoding not in tried:277tried.add(encoding)278return True279return False280281@property282def encodings(self):283"""Yield a number of encodings that might work for this markup.284285:yield: A sequence of strings.286"""287tried = set()288for e in self.override_encodings:289if self._usable(e, tried):290yield e291292# Did the document originally start with a byte-order mark293# that indicated its encoding?294if self._usable(self.sniffed_encoding, tried):295yield self.sniffed_encoding296297# Look within the document for an XML or HTML encoding298# declaration.299if self.declared_encoding is None:300self.declared_encoding = self.find_declared_encoding(301self.markup, self.is_html)302if self._usable(self.declared_encoding, tried):303yield self.declared_encoding304305# Use third-party character set detection to guess at the306# encoding.307if self.chardet_encoding is None:308self.chardet_encoding = chardet_dammit(self.markup)309if self._usable(self.chardet_encoding, tried):310yield self.chardet_encoding311312# As a last-ditch effort, try utf-8 and windows-1252.313for e in ('utf-8', 'windows-1252'):314if self._usable(e, tried):315yield e316317@classmethod318def strip_byte_order_mark(cls, data):319"""If a byte-order mark is present, strip it and return the encoding it implies.320321:param data: Some markup.322:return: A 2-tuple (modified data, implied encoding)323"""324encoding = None325if isinstance(data, str):326# Unicode data cannot have a byte-order mark.327return data, encoding328if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \329and (data[2:4] != '\x00\x00'):330encoding = 'utf-16be'331data = data[2:]332elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \333and (data[2:4] != '\x00\x00'):334encoding = 'utf-16le'335data = data[2:]336elif data[:3] == b'\xef\xbb\xbf':337encoding = 'utf-8'338data = data[3:]339elif data[:4] == b'\x00\x00\xfe\xff':340encoding = 'utf-32be'341data = data[4:]342elif data[:4] == b'\xff\xfe\x00\x00':343encoding = 'utf-32le'344data = data[4:]345return data, encoding346347@classmethod348def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):349"""Given a document, tries to find its declared encoding.350351An XML encoding is declared at the beginning of the document.352353An HTML encoding is declared in a <meta> tag, hopefully near the354beginning of the document.355356:param markup: Some markup.357:param is_html: If True, this markup is considered to be HTML. Otherwise358it's assumed to be XML.359:param search_entire_document: Since an encoding is supposed to declared near the beginning360of the document, most of the time it's only necessary to search a few kilobytes of data.361Set this to True to force this method to search the entire document.362"""363if search_entire_document:364xml_endpos = html_endpos = len(markup)365else:366xml_endpos = 1024367html_endpos = max(2048, int(len(markup) * 0.05))368369if isinstance(markup, bytes):370res = encoding_res[bytes]371else:372res = encoding_res[str]373374xml_re = res['xml']375html_re = res['html']376declared_encoding = None377declared_encoding_match = xml_re.search(markup, endpos=xml_endpos)378if not declared_encoding_match and is_html:379declared_encoding_match = html_re.search(markup, endpos=html_endpos)380if declared_encoding_match is not None:381declared_encoding = declared_encoding_match.groups()[0]382if declared_encoding:383if isinstance(declared_encoding, bytes):384declared_encoding = declared_encoding.decode('ascii', 'replace')385return declared_encoding.lower()386return None387388class UnicodeDammit:389"""A class for detecting the encoding of a *ML document and390converting it to a Unicode string. If the source encoding is391windows-1252, can replace MS smart quotes with their HTML or XML392equivalents."""393394# This dictionary maps commonly seen values for "charset" in HTML395# meta tags to the corresponding Python codec names. It only covers396# values that aren't in Python's aliases and can't be determined397# by the heuristics in find_codec.398CHARSET_ALIASES = {"macintosh": "mac-roman",399"x-sjis": "shift-jis"}400401ENCODINGS_WITH_SMART_QUOTES = [402"windows-1252",403"iso-8859-1",404"iso-8859-2",405]406407def __init__(self, markup, override_encodings=[],408smart_quotes_to=None, is_html=False, exclude_encodings=[]):409"""Constructor.410411:param markup: A bytestring representing markup in an unknown encoding.412:param override_encodings: These encodings will be tried first,413before any sniffing code is run.414415:param smart_quotes_to: By default, Microsoft smart quotes will, like all other characters, be converted416to Unicode characters. Setting this to 'ascii' will convert them to ASCII quotes instead.417Setting it to 'xml' will convert them to XML entity references, and setting it to 'html'418will convert them to HTML entity references.419:param is_html: If True, this markup is considered to be HTML. Otherwise420it's assumed to be XML.421:param exclude_encodings: These encodings will not be considered, even422if the sniffing code thinks they might make sense.423"""424self.smart_quotes_to = smart_quotes_to425self.tried_encodings = []426self.contains_replacement_characters = False427self.is_html = is_html428self.log = logging.getLogger(__name__)429self.detector = EncodingDetector(430markup, override_encodings, is_html, exclude_encodings)431432# Short-circuit if the data is in Unicode to begin with.433if isinstance(markup, str) or markup == '':434self.markup = markup435self.unicode_markup = str(markup)436self.original_encoding = None437return438439# The encoding detector may have stripped a byte-order mark.440# Use the stripped markup from this point on.441self.markup = self.detector.markup442443u = None444for encoding in self.detector.encodings:445markup = self.detector.markup446u = self._convert_from(encoding)447if u is not None:448break449450if not u:451# None of the encodings worked. As an absolute last resort,452# try them again with character replacement.453454for encoding in self.detector.encodings:455if encoding != "ascii":456u = self._convert_from(encoding, "replace")457if u is not None:458self.log.warning(459"Some characters could not be decoded, and were "460"replaced with REPLACEMENT CHARACTER."461)462self.contains_replacement_characters = True463break464465# If none of that worked, we could at this point force it to466# ASCII, but that would destroy so much data that I think467# giving up is better.468self.unicode_markup = u469if not u:470self.original_encoding = None471472def _sub_ms_char(self, match):473"""Changes a MS smart quote character to an XML or HTML474entity, or an ASCII character."""475orig = match.group(1)476if self.smart_quotes_to == 'ascii':477sub = self.MS_CHARS_TO_ASCII.get(orig).encode()478else:479sub = self.MS_CHARS.get(orig)480if type(sub) == tuple:481if self.smart_quotes_to == 'xml':482sub = '&#x'.encode() + sub[1].encode() + ';'.encode()483else:484sub = '&'.encode() + sub[0].encode() + ';'.encode()485else:486sub = sub.encode()487return sub488489def _convert_from(self, proposed, errors="strict"):490"""Attempt to convert the markup to the proposed encoding.491492:param proposed: The name of a character encoding.493"""494proposed = self.find_codec(proposed)495if not proposed or (proposed, errors) in self.tried_encodings:496return None497self.tried_encodings.append((proposed, errors))498markup = self.markup499# Convert smart quotes to HTML if coming from an encoding500# that might have them.501if (self.smart_quotes_to is not None502and proposed in self.ENCODINGS_WITH_SMART_QUOTES):503smart_quotes_re = b"([\x80-\x9f])"504smart_quotes_compiled = re.compile(smart_quotes_re)505markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)506507try:508#print("Trying to convert document to %s (errors=%s)" % (509# proposed, errors))510u = self._to_unicode(markup, proposed, errors)511self.markup = u512self.original_encoding = proposed513except Exception as e:514#print("That didn't work!")515#print(e)516return None517#print("Correct encoding: %s" % proposed)518return self.markup519520def _to_unicode(self, data, encoding, errors="strict"):521"""Given a string and its encoding, decodes the string into Unicode.522523:param encoding: The name of an encoding.524"""525return str(data, encoding, errors)526527@property528def declared_html_encoding(self):529"""If the markup is an HTML document, returns the encoding declared _within_530the document.531"""532if not self.is_html:533return None534return self.detector.declared_encoding535536def find_codec(self, charset):537"""Convert the name of a character set to a codec name.538539:param charset: The name of a character set.540:return: The name of a codec.541"""542value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))543or (charset and self._codec(charset.replace("-", "")))544or (charset and self._codec(charset.replace("-", "_")))545or (charset and charset.lower())546or charset547)548if value:549return value.lower()550return None551552def _codec(self, charset):553if not charset:554return charset555codec = None556try:557codecs.lookup(charset)558codec = charset559except (LookupError, ValueError):560pass561return codec562563564# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.565MS_CHARS = {b'\x80': ('euro', '20AC'),566b'\x81': ' ',567b'\x82': ('sbquo', '201A'),568b'\x83': ('fnof', '192'),569b'\x84': ('bdquo', '201E'),570b'\x85': ('hellip', '2026'),571b'\x86': ('dagger', '2020'),572b'\x87': ('Dagger', '2021'),573b'\x88': ('circ', '2C6'),574b'\x89': ('permil', '2030'),575b'\x8A': ('Scaron', '160'),576b'\x8B': ('lsaquo', '2039'),577b'\x8C': ('OElig', '152'),578b'\x8D': '?',579b'\x8E': ('#x17D', '17D'),580b'\x8F': '?',581b'\x90': '?',582b'\x91': ('lsquo', '2018'),583b'\x92': ('rsquo', '2019'),584b'\x93': ('ldquo', '201C'),585b'\x94': ('rdquo', '201D'),586b'\x95': ('bull', '2022'),587b'\x96': ('ndash', '2013'),588b'\x97': ('mdash', '2014'),589b'\x98': ('tilde', '2DC'),590b'\x99': ('trade', '2122'),591b'\x9a': ('scaron', '161'),592b'\x9b': ('rsaquo', '203A'),593b'\x9c': ('oelig', '153'),594b'\x9d': '?',595b'\x9e': ('#x17E', '17E'),596b'\x9f': ('Yuml', ''),}597598# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains599# horrors like stripping diacritical marks to turn á into a, but also600# contains non-horrors like turning “ into ".601MS_CHARS_TO_ASCII = {602b'\x80' : 'EUR',603b'\x81' : ' ',604b'\x82' : ',',605b'\x83' : 'f',606b'\x84' : ',,',607b'\x85' : '...',608b'\x86' : '+',609b'\x87' : '++',610b'\x88' : '^',611b'\x89' : '%',612b'\x8a' : 'S',613b'\x8b' : '<',614b'\x8c' : 'OE',615b'\x8d' : '?',616b'\x8e' : 'Z',617b'\x8f' : '?',618b'\x90' : '?',619b'\x91' : "'",620b'\x92' : "'",621b'\x93' : '"',622b'\x94' : '"',623b'\x95' : '*',624b'\x96' : '-',625b'\x97' : '--',626b'\x98' : '~',627b'\x99' : '(TM)',628b'\x9a' : 's',629b'\x9b' : '>',630b'\x9c' : 'oe',631b'\x9d' : '?',632b'\x9e' : 'z',633b'\x9f' : 'Y',634b'\xa0' : ' ',635b'\xa1' : '!',636b'\xa2' : 'c',637b'\xa3' : 'GBP',638b'\xa4' : '$', #This approximation is especially parochial--this is the639#generic currency symbol.640b'\xa5' : 'YEN',641b'\xa6' : '|',642b'\xa7' : 'S',643b'\xa8' : '..',644b'\xa9' : '',645b'\xaa' : '(th)',646b'\xab' : '<<',647b'\xac' : '!',648b'\xad' : ' ',649b'\xae' : '(R)',650b'\xaf' : '-',651b'\xb0' : 'o',652b'\xb1' : '+-',653b'\xb2' : '2',654b'\xb3' : '3',655b'\xb4' : ("'", 'acute'),656b'\xb5' : 'u',657b'\xb6' : 'P',658b'\xb7' : '*',659b'\xb8' : ',',660b'\xb9' : '1',661b'\xba' : '(th)',662b'\xbb' : '>>',663b'\xbc' : '1/4',664b'\xbd' : '1/2',665b'\xbe' : '3/4',666b'\xbf' : '?',667b'\xc0' : 'A',668b'\xc1' : 'A',669b'\xc2' : 'A',670b'\xc3' : 'A',671b'\xc4' : 'A',672b'\xc5' : 'A',673b'\xc6' : 'AE',674b'\xc7' : 'C',675b'\xc8' : 'E',676b'\xc9' : 'E',677b'\xca' : 'E',678b'\xcb' : 'E',679b'\xcc' : 'I',680b'\xcd' : 'I',681b'\xce' : 'I',682b'\xcf' : 'I',683b'\xd0' : 'D',684b'\xd1' : 'N',685b'\xd2' : 'O',686b'\xd3' : 'O',687b'\xd4' : 'O',688b'\xd5' : 'O',689b'\xd6' : 'O',690b'\xd7' : '*',691b'\xd8' : 'O',692b'\xd9' : 'U',693b'\xda' : 'U',694b'\xdb' : 'U',695b'\xdc' : 'U',696b'\xdd' : 'Y',697b'\xde' : 'b',698b'\xdf' : 'B',699b'\xe0' : 'a',700b'\xe1' : 'a',701b'\xe2' : 'a',702b'\xe3' : 'a',703b'\xe4' : 'a',704b'\xe5' : 'a',705b'\xe6' : 'ae',706b'\xe7' : 'c',707b'\xe8' : 'e',708b'\xe9' : 'e',709b'\xea' : 'e',710b'\xeb' : 'e',711b'\xec' : 'i',712b'\xed' : 'i',713b'\xee' : 'i',714b'\xef' : 'i',715b'\xf0' : 'o',716b'\xf1' : 'n',717b'\xf2' : 'o',718b'\xf3' : 'o',719b'\xf4' : 'o',720b'\xf5' : 'o',721b'\xf6' : 'o',722b'\xf7' : '/',723b'\xf8' : 'o',724b'\xf9' : 'u',725b'\xfa' : 'u',726b'\xfb' : 'u',727b'\xfc' : 'u',728b'\xfd' : 'y',729b'\xfe' : 'b',730b'\xff' : 'y',731}732733# A map used when removing rogue Windows-1252/ISO-8859-1734# characters in otherwise UTF-8 documents.735#736# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in737# Windows-1252.738WINDOWS_1252_TO_UTF8 = {7390x80 : b'\xe2\x82\xac', # €7400x82 : b'\xe2\x80\x9a', # ‚7410x83 : b'\xc6\x92', # ƒ7420x84 : b'\xe2\x80\x9e', # „7430x85 : b'\xe2\x80\xa6', # …7440x86 : b'\xe2\x80\xa0', # †7450x87 : b'\xe2\x80\xa1', # ‡7460x88 : b'\xcb\x86', # ˆ7470x89 : b'\xe2\x80\xb0', # ‰7480x8a : b'\xc5\xa0', # Š7490x8b : b'\xe2\x80\xb9', # ‹7500x8c : b'\xc5\x92', # Œ7510x8e : b'\xc5\xbd', # Ž7520x91 : b'\xe2\x80\x98', # ‘7530x92 : b'\xe2\x80\x99', # ’7540x93 : b'\xe2\x80\x9c', # “7550x94 : b'\xe2\x80\x9d', # ”7560x95 : b'\xe2\x80\xa2', # •7570x96 : b'\xe2\x80\x93', # –7580x97 : b'\xe2\x80\x94', # —7590x98 : b'\xcb\x9c', # ˜7600x99 : b'\xe2\x84\xa2', # ™7610x9a : b'\xc5\xa1', # š7620x9b : b'\xe2\x80\xba', # ›7630x9c : b'\xc5\x93', # œ7640x9e : b'\xc5\xbe', # ž7650x9f : b'\xc5\xb8', # Ÿ7660xa0 : b'\xc2\xa0', #7670xa1 : b'\xc2\xa1', # ¡7680xa2 : b'\xc2\xa2', # ¢7690xa3 : b'\xc2\xa3', # £7700xa4 : b'\xc2\xa4', # ¤7710xa5 : b'\xc2\xa5', # ¥7720xa6 : b'\xc2\xa6', # ¦7730xa7 : b'\xc2\xa7', # §7740xa8 : b'\xc2\xa8', # ¨7750xa9 : b'\xc2\xa9', # ©7760xaa : b'\xc2\xaa', # ª7770xab : b'\xc2\xab', # «7780xac : b'\xc2\xac', # ¬7790xad : b'\xc2\xad', # 7800xae : b'\xc2\xae', # ®7810xaf : b'\xc2\xaf', # ¯7820xb0 : b'\xc2\xb0', # °7830xb1 : b'\xc2\xb1', # ±7840xb2 : b'\xc2\xb2', # ²7850xb3 : b'\xc2\xb3', # ³7860xb4 : b'\xc2\xb4', # ´7870xb5 : b'\xc2\xb5', # µ7880xb6 : b'\xc2\xb6', # ¶7890xb7 : b'\xc2\xb7', # ·7900xb8 : b'\xc2\xb8', # ¸7910xb9 : b'\xc2\xb9', # ¹7920xba : b'\xc2\xba', # º7930xbb : b'\xc2\xbb', # »7940xbc : b'\xc2\xbc', # ¼7950xbd : b'\xc2\xbd', # ½7960xbe : b'\xc2\xbe', # ¾7970xbf : b'\xc2\xbf', # ¿7980xc0 : b'\xc3\x80', # À7990xc1 : b'\xc3\x81', # Á8000xc2 : b'\xc3\x82', # Â8010xc3 : b'\xc3\x83', # Ã8020xc4 : b'\xc3\x84', # Ä8030xc5 : b'\xc3\x85', # Å8040xc6 : b'\xc3\x86', # Æ8050xc7 : b'\xc3\x87', # Ç8060xc8 : b'\xc3\x88', # È8070xc9 : b'\xc3\x89', # É8080xca : b'\xc3\x8a', # Ê8090xcb : b'\xc3\x8b', # Ë8100xcc : b'\xc3\x8c', # Ì8110xcd : b'\xc3\x8d', # Í8120xce : b'\xc3\x8e', # Î8130xcf : b'\xc3\x8f', # Ï8140xd0 : b'\xc3\x90', # Ð8150xd1 : b'\xc3\x91', # Ñ8160xd2 : b'\xc3\x92', # Ò8170xd3 : b'\xc3\x93', # Ó8180xd4 : b'\xc3\x94', # Ô8190xd5 : b'\xc3\x95', # Õ8200xd6 : b'\xc3\x96', # Ö8210xd7 : b'\xc3\x97', # ×8220xd8 : b'\xc3\x98', # Ø8230xd9 : b'\xc3\x99', # Ù8240xda : b'\xc3\x9a', # Ú8250xdb : b'\xc3\x9b', # Û8260xdc : b'\xc3\x9c', # Ü8270xdd : b'\xc3\x9d', # Ý8280xde : b'\xc3\x9e', # Þ8290xdf : b'\xc3\x9f', # ß8300xe0 : b'\xc3\xa0', # à8310xe1 : b'\xa1', # á8320xe2 : b'\xc3\xa2', # â8330xe3 : b'\xc3\xa3', # ã8340xe4 : b'\xc3\xa4', # ä8350xe5 : b'\xc3\xa5', # å8360xe6 : b'\xc3\xa6', # æ8370xe7 : b'\xc3\xa7', # ç8380xe8 : b'\xc3\xa8', # è8390xe9 : b'\xc3\xa9', # é8400xea : b'\xc3\xaa', # ê8410xeb : b'\xc3\xab', # ë8420xec : b'\xc3\xac', # ì8430xed : b'\xc3\xad', # í8440xee : b'\xc3\xae', # î8450xef : b'\xc3\xaf', # ï8460xf0 : b'\xc3\xb0', # ð8470xf1 : b'\xc3\xb1', # ñ8480xf2 : b'\xc3\xb2', # ò8490xf3 : b'\xc3\xb3', # ó8500xf4 : b'\xc3\xb4', # ô8510xf5 : b'\xc3\xb5', # õ8520xf6 : b'\xc3\xb6', # ö8530xf7 : b'\xc3\xb7', # ÷8540xf8 : b'\xc3\xb8', # ø8550xf9 : b'\xc3\xb9', # ù8560xfa : b'\xc3\xba', # ú8570xfb : b'\xc3\xbb', # û8580xfc : b'\xc3\xbc', # ü8590xfd : b'\xc3\xbd', # ý8600xfe : b'\xc3\xbe', # þ861}862863MULTIBYTE_MARKERS_AND_SIZES = [864(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF865(0xe0, 0xef, 3), # 3-byte characters start with E0-EF866(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4867]868869FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]870LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]871872@classmethod873def detwingle(cls, in_bytes, main_encoding="utf8",874embedded_encoding="windows-1252"):875"""Fix characters from one encoding embedded in some other encoding.876877Currently the only situation supported is Windows-1252 (or its878subset ISO-8859-1), embedded in UTF-8.879880:param in_bytes: A bytestring that you suspect contains881characters from multiple encodings. Note that this _must_882be a bytestring. If you've already converted the document883to Unicode, you're too late.884:param main_encoding: The primary encoding of `in_bytes`.885:param embedded_encoding: The encoding that was used to embed characters886in the main document.887:return: A bytestring in which `embedded_encoding`888characters have been converted to their `main_encoding`889equivalents.890"""891if embedded_encoding.replace('_', '-').lower() not in (892'windows-1252', 'windows_1252'):893raise NotImplementedError(894"Windows-1252 and ISO-8859-1 are the only currently supported "895"embedded encodings.")896897if main_encoding.lower() not in ('utf8', 'utf-8'):898raise NotImplementedError(899"UTF-8 is the only currently supported main encoding.")900901byte_chunks = []902903chunk_start = 0904pos = 0905while pos < len(in_bytes):906byte = in_bytes[pos]907if not isinstance(byte, int):908# Python 2.x909byte = ord(byte)910if (byte >= cls.FIRST_MULTIBYTE_MARKER911and byte <= cls.LAST_MULTIBYTE_MARKER):912# This is the start of a UTF-8 multibyte character. Skip913# to the end.914for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:915if byte >= start and byte <= end:916pos += size917break918elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:919# We found a Windows-1252 character!920# Save the string up to this point as a chunk.921byte_chunks.append(in_bytes[chunk_start:pos])922923# Now translate the Windows-1252 character into UTF-8924# and add it as another, one-byte chunk.925byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])926pos += 1927chunk_start = pos928else:929# Go on to the next character.930pos += 1931if chunk_start == 0:932# The string is unchanged.933return in_bytes934else:935# Store the final chunk.936byte_chunks.append(in_bytes[chunk_start:])937return b''.join(byte_chunks)938939940941