Path: blob/master/venv/Lib/site-packages/chardet/escprober.py
811 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is mozilla.org code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 19986# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10#11# This library is free software; you can redistribute it and/or12# modify it under the terms of the GNU Lesser General Public13# License as published by the Free Software Foundation; either14# version 2.1 of the License, or (at your option) any later version.15#16# This library is distributed in the hope that it will be useful,17# but WITHOUT ANY WARRANTY; without even the implied warranty of18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU19# Lesser General Public License for more details.20#21# You should have received a copy of the GNU Lesser General Public22# License along with this library; if not, write to the Free Software23# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA24# 02110-1301 USA25######################### END LICENSE BLOCK #########################2627from .charsetprober import CharSetProber28from .codingstatemachine import CodingStateMachine29from .enums import LanguageFilter, ProbingState, MachineState30from .escsm import (HZ_SM_MODEL, ISO2022CN_SM_MODEL, ISO2022JP_SM_MODEL,31ISO2022KR_SM_MODEL)323334class EscCharSetProber(CharSetProber):35"""36This CharSetProber uses a "code scheme" approach for detecting encodings,37whereby easily recognizable escape or shift sequences are relied on to38identify these encodings.39"""4041def __init__(self, lang_filter=None):42super(EscCharSetProber, self).__init__(lang_filter=lang_filter)43self.coding_sm = []44if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED:45self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL))46self.coding_sm.append(CodingStateMachine(ISO2022CN_SM_MODEL))47if self.lang_filter & LanguageFilter.JAPANESE:48self.coding_sm.append(CodingStateMachine(ISO2022JP_SM_MODEL))49if self.lang_filter & LanguageFilter.KOREAN:50self.coding_sm.append(CodingStateMachine(ISO2022KR_SM_MODEL))51self.active_sm_count = None52self._detected_charset = None53self._detected_language = None54self._state = None55self.reset()5657def reset(self):58super(EscCharSetProber, self).reset()59for coding_sm in self.coding_sm:60if not coding_sm:61continue62coding_sm.active = True63coding_sm.reset()64self.active_sm_count = len(self.coding_sm)65self._detected_charset = None66self._detected_language = None6768@property69def charset_name(self):70return self._detected_charset7172@property73def language(self):74return self._detected_language7576def get_confidence(self):77if self._detected_charset:78return 0.9979else:80return 0.008182def feed(self, byte_str):83for c in byte_str:84for coding_sm in self.coding_sm:85if not coding_sm or not coding_sm.active:86continue87coding_state = coding_sm.next_state(c)88if coding_state == MachineState.ERROR:89coding_sm.active = False90self.active_sm_count -= 191if self.active_sm_count <= 0:92self._state = ProbingState.NOT_ME93return self.state94elif coding_state == MachineState.ITS_ME:95self._state = ProbingState.FOUND_IT96self._detected_charset = coding_sm.get_coding_state_machine()97self._detected_language = coding_sm.language98return self.state99100return self.state101102103