Path: blob/master/thirdparty/chardet/mbcharsetprober.py
2992 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is Mozilla Universal charset detector code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 20016# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10# Shy Shalom - original C code11# Proofpoint, Inc.12#13# This library is free software; you can redistribute it and/or14# modify it under the terms of the GNU Lesser General Public15# License as published by the Free Software Foundation; either16# version 2.1 of the License, or (at your option) any later version.17#18# This library is distributed in the hope that it will be useful,19# but WITHOUT ANY WARRANTY; without even the implied warranty of20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU21# Lesser General Public License for more details.22#23# You should have received a copy of the GNU Lesser General Public24# License along with this library; if not, write to the Free Software25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA26# 02110-1301 USA27######################### END LICENSE BLOCK #########################2829from .charsetprober import CharSetProber30from .enums import ProbingState, MachineState313233class MultiByteCharSetProber(CharSetProber):34"""35MultiByteCharSetProber36"""3738def __init__(self, lang_filter=None):39super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter)40self.distribution_analyzer = None41self.coding_sm = None42self._last_char = [0, 0]4344def reset(self):45super(MultiByteCharSetProber, self).reset()46if self.coding_sm:47self.coding_sm.reset()48if self.distribution_analyzer:49self.distribution_analyzer.reset()50self._last_char = [0, 0]5152@property53def charset_name(self):54raise NotImplementedError5556@property57def language(self):58raise NotImplementedError5960def feed(self, byte_str):61for i in range(len(byte_str)):62coding_state = self.coding_sm.next_state(byte_str[i])63if coding_state == MachineState.ERROR:64self.logger.debug('%s %s prober hit error at byte %s',65self.charset_name, self.language, i)66self._state = ProbingState.NOT_ME67break68elif coding_state == MachineState.ITS_ME:69self._state = ProbingState.FOUND_IT70break71elif coding_state == MachineState.START:72char_len = self.coding_sm.get_current_charlen()73if i == 0:74self._last_char[1] = byte_str[0]75self.distribution_analyzer.feed(self._last_char, char_len)76else:77self.distribution_analyzer.feed(byte_str[i - 1:i + 1],78char_len)7980self._last_char[0] = byte_str[-1]8182if self.state == ProbingState.DETECTING:83if (self.distribution_analyzer.got_enough_data() and84(self.get_confidence() > self.SHORTCUT_THRESHOLD)):85self._state = ProbingState.FOUND_IT8687return self.state8889def get_confidence(self):90return self.distribution_analyzer.get_confidence()919293