Path: blob/master/thirdparty/chardet/eucjpprober.py
2992 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is mozilla.org code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 19986# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10#11# This library is free software; you can redistribute it and/or12# modify it under the terms of the GNU Lesser General Public13# License as published by the Free Software Foundation; either14# version 2.1 of the License, or (at your option) any later version.15#16# This library is distributed in the hope that it will be useful,17# but WITHOUT ANY WARRANTY; without even the implied warranty of18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU19# Lesser General Public License for more details.20#21# You should have received a copy of the GNU Lesser General Public22# License along with this library; if not, write to the Free Software23# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA24# 02110-1301 USA25######################### END LICENSE BLOCK #########################2627from .enums import ProbingState, MachineState28from .mbcharsetprober import MultiByteCharSetProber29from .codingstatemachine import CodingStateMachine30from .chardistribution import EUCJPDistributionAnalysis31from .jpcntx import EUCJPContextAnalysis32from .mbcssm import EUCJP_SM_MODEL333435class EUCJPProber(MultiByteCharSetProber):36def __init__(self):37super(EUCJPProber, self).__init__()38self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL)39self.distribution_analyzer = EUCJPDistributionAnalysis()40self.context_analyzer = EUCJPContextAnalysis()41self.reset()4243def reset(self):44super(EUCJPProber, self).reset()45self.context_analyzer.reset()4647@property48def charset_name(self):49return "EUC-JP"5051@property52def language(self):53return "Japanese"5455def feed(self, byte_str):56for i in range(len(byte_str)):57# PY3K: byte_str is a byte array, so byte_str[i] is an int, not a byte58coding_state = self.coding_sm.next_state(byte_str[i])59if coding_state == MachineState.ERROR:60self.logger.debug('%s %s prober hit error at byte %s',61self.charset_name, self.language, i)62self._state = ProbingState.NOT_ME63break64elif coding_state == MachineState.ITS_ME:65self._state = ProbingState.FOUND_IT66break67elif coding_state == MachineState.START:68char_len = self.coding_sm.get_current_charlen()69if i == 0:70self._last_char[1] = byte_str[0]71self.context_analyzer.feed(self._last_char, char_len)72self.distribution_analyzer.feed(self._last_char, char_len)73else:74self.context_analyzer.feed(byte_str[i - 1:i + 1],75char_len)76self.distribution_analyzer.feed(byte_str[i - 1:i + 1],77char_len)7879self._last_char[0] = byte_str[-1]8081if self.state == ProbingState.DETECTING:82if (self.context_analyzer.got_enough_data() and83(self.get_confidence() > self.SHORTCUT_THRESHOLD)):84self._state = ProbingState.FOUND_IT8586return self.state8788def get_confidence(self):89context_conf = self.context_analyzer.get_confidence()90distrib_conf = self.distribution_analyzer.get_confidence()91return max(context_conf, distrib_conf)929394