Path: blob/master/thirdparty/chardet/utf8prober.py
2992 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is mozilla.org code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 19986# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10#11# This library is free software; you can redistribute it and/or12# modify it under the terms of the GNU Lesser General Public13# License as published by the Free Software Foundation; either14# version 2.1 of the License, or (at your option) any later version.15#16# This library is distributed in the hope that it will be useful,17# but WITHOUT ANY WARRANTY; without even the implied warranty of18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU19# Lesser General Public License for more details.20#21# You should have received a copy of the GNU Lesser General Public22# License along with this library; if not, write to the Free Software23# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA24# 02110-1301 USA25######################### END LICENSE BLOCK #########################2627from .charsetprober import CharSetProber28from .enums import ProbingState, MachineState29from .codingstatemachine import CodingStateMachine30from .mbcssm import UTF8_SM_MODEL31323334class UTF8Prober(CharSetProber):35ONE_CHAR_PROB = 0.53637def __init__(self):38super(UTF8Prober, self).__init__()39self.coding_sm = CodingStateMachine(UTF8_SM_MODEL)40self._num_mb_chars = None41self.reset()4243def reset(self):44super(UTF8Prober, self).reset()45self.coding_sm.reset()46self._num_mb_chars = 04748@property49def charset_name(self):50return "utf-8"5152@property53def language(self):54return ""5556def feed(self, byte_str):57for c in byte_str:58coding_state = self.coding_sm.next_state(c)59if coding_state == MachineState.ERROR:60self._state = ProbingState.NOT_ME61break62elif coding_state == MachineState.ITS_ME:63self._state = ProbingState.FOUND_IT64break65elif coding_state == MachineState.START:66if self.coding_sm.get_current_charlen() >= 2:67self._num_mb_chars += 16869if self.state == ProbingState.DETECTING:70if self.get_confidence() > self.SHORTCUT_THRESHOLD:71self._state = ProbingState.FOUND_IT7273return self.state7475def get_confidence(self):76unlike = 0.9977if self._num_mb_chars < 6:78unlike *= self.ONE_CHAR_PROB ** self._num_mb_chars79return 1.0 - unlike80else:81return unlike828384