Path: blob/master/thirdparty/chardet/codingstatemachine.py
2992 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is mozilla.org code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 19986# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10#11# This library is free software; you can redistribute it and/or12# modify it under the terms of the GNU Lesser General Public13# License as published by the Free Software Foundation; either14# version 2.1 of the License, or (at your option) any later version.15#16# This library is distributed in the hope that it will be useful,17# but WITHOUT ANY WARRANTY; without even the implied warranty of18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU19# Lesser General Public License for more details.20#21# You should have received a copy of the GNU Lesser General Public22# License along with this library; if not, write to the Free Software23# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA24# 02110-1301 USA25######################### END LICENSE BLOCK #########################2627import logging2829from .enums import MachineState303132class CodingStateMachine(object):33"""34A state machine to verify a byte sequence for a particular encoding. For35each byte the detector receives, it will feed that byte to every active36state machine available, one byte at a time. The state machine changes its37state based on its previous state and the byte it receives. There are 338states in a state machine that are of interest to an auto-detector:3940START state: This is the state to start with, or a legal byte sequence41(i.e. a valid code point) for character has been identified.4243ME state: This indicates that the state machine identified a byte sequence44that is specific to the charset it is designed for and that45there is no other possible encoding which can contain this byte46sequence. This will to lead to an immediate positive answer for47the detector.4849ERROR state: This indicates the state machine identified an illegal byte50sequence for that encoding. This will lead to an immediate51negative answer for this encoding. Detector will exclude this52encoding from consideration from here on.53"""54def __init__(self, sm):55self._model = sm56self._curr_byte_pos = 057self._curr_char_len = 058self._curr_state = None59self.logger = logging.getLogger(__name__)60self.reset()6162def reset(self):63self._curr_state = MachineState.START6465def next_state(self, c):66# for each byte we get its class67# if it is first byte, we also get byte length68byte_class = self._model['class_table'][c]69if self._curr_state == MachineState.START:70self._curr_byte_pos = 071self._curr_char_len = self._model['char_len_table'][byte_class]72# from byte's class and state_table, we get its next state73curr_state = (self._curr_state * self._model['class_factor']74+ byte_class)75self._curr_state = self._model['state_table'][curr_state]76self._curr_byte_pos += 177return self._curr_state7879def get_current_charlen(self):80return self._curr_char_len8182def get_coding_state_machine(self):83return self._model['name']8485@property86def language(self):87return self._model['language']888990