Path: blob/master/thirdparty/chardet/sbcharsetprober.py
2992 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is Mozilla Universal charset detector code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 20016# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10# Shy Shalom - original C code11#12# This library is free software; you can redistribute it and/or13# modify it under the terms of the GNU Lesser General Public14# License as published by the Free Software Foundation; either15# version 2.1 of the License, or (at your option) any later version.16#17# This library is distributed in the hope that it will be useful,18# but WITHOUT ANY WARRANTY; without even the implied warranty of19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU20# Lesser General Public License for more details.21#22# You should have received a copy of the GNU Lesser General Public23# License along with this library; if not, write to the Free Software24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA25# 02110-1301 USA26######################### END LICENSE BLOCK #########################2728from .charsetprober import CharSetProber29from .enums import CharacterCategory, ProbingState, SequenceLikelihood303132class SingleByteCharSetProber(CharSetProber):33SAMPLE_SIZE = 6434SB_ENOUGH_REL_THRESHOLD = 1024 # 0.25 * SAMPLE_SIZE^235POSITIVE_SHORTCUT_THRESHOLD = 0.9536NEGATIVE_SHORTCUT_THRESHOLD = 0.053738def __init__(self, model, reversed=False, name_prober=None):39super(SingleByteCharSetProber, self).__init__()40self._model = model41# TRUE if we need to reverse every pair in the model lookup42self._reversed = reversed43# Optional auxiliary prober for name decision44self._name_prober = name_prober45self._last_order = None46self._seq_counters = None47self._total_seqs = None48self._total_char = None49self._freq_char = None50self.reset()5152def reset(self):53super(SingleByteCharSetProber, self).reset()54# char order of last character55self._last_order = 25556self._seq_counters = [0] * SequenceLikelihood.get_num_categories()57self._total_seqs = 058self._total_char = 059# characters that fall in our sampling range60self._freq_char = 06162@property63def charset_name(self):64if self._name_prober:65return self._name_prober.charset_name66else:67return self._model['charset_name']6869@property70def language(self):71if self._name_prober:72return self._name_prober.language73else:74return self._model.get('language')7576def feed(self, byte_str):77if not self._model['keep_english_letter']:78byte_str = self.filter_international_words(byte_str)79if not byte_str:80return self.state81char_to_order_map = self._model['char_to_order_map']82for i, c in enumerate(byte_str):83# XXX: Order is in range 1-64, so one would think we want 0-63 here,84# but that leads to 27 more test failures than before.85order = char_to_order_map[c]86# XXX: This was SYMBOL_CAT_ORDER before, with a value of 250, but87# CharacterCategory.SYMBOL is actually 253, so we use CONTROL88# to make it closer to the original intent. The only difference89# is whether or not we count digits and control characters for90# _total_char purposes.91if order < CharacterCategory.CONTROL:92self._total_char += 193if order < self.SAMPLE_SIZE:94self._freq_char += 195if self._last_order < self.SAMPLE_SIZE:96self._total_seqs += 197if not self._reversed:98i = (self._last_order * self.SAMPLE_SIZE) + order99model = self._model['precedence_matrix'][i]100else: # reverse the order of the letters in the lookup101i = (order * self.SAMPLE_SIZE) + self._last_order102model = self._model['precedence_matrix'][i]103self._seq_counters[model] += 1104self._last_order = order105106charset_name = self._model['charset_name']107if self.state == ProbingState.DETECTING:108if self._total_seqs > self.SB_ENOUGH_REL_THRESHOLD:109confidence = self.get_confidence()110if confidence > self.POSITIVE_SHORTCUT_THRESHOLD:111self.logger.debug('%s confidence = %s, we have a winner',112charset_name, confidence)113self._state = ProbingState.FOUND_IT114elif confidence < self.NEGATIVE_SHORTCUT_THRESHOLD:115self.logger.debug('%s confidence = %s, below negative '116'shortcut threshhold %s', charset_name,117confidence,118self.NEGATIVE_SHORTCUT_THRESHOLD)119self._state = ProbingState.NOT_ME120121return self.state122123def get_confidence(self):124r = 0.01125if self._total_seqs > 0:126r = ((1.0 * self._seq_counters[SequenceLikelihood.POSITIVE]) /127self._total_seqs / self._model['typical_positive_ratio'])128r = r * self._freq_char / self._total_char129if r >= 1.0:130r = 0.99131return r132133134