Path: blob/master/thirdparty/chardet/latin1prober.py
2992 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is Mozilla Universal charset detector code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 20016# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10# Shy Shalom - original C code11#12# This library is free software; you can redistribute it and/or13# modify it under the terms of the GNU Lesser General Public14# License as published by the Free Software Foundation; either15# version 2.1 of the License, or (at your option) any later version.16#17# This library is distributed in the hope that it will be useful,18# but WITHOUT ANY WARRANTY; without even the implied warranty of19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU20# Lesser General Public License for more details.21#22# You should have received a copy of the GNU Lesser General Public23# License along with this library; if not, write to the Free Software24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA25# 02110-1301 USA26######################### END LICENSE BLOCK #########################2728from .charsetprober import CharSetProber29from .enums import ProbingState3031FREQ_CAT_NUM = 43233UDF = 0 # undefined34OTH = 1 # other35ASC = 2 # ascii capital letter36ASS = 3 # ascii small letter37ACV = 4 # accent capital vowel38ACO = 5 # accent capital other39ASV = 6 # accent small vowel40ASO = 7 # accent small other41CLASS_NUM = 8 # total classes4243Latin1_CharToClass = (44OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 0745OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F46OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 1747OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F48OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 2749OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F50OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 3751OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F52OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 4753ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F54ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 5755ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F56OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 6757ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F58ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 7759ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F60OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, # 80 - 8761OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, # 88 - 8F62UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 90 - 9763OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, # 98 - 9F64OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A0 - A765OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # A8 - AF66OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B767OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B8 - BF68ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, # C0 - C769ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # C8 - CF70ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, # D0 - D771ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, # D8 - DF72ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, # E0 - E773ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # E8 - EF74ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, # F0 - F775ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, # F8 - FF76)7778# 0 : illegal79# 1 : very unlikely80# 2 : normal81# 3 : very likely82Latin1ClassModel = (83# UDF OTH ASC ASS ACV ACO ASV ASO840, 0, 0, 0, 0, 0, 0, 0, # UDF850, 3, 3, 3, 3, 3, 3, 3, # OTH860, 3, 3, 3, 3, 3, 3, 3, # ASC870, 3, 3, 3, 1, 1, 3, 3, # ASS880, 3, 3, 3, 1, 2, 1, 2, # ACV890, 3, 3, 3, 3, 3, 3, 3, # ACO900, 3, 1, 3, 1, 1, 1, 3, # ASV910, 3, 1, 3, 1, 1, 3, 3, # ASO92)939495class Latin1Prober(CharSetProber):96def __init__(self):97super(Latin1Prober, self).__init__()98self._last_char_class = None99self._freq_counter = None100self.reset()101102def reset(self):103self._last_char_class = OTH104self._freq_counter = [0] * FREQ_CAT_NUM105CharSetProber.reset(self)106107@property108def charset_name(self):109return "ISO-8859-1"110111@property112def language(self):113return ""114115def feed(self, byte_str):116byte_str = self.filter_with_english_letters(byte_str)117for c in byte_str:118char_class = Latin1_CharToClass[c]119freq = Latin1ClassModel[(self._last_char_class * CLASS_NUM)120+ char_class]121if freq == 0:122self._state = ProbingState.NOT_ME123break124self._freq_counter[freq] += 1125self._last_char_class = char_class126127return self.state128129def get_confidence(self):130if self.state == ProbingState.NOT_ME:131return 0.01132133total = sum(self._freq_counter)134if total < 0.01:135confidence = 0.0136else:137confidence = ((self._freq_counter[3] - self._freq_counter[1] * 20.0)138/ total)139if confidence < 0.0:140confidence = 0.0141# lower the confidence of latin1 so that other more accurate142# detector can take priority.143confidence = confidence * 0.73144return confidence145146147