Path: blob/master/thirdparty/chardet/sbcsgroupprober.py
2992 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is Mozilla Universal charset detector code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 20016# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10# Shy Shalom - original C code11#12# This library is free software; you can redistribute it and/or13# modify it under the terms of the GNU Lesser General Public14# License as published by the Free Software Foundation; either15# version 2.1 of the License, or (at your option) any later version.16#17# This library is distributed in the hope that it will be useful,18# but WITHOUT ANY WARRANTY; without even the implied warranty of19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU20# Lesser General Public License for more details.21#22# You should have received a copy of the GNU Lesser General Public23# License along with this library; if not, write to the Free Software24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA25# 02110-1301 USA26######################### END LICENSE BLOCK #########################2728from .charsetgroupprober import CharSetGroupProber29from .sbcharsetprober import SingleByteCharSetProber30from .langcyrillicmodel import (Win1251CyrillicModel, Koi8rModel,31Latin5CyrillicModel, MacCyrillicModel,32Ibm866Model, Ibm855Model)33from .langgreekmodel import Latin7GreekModel, Win1253GreekModel34from .langbulgarianmodel import Latin5BulgarianModel, Win1251BulgarianModel35# from .langhungarianmodel import Latin2HungarianModel, Win1250HungarianModel36from .langthaimodel import TIS620ThaiModel37from .langhebrewmodel import Win1255HebrewModel38from .hebrewprober import HebrewProber39from .langturkishmodel import Latin5TurkishModel404142class SBCSGroupProber(CharSetGroupProber):43def __init__(self):44super(SBCSGroupProber, self).__init__()45self.probers = [46SingleByteCharSetProber(Win1251CyrillicModel),47SingleByteCharSetProber(Koi8rModel),48SingleByteCharSetProber(Latin5CyrillicModel),49SingleByteCharSetProber(MacCyrillicModel),50SingleByteCharSetProber(Ibm866Model),51SingleByteCharSetProber(Ibm855Model),52SingleByteCharSetProber(Latin7GreekModel),53SingleByteCharSetProber(Win1253GreekModel),54SingleByteCharSetProber(Latin5BulgarianModel),55SingleByteCharSetProber(Win1251BulgarianModel),56# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250)57# after we retrain model.58# SingleByteCharSetProber(Latin2HungarianModel),59# SingleByteCharSetProber(Win1250HungarianModel),60SingleByteCharSetProber(TIS620ThaiModel),61SingleByteCharSetProber(Latin5TurkishModel),62]63hebrew_prober = HebrewProber()64logical_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel,65False, hebrew_prober)66visual_hebrew_prober = SingleByteCharSetProber(Win1255HebrewModel, True,67hebrew_prober)68hebrew_prober.set_model_probers(logical_hebrew_prober, visual_hebrew_prober)69self.probers.extend([hebrew_prober, logical_hebrew_prober,70visual_hebrew_prober])7172self.reset()737475