Path: blob/master/thirdparty/chardet/charsetprober.py
2992 views
######################## BEGIN LICENSE BLOCK ########################1# The Original Code is Mozilla Universal charset detector code.2#3# The Initial Developer of the Original Code is4# Netscape Communications Corporation.5# Portions created by the Initial Developer are Copyright (C) 20016# the Initial Developer. All Rights Reserved.7#8# Contributor(s):9# Mark Pilgrim - port to Python10# Shy Shalom - original C code11#12# This library is free software; you can redistribute it and/or13# modify it under the terms of the GNU Lesser General Public14# License as published by the Free Software Foundation; either15# version 2.1 of the License, or (at your option) any later version.16#17# This library is distributed in the hope that it will be useful,18# but WITHOUT ANY WARRANTY; without even the implied warranty of19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU20# Lesser General Public License for more details.21#22# You should have received a copy of the GNU Lesser General Public23# License along with this library; if not, write to the Free Software24# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA25# 02110-1301 USA26######################### END LICENSE BLOCK #########################2728import logging29import re3031from .enums import ProbingState323334class CharSetProber(object):3536SHORTCUT_THRESHOLD = 0.953738def __init__(self, lang_filter=None):39self._state = None40self.lang_filter = lang_filter41self.logger = logging.getLogger(__name__)4243def reset(self):44self._state = ProbingState.DETECTING4546@property47def charset_name(self):48return None4950def feed(self, buf):51pass5253@property54def state(self):55return self._state5657def get_confidence(self):58return 0.05960@staticmethod61def filter_high_byte_only(buf):62buf = re.sub(b'([\x00-\x7F])+', b' ', buf)63return buf6465@staticmethod66def filter_international_words(buf):67"""68We define three types of bytes:69alphabet: english alphabets [a-zA-Z]70international: international characters [\x80-\xFF]71marker: everything else [^a-zA-Z\x80-\xFF]7273The input buffer can be thought to contain a series of words delimited74by markers. This function works to filter all words that contain at75least one international character. All contiguous sequences of markers76are replaced by a single space ascii character.7778This filter applies to all scripts which do not use English characters.79"""80filtered = bytearray()8182# This regex expression filters out only words that have at-least one83# international character. The word may include one marker character at84# the end.85words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',86buf)8788for word in words:89filtered.extend(word[:-1])9091# If the last character in the word is a marker, replace it with a92# space as markers shouldn't affect our analysis (they are used93# similarly across all languages and may thus have similar94# frequencies).95last_char = word[-1:]96if not last_char.isalpha() and last_char < b'\x80':97last_char = b' '98filtered.extend(last_char)99100return filtered101102@staticmethod103def filter_with_english_letters(buf):104"""105Returns a copy of ``buf`` that retains only the sequences of English106alphabet and high byte characters that are not between <> characters.107Also retains English alphabet and high byte characters immediately108before occurrences of >.109110This filter can be applied to all scripts which contain both English111characters and extended ASCII characters, but is currently only used by112``Latin1Prober``.113"""114filtered = bytearray()115in_tag = False116prev = 0117118for curr in range(len(buf)):119# Slice here to get bytes instead of an int with Python 3120buf_char = buf[curr:curr + 1]121# Check if we're coming out of or entering an HTML tag122if buf_char == b'>':123in_tag = False124elif buf_char == b'<':125in_tag = True126127# If current character is not extended-ASCII and not alphabetic...128if buf_char < b'\x80' and not buf_char.isalpha():129# ...and we're not in a tag130if curr > prev and not in_tag:131# Keep everything after last non-extended-ASCII,132# non-alphabetic character133filtered.extend(buf[prev:curr])134# Output a space to delimit stretch we kept135filtered.extend(b' ')136prev = curr + 1137138# If we're not in a tag...139if not in_tag:140# Keep everything after last non-extended-ASCII, non-alphabetic141# character142filtered.extend(buf[prev:])143144return filtered145146147