CoCalc -- charsetprober.py

GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/thirdparty/chardet/charsetprober.py
²⁹⁹² views
1
######################## BEGIN LICENSE BLOCK ########################
2
# The Original Code is Mozilla Universal charset detector code.
3
#
4
# The Initial Developer of the Original Code is
5
# Netscape Communications Corporation.
6
# Portions created by the Initial Developer are Copyright (C) 2001
7
# the Initial Developer. All Rights Reserved.
8
#
9
# Contributor(s):
10
#   Mark Pilgrim - port to Python
11
#   Shy Shalom - original C code
12
#
13
# This library is free software; you can redistribute it and/or
14
# modify it under the terms of the GNU Lesser General Public
15
# License as published by the Free Software Foundation; either
16
# version 2.1 of the License, or (at your option) any later version.
17
#
18
# This library is distributed in the hope that it will be useful,
19
# but WITHOUT ANY WARRANTY; without even the implied warranty of
20
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21
# Lesser General Public License for more details.
22
#
23
# You should have received a copy of the GNU Lesser General Public
24
# License along with this library; if not, write to the Free Software
25
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
26
# 02110-1301  USA
27
######################### END LICENSE BLOCK #########################
28

29
import logging
30
import re
31

32
from .enums import ProbingState
33

34

35
class CharSetProber(object):
36

37
    SHORTCUT_THRESHOLD = 0.95
38

39
    def __init__(self, lang_filter=None):
40
        self._state = None
41
        self.lang_filter = lang_filter
42
        self.logger = logging.getLogger(__name__)
43

44
    def reset(self):
45
        self._state = ProbingState.DETECTING
46

47
    @property
48
    def charset_name(self):
49
        return None
50

51
    def feed(self, buf):
52
        pass
53

54
    @property
55
    def state(self):
56
        return self._state
57

58
    def get_confidence(self):
59
        return 0.0
60

61
    @staticmethod
62
    def filter_high_byte_only(buf):
63
        buf = re.sub(b'([\x00-\x7F])+', b' ', buf)
64
        return buf
65

66
    @staticmethod
67
    def filter_international_words(buf):
68
        """
69
        We define three types of bytes:
70
        alphabet: english alphabets [a-zA-Z]
71
        international: international characters [\x80-\xFF]
72
        marker: everything else [^a-zA-Z\x80-\xFF]
73

74
        The input buffer can be thought to contain a series of words delimited
75
        by markers. This function works to filter all words that contain at
76
        least one international character. All contiguous sequences of markers
77
        are replaced by a single space ascii character.
78

79
        This filter applies to all scripts which do not use English characters.
80
        """
81
        filtered = bytearray()
82

83
        # This regex expression filters out only words that have at-least one
84
        # international character. The word may include one marker character at
85
        # the end.
86
        words = re.findall(b'[a-zA-Z]*[\x80-\xFF]+[a-zA-Z]*[^a-zA-Z\x80-\xFF]?',
87
                           buf)
88

89
        for word in words:
90
            filtered.extend(word[:-1])
91

92
            # If the last character in the word is a marker, replace it with a
93
            # space as markers shouldn't affect our analysis (they are used
94
            # similarly across all languages and may thus have similar
95
            # frequencies).
96
            last_char = word[-1:]
97
            if not last_char.isalpha() and last_char < b'\x80':
98
                last_char = b' '
99
            filtered.extend(last_char)
100

101
        return filtered
102

103
    @staticmethod
104
    def filter_with_english_letters(buf):
105
        """
106
        Returns a copy of ``buf`` that retains only the sequences of English
107
        alphabet and high byte characters that are not between <> characters.
108
        Also retains English alphabet and high byte characters immediately
109
        before occurrences of >.
110

111
        This filter can be applied to all scripts which contain both English
112
        characters and extended ASCII characters, but is currently only used by
113
        ``Latin1Prober``.
114
        """
115
        filtered = bytearray()
116
        in_tag = False
117
        prev = 0
118

119
        for curr in range(len(buf)):
120
            # Slice here to get bytes instead of an int with Python 3
121
            buf_char = buf[curr:curr + 1]
122
            # Check if we're coming out of or entering an HTML tag
123
            if buf_char == b'>':
124
                in_tag = False
125
            elif buf_char == b'<':
126
                in_tag = True
127

128
            # If current character is not extended-ASCII and not alphabetic...
129
            if buf_char < b'\x80' and not buf_char.isalpha():
130
                # ...and we're not in a tag
131
                if curr > prev and not in_tag:
132
                    # Keep everything after last non-extended-ASCII,
133
                    # non-alphabetic character
134
                    filtered.extend(buf[prev:curr])
135
                    # Output a space to delimit stretch we kept
136
                    filtered.extend(b' ')
137
                prev = curr + 1
138

139
        # If we're not in a tag...
140
        if not in_tag:
141
            # Keep everything after last non-extended-ASCII, non-alphabetic
142
            # character
143
            filtered.extend(buf[prev:])
144

145
        return filtered
146

147
Product

Resources

Company