Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sqlmapproject
GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/thirdparty/chardet/mbcharsetprober.py
2992 views
1
######################## BEGIN LICENSE BLOCK ########################
2
# The Original Code is Mozilla Universal charset detector code.
3
#
4
# The Initial Developer of the Original Code is
5
# Netscape Communications Corporation.
6
# Portions created by the Initial Developer are Copyright (C) 2001
7
# the Initial Developer. All Rights Reserved.
8
#
9
# Contributor(s):
10
# Mark Pilgrim - port to Python
11
# Shy Shalom - original C code
12
# Proofpoint, Inc.
13
#
14
# This library is free software; you can redistribute it and/or
15
# modify it under the terms of the GNU Lesser General Public
16
# License as published by the Free Software Foundation; either
17
# version 2.1 of the License, or (at your option) any later version.
18
#
19
# This library is distributed in the hope that it will be useful,
20
# but WITHOUT ANY WARRANTY; without even the implied warranty of
21
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
22
# Lesser General Public License for more details.
23
#
24
# You should have received a copy of the GNU Lesser General Public
25
# License along with this library; if not, write to the Free Software
26
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
27
# 02110-1301 USA
28
######################### END LICENSE BLOCK #########################
29
30
from .charsetprober import CharSetProber
31
from .enums import ProbingState, MachineState
32
33
34
class MultiByteCharSetProber(CharSetProber):
35
"""
36
MultiByteCharSetProber
37
"""
38
39
def __init__(self, lang_filter=None):
40
super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter)
41
self.distribution_analyzer = None
42
self.coding_sm = None
43
self._last_char = [0, 0]
44
45
def reset(self):
46
super(MultiByteCharSetProber, self).reset()
47
if self.coding_sm:
48
self.coding_sm.reset()
49
if self.distribution_analyzer:
50
self.distribution_analyzer.reset()
51
self._last_char = [0, 0]
52
53
@property
54
def charset_name(self):
55
raise NotImplementedError
56
57
@property
58
def language(self):
59
raise NotImplementedError
60
61
def feed(self, byte_str):
62
for i in range(len(byte_str)):
63
coding_state = self.coding_sm.next_state(byte_str[i])
64
if coding_state == MachineState.ERROR:
65
self.logger.debug('%s %s prober hit error at byte %s',
66
self.charset_name, self.language, i)
67
self._state = ProbingState.NOT_ME
68
break
69
elif coding_state == MachineState.ITS_ME:
70
self._state = ProbingState.FOUND_IT
71
break
72
elif coding_state == MachineState.START:
73
char_len = self.coding_sm.get_current_charlen()
74
if i == 0:
75
self._last_char[1] = byte_str[0]
76
self.distribution_analyzer.feed(self._last_char, char_len)
77
else:
78
self.distribution_analyzer.feed(byte_str[i - 1:i + 1],
79
char_len)
80
81
self._last_char[0] = byte_str[-1]
82
83
if self.state == ProbingState.DETECTING:
84
if (self.distribution_analyzer.got_enough_data() and
85
(self.get_confidence() > self.SHORTCUT_THRESHOLD)):
86
self._state = ProbingState.FOUND_IT
87
88
return self.state
89
90
def get_confidence(self):
91
return self.distribution_analyzer.get_confidence()
92
93