Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
maurosoria
GitHub Repository: maurosoria/dirsearch
Path: blob/master/lib/utils/diff.py
896 views
1
# -*- coding: utf-8 -*-
2
# This program is free software; you can redistribute it and/or modify
3
# it under the terms of the GNU General Public License as published by
4
# the Free Software Foundation; either version 2 of the License, or
5
# (at your option) any later version.
6
#
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
# GNU General Public License for more details.
11
#
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
15
# MA 02110-1301, USA.
16
#
17
# Author: Mauro Soria
18
19
import difflib
20
import re
21
22
from lib.utils.common import lstrip_once
23
24
25
class DynamicContentParser:
26
def __init__(self, content1, content2):
27
self._static_patterns = None
28
self._differ = difflib.Differ()
29
self._is_static = content1 == content2
30
self._base_content = content1
31
32
if not self._is_static:
33
self._static_patterns = self.get_static_patterns(
34
self._differ.compare(content1.split(), content2.split())
35
)
36
37
def compare_to(self, content):
38
"""
39
DynamicContentParser.compare_to() workflow
40
41
1. Check if the wildcard response is static or not, if yes, compare two responses.
42
2. If it's not static, get static patterns (split by space) and check if the response
43
has all of them.
44
3. In some cases, checking static patterns isn't reliable enough, so we check the similarity
45
ratio of the two responses.
46
"""
47
48
if self._is_static:
49
return content == self._base_content
50
51
i = -1
52
splitted_content = content.split()
53
# Allow one miss, see https://github.com/maurosoria/dirsearch/issues/1279
54
misses = 0
55
for pattern in self._static_patterns:
56
try:
57
i = splitted_content.index(pattern, i + 1)
58
except ValueError:
59
if misses or len(self._static_patterns) < 20:
60
return False
61
62
misses += 1
63
64
# Static patterns doesn't seem to be a reliable enough method
65
if len(content.split()) > len(self._base_content.split()) and len(self._static_patterns) < 20:
66
return difflib.SequenceMatcher(None, self._base_content, content).ratio() > 0.75
67
68
return True
69
70
@staticmethod
71
def get_static_patterns(patterns):
72
# difflib.Differ.compare returns something like below:
73
# [" str1", "- str2", "+ str3", " str4"]
74
#
75
# Get only stable patterns in the contents
76
return [lstrip_once(pattern, " ") for pattern in patterns if pattern.startswith(" ")]
77
78
79
def generate_matching_regex(string1: str, string2: str) -> str:
80
start = "^"
81
end = "$"
82
83
for char1, char2 in zip(string1, string2):
84
if char1 != char2:
85
start += ".*"
86
break
87
88
start += re.escape(char1)
89
90
if start.endswith(".*"):
91
for char1, char2 in zip(string1[::-1], string2[::-1]):
92
if char1 != char2:
93
break
94
95
end = re.escape(char1) + end
96
97
return start + end
98
99