CoCalc -- dictionary.py

GitHub Repository: maurosoria/dirsearch
Path: blob/master/lib/core/dictionary.py
⁸⁹⁶ views
1
# -*- coding: utf-8 -*-
2
#  This program is free software; you can redistribute it and/or modify
3
#  it under the terms of the GNU General Public License as published by
4
#  the Free Software Foundation; either version 2 of the License, or
5
#  (at your option) any later version.
6
#
7
#  This program is distributed in the hope that it will be useful,
8
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
9
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
#  GNU General Public License for more details.
11
#
12
#  You should have received a copy of the GNU General Public License
13
#  along with this program; if not, write to the Free Software
14
#  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
15
#  MA 02110-1301, USA.
16
#
17
#  Author: Mauro Soria
18

19
from __future__ import annotations
20

21
import re
22
from typing import Any, Iterator
23

24
from lib.core.data import options
25
from lib.core.decorators import locked
26
from lib.core.settings import (
27
    SCRIPT_PATH,
28
    EXTENSION_TAG,
29
    EXCLUDE_OVERWRITE_EXTENSIONS,
30
    EXTENSION_RECOGNITION_REGEX,
31
)
32
from lib.core.structures import OrderedSet
33
from lib.parse.url import clean_path
34
from lib.utils.common import lstrip_once
35
from lib.utils.file import FileUtils
36

37

38
# Get ignore paths for status codes.
39
# Reference: https://github.com/maurosoria/dirsearch#Blacklist
40
def get_blacklists() -> dict[int, Dictionary]:
41
    blacklists = {}
42

43
    for status in [400, 403, 500]:
44
        blacklist_file_name = FileUtils.build_path(SCRIPT_PATH, "db")
45
        blacklist_file_name = FileUtils.build_path(
46
            blacklist_file_name, f"{status}_blacklist.txt"
47
        )
48

49
        if not FileUtils.can_read(blacklist_file_name):
50
            # Skip if cannot read file
51
            continue
52

53
        blacklists[status] = Dictionary(
54
            files=[blacklist_file_name],
55
            is_blacklist=True,
56
        )
57

58
    return blacklists
59

60

61
class Dictionary:
62
    def __init__(self, **kwargs: Any) -> None:
63
        self._index = 0
64
        self._items = self.generate(**kwargs)
65
        # Items in self._extra will be cleared when self.reset() is called
66
        self._extra_index = 0
67
        self._extra = []
68

69
    @property
70
    def index(self) -> int:
71
        return self._index
72

73
    @locked
74
    def __next__(self) -> str:
75
        if len(self._extra) > self._extra_index:
76
            self._extra_index += 1
77
            return self._extra[self._extra_index - 1]
78
        elif len(self._items) > self._index:
79
            self._index += 1
80
            return self._items[self._index - 1]
81
        else:
82
            raise StopIteration
83

84
    def __contains__(self, item: str) -> bool:
85
        return item in self._items
86

87
    def __getstate__(self) -> tuple[list[str], int]:
88
        return self._items, self._index, self._extra, self._extra_index
89

90
    def __setstate__(self, state: tuple[list[str], int]) -> None:
91
        self._items, self._index, self._extra, self._extra_index = state
92

93
    def __iter__(self) -> Iterator[str]:
94
        return iter(self._items)
95

96
    def __len__(self) -> int:
97
        return len(self._items)
98

99
    def generate(self, files: list[str] = [], is_blacklist: bool = False) -> list[str]:
100
        """
101
        Dictionary.generate() behaviour
102

103
        Classic dirsearch wordlist:
104
          1. If %EXT% keyword is present, append one with each extension REPLACED.
105
          2. If the special word is no present, append line unmodified.
106

107
        Forced extensions wordlist (NEW):
108
          This type of wordlist processing is a mix between classic processing
109
          and DirBuster processing.
110
              1. If %EXT% keyword is present in the line, immediately process as "classic dirsearch" (1).
111
              2. If the line does not include the special word AND is NOT terminated by a slash,
112
                append one with each extension APPENDED (line.ext) and ONLY ONE with a slash.
113
              3. If the line does not include the special word and IS ALREADY terminated by slash,
114
                append line unmodified.
115
        """
116

117
        wordlist = OrderedSet()
118
        re_ext_tag = re.compile(EXTENSION_TAG, re.IGNORECASE)
119

120
        for dict_file in files:
121
            for line in FileUtils.get_lines(dict_file):
122
                # Removing leading "/" to work with prefixes later
123
                line = lstrip_once(line, "/")
124

125
                if not self.is_valid(line):
126
                    continue
127

128
                # Classic dirsearch wordlist processing (with %EXT% keyword)
129
                if EXTENSION_TAG in line.lower():
130
                    for extension in options["extensions"]:
131
                        newline = re_ext_tag.sub(extension, line)
132
                        wordlist.add(newline)
133
                else:
134
                    wordlist.add(line)
135

136
                    # "Forcing extensions" and "overwriting extensions" shouldn't apply to
137
                    # blacklists otherwise it might cause false negatives
138
                    if is_blacklist:
139
                        continue
140

141
                    # If "forced extensions" is used and the path is not a directory (terminated by /)
142
                    # or has had an extension already, append extensions to the path
143
                    if (
144
                        options["force_extensions"]
145
                        and "." not in line
146
                        and not line.endswith("/")
147
                    ):
148
                        wordlist.add(line + "/")
149

150
                        for extension in options["extensions"]:
151
                            wordlist.add(f"{line}.{extension}")
152
                    # Overwrite unknown extensions with selected ones (but also keep the origin)
153
                    elif (
154
                        options["overwrite_extensions"]
155
                        and not line.endswith(options["extensions"] + EXCLUDE_OVERWRITE_EXTENSIONS)
156
                        # Paths that have queries in wordlist are usually used for exploiting
157
                        # disclosed vulnerabilities of services, skip such paths
158
                        and "?" not in line
159
                        and "#" not in line
160
                        and re.search(EXTENSION_RECOGNITION_REGEX, line)
161
                    ):
162
                        base = line.split(".")[0]
163

164
                        for extension in options["extensions"]:
165
                            wordlist.add(f"{base}.{extension}")
166

167
        if not is_blacklist:
168
            # Appending prefixes and suffixes
169
            altered_wordlist = OrderedSet()
170

171
            for path in wordlist:
172
                for pref in options["prefixes"]:
173
                    if (
174
                        not path.startswith(("/", pref))
175
                    ):
176
                        altered_wordlist.add(pref + path)
177
                for suff in options["suffixes"]:
178
                    if (
179
                        not path.endswith(("/", suff))
180
                        # Appending suffixes to the URL fragment is useless
181
                        and "?" not in path
182
                        and "#" not in path
183
                    ):
184
                        altered_wordlist.add(path + suff)
185

186
            if altered_wordlist:
187
                wordlist = altered_wordlist
188

189
        if options["lowercase"]:
190
            return list(map(str.lower, wordlist))
191
        elif options["uppercase"]:
192
            return list(map(str.upper, wordlist))
193
        elif options["capitalization"]:
194
            return list(map(str.capitalize, wordlist))
195
        else:
196
            return list(wordlist)
197

198
    def is_valid(self, path: str) -> bool:
199
        # Skip comments and empty lines
200
        if not path or path.startswith("#"):
201
            return False
202

203
        # Skip if the path has excluded extensions
204
        cleaned_path = clean_path(path)
205
        if cleaned_path.endswith(
206
            tuple(f".{extension}" for extension in options["exclude_extensions"])
207
        ):
208
            return False
209

210
        return True
211

212
    def add_extra(self, path) -> None:
213
        if path in self._items or path in self._extra:
214
            return
215

216
        self._extra.append(path)
217

218
    def reset(self) -> None:
219
        self._index = self._extra_index = 0
220
        self._extra.clear()
221

222
Product

Resources

Company