Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
maurosoria
GitHub Repository: maurosoria/dirsearch
Path: blob/master/lib/connection/requester.py
896 views
1
# -*- coding: utf-8 -*-
2
# This program is free software; you can redistribute it and/or modify
3
# it under the terms of the GNU General Public License as published by
4
# the Free Software Foundation; either version 2 of the License, or
5
# (at your option) any later version.
6
#
7
# This program is distributed in the hope that it will be useful,
8
# but WITHOUT ANY WARRANTY; without even the implied warranty of
9
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
# GNU General Public License for more details.
11
#
12
# You should have received a copy of the GNU General Public License
13
# along with this program; if not, write to the Free Software
14
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
15
# MA 02110-1301, USA.
16
#
17
# Author: Mauro Soria
18
19
from __future__ import annotations
20
21
import asyncio
22
import http.client
23
import random
24
import re
25
import socket
26
from ssl import SSLError
27
import threading
28
import time
29
from typing import Any, Generator
30
from urllib.parse import urlparse
31
32
import httpx
33
import requests
34
from requests.auth import AuthBase, HTTPBasicAuth, HTTPDigestAuth
35
from requests.packages import urllib3
36
from requests_ntlm import HttpNtlmAuth
37
from httpx_ntlm import HttpNtlmAuth as HttpxNtlmAuth
38
from requests_toolbelt.adapters.socket_options import SocketOptionsAdapter
39
40
from lib.connection.dns import cached_getaddrinfo
41
from lib.connection.response import AsyncResponse, Response
42
from lib.core.data import options
43
from lib.core.decorators import cached
44
from lib.core.exceptions import RequestException
45
from lib.core.logger import logger
46
from lib.core.settings import (
47
PROXY_SCHEMES,
48
RATE_UPDATE_DELAY,
49
READ_RESPONSE_ERROR_REGEX,
50
SCRIPT_PATH,
51
)
52
from lib.core.structures import CaseInsensitiveDict
53
from lib.utils.common import safequote
54
from lib.utils.file import FileUtils
55
from lib.utils.mimetype import guess_mimetype
56
57
# Disable InsecureRequestWarning from urllib3
58
urllib3.disable_warnings(urllib3.exceptions.SecurityWarning)
59
# Use custom `socket.getaddrinfo` for `requests` which supports DNS caching
60
socket.getaddrinfo = cached_getaddrinfo
61
62
63
class BaseRequester:
64
def __init__(self) -> None:
65
self._url: str = ""
66
self._rate = 0
67
self.proxy_cred = options["proxy_auth"]
68
self.headers = CaseInsensitiveDict(options["headers"])
69
self.agents: list[str] = []
70
self.session = None
71
72
self._cert = None
73
if options["cert_file"] and options["key_file"]:
74
self._cert = (options["cert_file"], options["key_file"])
75
76
self._socket_options = []
77
if options["network_interface"]:
78
self._socket_options.append(
79
(
80
socket.SOL_SOCKET,
81
socket.SO_BINDTODEVICE,
82
options["network_interface"].encode("utf-8"),
83
)
84
)
85
86
if options["random_agents"]:
87
self._fetch_agents()
88
89
# Guess the mime type of request data if not specified
90
if options["data"] and "content-type" not in self.headers:
91
self.set_header("content-type", guess_mimetype(options["data"]))
92
93
def _fetch_agents(self) -> None:
94
self.agents = FileUtils.get_lines(
95
FileUtils.build_path(SCRIPT_PATH, "db", "user-agents.txt")
96
)
97
98
def set_url(self, url: str) -> None:
99
self._url = url
100
101
def set_header(self, key: str, value: str) -> None:
102
self.headers[key] = value.lstrip()
103
104
def is_rate_exceeded(self) -> bool:
105
return self._rate >= options["max_rate"] > 0
106
107
def decrease_rate(self) -> None:
108
self._rate -= 1
109
110
def increase_rate(self) -> None:
111
self._rate += 1
112
threading.Timer(1, self.decrease_rate).start()
113
114
@property
115
@cached(RATE_UPDATE_DELAY)
116
def rate(self) -> int:
117
return self._rate
118
119
120
class HTTPBearerAuth(AuthBase):
121
def __init__(self, token: str) -> None:
122
self.token = token
123
124
def __call__(self, request: requests.PreparedRequest) -> requests.PreparedRequest:
125
request.headers["Authorization"] = f"Bearer {self.token}"
126
return request
127
128
129
class Requester(BaseRequester):
130
def __init__(self):
131
super().__init__()
132
133
self.session = requests.Session()
134
self.session.verify = False
135
self.session.cert = self._cert
136
137
for scheme in ("http://", "https://"):
138
self.session.mount(
139
scheme,
140
SocketOptionsAdapter(
141
max_retries=0,
142
pool_maxsize=options["thread_count"],
143
socket_options=self._socket_options,
144
),
145
)
146
147
if options["auth"]:
148
self.set_auth(options["auth_type"], options["auth"])
149
150
def set_auth(self, type: str, credential: str) -> None:
151
if type in ("bearer", "jwt"):
152
self.session.auth = HTTPBearerAuth(credential)
153
else:
154
try:
155
user, password = credential.split(":", 1)
156
except ValueError:
157
user = credential
158
password = ""
159
160
if type == "basic":
161
self.session.auth = HTTPBasicAuth(user, password)
162
elif type == "digest":
163
self.session.auth = HTTPDigestAuth(user, password)
164
else:
165
self.session.auth = HttpNtlmAuth(user, password)
166
167
# :path: is expected not to start with "/"
168
def request(self, path: str, proxy: str | None = None) -> Response:
169
# Pause if the request rate exceeded the maximum
170
while self.is_rate_exceeded():
171
time.sleep(0.1)
172
173
self.increase_rate()
174
175
err_msg = None
176
url = self._url + safequote(path)
177
178
# Why using a loop instead of max_retries argument? Check issue #1009
179
for _ in range(options["max_retries"] + 1):
180
try:
181
proxies = {}
182
try:
183
proxy_url = proxy or random.choice(options["proxies"])
184
if not proxy_url.startswith(PROXY_SCHEMES):
185
proxy_url = f"http://{proxy_url}"
186
187
if self.proxy_cred and "@" not in proxy_url:
188
# socks5://localhost:9050 => socks5://[credential]@localhost:9050
189
proxy_url = proxy_url.replace("://", f"://{self.proxy_cred}@", 1)
190
191
proxies["https"] = proxy_url
192
if not proxy_url.startswith("https://"):
193
proxies["http"] = proxy_url
194
except IndexError:
195
pass
196
197
if self.agents:
198
self.set_header("user-agent", random.choice(self.agents))
199
200
# Use prepared request to avoid the URL path from being normalized
201
# Reference: https://github.com/psf/requests/issues/5289
202
request = requests.Request(
203
options["http_method"],
204
url,
205
headers=self.headers,
206
data=options["data"],
207
)
208
prep = self.session.prepare_request(request)
209
prep.url = url
210
211
origin_response = self.session.send(
212
prep,
213
allow_redirects=options["follow_redirects"],
214
timeout=options["timeout"],
215
proxies=proxies,
216
stream=True,
217
)
218
response = Response(url, origin_response)
219
220
log_msg = f'"{options["http_method"]} {response.url}" {response.status} - {response.length}B'
221
222
if response.redirect:
223
log_msg += f" - LOCATION: {response.redirect}"
224
225
logger.info(log_msg)
226
227
return response
228
229
except Exception as e:
230
logger.exception(e)
231
232
if e == socket.gaierror:
233
err_msg = "Couldn't resolve DNS"
234
elif "SSLError" in str(e):
235
err_msg = "Unexpected SSL error"
236
elif "TooManyRedirects" in str(e):
237
err_msg = f"Too many redirects: {url}"
238
elif "ProxyError" in str(e):
239
if proxy:
240
err_msg = f"Error with the proxy: {proxy}"
241
else:
242
err_msg = "Error with the system proxy"
243
# Prevent from reusing it in the future
244
if proxy in options["proxies"] and len(options["proxies"]) > 1:
245
options["proxies"].remove(proxy)
246
elif "InvalidURL" in str(e):
247
err_msg = f"Invalid URL: {url}"
248
elif "InvalidProxyURL" in str(e):
249
err_msg = f"Invalid proxy URL: {proxy}"
250
elif "ConnectionError" in str(e):
251
err_msg = f"Cannot connect to: {urlparse(url).netloc}"
252
elif re.search(READ_RESPONSE_ERROR_REGEX, str(e)):
253
err_msg = f"Failed to read response body: {url}"
254
elif "Timeout" in str(e) or e in (
255
http.client.IncompleteRead,
256
socket.timeout,
257
):
258
err_msg = f"Request timeout: {url}"
259
else:
260
err_msg = f"There was a problem in the request to: {url}"
261
262
raise RequestException(err_msg)
263
264
265
class HTTPXBearerAuth(httpx.Auth):
266
def __init__(self, token: str) -> None:
267
self.token = token
268
269
def auth_flow(self, request: httpx.Request) -> Generator[httpx.Request, None, None]:
270
request.headers["Authorization"] = f"Bearer {self.token}"
271
yield request
272
273
274
class ProxyRoatingTransport(httpx.AsyncBaseTransport):
275
def __init__(self, proxies: list[str], **kwargs: Any) -> None:
276
self._transports = [
277
httpx.AsyncHTTPTransport(proxy=proxy, **kwargs) for proxy in proxies
278
]
279
280
async def handle_async_request(self, request: httpx.Request) -> httpx.Response:
281
request.extensions["target"] = str(request.url).encode()
282
transport = random.choice(self._transports)
283
return await transport.handle_async_request(request)
284
285
286
class AsyncRequester(BaseRequester):
287
def __init__(self) -> None:
288
super().__init__()
289
290
tpargs = {
291
"verify": False,
292
"cert": self._cert,
293
"limits": httpx.Limits(max_connections=options["thread_count"]),
294
"socket_options": self._socket_options,
295
}
296
transport = (
297
ProxyRoatingTransport(
298
[self.parse_proxy(p) for p in options["proxies"]], **tpargs
299
)
300
if options["proxies"]
301
else httpx.AsyncHTTPTransport(**tpargs)
302
)
303
304
self.session = httpx.AsyncClient(
305
mounts={"all://": transport},
306
timeout=httpx.Timeout(options["timeout"]),
307
)
308
self.replay_session = None
309
310
if options["auth"]:
311
self.set_auth(options["auth_type"], options["auth"])
312
313
def parse_proxy(self, proxy: str) -> str:
314
if not proxy:
315
return None
316
317
if not proxy.startswith(PROXY_SCHEMES):
318
proxy = f"http://{proxy}"
319
320
if self.proxy_cred and "@" not in proxy:
321
# socks5://localhost:9050 => socks5://[credential]@localhost:9050
322
proxy = proxy.replace("://", f"://{self.proxy_cred}@", 1)
323
324
return proxy
325
326
def set_auth(self, type: str, credential: str) -> None:
327
if type in ("bearer", "jwt"):
328
self.session.auth = HTTPXBearerAuth(credential)
329
else:
330
try:
331
user, password = credential.split(":", 1)
332
except ValueError:
333
user = credential
334
password = ""
335
336
if type == "basic":
337
self.session.auth = httpx.BasicAuth(user, password)
338
elif type == "digest":
339
self.session.auth = httpx.DigestAuth(user, password)
340
else:
341
self.session.auth = HttpxNtlmAuth(user, password)
342
343
async def replay_request(self, path: str, proxy: str) -> AsyncResponse:
344
if self.replay_session is None:
345
transport = httpx.AsyncHTTPTransport(
346
verify=False,
347
cert=self._cert,
348
limits=httpx.Limits(max_connections=options["thread_count"]),
349
proxy=self.parse_proxy(proxy),
350
socket_options=self._socket_options,
351
)
352
self.replay_session = httpx.AsyncClient(
353
mounts={"all://": transport},
354
timeout=httpx.Timeout(options["timeout"]),
355
)
356
return await self.request(path, self.replay_session, replay=True)
357
358
# :path: is expected not to start with "/"
359
async def request(
360
self, path: str, session: httpx.AsyncClient | None = None, replay: bool = False
361
) -> AsyncResponse:
362
while self.is_rate_exceeded():
363
await asyncio.sleep(0.1)
364
365
self.increase_rate()
366
367
err_msg = None
368
url = self._url + safequote(path)
369
session = session or self.session
370
371
for _ in range(options["max_retries"] + 1):
372
try:
373
if self.agents:
374
self.set_header("user-agent", random.choice(self.agents))
375
376
# Use "target" extension to avoid the URL path from being normalized
377
request = session.build_request(
378
options["http_method"],
379
url,
380
headers=self.headers,
381
data=options["data"],
382
extensions={"target": (url if replay else f"/{safequote(path)}").encode()},
383
)
384
385
xresponse = await session.send(
386
request,
387
stream=True,
388
follow_redirects=options["follow_redirects"],
389
)
390
response = await AsyncResponse.create(url, xresponse)
391
await xresponse.aclose()
392
393
log_msg = f'"{options["http_method"]} {response.url}" {response.status} - {response.length}B'
394
395
if response.redirect:
396
log_msg += f" - LOCATION: {response.redirect}"
397
398
logger.info(log_msg)
399
400
return response
401
402
except Exception as e:
403
logger.exception(e)
404
405
if isinstance(e, httpx.ConnectError):
406
if str(e).startswith("[Errno -2]"):
407
err_msg = "Couldn't resolve DNS"
408
else:
409
err_msg = f"Cannot connect to: {urlparse(url).netloc}"
410
elif isinstance(e, SSLError):
411
err_msg = "Unexpected SSL error"
412
elif isinstance(e, httpx.TooManyRedirects):
413
err_msg = f"Too many redirects: {url}"
414
elif isinstance(e, httpx.ProxyError):
415
err_msg = "Cannot establish the proxy connection"
416
elif isinstance(e, httpx.InvalidURL):
417
err_msg = f"Invalid URL: {url}"
418
elif isinstance(e, httpx.TimeoutException):
419
err_msg = f"Request timeout: {url}"
420
elif isinstance(e, httpx.ReadError) or isinstance(e, httpx.DecodingError): # not sure
421
err_msg = f"Failed to read response body: {url}"
422
else:
423
err_msg = f"There was a problem in the request to: {url}"
424
425
raise RequestException(err_msg)
426
427
def increase_rate(self) -> None:
428
self._rate += 1
429
asyncio.get_running_loop().call_later(1, self.decrease_rate)
430
431