Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sqlmapproject
GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/lib/utils/crawler.py
2989 views
1
#!/usr/bin/env python
2
3
"""
4
Copyright (c) 2006-2025 sqlmap developers (https://sqlmap.org)
5
See the file 'LICENSE' for copying permission
6
"""
7
8
from __future__ import division
9
10
import os
11
import re
12
import tempfile
13
import time
14
15
from lib.core.common import checkSameHost
16
from lib.core.common import clearConsoleLine
17
from lib.core.common import dataToStdout
18
from lib.core.common import extractRegexResult
19
from lib.core.common import findPageForms
20
from lib.core.common import getSafeExString
21
from lib.core.common import openFile
22
from lib.core.common import readInput
23
from lib.core.common import safeCSValue
24
from lib.core.common import urldecode
25
from lib.core.compat import xrange
26
from lib.core.convert import htmlUnescape
27
from lib.core.data import conf
28
from lib.core.data import kb
29
from lib.core.data import logger
30
from lib.core.datatype import OrderedSet
31
from lib.core.enums import MKSTEMP_PREFIX
32
from lib.core.exception import SqlmapConnectionException
33
from lib.core.exception import SqlmapSyntaxException
34
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
35
from lib.core.threads import getCurrentThreadData
36
from lib.core.threads import runThreads
37
from lib.parse.sitemap import parseSitemap
38
from lib.request.connect import Connect as Request
39
from thirdparty import six
40
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
41
from thirdparty.six.moves import http_client as _http_client
42
from thirdparty.six.moves import urllib as _urllib
43
44
def crawl(target, post=None, cookie=None):
45
if not target:
46
return
47
48
try:
49
visited = set()
50
threadData = getCurrentThreadData()
51
threadData.shared.value = OrderedSet()
52
threadData.shared.formsFound = False
53
54
def crawlThread():
55
threadData = getCurrentThreadData()
56
57
while kb.threadContinue:
58
with kb.locks.limit:
59
if threadData.shared.unprocessed:
60
current = threadData.shared.unprocessed.pop()
61
if current in visited:
62
continue
63
elif conf.crawlExclude and re.search(conf.crawlExclude, current):
64
dbgMsg = "skipping '%s'" % current
65
logger.debug(dbgMsg)
66
continue
67
else:
68
visited.add(current)
69
else:
70
break
71
72
content = None
73
try:
74
if current:
75
content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0]
76
except SqlmapConnectionException as ex:
77
errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
78
errMsg += "URL '%s'" % current
79
logger.critical(errMsg)
80
except SqlmapSyntaxException:
81
errMsg = "invalid URL detected. skipping '%s'" % current
82
logger.critical(errMsg)
83
except _http_client.InvalidURL as ex:
84
errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
85
errMsg += "URL '%s'" % current
86
logger.critical(errMsg)
87
88
if not kb.threadContinue:
89
break
90
91
if isinstance(content, six.text_type):
92
try:
93
match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
94
if match:
95
content = "<html>%s</html>" % match.group(1)
96
97
soup = BeautifulSoup(content)
98
tags = soup('a')
99
100
tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
101
tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)
102
103
for tag in tags:
104
href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
105
106
if href:
107
if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
108
current = threadData.lastRedirectURL[1]
109
url = _urllib.parse.urljoin(current, htmlUnescape(href))
110
111
# flag to know if we are dealing with the same target host
112
_ = checkSameHost(url, target)
113
114
if conf.scope:
115
if not re.search(conf.scope, url, re.I):
116
continue
117
elif not _:
118
continue
119
120
if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
121
with kb.locks.value:
122
threadData.shared.deeper.add(url)
123
if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
124
threadData.shared.value.add(url)
125
except UnicodeEncodeError: # for non-HTML files
126
pass
127
except ValueError: # for non-valid links
128
pass
129
except AssertionError: # for invalid HTML
130
pass
131
finally:
132
if conf.forms:
133
threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0
134
135
if conf.verbose in (1, 2):
136
threadData.shared.count += 1
137
status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
138
dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
139
140
threadData.shared.deeper = set()
141
threadData.shared.unprocessed = set([target])
142
143
_ = re.sub(r"(?<!/)/(?!/).*", "", target)
144
if _:
145
if target.strip('/') != _.strip('/'):
146
threadData.shared.unprocessed.add(_)
147
148
if re.search(r"\?.*\b\w+=", target):
149
threadData.shared.value.add(target)
150
151
if kb.checkSitemap is None:
152
message = "do you want to check for the existence of "
153
message += "site's sitemap(.xml) [y/N] "
154
kb.checkSitemap = readInput(message, default='N', boolean=True)
155
156
if kb.checkSitemap:
157
found = True
158
items = None
159
url = _urllib.parse.urljoin(target, "/sitemap.xml")
160
try:
161
items = parseSitemap(url)
162
except SqlmapConnectionException as ex:
163
if "page not found" in getSafeExString(ex):
164
found = False
165
logger.warning("'sitemap.xml' not found")
166
except:
167
pass
168
finally:
169
if found:
170
if items:
171
for item in items:
172
if re.search(r"(.*?)\?(.+)", item):
173
threadData.shared.value.add(item)
174
if conf.crawlDepth > 1:
175
threadData.shared.unprocessed.update(items)
176
logger.info("%s links found" % ("no" if not items else len(items)))
177
178
if not conf.bulkFile:
179
infoMsg = "starting crawler for target URL '%s'" % target
180
logger.info(infoMsg)
181
182
for i in xrange(conf.crawlDepth):
183
threadData.shared.count = 0
184
threadData.shared.length = len(threadData.shared.unprocessed)
185
numThreads = min(conf.threads, len(threadData.shared.unprocessed))
186
187
if not conf.bulkFile:
188
logger.info("searching for links with depth %d" % (i + 1))
189
190
runThreads(numThreads, crawlThread, threadChoice=(i > 0))
191
clearConsoleLine(True)
192
193
if threadData.shared.deeper:
194
threadData.shared.unprocessed = set(threadData.shared.deeper)
195
else:
196
break
197
198
except KeyboardInterrupt:
199
warnMsg = "user aborted during crawling. sqlmap "
200
warnMsg += "will use partial list"
201
logger.warning(warnMsg)
202
203
finally:
204
clearConsoleLine(True)
205
206
if not threadData.shared.value:
207
if not (conf.forms and threadData.shared.formsFound):
208
warnMsg = "no usable links found (with GET parameters)"
209
if conf.forms:
210
warnMsg += " or forms"
211
logger.warning(warnMsg)
212
else:
213
for url in threadData.shared.value:
214
kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))
215
216
if kb.targets:
217
if kb.normalizeCrawlingChoice is None:
218
message = "do you want to normalize "
219
message += "crawling results [Y/n] "
220
221
kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)
222
223
if kb.normalizeCrawlingChoice:
224
seen = set()
225
results = OrderedSet()
226
227
for target in kb.targets:
228
value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "")
229
match = re.search(r"/[^/?]*\?.+\Z", value)
230
if match:
231
key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?")
232
if '=' in key and key not in seen:
233
results.add(target)
234
seen.add(key)
235
236
kb.targets = results
237
238
storeResultsToFile(kb.targets)
239
240
def storeResultsToFile(results):
241
if not results:
242
return
243
244
if kb.storeCrawlingChoice is None:
245
message = "do you want to store crawling results to a temporary file "
246
message += "for eventual further processing with other tools [y/N] "
247
248
kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)
249
250
if kb.storeCrawlingChoice:
251
handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")
252
os.close(handle)
253
254
infoMsg = "writing crawling results to a temporary file '%s' " % filename
255
logger.info(infoMsg)
256
257
with openFile(filename, "w+b") as f:
258
if conf.forms:
259
f.write("URL,POST\n")
260
261
for url, _, data, _, _ in results:
262
if conf.forms:
263
f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))
264
else:
265
f.write("%s\n" % url)
266
267