CoCalc -- crawler.py

GitHub Repository: sqlmapproject/sqlmap
Path: blob/master/lib/utils/crawler.py
²⁹⁸⁹ views
1
#!/usr/bin/env python
2

3
"""
4
Copyright (c) 2006-2025 sqlmap developers (https://sqlmap.org)
5
See the file 'LICENSE' for copying permission
6
"""
7

8
from __future__ import division
9

10
import os
11
import re
12
import tempfile
13
import time
14

15
from lib.core.common import checkSameHost
16
from lib.core.common import clearConsoleLine
17
from lib.core.common import dataToStdout
18
from lib.core.common import extractRegexResult
19
from lib.core.common import findPageForms
20
from lib.core.common import getSafeExString
21
from lib.core.common import openFile
22
from lib.core.common import readInput
23
from lib.core.common import safeCSValue
24
from lib.core.common import urldecode
25
from lib.core.compat import xrange
26
from lib.core.convert import htmlUnescape
27
from lib.core.data import conf
28
from lib.core.data import kb
29
from lib.core.data import logger
30
from lib.core.datatype import OrderedSet
31
from lib.core.enums import MKSTEMP_PREFIX
32
from lib.core.exception import SqlmapConnectionException
33
from lib.core.exception import SqlmapSyntaxException
34
from lib.core.settings import CRAWL_EXCLUDE_EXTENSIONS
35
from lib.core.threads import getCurrentThreadData
36
from lib.core.threads import runThreads
37
from lib.parse.sitemap import parseSitemap
38
from lib.request.connect import Connect as Request
39
from thirdparty import six
40
from thirdparty.beautifulsoup.beautifulsoup import BeautifulSoup
41
from thirdparty.six.moves import http_client as _http_client
42
from thirdparty.six.moves import urllib as _urllib
43

44
def crawl(target, post=None, cookie=None):
45
    if not target:
46
        return
47

48
    try:
49
        visited = set()
50
        threadData = getCurrentThreadData()
51
        threadData.shared.value = OrderedSet()
52
        threadData.shared.formsFound = False
53

54
        def crawlThread():
55
            threadData = getCurrentThreadData()
56

57
            while kb.threadContinue:
58
                with kb.locks.limit:
59
                    if threadData.shared.unprocessed:
60
                        current = threadData.shared.unprocessed.pop()
61
                        if current in visited:
62
                            continue
63
                        elif conf.crawlExclude and re.search(conf.crawlExclude, current):
64
                            dbgMsg = "skipping '%s'" % current
65
                            logger.debug(dbgMsg)
66
                            continue
67
                        else:
68
                            visited.add(current)
69
                    else:
70
                        break
71

72
                content = None
73
                try:
74
                    if current:
75
                        content = Request.getPage(url=current, post=post, cookie=None, crawling=True, raise404=False)[0]
76
                except SqlmapConnectionException as ex:
77
                    errMsg = "connection exception detected ('%s'). skipping " % getSafeExString(ex)
78
                    errMsg += "URL '%s'" % current
79
                    logger.critical(errMsg)
80
                except SqlmapSyntaxException:
81
                    errMsg = "invalid URL detected. skipping '%s'" % current
82
                    logger.critical(errMsg)
83
                except _http_client.InvalidURL as ex:
84
                    errMsg = "invalid URL detected ('%s'). skipping " % getSafeExString(ex)
85
                    errMsg += "URL '%s'" % current
86
                    logger.critical(errMsg)
87

88
                if not kb.threadContinue:
89
                    break
90

91
                if isinstance(content, six.text_type):
92
                    try:
93
                        match = re.search(r"(?si)<html[^>]*>(.+)</html>", content)
94
                        if match:
95
                            content = "<html>%s</html>" % match.group(1)
96

97
                        soup = BeautifulSoup(content)
98
                        tags = soup('a')
99

100
                        tags += re.finditer(r'(?i)\s(href|src)=["\'](?P<href>[^>"\']+)', content)
101
                        tags += re.finditer(r'(?i)window\.open\(["\'](?P<href>[^)"\']+)["\']', content)
102

103
                        for tag in tags:
104
                            href = tag.get("href") if hasattr(tag, "get") else tag.group("href")
105

106
                            if href:
107
                                if threadData.lastRedirectURL and threadData.lastRedirectURL[0] == threadData.lastRequestUID:
108
                                    current = threadData.lastRedirectURL[1]
109
                                url = _urllib.parse.urljoin(current, htmlUnescape(href))
110

111
                                # flag to know if we are dealing with the same target host
112
                                _ = checkSameHost(url, target)
113

114
                                if conf.scope:
115
                                    if not re.search(conf.scope, url, re.I):
116
                                        continue
117
                                elif not _:
118
                                    continue
119

120
                                if (extractRegexResult(r"\A[^?]+\.(?P<result>\w+)(\?|\Z)", url) or "").lower() not in CRAWL_EXCLUDE_EXTENSIONS:
121
                                    with kb.locks.value:
122
                                        threadData.shared.deeper.add(url)
123
                                        if re.search(r"(.*?)\?(.+)", url) and not re.search(r"\?(v=)?\d+\Z", url) and not re.search(r"(?i)\.(js|css)(\?|\Z)", url):
124
                                            threadData.shared.value.add(url)
125
                    except UnicodeEncodeError:  # for non-HTML files
126
                        pass
127
                    except ValueError:          # for non-valid links
128
                        pass
129
                    except AssertionError:      # for invalid HTML
130
                        pass
131
                    finally:
132
                        if conf.forms:
133
                            threadData.shared.formsFound |= len(findPageForms(content, current, False, True)) > 0
134

135
                if conf.verbose in (1, 2):
136
                    threadData.shared.count += 1
137
                    status = '%d/%d links visited (%d%%)' % (threadData.shared.count, threadData.shared.length, round(100.0 * threadData.shared.count / threadData.shared.length))
138
                    dataToStdout("\r[%s] [INFO] %s" % (time.strftime("%X"), status), True)
139

140
        threadData.shared.deeper = set()
141
        threadData.shared.unprocessed = set([target])
142

143
        _ = re.sub(r"(?<!/)/(?!/).*", "", target)
144
        if _:
145
            if target.strip('/') != _.strip('/'):
146
                threadData.shared.unprocessed.add(_)
147

148
        if re.search(r"\?.*\b\w+=", target):
149
            threadData.shared.value.add(target)
150

151
        if kb.checkSitemap is None:
152
            message = "do you want to check for the existence of "
153
            message += "site's sitemap(.xml) [y/N] "
154
            kb.checkSitemap = readInput(message, default='N', boolean=True)
155

156
        if kb.checkSitemap:
157
            found = True
158
            items = None
159
            url = _urllib.parse.urljoin(target, "/sitemap.xml")
160
            try:
161
                items = parseSitemap(url)
162
            except SqlmapConnectionException as ex:
163
                if "page not found" in getSafeExString(ex):
164
                    found = False
165
                    logger.warning("'sitemap.xml' not found")
166
            except:
167
                pass
168
            finally:
169
                if found:
170
                    if items:
171
                        for item in items:
172
                            if re.search(r"(.*?)\?(.+)", item):
173
                                threadData.shared.value.add(item)
174
                        if conf.crawlDepth > 1:
175
                            threadData.shared.unprocessed.update(items)
176
                    logger.info("%s links found" % ("no" if not items else len(items)))
177

178
        if not conf.bulkFile:
179
            infoMsg = "starting crawler for target URL '%s'" % target
180
            logger.info(infoMsg)
181

182
        for i in xrange(conf.crawlDepth):
183
            threadData.shared.count = 0
184
            threadData.shared.length = len(threadData.shared.unprocessed)
185
            numThreads = min(conf.threads, len(threadData.shared.unprocessed))
186

187
            if not conf.bulkFile:
188
                logger.info("searching for links with depth %d" % (i + 1))
189

190
            runThreads(numThreads, crawlThread, threadChoice=(i > 0))
191
            clearConsoleLine(True)
192

193
            if threadData.shared.deeper:
194
                threadData.shared.unprocessed = set(threadData.shared.deeper)
195
            else:
196
                break
197

198
    except KeyboardInterrupt:
199
        warnMsg = "user aborted during crawling. sqlmap "
200
        warnMsg += "will use partial list"
201
        logger.warning(warnMsg)
202

203
    finally:
204
        clearConsoleLine(True)
205

206
        if not threadData.shared.value:
207
            if not (conf.forms and threadData.shared.formsFound):
208
                warnMsg = "no usable links found (with GET parameters)"
209
                if conf.forms:
210
                    warnMsg += " or forms"
211
                logger.warning(warnMsg)
212
        else:
213
            for url in threadData.shared.value:
214
                kb.targets.add((urldecode(url, kb.pageEncoding), None, None, None, None))
215

216
        if kb.targets:
217
            if kb.normalizeCrawlingChoice is None:
218
                message = "do you want to normalize "
219
                message += "crawling results [Y/n] "
220

221
                kb.normalizeCrawlingChoice = readInput(message, default='Y', boolean=True)
222

223
            if kb.normalizeCrawlingChoice:
224
                seen = set()
225
                results = OrderedSet()
226

227
                for target in kb.targets:
228
                    value = "%s%s%s" % (target[0], '&' if '?' in target[0] else '?', target[2] or "")
229
                    match = re.search(r"/[^/?]*\?.+\Z", value)
230
                    if match:
231
                        key = re.sub(r"=[^=&]*", "=", match.group(0)).strip("&?")
232
                        if '=' in key and key not in seen:
233
                            results.add(target)
234
                            seen.add(key)
235

236
                kb.targets = results
237

238
            storeResultsToFile(kb.targets)
239

240
def storeResultsToFile(results):
241
    if not results:
242
        return
243

244
    if kb.storeCrawlingChoice is None:
245
        message = "do you want to store crawling results to a temporary file "
246
        message += "for eventual further processing with other tools [y/N] "
247

248
        kb.storeCrawlingChoice = readInput(message, default='N', boolean=True)
249

250
    if kb.storeCrawlingChoice:
251
        handle, filename = tempfile.mkstemp(prefix=MKSTEMP_PREFIX.CRAWLER, suffix=".csv" if conf.forms else ".txt")
252
        os.close(handle)
253

254
        infoMsg = "writing crawling results to a temporary file '%s' " % filename
255
        logger.info(infoMsg)
256

257
        with openFile(filename, "w+b") as f:
258
            if conf.forms:
259
                f.write("URL,POST\n")
260

261
            for url, _, data, _, _ in results:
262
                if conf.forms:
263
                    f.write("%s,%s\n" % (safeCSValue(url), safeCSValue(data or "")))
264
                else:
265
                    f.write("%s\n" % url)
266

267
Product

Resources

Company