CoCalc -- webscreenshot.py

GitHub Repository: 1N3/Sn1per
Path: blob/master/bin/webscreenshot.py
²⁹⁶⁰ views
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3

4
# This file is part of webscreenshot.
5
#
6
# Copyright (C) 2018, Thomas Debize <tdebize at mail.com>
7
# All rights reserved.
8
#
9
# webscreenshot is free software: you can redistribute it and/or modify
10
# it under the terms of the GNU Lesser General Public License as published by
11
# the Free Software Foundation, either version 3 of the License, or
12
# (at your option) any later version.
13
#
14
# webscreenshot is distributed in the hope that it will be useful,
15
# but WITHOUT ANY WARRANTY; without even the implied warranty of
16
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
# GNU Lesser General Public License for more details.
18
#
19
# You should have received a copy of the GNU Lesser General Public License
20
# along with webscreenshot.  If not, see <http://www.gnu.org/licenses/>.
21

22
import re
23
import os
24
import sys
25
import subprocess
26
import datetime
27
import time
28
import signal
29
import multiprocessing
30
import itertools
31
import shlex
32
import logging
33
import errno
34

35
# Script version
36
VERSION = '2.2.1'
37

38
# OptionParser imports
39
from optparse import OptionParser
40
from optparse import OptionGroup
41

42
# Options definition
43
parser = OptionParser(usage="usage: %prog [options] URL")
44

45
main_grp = OptionGroup(parser, 'Main parameters')
46
main_grp.add_option('-i', '--input-file', help = '<INPUT_FILE>: text file containing the target list. Ex: list.txt', nargs = 1)
47
main_grp.add_option('-o', '--output-directory', help = '<OUTPUT_DIRECTORY> (optional): screenshots output directory (default \'./screenshots/\')', nargs = 1)
48
main_grp.add_option('-r', '--renderer', help = '<RENDERER> (optional): renderer to use among \'phantomjs\' (legacy but best results), \'chrome\', \'chromium\' (version > 57) (default \'phantomjs\')', choices = ['phantomjs', 'chrome', 'chromium'], default = 'phantomjs', nargs = 1)
49
main_grp.add_option('-w', '--workers', help = '<WORKERS> (optional): number of parallel execution workers (default 2)', default = 2, nargs = 1)
50
main_grp.add_option('-v', '--verbosity', help = '<VERBOSITY> (optional): verbosity level, repeat it to increase the level { -v INFO, -vv DEBUG } (default verbosity ERROR)', action = 'count', default = 0)
51

52
proc_grp = OptionGroup(parser, 'Input processing parameters')
53
proc_grp.add_option('-p', '--port', help = '<PORT> (optional): use the specified port for each target in the input list. Ex: -p 80', nargs = 1)
54
proc_grp.add_option('-s', '--ssl', help = '<SSL> (optional): enforce ssl for every connection', action = 'store_true', default = False)
55
proc_grp.add_option('-m', '--multiprotocol', help = '<MULTIPROTOCOL> (optional): perform screenshots over HTTP and HTTPS for each target', action = 'store_true', default = False) 
56

57
http_grp = OptionGroup(parser, 'HTTP parameters')
58
http_grp.add_option('-c', '--cookie', help = '<COOKIE_STRING> (optional): cookie string to add. Ex: -c "JSESSIONID=1234; YOLO=SWAG"', nargs = 1)
59
http_grp.add_option('-a', '--header', help = '<HEADER> (optional): custom or additional header. Repeat this option for every header. Ex: -a "Host: localhost" -a "Foo: bar"', action = 'append')
60

61
http_grp.add_option('-u', '--http-username', help = '<HTTP_USERNAME> (optional): specify a username for HTTP Basic Authentication.')
62
http_grp.add_option('-b', '--http-password', help = '<HTTP_PASSWORD> (optional): specify a password for HTTP Basic Authentication.')
63

64
conn_grp = OptionGroup(parser, 'Connection parameters')
65
conn_grp.add_option('-P', '--proxy', help = '<PROXY> (optional): specify a proxy. Ex: -P http://proxy.company.com:8080')
66
conn_grp.add_option('-A', '--proxy-auth', help = '<PROXY_AUTH> (optional): provides authentication information for the proxy. Ex: -A user:password')
67
conn_grp.add_option('-T', '--proxy-type', help = '<PROXY_TYPE> (optional): specifies the proxy type, "http" (default), "none" (disable completely), or "socks5". Ex: -T socks')
68
conn_grp.add_option('-t', '--timeout', help = '<TIMEOUT> (optional): renderer execution timeout in seconds (default 30 sec)', default = 30, nargs = 1)
69

70
parser.option_groups.extend([main_grp, proc_grp, http_grp, conn_grp])
71

72
# renderer binaries, hoping to find it in a $PATH directory
73
## Be free to change them to your own full-path location 
74
PHANTOMJS_BIN = 'phantomjs'
75
CHROME_BIN = 'google-chrome'
76
CHROMIUM_BIN = 'chromium'
77

78
WEBSCREENSHOT_JS = os.path.abspath(os.path.join(os.path.dirname(os.path.realpath(__file__)), './webscreenshot.js'))
79
SCREENSHOTS_DIRECTORY = os.path.abspath(os.path.join(os.getcwdu(), './screenshots/'))
80

81
# Logger definition
82
LOGLEVELS = {0 : 'ERROR', 1 : 'INFO', 2 : 'DEBUG'}
83
logger_output = logging.StreamHandler(sys.stdout)
84
logger_output.setFormatter(logging.Formatter('[%(levelname)s][%(name)s] %(message)s'))
85

86
logger_gen = logging.getLogger("General")
87
logger_gen.addHandler(logger_output)
88

89
# Macros
90
SHELL_EXECUTION_OK = 0
91
SHELL_EXECUTION_ERROR = -1
92
PHANTOMJS_HTTP_AUTH_ERROR_CODE = 2
93

94
# Handful patterns
95
p_ipv4_elementary = '(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})\.(?:[\d]{1,3})'
96
p_domain = '[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,6}'
97
p_port = '\d{0,5}'
98
p_resource = '(?:/(?P<res>.*))?'
99

100
full_uri_domain = re.compile('^(?P<protocol>http(?:|s))://(?P<host>%s|%s)(?::(?P<port>%s))?%s$' % (p_domain, p_ipv4_elementary, p_port, p_resource))
101

102
fqdn_and_port = re.compile('^(?P<host>%s):(?P<port>%s)%s$' % (p_domain, p_port, p_resource))
103
fqdn_only = re.compile('^(?P<host>%s)%s$' % (p_domain, p_resource))
104

105
ipv4_and_port = re.compile('^(?P<host>%s):(?P<port>%s)%s' % (p_ipv4_elementary, p_port, p_resource))
106
ipv4_only = re.compile('^(?P<host>%s)%s$' % (p_ipv4_elementary, p_resource))
107

108
entry_from_csv = re.compile('^(?P<host>%s|%s)\s+(?P<port>\d+)$' % (p_domain, p_ipv4_elementary))
109

110
# Handful functions
111
def init_worker():
112
    """ 
113
        Tell the workers to ignore a global SIGINT interruption
114
    """
115
    signal.signal(signal.SIGINT, signal.SIG_IGN)
116
    
117
def kill_em_all(signal, frame):
118
    """
119
        Terminate all processes while capturing a SIGINT from the user
120
    """
121
    logger_gen.info('CTRL-C received, exiting')
122
    sys.exit(0)
123
    
124
def shell_exec(url, command, options):
125
    """
126
        Execute a shell command following a timeout
127
        Taken from http://howto.pui.ch/post/37471155682/set-timeout-for-a-shell-command-in-python
128
    """
129
    global SHELL_EXECUTION_OK, SHELL_EXECUTION_ERROR
130
    
131
    logger_url = logging.getLogger("%s" % url)
132
    logger_url.setLevel(options.log_level)
133
    
134
    timeout = int(options.timeout)
135
    start = datetime.datetime.now()
136
    
137
    try :
138
        p = subprocess.Popen(shlex.split(command), shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
139
        
140
        # binaries timeout
141
        while p.poll() is None:
142
            time.sleep(0.1)
143
            now = datetime.datetime.now()
144
            if (now - start).seconds > timeout:
145
                logger_url.debug("Shell command PID %s reached the timeout, killing it now" % p.pid)
146
                logger_url.error("Screenshot somehow failed\n")
147
                
148
                if sys.platform == 'win32':
149
                    p.send_signal(signal.SIGTERM)
150
                else:
151
                    p.send_signal(signal.SIGKILL)
152
                
153
                return SHELL_EXECUTION_ERROR
154
        
155
        retval = p.poll()
156
        if retval != SHELL_EXECUTION_OK:
157
            if retval == PHANTOMJS_HTTP_AUTH_ERROR_CODE:
158
                # HTTP Authentication request
159
                logger_url.error("HTTP Authentication requested, try to pass credentials with -u and -b options")
160
            else:
161
                # Phantomjs general error
162
                logger_url.error("Shell command PID %s returned an abnormal error code: '%s'" % (p.pid,retval))
163
                logger_url.error("Screenshot somehow failed\n")
164
                    
165
            return SHELL_EXECUTION_ERROR
166
        
167
        else:
168
            # Phantomjs ok
169
            logger_url.debug("Shell command PID %s ended normally" % p.pid)
170
            logger_url.info("Screenshot OK\n")
171
            return SHELL_EXECUTION_OK
172
    
173
    except Exception as e:
174
        if e.errno and e.errno == errno.ENOENT :
175
            logger_url.error('renderer binary could not have been found in your current PATH environment variable, exiting')
176
        else:
177
            logger_gen.error('Unknown error: %s, exiting' % e )
178
        return SHELL_EXECUTION_ERROR
179

180
def filter_bad_filename_chars(filename):
181
    #print (filename)
182
    """
183
        Filter bad chars for any filename
184
    """
185
    # Before, just avoid triple underscore escape for the classic '://' pattern
186
    filename = filename.replace('http://', '')
187
    filename = filename.replace('https://', '')
188
    #print (filename)
189
    
190
    return re.sub('[^\w\-_\. ]', '-port', filename)
191
    #print (filename)
192

193
def extract_all_matched_named_groups(regex, match):
194
    """
195
        Return a set of all extractable matched parameters.
196
        >>> full_uri_domain.groupindex
197
        {'domain': 1, 'port': 3}
198
        >>>full_uri_domain.match('http://8.8.8.8:80').group('domain')
199
        '8.8.8.8'
200
        >>>extract_all_matched_named_groups() => {'domain': '8.8.8.8', 'port': '80'}
201
            
202
    """
203
    result = {}
204
    for name, id in regex.groupindex.items():
205
        matched_value = match.group(name)
206
        if matched_value != None: result[name] = matched_value
207
    
208
    return result
209
    
210
def entry_format_validator(line):
211
    """
212
        Validate the current line against several regexes and return matched parameters (ip, domain, port etc.)
213
    """
214
    tab = { 'full_uri_domain'       : full_uri_domain,
215
            'fqdn_only'             : fqdn_only,
216
            'fqdn_and_port'         : fqdn_and_port, 
217
            'ipv4_and_port'         : ipv4_and_port, 
218
            'ipv4_only'             : ipv4_only, 
219
            'entry_from_csv'        : entry_from_csv
220
    }
221
    
222
    for name, regex in tab.items():
223
        validator = regex.match(line)
224
        if validator:
225
            return extract_all_matched_named_groups(regex, validator)
226

227
def parse_targets(options, arguments):
228
    """
229
        Parse list and convert each target to valid URI with port(protocol://foobar:port) 
230
    """
231
    
232
    target_list = []
233
    
234
    if options.input_file != None:    
235
        with open(options.input_file,'rb') as fd_input:
236
            try:
237
                lines = [l.decode('utf-8').lstrip().rstrip().strip() for l in fd_input.readlines()]
238
            except UnicodeDecodeError as e:
239
                logger_gen.error('Your input file is not UTF-8 encoded, please encode it before using this script')
240
                sys.exit(0)
241
    else:
242
        lines = arguments
243
        
244
    for index, line in enumerate(lines, start=1):
245
        matches = entry_format_validator(line)
246
        
247
        # pass if line can be recognized as a correct input, or if no 'host' group could be found with all the regexes
248
        if matches == None or not('host' in matches.keys()):
249
            logger_gen.warn("Line %s '%s' could not have been recognized as a correct input" % (index, line))
250
            pass
251
        else:
252
            host = matches['host']
253
            
254
            # Protocol is 'http' by default, unless ssl is forced
255
            if options.ssl == True:
256
                protocol = 'https'
257
            elif 'protocol' in matches.keys():
258
                protocol = str(matches['protocol'])
259
            else:
260
                protocol = 'http'
261
            
262
            # Port is ('80' for http) or ('443' for https) by default, unless a specific port is supplied
263
            if options.port != None:
264
                port = options.port
265
            elif 'port' in matches.keys():
266
                port = int(matches['port'])
267
                
268
                # if port is 443, assume protocol is https if is not specified
269
                protocol = 'https' if port == 443 else protocol
270
            else:
271
                port = 443 if protocol == 'https' else 80
272
            
273
            # No resource URI by default
274
            if 'res' in matches.keys():
275
                res = str(matches['res'])
276
            else:
277
                res = None
278
            
279
            # perform screenshots over HTTP and HTTPS for each target
280
            if options.multiprotocol:
281
                final_uri_http_port = int(matches['port']) if 'port' in matches.keys() else 80
282
                final_uri_http = '%s://%s:%s' % ('http', host, final_uri_http_port)
283
                target_list.append(final_uri_http)
284
                logger_gen.info("'%s' has been formatted as '%s' with supplied overriding options" % (line, final_uri_http))
285
                
286
                
287
                final_uri_https_port = int(matches['port']) if 'port' in matches.keys() else 443
288
                final_uri_https = '%s://%s:%s' % ('https', host, final_uri_https_port)
289
                target_list.append(final_uri_https)
290
                logger_gen.info("'%s' has been formatted as '%s' with supplied overriding options" % (line, final_uri_https))
291
            
292
            else:
293
                final_uri = '%s://%s:%s' % (protocol, host, port)
294
                final_uri = final_uri + '/%s' % res if res != None else final_uri
295
                target_list.append(final_uri)
296

297
                logger_gen.info("'%s' has been formatted as '%s' with supplied overriding options" % (line, final_uri))
298
    
299
    return target_list      
300

301
def craft_cmd(url_and_options):
302
    """
303
        Craft the correct command with url and options
304
    """
305
    global logger_output, PHANTOMJS_BIN, WEBSCREENSHOT_JS, SCREENSHOTS_DIRECTORY, SHELL_EXECUTION_OK, SHELL_EXECUTION_ERROR
306
    
307
    url, options = url_and_options
308
    
309
    logger_url = logging.getLogger("%s" % url)
310
    logger_url.addHandler(logger_output)
311
    logger_url.setLevel(options.log_level)
312

313
    #output_filename = os.path.join(SCREENSHOTS_DIRECTORY, ('%s.png' % filter_bad_filename_chars(url)))
314
    output_filename = os.path.join(SCREENSHOTS_DIRECTORY, ('%s.jpg' % filter_bad_filename_chars(url)))
315
    
316
    # PhantomJS renderer
317
    if options.renderer == 'phantomjs':
318
        # If you ever want to add some voodoo options to the phantomjs command to be executed, that's here right below
319
        cmd_parameters = [  PHANTOMJS_BIN,
320
                            '--ignore-ssl-errors true',
321
                            '--ssl-protocol any',
322
                            '--ssl-ciphers ALL'
323
        ]
324
        
325
        cmd_parameters.append("--proxy %s" % options.proxy) if options.proxy != None else None
326
        cmd_parameters.append("--proxy-auth %s" % options.proxy_auth) if options.proxy_auth != None else None
327
        cmd_parameters.append("--proxy-type %s" % options.proxy_type) if options.proxy_type != None else None
328

329
        cmd_parameters.append('"%s" url_capture="%s" output_file="%s"' % (WEBSCREENSHOT_JS, url, output_filename))
330
        
331
        cmd_parameters.append('header="Cookie: %s"' % options.cookie.rstrip(';')) if options.cookie != None else None
332
        
333
        cmd_parameters.append('http_username="%s"' % options.http_username) if options.http_username != None else None
334
        cmd_parameters.append('http_password="%s"' % options.http_password) if options.http_password != None else None
335
        
336
        if options.header:
337
            for header in options.header:
338
                cmd_parameters.append('header="%s"' % header.rstrip(';'))
339
    
340
    # Chrome and chromium renderers
341
    else: 
342
        cmd_parameters =  [ CHROME_BIN ] if options.renderer == 'chrome' else [ CHROMIUM_BIN ]
343
        cmd_parameters += [ '--allow-running-insecure-content',
344
                            '--ignore-certificate-errors',
345
                            '--ignore-urlfetcher-cert-requests',
346
                            '--reduce-security-for-testing',
347
                            '--no-sandbox',
348
                            '--headless',
349
                            '--disable-gpu',
350
                            '--hide-scrollbars',
351
                            '--incognito',
352
                            '-screenshot="%s"' % output_filename,
353
                            '--window-size=1200,800',
354
                            '"%s"' % url
355
        ]
356
        cmd_parameters.append('--proxy-server="%s"' % options.proxy) if options.proxy != None else None
357
    
358
    cmd = " ".join(cmd_parameters)
359
    
360
    logger_url.debug("Shell command to be executed\n'%s'\n" % cmd)
361
    
362
    execution_retval = shell_exec(url, cmd, options)
363
    
364
    return execution_retval, url
365

366
    
367
def take_screenshot(url_list, options):
368
    """
369
        Launch the screenshot workers
370
        Thanks http://noswap.com/blog/python-multiprocessing-keyboardinterrupt
371
    """
372
    global SHELL_EXECUTION_OK, SHELL_EXECUTION_ERROR
373
    
374
    screenshot_number = len(url_list)
375
    print "[+] %s URLs to be screenshot" % screenshot_number
376
    
377
    pool = multiprocessing.Pool(processes=int(options.workers), initializer=init_worker)
378
    
379
    taken_screenshots = [r for r in pool.imap(func=craft_cmd, iterable=itertools.izip(url_list, itertools.repeat(options)))]
380

381
    screenshots_error_url = [url for retval, url in taken_screenshots if retval == SHELL_EXECUTION_ERROR]
382
    screenshots_error = sum(retval == SHELL_EXECUTION_ERROR for retval, url in taken_screenshots)
383
    screenshots_ok = int(screenshot_number - screenshots_error)
384
    
385
    print "[+] %s actual URLs screenshot" % screenshots_ok
386
    print "[+] %s error(s)" % screenshots_error
387
    
388
    if screenshots_error != 0:
389
        for url in screenshots_error_url:
390
            print "    %s" % url
391

392
    return None
393
    
394
def main():
395
    """
396
        Dat main
397
    """
398
    global VERSION, SCREENSHOTS_DIRECTORY, LOGLEVELS
399
    signal.signal(signal.SIGINT, kill_em_all)
400
    
401
    print 'webscreenshot.py version %s\n' % VERSION
402
    
403
    options, arguments = parser.parse_args()
404
       
405
    try :
406
        options.log_level = LOGLEVELS[options.verbosity]
407
        logger_gen.setLevel(options.log_level)
408
    except :
409
        parser.error("Please specify a valid log level")
410
        
411
    if (options.input_file == None and (len(arguments) > 1 or len(arguments) == 0)):
412
        parser.error('Please specify a valid input file or a valid URL')
413
    
414
    if (options.input_file != None and len(arguments) == 1):
415
        parser.error('Please specify either an input file or an URL')
416
    
417
    if (options.output_directory != None):
418
        SCREENSHOTS_DIRECTORY = os.path.abspath(os.path.join(os.getcwdu(), options.output_directory))
419
    
420
    logger_gen.debug("Options: %s\n" % options)
421
    if not os.path.exists(SCREENSHOTS_DIRECTORY):
422
        logger_gen.info("'%s' does not exist, will then be created" % SCREENSHOTS_DIRECTORY)
423
        os.makedirs(SCREENSHOTS_DIRECTORY)
424
        
425
    url_list = parse_targets(options, arguments)
426
    
427
    take_screenshot(url_list, options)
428
    
429
    return None
430

431
if __name__ == "__main__" :
432
    main()
433
Product

Resources

Company