Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
1N3
GitHub Repository: 1N3/Sn1per
Path: blob/master/bin/waybackrobots.py
2960 views
1
import requests
2
import re
3
import sys
4
from multiprocessing.dummy import Pool
5
6
7
def robots(host):
8
r = requests.get(
9
'https://web.archive.org/cdx/search/cdx\
10
?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host)
11
results = r.json()
12
if len(results) == 0: # might find nothing
13
return []
14
results.pop(0) # The first item is ['timestamp', 'original']
15
return results
16
17
18
def getpaths(snapshot):
19
url = 'https://web.archive.org/web/{0}/{1}'.format(snapshot[0], snapshot[1])
20
robotstext = requests.get(url).text
21
if 'Disallow:' in robotstext: # verify it's acually a robots.txt file, not 404 page
22
paths = re.findall('/.*', robotstext)
23
return paths
24
return []
25
26
27
if __name__ == '__main__':
28
if len(sys.argv) < 2:
29
print('Usage:\n\tpython3 waybackrobots.py <domain-name>')
30
sys.exit()
31
32
host = sys.argv[1]
33
34
snapshots = robots(host)
35
print('Found %s unique results' % len(snapshots))
36
if len(snapshots) == 0:
37
sys.exit()
38
print('This may take some time...')
39
pool = Pool(4)
40
paths = pool.map(getpaths, snapshots)
41
unique_paths = set()
42
for i in paths:
43
unique_paths.update(i)
44
filename = '%s-robots.txt' % host
45
with open(filename, 'w') as f:
46
f.write('\n'.join(unique_paths))
47
print('[*] Saved results to %s' % filename)
48
49