48 lines
1.4 KiB
Python
48 lines
1.4 KiB
Python
import requests
|
|
import re
|
|
import sys
|
|
from multiprocessing.dummy import Pool
|
|
|
|
|
|
def robots(host):
|
|
r = requests.get(
|
|
'https://web.archive.org/cdx/search/cdx\
|
|
?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest' % host)
|
|
results = r.json()
|
|
if len(results) == 0: # might find nothing
|
|
return []
|
|
results.pop(0) # The first item is ['timestamp', 'original']
|
|
return results
|
|
|
|
|
|
def getpaths(snapshot):
|
|
url = 'https://web.archive.org/web/{0}/{1}'.format(snapshot[0], snapshot[1])
|
|
robotstext = requests.get(url).text
|
|
if 'Disallow:' in robotstext: # verify it's acually a robots.txt file, not 404 page
|
|
paths = re.findall('/.*', robotstext)
|
|
return paths
|
|
return []
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if len(sys.argv) < 2:
|
|
print('Usage:\n\tpython3 waybackrobots.py <domain-name>')
|
|
sys.exit()
|
|
|
|
host = sys.argv[1]
|
|
|
|
snapshots = robots(host)
|
|
print('Found %s unique results' % len(snapshots))
|
|
if len(snapshots) == 0:
|
|
sys.exit()
|
|
print('This may take some time...')
|
|
pool = Pool(4)
|
|
paths = pool.map(getpaths, snapshots)
|
|
unique_paths = set()
|
|
for i in paths:
|
|
unique_paths.update(i)
|
|
filename = '%s-robots.txt' % host
|
|
with open(filename, 'w') as f:
|
|
f.write('\n'.join(unique_paths))
|
|
print('[*] Saved results to %s' % filename)
|