Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
sagemathinc
GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/scripts/check_doc_urls.py
5606 views
1
#!/usr/bin/env python
2
"""
3
Check that all links to the documentation are valid.
4
"""
5
import os
6
import re
7
import requests as r
8
import time
9
from multiprocessing.pool import ThreadPool
10
11
BASE_URL = "https://doc.cocalc.com"
12
13
# change the working directory to the parent directory, of there this file is
14
curdir = os.path.dirname(os.path.abspath(__file__))
15
parentdir = os.path.dirname(curdir)
16
os.chdir(parentdir)
17
18
19
def extract_urls(fn):
20
with open(fn) as f:
21
content = f.read()
22
pattern = fr'''({BASE_URL}[^\s'"\\\n)]+)'''
23
urls = re.findall(pattern, content)
24
25
for url in urls:
26
# remove anchors
27
if '#' in url:
28
url = url[:url.index('#')]
29
# remove query parameters
30
if '?' in url:
31
url = url[:url.index('?')]
32
yield url
33
34
35
def get_all_urls():
36
"""
37
use git grep to find all files, that contain the BASE_URL
38
and then extract all urls from those files
39
"""
40
cmd = f"git grep -lI {BASE_URL}"
41
output = os.popen(cmd).read()
42
files = output.split()
43
# combine all urls into one set
44
all_url = set()
45
for fn in files:
46
for url in extract_urls(fn):
47
all_url.add(url)
48
return sorted(all_url)
49
50
51
def check_url(url):
52
"""
53
Check the HTTP HEAD request for the given URL, to avoid
54
downloading the whole file. Retry a few times for transient failures.
55
"""
56
attempts = 3
57
delay = 5
58
for attempt in range(1, attempts + 1):
59
try:
60
res = r.head(url, timeout=10)
61
res.raise_for_status()
62
except Exception as ex:
63
if attempt < attempts:
64
time.sleep(delay)
65
continue
66
print(f"✗ {url}: {ex}")
67
return False
68
else:
69
print(f"✓ {url}")
70
return True
71
72
73
def main():
74
"""
75
Check all URLs. We use HEAD requests, so that we don't download the whole file.
76
"""
77
all_url = get_all_urls()
78
print(f"Checking {len(all_url)} URLs...")
79
results = ThreadPool(16).map(check_url, all_url)
80
if not all(results):
81
num_failed = len([x for x in results if not x])
82
print(f"{num_failed} URLs failed.")
83
exit(1)
84
else:
85
print("All URLs are valid.")
86
87
88
if __name__ == '__main__':
89
main()
90
91