CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
sagemathinc

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/scripts/check_doc_urls.py
Views: 687
1
#!/usr/bin/env python
2
"""
3
Check that all links to the documentation are valid.
4
"""
5
import os
6
import re
7
import requests as r
8
from multiprocessing.pool import ThreadPool
9
10
BASE_URL = "https://doc.cocalc.com"
11
12
# change the working directory to the parent directory, of there this file is
13
curdir = os.path.dirname(os.path.abspath(__file__))
14
parentdir = os.path.dirname(curdir)
15
os.chdir(parentdir)
16
17
18
def extract_urls(fn):
19
with open(fn) as f:
20
content = f.read()
21
pattern = fr'''({BASE_URL}[^\s'"\\\n)]+)'''
22
urls = re.findall(pattern, content)
23
24
for url in urls:
25
# remove anchors
26
if '#' in url:
27
url = url[:url.index('#')]
28
# remove query parameters
29
if '?' in url:
30
url = url[:url.index('?')]
31
yield url
32
33
34
def get_all_urls():
35
"""
36
use git grep to find all files, that contain the BASE_URL
37
and then extract all urls from those files
38
"""
39
cmd = f"git grep -lI {BASE_URL}"
40
output = os.popen(cmd).read()
41
files = output.split()
42
# combine all urls into one set
43
all_url = set()
44
for fn in files:
45
for url in extract_urls(fn):
46
all_url.add(url)
47
return sorted(all_url)
48
49
50
def check_url(url):
51
"""
52
Check the HTTP HEAD request for the given URL, to avoid
53
downloading the whole file.
54
"""
55
try:
56
res = r.head(url, timeout=10)
57
res.raise_for_status()
58
except Exception as ex:
59
print(f"✗ {url}: {ex}")
60
return False
61
else:
62
print(f"✓ {url}")
63
return True
64
65
66
def main():
67
"""
68
Check all URLs. We use HEAD requests, so that we don't download the whole file.
69
"""
70
all_url = get_all_urls()
71
print(f"Checking {len(all_url)} URLs...")
72
results = ThreadPool(16).map(check_url, all_url)
73
if not all(results):
74
num_failed = len([x for x in results if not x])
75
print(f"{num_failed} URLs failed.")
76
exit(1)
77
else:
78
print("All URLs are valid.")
79
80
81
if __name__ == '__main__':
82
main()
83
84