CoCalc -- check_doc

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: sagemathinc/cocalc
Path: blob/master/src/scripts/check_doc_urls.py
Views: ⁹²³

1
#!/usr/bin/env python
2
"""
3
Check that all links to the documentation are valid.
4
"""
5
import os
6
import re
7
import requests as r
8
from multiprocessing.pool import ThreadPool
9

10
BASE_URL = "https://doc.cocalc.com"
11

12
# change the working directory to the parent directory, of there this file is
13
curdir = os.path.dirname(os.path.abspath(__file__))
14
parentdir = os.path.dirname(curdir)
15
os.chdir(parentdir)
16

17

18
def extract_urls(fn):
19
    with open(fn) as f:
20
        content = f.read()
21
        pattern = fr'''({BASE_URL}[^\s'"\\\n)]+)'''
22
        urls = re.findall(pattern, content)
23

24
        for url in urls:
25
            # remove anchors
26
            if '#' in url:
27
                url = url[:url.index('#')]
28
            # remove query parameters
29
            if '?' in url:
30
                url = url[:url.index('?')]
31
            yield url
32

33

34
def get_all_urls():
35
    """
36
    use git grep to find all files, that contain the BASE_URL
37
    and then extract all urls from those files
38
    """
39
    cmd = f"git grep -lI {BASE_URL}"
40
    output = os.popen(cmd).read()
41
    files = output.split()
42
    # combine all urls into one set
43
    all_url = set()
44
    for fn in files:
45
        for url in extract_urls(fn):
46
            all_url.add(url)
47
    return sorted(all_url)
48

49

50
def check_url(url):
51
    """
52
    Check the HTTP HEAD request for the given URL, to avoid
53
    downloading the whole file.
54
    """
55
    try:
56
        res = r.head(url, timeout=10)
57
        res.raise_for_status()
58
    except Exception as ex:
59
        print(f"✗ {url}: {ex}")
60
        return False
61
    else:
62
        print(f"✓ {url}")
63
    return True
64

65

66
def main():
67
    """
68
    Check all URLs. We use HEAD requests, so that we don't download the whole file.
69
    """
70
    all_url = get_all_urls()
71
    print(f"Checking {len(all_url)} URLs...")
72
    results = ThreadPool(16).map(check_url, all_url)
73
    if not all(results):
74
        num_failed = len([x for x in results if not x])
75
        print(f"{num_failed} URLs failed.")
76
        exit(1)
77
    else:
78
        print("All URLs are valid.")
79

80

81
if __name__ == '__main__':
82
    main()
83

84

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.