Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place.
Path: blob/master/src/scripts/check_doc_urls.py
Views: 687
#!/usr/bin/env python1"""2Check that all links to the documentation are valid.3"""4import os5import re6import requests as r7from multiprocessing.pool import ThreadPool89BASE_URL = "https://doc.cocalc.com"1011# change the working directory to the parent directory, of there this file is12curdir = os.path.dirname(os.path.abspath(__file__))13parentdir = os.path.dirname(curdir)14os.chdir(parentdir)151617def extract_urls(fn):18with open(fn) as f:19content = f.read()20pattern = fr'''({BASE_URL}[^\s'"\\\n)]+)'''21urls = re.findall(pattern, content)2223for url in urls:24# remove anchors25if '#' in url:26url = url[:url.index('#')]27# remove query parameters28if '?' in url:29url = url[:url.index('?')]30yield url313233def get_all_urls():34"""35use git grep to find all files, that contain the BASE_URL36and then extract all urls from those files37"""38cmd = f"git grep -lI {BASE_URL}"39output = os.popen(cmd).read()40files = output.split()41# combine all urls into one set42all_url = set()43for fn in files:44for url in extract_urls(fn):45all_url.add(url)46return sorted(all_url)474849def check_url(url):50"""51Check the HTTP HEAD request for the given URL, to avoid52downloading the whole file.53"""54try:55res = r.head(url, timeout=10)56res.raise_for_status()57except Exception as ex:58print(f"✗ {url}: {ex}")59return False60else:61print(f"✓ {url}")62return True636465def main():66"""67Check all URLs. We use HEAD requests, so that we don't download the whole file.68"""69all_url = get_all_urls()70print(f"Checking {len(all_url)} URLs...")71results = ThreadPool(16).map(check_url, all_url)72if not all(results):73num_failed = len([x for x in results if not x])74print(f"{num_failed} URLs failed.")75exit(1)76else:77print("All URLs are valid.")787980if __name__ == '__main__':81main()828384