Path: blob/master/utils/papers_list.py
4918 views
import json1import re2from pathlib import Path34from labml import logger5from labml.logger import Text67HOME = Path('./labml_nn').absolute()8print(HOME)910REGEX = re.compile(r"""11\(12https://arxiv\.org/abs/ # Start of a numeric entity reference13(?P<id>[0-9\.]+) # Paper ID14\)15""", re.VERBOSE)1617IGNORE = {18'neox/model.html',19'transformers/index.html',20'transformers/configs.html',21'optimizers/noam.html',22'transformers/basic/autoregressive_experiment.html',23'transformers/xl/relative_mha.html',24'capsule_networks/mnist.html',25'transformers/rope/value_pe/index.html',26}2728IGNORE_PAPERS = {29'2002.04745', # On Layer Normalization in the Transformer Architecture30'1606.08415', # Gaussian Error Linear Units (GELUs)31'1710.10196', # Progressive Growing of GANs for Improved Quality, Stability, and Variation32'1904.11486', # Making Convolutional Networks Shift-Invariant Again33'1801.04406', # Which Training Methods for GANs do actually Converge?34'1812.04948', # A Style-Based Generator Architecture for Generative Adversarial Networks35'1705.10528', # Constrained Policy Optimization36}373839def collect(path: Path):40if path.is_file():41html = path.relative_to(HOME)42if html.suffix not in {'.py'}:43return []4445if html.stem == '__init__':46html = html.parent / 'index.html'47else:48html = html.parent / f'{html.stem}.html'4950if str(html) in IGNORE:51return []5253with open(str(path), 'r') as f:54contents = f.read()55papers = set()56for m in REGEX.finditer(contents):57if m.group('id') in IGNORE_PAPERS:58continue59papers.add(m.group('id'))6061if len(papers) > 1:62logger.log([(str(html), Text.key), ': ', str(papers)])63return [{'url': str(html), 'arxiv_id': p} for p in papers]6465urls = []66for f in path.iterdir():67urls += collect(f)6869return urls707172def main():73papers = []74for f in HOME.iterdir():75papers += collect(f)7677papers.sort(key=lambda p: p['arxiv_id'])7879by_id = {}80for p in papers:81if p['arxiv_id'] not in by_id:82by_id[p['arxiv_id']] = []83by_id[p['arxiv_id']].append(f'''https://nn.labml.ai/{p['url']}''')8485logger.log([('Papers', Text.key), ': ', f'{len(by_id) :,}'])8687with open(str(HOME.parent / 'docs' / 'papers.json'), 'w') as f:88f.write(json.dumps(by_id, indent=1))899091if __name__ == '__main__':92main()939495