CoCalc -- papers

GitHub Repository: labmlai/annotated_deep_learning_paper_implementations
Path: blob/master/utils/papers_list.py
⁴⁹¹⁸ views
1
import json
2
import re
3
from pathlib import Path
4

5
from labml import logger
6
from labml.logger import Text
7

8
HOME = Path('./labml_nn').absolute()
9
print(HOME)
10

11
REGEX = re.compile(r"""
12
 \(
13
 https://arxiv\.org/abs/  # Start of a numeric entity reference
14
 (?P<id>[0-9\.]+)  # Paper ID
15
 \)
16
""", re.VERBOSE)
17

18
IGNORE = {
19
    'neox/model.html',
20
    'transformers/index.html',
21
    'transformers/configs.html',
22
    'optimizers/noam.html',
23
    'transformers/basic/autoregressive_experiment.html',
24
    'transformers/xl/relative_mha.html',
25
    'capsule_networks/mnist.html',
26
    'transformers/rope/value_pe/index.html',
27
}
28

29
IGNORE_PAPERS = {
30
    '2002.04745',  # On Layer Normalization in the Transformer Architecture
31
    '1606.08415',  # Gaussian Error Linear Units (GELUs)
32
    '1710.10196',  # Progressive Growing of GANs for Improved Quality, Stability, and Variation
33
    '1904.11486',  # Making Convolutional Networks Shift-Invariant Again
34
    '1801.04406',  # Which Training Methods for GANs do actually Converge?
35
    '1812.04948',  # A Style-Based Generator Architecture for Generative Adversarial Networks
36
    '1705.10528',  # Constrained Policy Optimization
37
}
38

39

40
def collect(path: Path):
41
    if path.is_file():
42
        html = path.relative_to(HOME)
43
        if html.suffix not in {'.py'}:
44
            return []
45

46
        if html.stem == '__init__':
47
            html = html.parent / 'index.html'
48
        else:
49
            html = html.parent / f'{html.stem}.html'
50

51
        if str(html) in IGNORE:
52
            return []
53

54
        with open(str(path), 'r') as f:
55
            contents = f.read()
56
            papers = set()
57
            for m in REGEX.finditer(contents):
58
                if m.group('id') in IGNORE_PAPERS:
59
                    continue
60
                papers.add(m.group('id'))
61

62
            if len(papers) > 1:
63
                logger.log([(str(html), Text.key), ': ', str(papers)])
64
            return [{'url': str(html), 'arxiv_id': p} for p in papers]
65

66
    urls = []
67
    for f in path.iterdir():
68
        urls += collect(f)
69

70
    return urls
71

72

73
def main():
74
    papers = []
75
    for f in HOME.iterdir():
76
        papers += collect(f)
77

78
    papers.sort(key=lambda p: p['arxiv_id'])
79

80
    by_id = {}
81
    for p in papers:
82
        if p['arxiv_id'] not in by_id:
83
            by_id[p['arxiv_id']] = []
84
        by_id[p['arxiv_id']].append(f'''https://nn.labml.ai/{p['url']}''')
85

86
    logger.log([('Papers', Text.key), ': ', f'{len(by_id) :,}'])
87

88
    with open(str(HOME.parent / 'docs' / 'papers.json'), 'w') as f:
89
        f.write(json.dumps(by_id, indent=1))
90

91

92
if __name__ == '__main__':
93
    main()
94

95
Product

Resources

Company