CoCalc -- convert_to_knowledge

GitHub Repository: ethen8181/machine-learning
Path: blob/master/convert_to_knowledge_repo.py
¹⁴⁶¹ views
1
"""
2
Examples
3
--------
4

5
Convert existing jupyter notebook to an airbnb knowledge repo format
6
- python convert_to_knowledge_repo.py --ml_repo . --knowledge_repo knowledge-repo
7

8
Deploying the webapp
9
- knowledge_repo --repo knowledge-repo deploy
10
"""
11
import os
12
import re
13
import json
14
import subprocess
15
from dateutil import parser as date_parser
16

17

18
def main(ml_repo, knowledge_repo, inplace):
19
    ml_repo_path = os.path.abspath(ml_repo)
20
    knowledge_repo_path = os.path.abspath(knowledge_repo)
21
    if not os.path.isdir(knowledge_repo_path):
22
        init_knowledge_repo(knowledge_repo_path)
23

24
    convert_all_posts(ml_repo_path, knowledge_repo_path, inplace)
25

26

27
def init_knowledge_repo(path):
28
    cmd = 'knowledge_repo --repo {} init'.format(path)
29
    subprocess.call(cmd, shell=True)
30

31

32
def convert_all_posts(path, knowledge_repo_path, inplace):
33
    """Recursive walk down all directory to perform the conversion"""
34
    if os.path.isdir(path):
35
        files = [os.path.join(path, f) for f in os.listdir(path)]
36
        for f in files:
37
            convert_all_posts(f, knowledge_repo_path, inplace)
38

39
    elif '-converted' not in path:
40
        head, ext = os.path.splitext(path)
41
        if ext == ".ipynb":
42
            try:
43
                converter = IpynbConverter(knowledge_repo_path, inplace)
44
                notebook = converter.convert(path)
45
                converter.add(notebook)
46
            except Exception as e:
47
                print('Skipping: {}'.format(path))
48
                print(e)
49

50

51
class IpynbConverter:
52
    """
53
    Converts Jupyter notebook to airbnb knowledge repo format [1]_.
54

55
    Parameters
56
    ----------
57
    knowledge_repo_path : str
58
        Path to store the airbnb knowledge repo-ed notebook.
59

60
    inplace : bool
61
        Whether to perform the conversion inplace or not. If
62
        false, then it will create a new notebook that has the
63
        '-converted' appended to the file name.
64

65
    Attributes
66
    ----------
67
    date_created_ : str
68
        Input notebook's creation date.
69

70
    date_updated_ : str
71
        Input notebook's latest updated date.
72

73
    tags_ : str
74
        The notebook's filename is use as the tag in this automated
75
        conversion process. e.g. /Users/ethen/machine-learning/trees/decision_tree.ipynb,
76
        we would use 'decision_tree' as the tag.
77

78
    github_link_ : str
79
        Notebook's original link on github.
80

81
    title_ : str
82
        Notebook's title, uses the first level 1 markdown header that's not
83
        'Table of Contents' that could be automatically generated by newer
84
        version of notebook. e.g. # Decision Tree (Classification)\n, then
85
        Decision Tree (Classification) would be our title.
86

87
    References
88
    ----------
89
    .. [1] `Airbnb knowledge repo
90
            <https://github.com/airbnb/knowledge-repo>`_
91
    """
92

93
    AUTHOR = 'Ethen Liu'
94
    DATE_FORMAT = '%Y-%m-%d'
95
    REPO_NAME = 'machine-learning'
96
    BASE_URL = 'https://github.com/ethen8181/'
97

98
    def __init__(self, knowledge_repo_path, inplace):
99
        self.inplace = inplace
100
        self.knowledge_repo_path = knowledge_repo_path
101

102
    def convert(self, path):
103
        """
104
        Convert the input path's notebook to a knowledge repo. This
105
        will add a mandatory raw cell that contains the yaml information
106
        needed by the knowledge repo and an additional cell that contains
107
        link to the notebook on github.
108

109
        Parameters
110
        ----------
111
        path : str
112
            Path that has the '.ipynb' extension.
113

114
        Returns
115
        -------
116
        notebook : dict
117
            Updated Jupyter notebook's raw json represented in dictionary format.
118
            Ready to be passed to the .add method to add to the knowledge repo.
119
        """
120
        self.date_created_ = self._date_created(path)
121
        self.date_updated_ = self._date_updated(path)
122
        self.tags_, self.github_link_ = self._tags_and_github_link(path)
123
        with open(path, encoding='utf-8') as f:
124
            notebook = json.load(f)
125

126
        self.title_ = self._title(notebook)
127

128
        # prepend the dictionary header to notebook['cells']
129
        notebook['cells'] = ([self._construct_header()] +
130
                             [self._construct_github_link_cell()] +
131
                             notebook['cells'])
132
        if not self.inplace:
133
            head, ext = os.path.splitext(path)
134
            head += '-converted'
135
            path = head + ext
136

137
        self._path = path
138
        return notebook
139

140
    def _date_created(self, path):
141
        """Grab the date of creation through git log."""
142
        cmd = 'git log --diff-filter=A --follow --format=%cd -1 -- {}'.format(path)
143
        return self._git_date_cmd(cmd)
144

145
    def _date_updated(self, path):
146
        """Grab the last date modified through git log."""
147
        cmd = 'git log --format=%cd -1 -- {}'.format(path)
148
        return self._git_date_cmd(cmd)
149

150
    def _git_date_cmd(self, cmd):
151
        """Run bash command to retrieve and format date string."""
152
        date_str = subprocess.check_output(cmd, shell=True)
153
        date_dt = date_parser.parse(date_str)
154
        formatted_date = date_dt.strftime(self.DATE_FORMAT)
155
        return formatted_date
156

157
    def _tags_and_github_link(self, path):
158
        """
159
        Use file name as tags, e.g. /Users/ethen/machine-learning/trees/decision_tree.ipynb
160
        we would use 'decision_tree' as the tag
161
        """
162
        _, file_path = path.split(self.REPO_NAME)
163
        _, file_name = os.path.split(file_path)
164
        tags, _ = os.path.splitext(file_name)
165

166
        # /blob/master indicates github master branch
167
        link = self.BASE_URL + self.REPO_NAME + '/blob/master' + file_path
168
        return tags, link
169

170
    def _title(self, notebook):
171
        """
172
        A title in the notebook always starts with the '#' indicating a
173
        markdown level 1 header e.g. # Decision Tree (Classification)\n
174
        thus we can just parse all the text in between the '#' and the line break '\n'
175
        """
176

177
        # TODO : we could fall back to the file path if it doesn't exist perhaps?
178
        title_pattern = re.compile('# (.*)\n')
179
        for cell in notebook['cells']:
180
            if cell['cell_type'] == 'markdown':
181
                # the [0] indicates the # title pattern
182
                # should always appear in the first line
183
                source = cell['source'][0]
184
                matched = title_pattern.match(source)
185
                if matched is not None:
186
                    title = matched.group(1)
187
                    # newer version of notebooks includes a
188
                    # Table of Contents automatically in the first
189
                    # cell, skip that and find the next level 1 header
190
                    if not title == 'Table of Contents':
191
                        break
192
        return title
193

194
    def _construct_header(self):
195
        """Create a knowledge repo style header as a dictionary."""
196

197
        def flatten_list(l):
198
            """
199
            Although not needed for the current version, we could
200
            have multiple tags and authors, in that case we would
201
            need to flatten them out.
202
            """
203
            flat = []
204
            for item in l:
205
                if isinstance(item, list):
206
                    flat += item
207
                else:
208
                    flat.append(item)
209

210
            return flat
211

212
        header = {'cell_type': 'raw', 'metadata': {}}
213

214
        # header text required by the knowledge repo
215
        # a '- ' in front is required for knowledge repo tag
216
        header_text = [
217
            '---',
218
            'title: {}'.format(self.title_),
219
            'authors:',
220
            '- {}'.format(self.AUTHOR),
221
            'tags:',
222
            '- ' + self.tags_,
223
            'created_at: {}'.format(self.date_created_),
224
            'updated_at: {}'.format(self.date_updated_),
225
            'tldr: Nothing for tldr section as of now.',
226
            '---']
227

228
        header_text = flatten_list(header_text)
229
        header_text = [text + '\n' for text in header_text[:-1]] + [header_text[-1]]
230
        header['source'] = header_text
231
        return header
232

233
    def _construct_github_link_cell(self):
234
        """Add a cell that contains link to original notebook on github"""
235
        github_link_cell = {
236
            'cell_type': 'markdown',
237
            'metadata': {},
238
            'source': ['Link to original notebook: {}'.format(self.github_link_)]}
239
        return github_link_cell
240

241
    def add(self, notebook):
242
        """
243
        Add the converted notebook to the knowledge repo.
244

245
        Parameters
246
        ----------
247
        notebook : dict
248
            Jupyter notebook's raw json represented in dictionary format.
249
        """
250
        with open(self._path, 'w', encoding='utf-8') as f:
251
            json.dump(notebook, f)
252

253
        # create a run knowledge repo command
254
        destination = os.path.join(self.knowledge_repo_path, 'project', self.tags_)
255
        cmd = 'knowledge_repo --repo {} add {} -p {}'.format(
256
            self.knowledge_repo_path, self._path, destination)
257

258
        # communicate with the shell output to enable
259
        # continuation of the script execution
260
        p = subprocess.Popen(
261
            cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE,
262
            stderr=subprocess.STDOUT, shell=True)
263
        p.communicate(input=b'generated by automated airbnb knowledge repo setup')
264

265
        if not self.inplace:
266
            os.remove(self._path)
267

268

269
if __name__ == '__main__':
270
    import argparse
271

272
    parser = argparse.ArgumentParser(
273
        description='Convert the machine-learning repository to an Airbnb Knowledge Repo.')
274
    parser.add_argument(
275
        '--ml_repo', type=str, help='Path to the root directory of the machine-learning repo.')
276
    parser.add_argument(
277
        '--knowledge_repo', type=str, help='Path to the knowledge repo.')
278
    parser.add_argument(
279
        '--inplace', action='store_true', help='Modify the existing .ipynb in place.')
280
    args = vars(parser.parse_args())
281
    main(**args)
282

283
Product

Resources

Company