Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ethen8181
GitHub Repository: ethen8181/machine-learning
Path: blob/master/convert_to_knowledge_repo.py
1461 views
1
"""
2
Examples
3
--------
4
5
Convert existing jupyter notebook to an airbnb knowledge repo format
6
- python convert_to_knowledge_repo.py --ml_repo . --knowledge_repo knowledge-repo
7
8
Deploying the webapp
9
- knowledge_repo --repo knowledge-repo deploy
10
"""
11
import os
12
import re
13
import json
14
import subprocess
15
from dateutil import parser as date_parser
16
17
18
def main(ml_repo, knowledge_repo, inplace):
19
ml_repo_path = os.path.abspath(ml_repo)
20
knowledge_repo_path = os.path.abspath(knowledge_repo)
21
if not os.path.isdir(knowledge_repo_path):
22
init_knowledge_repo(knowledge_repo_path)
23
24
convert_all_posts(ml_repo_path, knowledge_repo_path, inplace)
25
26
27
def init_knowledge_repo(path):
28
cmd = 'knowledge_repo --repo {} init'.format(path)
29
subprocess.call(cmd, shell=True)
30
31
32
def convert_all_posts(path, knowledge_repo_path, inplace):
33
"""Recursive walk down all directory to perform the conversion"""
34
if os.path.isdir(path):
35
files = [os.path.join(path, f) for f in os.listdir(path)]
36
for f in files:
37
convert_all_posts(f, knowledge_repo_path, inplace)
38
39
elif '-converted' not in path:
40
head, ext = os.path.splitext(path)
41
if ext == ".ipynb":
42
try:
43
converter = IpynbConverter(knowledge_repo_path, inplace)
44
notebook = converter.convert(path)
45
converter.add(notebook)
46
except Exception as e:
47
print('Skipping: {}'.format(path))
48
print(e)
49
50
51
class IpynbConverter:
52
"""
53
Converts Jupyter notebook to airbnb knowledge repo format [1]_.
54
55
Parameters
56
----------
57
knowledge_repo_path : str
58
Path to store the airbnb knowledge repo-ed notebook.
59
60
inplace : bool
61
Whether to perform the conversion inplace or not. If
62
false, then it will create a new notebook that has the
63
'-converted' appended to the file name.
64
65
Attributes
66
----------
67
date_created_ : str
68
Input notebook's creation date.
69
70
date_updated_ : str
71
Input notebook's latest updated date.
72
73
tags_ : str
74
The notebook's filename is use as the tag in this automated
75
conversion process. e.g. /Users/ethen/machine-learning/trees/decision_tree.ipynb,
76
we would use 'decision_tree' as the tag.
77
78
github_link_ : str
79
Notebook's original link on github.
80
81
title_ : str
82
Notebook's title, uses the first level 1 markdown header that's not
83
'Table of Contents' that could be automatically generated by newer
84
version of notebook. e.g. # Decision Tree (Classification)\n, then
85
Decision Tree (Classification) would be our title.
86
87
References
88
----------
89
.. [1] `Airbnb knowledge repo
90
<https://github.com/airbnb/knowledge-repo>`_
91
"""
92
93
AUTHOR = 'Ethen Liu'
94
DATE_FORMAT = '%Y-%m-%d'
95
REPO_NAME = 'machine-learning'
96
BASE_URL = 'https://github.com/ethen8181/'
97
98
def __init__(self, knowledge_repo_path, inplace):
99
self.inplace = inplace
100
self.knowledge_repo_path = knowledge_repo_path
101
102
def convert(self, path):
103
"""
104
Convert the input path's notebook to a knowledge repo. This
105
will add a mandatory raw cell that contains the yaml information
106
needed by the knowledge repo and an additional cell that contains
107
link to the notebook on github.
108
109
Parameters
110
----------
111
path : str
112
Path that has the '.ipynb' extension.
113
114
Returns
115
-------
116
notebook : dict
117
Updated Jupyter notebook's raw json represented in dictionary format.
118
Ready to be passed to the .add method to add to the knowledge repo.
119
"""
120
self.date_created_ = self._date_created(path)
121
self.date_updated_ = self._date_updated(path)
122
self.tags_, self.github_link_ = self._tags_and_github_link(path)
123
with open(path, encoding='utf-8') as f:
124
notebook = json.load(f)
125
126
self.title_ = self._title(notebook)
127
128
# prepend the dictionary header to notebook['cells']
129
notebook['cells'] = ([self._construct_header()] +
130
[self._construct_github_link_cell()] +
131
notebook['cells'])
132
if not self.inplace:
133
head, ext = os.path.splitext(path)
134
head += '-converted'
135
path = head + ext
136
137
self._path = path
138
return notebook
139
140
def _date_created(self, path):
141
"""Grab the date of creation through git log."""
142
cmd = 'git log --diff-filter=A --follow --format=%cd -1 -- {}'.format(path)
143
return self._git_date_cmd(cmd)
144
145
def _date_updated(self, path):
146
"""Grab the last date modified through git log."""
147
cmd = 'git log --format=%cd -1 -- {}'.format(path)
148
return self._git_date_cmd(cmd)
149
150
def _git_date_cmd(self, cmd):
151
"""Run bash command to retrieve and format date string."""
152
date_str = subprocess.check_output(cmd, shell=True)
153
date_dt = date_parser.parse(date_str)
154
formatted_date = date_dt.strftime(self.DATE_FORMAT)
155
return formatted_date
156
157
def _tags_and_github_link(self, path):
158
"""
159
Use file name as tags, e.g. /Users/ethen/machine-learning/trees/decision_tree.ipynb
160
we would use 'decision_tree' as the tag
161
"""
162
_, file_path = path.split(self.REPO_NAME)
163
_, file_name = os.path.split(file_path)
164
tags, _ = os.path.splitext(file_name)
165
166
# /blob/master indicates github master branch
167
link = self.BASE_URL + self.REPO_NAME + '/blob/master' + file_path
168
return tags, link
169
170
def _title(self, notebook):
171
"""
172
A title in the notebook always starts with the '#' indicating a
173
markdown level 1 header e.g. # Decision Tree (Classification)\n
174
thus we can just parse all the text in between the '#' and the line break '\n'
175
"""
176
177
# TODO : we could fall back to the file path if it doesn't exist perhaps?
178
title_pattern = re.compile('# (.*)\n')
179
for cell in notebook['cells']:
180
if cell['cell_type'] == 'markdown':
181
# the [0] indicates the # title pattern
182
# should always appear in the first line
183
source = cell['source'][0]
184
matched = title_pattern.match(source)
185
if matched is not None:
186
title = matched.group(1)
187
# newer version of notebooks includes a
188
# Table of Contents automatically in the first
189
# cell, skip that and find the next level 1 header
190
if not title == 'Table of Contents':
191
break
192
return title
193
194
def _construct_header(self):
195
"""Create a knowledge repo style header as a dictionary."""
196
197
def flatten_list(l):
198
"""
199
Although not needed for the current version, we could
200
have multiple tags and authors, in that case we would
201
need to flatten them out.
202
"""
203
flat = []
204
for item in l:
205
if isinstance(item, list):
206
flat += item
207
else:
208
flat.append(item)
209
210
return flat
211
212
header = {'cell_type': 'raw', 'metadata': {}}
213
214
# header text required by the knowledge repo
215
# a '- ' in front is required for knowledge repo tag
216
header_text = [
217
'---',
218
'title: {}'.format(self.title_),
219
'authors:',
220
'- {}'.format(self.AUTHOR),
221
'tags:',
222
'- ' + self.tags_,
223
'created_at: {}'.format(self.date_created_),
224
'updated_at: {}'.format(self.date_updated_),
225
'tldr: Nothing for tldr section as of now.',
226
'---']
227
228
header_text = flatten_list(header_text)
229
header_text = [text + '\n' for text in header_text[:-1]] + [header_text[-1]]
230
header['source'] = header_text
231
return header
232
233
def _construct_github_link_cell(self):
234
"""Add a cell that contains link to original notebook on github"""
235
github_link_cell = {
236
'cell_type': 'markdown',
237
'metadata': {},
238
'source': ['Link to original notebook: {}'.format(self.github_link_)]}
239
return github_link_cell
240
241
def add(self, notebook):
242
"""
243
Add the converted notebook to the knowledge repo.
244
245
Parameters
246
----------
247
notebook : dict
248
Jupyter notebook's raw json represented in dictionary format.
249
"""
250
with open(self._path, 'w', encoding='utf-8') as f:
251
json.dump(notebook, f)
252
253
# create a run knowledge repo command
254
destination = os.path.join(self.knowledge_repo_path, 'project', self.tags_)
255
cmd = 'knowledge_repo --repo {} add {} -p {}'.format(
256
self.knowledge_repo_path, self._path, destination)
257
258
# communicate with the shell output to enable
259
# continuation of the script execution
260
p = subprocess.Popen(
261
cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE,
262
stderr=subprocess.STDOUT, shell=True)
263
p.communicate(input=b'generated by automated airbnb knowledge repo setup')
264
265
if not self.inplace:
266
os.remove(self._path)
267
268
269
if __name__ == '__main__':
270
import argparse
271
272
parser = argparse.ArgumentParser(
273
description='Convert the machine-learning repository to an Airbnb Knowledge Repo.')
274
parser.add_argument(
275
'--ml_repo', type=str, help='Path to the root directory of the machine-learning repo.')
276
parser.add_argument(
277
'--knowledge_repo', type=str, help='Path to the knowledge repo.')
278
parser.add_argument(
279
'--inplace', action='store_true', help='Modify the existing .ipynb in place.')
280
args = vars(parser.parse_args())
281
main(**args)
282
283