Path: blob/master/convert_to_knowledge_repo.py
1461 views
"""1Examples2--------34Convert existing jupyter notebook to an airbnb knowledge repo format5- python convert_to_knowledge_repo.py --ml_repo . --knowledge_repo knowledge-repo67Deploying the webapp8- knowledge_repo --repo knowledge-repo deploy9"""10import os11import re12import json13import subprocess14from dateutil import parser as date_parser151617def main(ml_repo, knowledge_repo, inplace):18ml_repo_path = os.path.abspath(ml_repo)19knowledge_repo_path = os.path.abspath(knowledge_repo)20if not os.path.isdir(knowledge_repo_path):21init_knowledge_repo(knowledge_repo_path)2223convert_all_posts(ml_repo_path, knowledge_repo_path, inplace)242526def init_knowledge_repo(path):27cmd = 'knowledge_repo --repo {} init'.format(path)28subprocess.call(cmd, shell=True)293031def convert_all_posts(path, knowledge_repo_path, inplace):32"""Recursive walk down all directory to perform the conversion"""33if os.path.isdir(path):34files = [os.path.join(path, f) for f in os.listdir(path)]35for f in files:36convert_all_posts(f, knowledge_repo_path, inplace)3738elif '-converted' not in path:39head, ext = os.path.splitext(path)40if ext == ".ipynb":41try:42converter = IpynbConverter(knowledge_repo_path, inplace)43notebook = converter.convert(path)44converter.add(notebook)45except Exception as e:46print('Skipping: {}'.format(path))47print(e)484950class IpynbConverter:51"""52Converts Jupyter notebook to airbnb knowledge repo format [1]_.5354Parameters55----------56knowledge_repo_path : str57Path to store the airbnb knowledge repo-ed notebook.5859inplace : bool60Whether to perform the conversion inplace or not. If61false, then it will create a new notebook that has the62'-converted' appended to the file name.6364Attributes65----------66date_created_ : str67Input notebook's creation date.6869date_updated_ : str70Input notebook's latest updated date.7172tags_ : str73The notebook's filename is use as the tag in this automated74conversion process. e.g. /Users/ethen/machine-learning/trees/decision_tree.ipynb,75we would use 'decision_tree' as the tag.7677github_link_ : str78Notebook's original link on github.7980title_ : str81Notebook's title, uses the first level 1 markdown header that's not82'Table of Contents' that could be automatically generated by newer83version of notebook. e.g. # Decision Tree (Classification)\n, then84Decision Tree (Classification) would be our title.8586References87----------88.. [1] `Airbnb knowledge repo89<https://github.com/airbnb/knowledge-repo>`_90"""9192AUTHOR = 'Ethen Liu'93DATE_FORMAT = '%Y-%m-%d'94REPO_NAME = 'machine-learning'95BASE_URL = 'https://github.com/ethen8181/'9697def __init__(self, knowledge_repo_path, inplace):98self.inplace = inplace99self.knowledge_repo_path = knowledge_repo_path100101def convert(self, path):102"""103Convert the input path's notebook to a knowledge repo. This104will add a mandatory raw cell that contains the yaml information105needed by the knowledge repo and an additional cell that contains106link to the notebook on github.107108Parameters109----------110path : str111Path that has the '.ipynb' extension.112113Returns114-------115notebook : dict116Updated Jupyter notebook's raw json represented in dictionary format.117Ready to be passed to the .add method to add to the knowledge repo.118"""119self.date_created_ = self._date_created(path)120self.date_updated_ = self._date_updated(path)121self.tags_, self.github_link_ = self._tags_and_github_link(path)122with open(path, encoding='utf-8') as f:123notebook = json.load(f)124125self.title_ = self._title(notebook)126127# prepend the dictionary header to notebook['cells']128notebook['cells'] = ([self._construct_header()] +129[self._construct_github_link_cell()] +130notebook['cells'])131if not self.inplace:132head, ext = os.path.splitext(path)133head += '-converted'134path = head + ext135136self._path = path137return notebook138139def _date_created(self, path):140"""Grab the date of creation through git log."""141cmd = 'git log --diff-filter=A --follow --format=%cd -1 -- {}'.format(path)142return self._git_date_cmd(cmd)143144def _date_updated(self, path):145"""Grab the last date modified through git log."""146cmd = 'git log --format=%cd -1 -- {}'.format(path)147return self._git_date_cmd(cmd)148149def _git_date_cmd(self, cmd):150"""Run bash command to retrieve and format date string."""151date_str = subprocess.check_output(cmd, shell=True)152date_dt = date_parser.parse(date_str)153formatted_date = date_dt.strftime(self.DATE_FORMAT)154return formatted_date155156def _tags_and_github_link(self, path):157"""158Use file name as tags, e.g. /Users/ethen/machine-learning/trees/decision_tree.ipynb159we would use 'decision_tree' as the tag160"""161_, file_path = path.split(self.REPO_NAME)162_, file_name = os.path.split(file_path)163tags, _ = os.path.splitext(file_name)164165# /blob/master indicates github master branch166link = self.BASE_URL + self.REPO_NAME + '/blob/master' + file_path167return tags, link168169def _title(self, notebook):170"""171A title in the notebook always starts with the '#' indicating a172markdown level 1 header e.g. # Decision Tree (Classification)\n173thus we can just parse all the text in between the '#' and the line break '\n'174"""175176# TODO : we could fall back to the file path if it doesn't exist perhaps?177title_pattern = re.compile('# (.*)\n')178for cell in notebook['cells']:179if cell['cell_type'] == 'markdown':180# the [0] indicates the # title pattern181# should always appear in the first line182source = cell['source'][0]183matched = title_pattern.match(source)184if matched is not None:185title = matched.group(1)186# newer version of notebooks includes a187# Table of Contents automatically in the first188# cell, skip that and find the next level 1 header189if not title == 'Table of Contents':190break191return title192193def _construct_header(self):194"""Create a knowledge repo style header as a dictionary."""195196def flatten_list(l):197"""198Although not needed for the current version, we could199have multiple tags and authors, in that case we would200need to flatten them out.201"""202flat = []203for item in l:204if isinstance(item, list):205flat += item206else:207flat.append(item)208209return flat210211header = {'cell_type': 'raw', 'metadata': {}}212213# header text required by the knowledge repo214# a '- ' in front is required for knowledge repo tag215header_text = [216'---',217'title: {}'.format(self.title_),218'authors:',219'- {}'.format(self.AUTHOR),220'tags:',221'- ' + self.tags_,222'created_at: {}'.format(self.date_created_),223'updated_at: {}'.format(self.date_updated_),224'tldr: Nothing for tldr section as of now.',225'---']226227header_text = flatten_list(header_text)228header_text = [text + '\n' for text in header_text[:-1]] + [header_text[-1]]229header['source'] = header_text230return header231232def _construct_github_link_cell(self):233"""Add a cell that contains link to original notebook on github"""234github_link_cell = {235'cell_type': 'markdown',236'metadata': {},237'source': ['Link to original notebook: {}'.format(self.github_link_)]}238return github_link_cell239240def add(self, notebook):241"""242Add the converted notebook to the knowledge repo.243244Parameters245----------246notebook : dict247Jupyter notebook's raw json represented in dictionary format.248"""249with open(self._path, 'w', encoding='utf-8') as f:250json.dump(notebook, f)251252# create a run knowledge repo command253destination = os.path.join(self.knowledge_repo_path, 'project', self.tags_)254cmd = 'knowledge_repo --repo {} add {} -p {}'.format(255self.knowledge_repo_path, self._path, destination)256257# communicate with the shell output to enable258# continuation of the script execution259p = subprocess.Popen(260cmd, stdout=subprocess.PIPE, stdin=subprocess.PIPE,261stderr=subprocess.STDOUT, shell=True)262p.communicate(input=b'generated by automated airbnb knowledge repo setup')263264if not self.inplace:265os.remove(self._path)266267268if __name__ == '__main__':269import argparse270271parser = argparse.ArgumentParser(272description='Convert the machine-learning repository to an Airbnb Knowledge Repo.')273parser.add_argument(274'--ml_repo', type=str, help='Path to the root directory of the machine-learning repo.')275parser.add_argument(276'--knowledge_repo', type=str, help='Path to the knowledge repo.')277parser.add_argument(278'--inplace', action='store_true', help='Modify the existing .ipynb in place.')279args = vars(parser.parse_args())280main(**args)281282283