Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.
Path: blob/main/utils/convert_doc_to_notebooks.py
Views: 2535
import nbformat1import os2import re3import shutil45# Paths are set to work by invoking this scrip from the notebooks repo, presuming the transformers repo is in the6# same parent folder as the notebooks repo.7PATH_TO_DOCS = '../transformers/docs/source'8PATH_TO_DEST = 'transformers_doc'9DOC_BASE_URL = "https://huggingface.co/transformers/"1011# These are the doc files converted, add any new tutorial to this list if you want it handled by the conversion12# script.13TUTORIAL_FILES = [14"benchmarks.rst",15"custom_datasets.rst",16"multilingual.rst",17"perplexity.rst",18"preprocessing.rst",19"quicktour.rst",20"task_summary.rst",21"tokenizer_summary.rst",22"training.rst"23]2425###################################26# Parsing the rst file #27###################################2829# Re pattern that catches markdown titles.30_re_title = re.compile(r"^#+\s+(\S+)")31# Re pattern that catches rst blocks of the form `.. block_name::`.32_re_block = re.compile(r"^\.\.\s+(\S+)::")33# Re pattern that catches what's after the :: in rst blocks of the form `.. block_name:: something`.34_re_block_lang = re.compile(r"^\.\.\s+\S+::\s*(\S+)(\s+|$)")35# Re pattern that catchers section names like `.. _name:`.36_re_anchor_section = re.compile(r"^\.\.\s+_(\S+):")37# Re pattern that catches indentation at the start of a line.38_re_indent = re.compile(r"^(\s*)\S")394041def split_blocks(lines):42""" Read the lines of a doc file and group them by blocks."""43blocks = []44block_type = None45current_block = []46i = 04748def _move_to_next_non_empty_line(i):49while i < len(lines) and len(lines[i]) == 0:50i += 151return i5253def _build_block(blocks, current_block, block_type):54if len(current_block) > 0:55while len(current_block[-1]) == 0:56current_block = current_block[:-1]57blocks.append(('\n'.join(current_block), block_type))58return blocks, []5960# Ignore everything before the main title (copyright header)61while _re_title.search(lines[i]) is None:62i += 16364while i < len(lines):65line = lines[i]66if _re_title.search(line) is not None:67blocks, current_block = _build_block(blocks, current_block, "prose")68blocks.append((line, "title"))69i += 170i = _move_to_next_non_empty_line(i)71elif _re_block.search(line) is not None:72blocks, current_block = _build_block(blocks, current_block, "prose")73block_type = _re_block.search(line).groups()[0]74if _re_block_lang.search(line):75block_type += " " + _re_block_lang.search(line).groups()[0]76i += 177i = _move_to_next_non_empty_line(i)78indent = _re_indent.search(lines[i]).groups()[0]79if len(indent) > 0:80while i < len(lines) and (lines[i].startswith(indent) or len(lines[i]) == 0):81current_block.append(lines[i])82i += 183blocks, current_block = _build_block(blocks, current_block, block_type)84elif _re_anchor_section.search(line):85blocks, current_block = _build_block(blocks, current_block, "prose")86blocks.append((line, "anchor"))87i += 188i = _move_to_next_non_empty_line(i)89else:90current_block.append(line)91i += 192blocks, current_block = _build_block(blocks, current_block, "prose")93return blocks949596###################################97# Text formatting and cleaning #98###################################99100def process_titles(lines):101""" Converts rst titles to markdown titles."""102title_chars = """= - ` : ' " ~ ^ _ * + # < >""".split(" ")103title_levels = {}104new_lines = []105for line in lines:106if len(new_lines) > 0 and len(line) >= len(new_lines[-1]) and len(set(line)) == 1 and line[0] in title_chars and line != "::":107char = line[0]108level = title_levels.get(char, len(title_levels) + 1)109if level not in title_levels:110title_levels[char] = level111new_lines[-1] = f"{'#' * level} {new_lines[-1]}"112else:113new_lines.append(line)114return new_lines115116117# Re pattern to catch things inside ` ` in :obj:`thing`.118_re_obj = re.compile(r":obj:`([^`]+)`")119# Re pattern to catch things inside ` ` in :math:`thing`.120_re_math = re.compile(r":math:`([^`]+)`")121# Re pattern to catch things between single backquotes.122_re_single_backquotes = re.compile(r"(^|[^`])`([^`]+)`([^`]|$)")123# Re pattern to catch things between stars.124_re_stars = re.compile(r"\*([^\*]+)\*")125# Re pattern to catch things between double backquotes.126_re_double_backquotes = re.compile(r"``([^`]+)``")127# Re pattern to catch things inside ` ` in :func/class/meth:`thing`.128_re_func_class = re.compile(r":(?:func|class|meth):`([^`]+)`")129130def convert_rst_formatting(text):131""" Convert rst syntax for formatting to markdown in text."""132# Remove :class:, :func: and :meth: markers. Simplify what's inside and put double backquotes133# (to not be caught by the italic conversion).134def _rep_func_class(match):135name = match.groups()[0]136splits = name.split('.')137i = 0138while i < len(splits)-1 and not splits[i][0].isupper():139i += 1140return f"``{'.'.join(splits[i:])}``"141text = _re_func_class.sub(_rep_func_class, text)142# Remove :obj: markers. What's after is in a single backquotes so we put in double backquotes143# (to not be caught by the italic conversion).144text = _re_obj.sub(r"``\1``", text)145# Remove :math: markers.146text = _re_math.sub(r"$\1$", text)147# Convert content in stars to bold148text = _re_stars.sub(r'**\1**', text)149# Convert content in single backquotes to italic.150text = _re_single_backquotes.sub(r'\1*\2*\3', text)151# Convert content in double backquotes to single backquotes.152text = _re_double_backquotes.sub(r'`\1`', text)153# Remove remaining ::154text = re.sub(r"::\n", "", text)155return text156157# Re pattern to catch description and url in links of the form `description <url>`_.158_re_links = re.compile(r"`([^`]+\S)\s+</*([^/][^>`]*)>`_+")159# Re pattern to catch reference in links of the form :doc:`reference`.160_re_simple_doc = re.compile(r":doc:`([^`<]*)`")161# Re pattern to catch description and reference in links of the form :doc:`description <reference>`.162_re_doc_with_description = re.compile(r":doc:`([^`<]+\S)\s+</*([^/][^>`]*)>`")163# Re pattern to catch reference in links of the form :ref:`reference`.164_re_simple_ref = re.compile(r":ref:`([^`<]*)`")165# Re pattern to catch description and reference in links of the form :ref:`description <reference>`.166_re_ref_with_description = re.compile(r":ref:`([^`<]+\S)\s+<([^>]*)>`")167168169def convert_rst_links(text):170""" Convert the rst links in text to markdown."""171# Links of the form :doc:`page`172text = _re_simple_doc.sub(r'[\1](' + DOC_BASE_URL + r'\1.html)', text)173# Links of the form :doc:`text <page>`174text = _re_doc_with_description.sub(r'[\1](' + DOC_BASE_URL + r'\2.html)', text)175# Refs of the form :ref:`page`176text = _re_simple_ref.sub(r'[\1](#\1)', text)177# Refs of the form :ref:`text <page>`178text = _re_ref_with_description.sub(r'[\1](#\2)', text)179# Other links180def _rep_links(match):181text,url = match.groups()182if not url.startswith('http'):183url = DOC_BASE_URL + url184return f"[{text}]({url})"185text = _re_links.sub(_rep_links, text)186return text187188189###################################190# Notes, math and reference #191###################################192193def remove_indentation(text):194""" Remove the indendation found in the first line in text."""195lines = text.split("\n")196indent = _re_indent.search(lines[0]).groups()[0]197new_lines = [line[len(indent):] for line in lines]198return "\n".join(new_lines)199200201# For now we just do **NOTE_TYPE:** text, maybe there is some clever html solution to have something nicer.202def convert_to_note(text, note_type):203""" Convert text to a note of note_type."""204text = remove_indentation(text)205lines = text.split("\n")206new_lines = [f"> **{note_type.upper()}:** {lines[0]}"]207new_lines += [f"> {line}" for line in lines[1:]]208return "\n".join(new_lines)209210211def convert_math(text):212""" Convert text to disaply mode LaTeX."""213text = remove_indentation(text)214return f"$${text}$$"215216217def convert_anchor(text):218""" Convert text to an anchor that can be used in the notebook."""219anchor_name = _re_anchor_section.search(text).groups()[0]220return f"<a id='{anchor_name}'></a>"221222223###################################224# Images #225###################################226227_re_attr_rst = re.compile(r"^\s*:(\S+):\s*(\S.*)$")228229230def convert_image(image_name, text, pref=None, origin_folder=None, dest_folder=None):231""" Convert text to proper html code for image_name.232Optionally copy image from origin_folder to dest_folder."""233# Copy the image if necessary234if origin_folder is not None and dest_folder is not None:235origin_file = os.path.join(origin_folder, image_name)236dest_file = os.path.join(dest_folder, image_name)237if not os.path.isfile(dest_file):238os.makedirs(os.path.dirname(dest_file), exist_ok=True)239shutil.copy(origin_file, dest_file)240attrs = {'src': image_name if pref is None else os.path.join(pref, image_name)}241for line in text.split("\n"):242if _re_attr_rst.search(line) is not None:243key, attr = _re_attr_rst.search(line).groups()244attrs[key] = attr245html = " ".join([f'{key}="{value}"' for key, value in attrs.items()])246return f"<img {html}/>"247248249###################################250# Tables #251###################################252253# Matches lines with a pattern of a table new line in rst.254_re_ignore_line_table = re.compile("^(\+[\-\s]+)+\+\s*$")255# Matches lines with a pattern of a table new line in rst, with a first column empty.256_re_ignore_line_table1 = re.compile("^\|\s+(\+[\-\s]+)+\+\s*$")257# Matches lines with a pattern of a first table line in rst.258_re_sep_line_table = re.compile("^(\+[=\s]+)+\+\s*$")259260261def convert_table(text):262""" Convert a table in text from rst to markdown."""263lines = text.split("\n")264new_lines = []265for line in lines:266if _re_ignore_line_table.search(line) is not None:267continue268if _re_ignore_line_table1.search(line) is not None:269continue270if _re_sep_line_table.search(line) is not None:271line = line.replace('=', '-').replace('+', '|')272new_lines.append(line)273return "\n".join(new_lines)274275276###################################277# Code cleaning #278###################################279280# Matches the pytorch code tag.281_re_pytorch = re.compile(r"## PYTORCH CODE")282# Matches the tensorflow code tag.283_re_tensorflow = re.compile(r"## TENSORFLOW CODE")284285286def split_frameworks(code):287""" Split code between the two frameworks (if it has two versions) with PyTorch first."""288if _re_pytorch.search(code) is None or _re_tensorflow.search(code) is None:289return (code,)290lines = code.split("\n")291is_pytorch_first = _re_pytorch.search(lines[0]) is not None292re_split = _re_tensorflow if is_pytorch_first else _re_pytorch293i = 1294while re_split.search(lines[i]) is None:295i += 1296j = i-1297while len(lines[j]) == 0:298j -= 1299return ("\n".join(lines[:j+1]), "\n".join(lines[i:])) if is_pytorch_first else ("\n".join(lines[i:]), "\n".join(lines[:j+1]))300301# Matches any doctest pattern.302_re_doctest = re.compile(r"^(>>>|\.\.\.)")303304305def parse_code_and_output(code):306""" Parse code to remove indentation, doctest prompts and split between source and theoretical output."""307lines = code.split("\n")308indent = _re_indent.search(lines[0]).groups()[0]309has_doctest = False310input_lines = []311output_lines = []312for line in lines:313if len(line) > 0:314line = line[len(indent):]315if _re_doctest.search(line):316has_doctest = True317line = line[4:]318input_lines.append(line)319elif has_doctest:320if len(line) > 0:321output_lines.append(line)322else:323input_lines.append(line)324return "\n".join(input_lines), "\n".join(output_lines)325326327###################################328# All together! #329###################################330331def markdown_cell(md):332""" Create a markdown cell with md inside."""333return nbformat.notebooknode.NotebookNode({'cell_type': 'markdown', 'source': md, 'metadata': {}})334335336def code_cell(code, output=None):337""" Create a code cell with `code` and optionally, `output`."""338if output is None or len(output) == 0:339outputs = []340else:341outputs = [nbformat.notebooknode.NotebookNode({342'data': {'text/plain': output},343'execution_count': None,344'metadata': {},345'output_type': 'execute_result'346})]347return nbformat.notebooknode.NotebookNode(348{'cell_type': 'code',349'execution_count': None,350'source': code,351'metadata': {},352'outputs': outputs})353354355def create_notebook(cells):356""" Create a notebook with `cells`."""357return nbformat.notebooknode.NotebookNode(358{'cells': cells,359'metadata': {},360'nbformat': 4,361'nbformat_minor': 4,362})363364365def rm_first_line(text):366""" Remove the first line in `text`."""367return '\n'.join(text.split('\n')[1:])368369370# For the first cell of the notebook371INSTALL_CODE = """# Transformers installation372! pip install transformers datasets373# To install from source instead of the last release, comment the command above and uncomment the following one.374# ! pip install git+https://github.com/huggingface/transformers.git375"""376377378def convert_rst_file_to_notebook(379rst_file,380notebook_fname,381framework=None,382img_prefix=None,383origin_folder=None,384dest_folder=None385):386r"""387Convert rst_file to a notebook named notebook_fname.388389Args:390- rst_file (:obj:`str`):391The doc file to convert (in rst format).392- notebook_fname (:obj:`str`):393The output notebook file name (will be replaced if it exists).394- framework (:obj:`str`, `optional`):395If provided, must be :obj:`"pt"` or :obj:`"tf"`. In this case, only the PyTorch (resp. TensorFlow) version396of the code is kept.397- img_prefix (:obj:`str`, `optional`):398If provided, will be inserted at the beginning of each image filename (in the `pytorch` or `tensorflow`399folder, we need to add ../ to each image file to find them).400- origin_folder (:obj:`str`, `optional`):401If provided in conjunction with :obj:`dest_folder`, images encountered will be copied from this folder to402:obj:`dest_folder`.403- dest_folder (:obj:`str`, `optional`):404If provided in conjunction with :obj:`origin_folder`, images encountered will be copied from405:obj:`origin_folder` to this folder.406"""407with open(rst_file, 'r') as f:408content = f.read()409lines = content.split("\n")410lines = process_titles(lines)411blocks = split_blocks(lines)412cells = [code_cell(INSTALL_CODE)]413for block,block_type in blocks:414if block_type == 'title' or block_type == 'prose':415block = convert_table(convert_rst_formatting(convert_rst_links(block)))416cells.append(markdown_cell(block))417elif block_type == 'anchor':418block = convert_anchor(block)419cells.append(markdown_cell(block))420elif block_type.startswith('code-block'):421codes = split_frameworks(block)422if framework == 'pt' and len(codes) > 1:423codes = (rm_first_line(codes[0]),)424elif framework == 'tf' and len(codes) > 1:425codes = (rm_first_line(codes[1]),)426for code in codes:427source,output = parse_code_and_output(code)428if block_type.endswith('bash'):429lines = source.split("\n")430new_lines = [line if line.startswith("#") else f"! {line}" for line in lines]431source = "\n".join(new_lines)432cells.append(code_cell(source, output=output))433elif block_type.startswith("image"):434image_name = block_type[len("image "):]435block = convert_image(436image_name,437block,438pref=img_prefix,439origin_folder=origin_folder,440dest_folder=dest_folder441)442cells.append(markdown_cell(block))443elif block_type == "math":444block = convert_math(block)445cells.append(markdown_cell(block))446else:447block = convert_rst_formatting(convert_rst_links(block))448block = convert_to_note(block, block_type)449cells.append(markdown_cell(block))450451notebook = create_notebook(cells)452nbformat.write(notebook, notebook_fname, version=4)453454455def convert_all_tutorials(path_to_docs=None, path_to_dest=None):456""" Convert all tutorials into notebooks."""457path_to_docs = PATH_TO_DOCS if path_to_docs is None else path_to_docs458path_to_dest = PATH_TO_DEST if path_to_dest is None else path_to_dest459for folder in ["pytorch", "tensorflow"]:460os.makedirs(os.path.join(path_to_dest, folder), exist_ok=True)461for file in TUTORIAL_FILES:462notebook_name = os.path.splitext(file)[0] + ".ipynb"463doc_file = os.path.join(path_to_docs, file)464notebook_file = os.path.join(path_to_dest, notebook_name)465convert_rst_file_to_notebook(doc_file, notebook_file, origin_folder=path_to_docs, dest_folder=path_to_dest)466for folder, framework in zip(["pytorch", "tensorflow"], ["pt", "tf"]):467notebook_file = os.path.join(os.path.join(path_to_dest, folder), notebook_name)468convert_rst_file_to_notebook(doc_file, notebook_file, framework=framework, img_prefix="..")469470471if __name__ == "__main__":472convert_all_tutorials()473474