CoCalc -- convert_doc_to

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/utils/convert_doc_to_notebooks.py
Views: ²⁵³⁵
1
import nbformat
2
import os
3
import re
4
import shutil
5

6
# Paths are set to work by invoking this scrip from the notebooks repo, presuming the transformers repo is in the
7
# same parent folder as the notebooks repo.
8
PATH_TO_DOCS = '../transformers/docs/source'
9
PATH_TO_DEST = 'transformers_doc'
10
DOC_BASE_URL = "https://huggingface.co/transformers/"
11

12
# These are the doc files converted, add any new tutorial to this list if you want it handled by the conversion
13
# script.
14
TUTORIAL_FILES = [
15
    "benchmarks.rst",
16
    "custom_datasets.rst",
17
    "multilingual.rst",
18
    "perplexity.rst",
19
    "preprocessing.rst",
20
    "quicktour.rst",
21
    "task_summary.rst",
22
    "tokenizer_summary.rst",
23
    "training.rst"
24
]
25

26
###################################
27
# Parsing the rst file            #
28
###################################
29

30
# Re pattern that catches markdown titles.
31
_re_title = re.compile(r"^#+\s+(\S+)")
32
# Re pattern that catches rst blocks of the form `.. block_name::`.
33
_re_block = re.compile(r"^\.\.\s+(\S+)::")
34
# Re pattern that catches what's after the :: in rst blocks of the form `.. block_name:: something`.
35
_re_block_lang = re.compile(r"^\.\.\s+\S+::\s*(\S+)(\s+|$)")
36
# Re pattern that catchers section names like `.. _name:`.
37
_re_anchor_section = re.compile(r"^\.\.\s+_(\S+):")
38
# Re pattern that catches indentation at the start of a line.
39
_re_indent = re.compile(r"^(\s*)\S")
40

41

42
def split_blocks(lines):
43
    """ Read the lines of a doc file and group them by blocks."""
44
    blocks = []
45
    block_type = None
46
    current_block = []
47
    i = 0
48
    
49
    def _move_to_next_non_empty_line(i):
50
        while i < len(lines) and len(lines[i]) == 0:
51
            i += 1
52
        return i
53
    
54
    def _build_block(blocks, current_block, block_type):
55
        if len(current_block) > 0:
56
            while len(current_block[-1]) == 0:
57
                current_block = current_block[:-1]
58
            blocks.append(('\n'.join(current_block), block_type))
59
        return blocks, []
60

61
    # Ignore everything before the main title (copyright header)
62
    while _re_title.search(lines[i]) is None:
63
        i += 1
64

65
    while i < len(lines):
66
        line = lines[i]
67
        if _re_title.search(line) is not None:
68
            blocks, current_block = _build_block(blocks, current_block, "prose")
69
            blocks.append((line, "title"))
70
            i += 1
71
            i = _move_to_next_non_empty_line(i)
72
        elif _re_block.search(line) is not None:
73
            blocks, current_block = _build_block(blocks, current_block, "prose")
74
            block_type = _re_block.search(line).groups()[0]
75
            if _re_block_lang.search(line):
76
                block_type += " " + _re_block_lang.search(line).groups()[0]
77
            i += 1
78
            i = _move_to_next_non_empty_line(i)
79
            indent = _re_indent.search(lines[i]).groups()[0]
80
            if len(indent) > 0:
81
                while i < len(lines) and (lines[i].startswith(indent) or len(lines[i]) == 0):
82
                    current_block.append(lines[i])
83
                    i += 1
84
            blocks, current_block = _build_block(blocks, current_block, block_type)
85
        elif _re_anchor_section.search(line):
86
            blocks, current_block = _build_block(blocks, current_block, "prose")
87
            blocks.append((line, "anchor"))
88
            i += 1
89
            i = _move_to_next_non_empty_line(i)
90
        else:
91
            current_block.append(line)
92
            i += 1
93
    blocks, current_block = _build_block(blocks, current_block, "prose")
94
    return blocks
95

96

97
###################################
98
# Text formatting and cleaning    #
99
###################################
100

101
def process_titles(lines):
102
    """ Converts rst titles to markdown titles."""
103
    title_chars =  """= - ` : ' " ~ ^ _ * + # < >""".split(" ")
104
    title_levels = {}
105
    new_lines = []
106
    for line in lines:
107
        if len(new_lines) > 0 and len(line) >= len(new_lines[-1]) and len(set(line)) == 1 and line[0] in title_chars and line != "::":
108
            char = line[0]
109
            level = title_levels.get(char, len(title_levels) + 1)
110
            if level not in title_levels:
111
                title_levels[char] = level
112
            new_lines[-1] = f"{'#' * level} {new_lines[-1]}"
113
        else:
114
            new_lines.append(line)
115
    return new_lines
116

117

118
# Re pattern to catch things inside ` ` in :obj:`thing`.
119
_re_obj = re.compile(r":obj:`([^`]+)`")
120
# Re pattern to catch things inside ` ` in :math:`thing`.
121
_re_math = re.compile(r":math:`([^`]+)`")
122
# Re pattern to catch things between single backquotes.
123
_re_single_backquotes = re.compile(r"(^|[^`])`([^`]+)`([^`]|$)")
124
# Re pattern to catch things between stars.
125
_re_stars = re.compile(r"\*([^\*]+)\*")
126
# Re pattern to catch things between double backquotes.
127
_re_double_backquotes = re.compile(r"``([^`]+)``")
128
# Re pattern to catch things inside ` ` in :func/class/meth:`thing`.
129
_re_func_class = re.compile(r":(?:func|class|meth):`([^`]+)`")
130

131
def convert_rst_formatting(text):
132
    """ Convert rst syntax for formatting to markdown in text."""
133
    # Remove :class:, :func: and :meth: markers. Simplify what's inside and put double backquotes
134
    # (to not be caught by the italic conversion).
135
    def _rep_func_class(match):
136
        name = match.groups()[0]
137
        splits = name.split('.')
138
        i = 0
139
        while i < len(splits)-1 and not splits[i][0].isupper():
140
            i += 1
141
        return f"``{'.'.join(splits[i:])}``"
142
    text = _re_func_class.sub(_rep_func_class, text)
143
    # Remove :obj: markers. What's after is in a single backquotes so we put in double backquotes 
144
    # (to not be caught by the italic conversion).
145
    text = _re_obj.sub(r"``\1``", text)
146
    # Remove :math: markers.
147
    text = _re_math.sub(r"$\1$", text)
148
    # Convert content in stars to bold
149
    text = _re_stars.sub(r'**\1**', text)
150
    # Convert content in single backquotes to italic.
151
    text = _re_single_backquotes.sub(r'\1*\2*\3', text)
152
    # Convert content in double backquotes to single backquotes.
153
    text = _re_double_backquotes.sub(r'`\1`', text)
154
    # Remove remaining ::
155
    text = re.sub(r"::\n", "", text)
156
    return text
157

158
# Re pattern to catch description and url in links of the form `description <url>`_.
159
_re_links = re.compile(r"`([^`]+\S)\s+</*([^/][^>`]*)>`_+")
160
# Re pattern to catch reference in links of the form :doc:`reference`.
161
_re_simple_doc = re.compile(r":doc:`([^`<]*)`")
162
# Re pattern to catch description and reference in links of the form :doc:`description <reference>`.
163
_re_doc_with_description = re.compile(r":doc:`([^`<]+\S)\s+</*([^/][^>`]*)>`")
164
# Re pattern to catch reference in links of the form :ref:`reference`.
165
_re_simple_ref = re.compile(r":ref:`([^`<]*)`")
166
# Re pattern to catch description and reference in links of the form :ref:`description <reference>`.
167
_re_ref_with_description = re.compile(r":ref:`([^`<]+\S)\s+<([^>]*)>`")
168

169

170
def convert_rst_links(text):
171
    """ Convert the rst links in text to markdown."""
172
    # Links of the form :doc:`page`
173
    text = _re_simple_doc.sub(r'[\1](' + DOC_BASE_URL + r'\1.html)', text)
174
    # Links of the form :doc:`text <page>`
175
    text = _re_doc_with_description.sub(r'[\1](' + DOC_BASE_URL + r'\2.html)', text)
176
    # Refs of the form :ref:`page`
177
    text = _re_simple_ref.sub(r'[\1](#\1)', text)
178
    # Refs of the form :ref:`text <page>`
179
    text = _re_ref_with_description.sub(r'[\1](#\2)', text)
180
    # Other links
181
    def _rep_links(match):
182
        text,url = match.groups()
183
        if not url.startswith('http'):
184
            url = DOC_BASE_URL + url
185
        return f"[{text}]({url})"
186
    text = _re_links.sub(_rep_links, text)
187
    return text
188

189

190
###################################
191
# Notes, math and reference       #
192
###################################
193

194
def remove_indentation(text):
195
    """ Remove the indendation found in the first line in text."""
196
    lines = text.split("\n")
197
    indent = _re_indent.search(lines[0]).groups()[0]
198
    new_lines = [line[len(indent):] for line in lines]
199
    return "\n".join(new_lines)
200

201

202
# For now we just do **NOTE_TYPE:** text, maybe there is some clever html solution to have something nicer.
203
def convert_to_note(text, note_type):
204
    """ Convert text to a note of note_type."""
205
    text = remove_indentation(text)
206
    lines = text.split("\n")
207
    new_lines = [f"> **{note_type.upper()}:** {lines[0]}"]
208
    new_lines += [f"> {line}" for line in lines[1:]]
209
    return "\n".join(new_lines)
210

211

212
def convert_math(text):
213
    """ Convert text to disaply mode LaTeX."""
214
    text = remove_indentation(text)
215
    return f"$${text}$$"
216

217

218
def convert_anchor(text):
219
    """ Convert text to an anchor that can be used in the notebook."""
220
    anchor_name = _re_anchor_section.search(text).groups()[0]
221
    return f"<a id='{anchor_name}'></a>"
222

223

224
###################################
225
# Images                          #
226
###################################
227

228
_re_attr_rst = re.compile(r"^\s*:(\S+):\s*(\S.*)$")
229

230

231
def convert_image(image_name, text, pref=None, origin_folder=None, dest_folder=None):
232
    """ Convert text to proper html code for image_name.
233
    Optionally copy image from origin_folder to dest_folder."""
234
    # Copy the image if necessary
235
    if origin_folder is not None and dest_folder is not None:
236
        origin_file = os.path.join(origin_folder, image_name)
237
        dest_file = os.path.join(dest_folder, image_name)
238
        if not os.path.isfile(dest_file):
239
            os.makedirs(os.path.dirname(dest_file), exist_ok=True)
240
            shutil.copy(origin_file, dest_file)
241
    attrs = {'src': image_name if pref is None else os.path.join(pref, image_name)}
242
    for line in text.split("\n"):
243
        if _re_attr_rst.search(line) is not None:
244
            key, attr = _re_attr_rst.search(line).groups()
245
            attrs[key] = attr
246
    html = " ".join([f'{key}="{value}"' for key, value in attrs.items()])
247
    return f"<img {html}/>"
248

249

250
###################################
251
# Tables                          #
252
###################################
253

254
# Matches lines with a pattern of a table new line in rst.
255
_re_ignore_line_table = re.compile("^(\+[\-\s]+)+\+\s*$")
256
# Matches lines with a pattern of a table new line in rst, with a first column empty.
257
_re_ignore_line_table1 = re.compile("^\|\s+(\+[\-\s]+)+\+\s*$")
258
# Matches lines with a pattern of a first table line in rst.
259
_re_sep_line_table = re.compile("^(\+[=\s]+)+\+\s*$")
260

261

262
def convert_table(text):
263
    """ Convert a table in text from rst to markdown.""" 
264
    lines = text.split("\n")
265
    new_lines = []
266
    for line in lines:
267
        if _re_ignore_line_table.search(line) is not None:
268
            continue
269
        if _re_ignore_line_table1.search(line) is not None:
270
            continue
271
        if _re_sep_line_table.search(line) is not None:
272
            line = line.replace('=', '-').replace('+', '|')
273
        new_lines.append(line)
274
    return "\n".join(new_lines)
275

276

277
###################################
278
# Code cleaning                   #
279
###################################
280

281
# Matches the pytorch code tag.
282
_re_pytorch = re.compile(r"## PYTORCH CODE")
283
# Matches the tensorflow code tag.
284
_re_tensorflow = re.compile(r"## TENSORFLOW CODE")
285

286

287
def split_frameworks(code):
288
    """ Split code between the two frameworks (if it has two versions) with PyTorch first."""
289
    if _re_pytorch.search(code) is None or _re_tensorflow.search(code) is None:
290
        return (code,)
291
    lines = code.split("\n")
292
    is_pytorch_first = _re_pytorch.search(lines[0]) is not None
293
    re_split = _re_tensorflow if is_pytorch_first else _re_pytorch
294
    i = 1
295
    while re_split.search(lines[i]) is None:
296
        i += 1
297
    j = i-1
298
    while len(lines[j]) == 0:
299
        j -= 1
300
    return ("\n".join(lines[:j+1]), "\n".join(lines[i:])) if is_pytorch_first else ("\n".join(lines[i:]), "\n".join(lines[:j+1]))
301

302
# Matches any doctest pattern.
303
_re_doctest = re.compile(r"^(>>>|\.\.\.)")
304

305

306
def parse_code_and_output(code):
307
    """ Parse code to remove indentation, doctest prompts and split between source and theoretical output."""
308
    lines = code.split("\n")
309
    indent = _re_indent.search(lines[0]).groups()[0]
310
    has_doctest = False
311
    input_lines = []
312
    output_lines = []
313
    for line in lines:
314
        if len(line) > 0:
315
            line = line[len(indent):]
316
        if _re_doctest.search(line):
317
            has_doctest = True
318
            line = line[4:]
319
            input_lines.append(line)
320
        elif has_doctest:
321
            if len(line) > 0:
322
                output_lines.append(line)
323
        else:
324
            input_lines.append(line)
325
    return "\n".join(input_lines), "\n".join(output_lines)
326

327

328
###################################
329
# All together!                   #
330
###################################
331

332
def markdown_cell(md):
333
    """ Create a markdown cell with md inside.""" 
334
    return nbformat.notebooknode.NotebookNode({'cell_type': 'markdown', 'source': md, 'metadata': {}})
335

336

337
def code_cell(code, output=None):
338
    """ Create a code cell with `code` and optionally, `output`."""
339
    if output is None or len(output) == 0:
340
        outputs = []
341
    else:
342
        outputs = [nbformat.notebooknode.NotebookNode({
343
            'data': {'text/plain': output},
344
            'execution_count': None,
345
            'metadata': {},
346
            'output_type': 'execute_result'
347
        })]
348
    return nbformat.notebooknode.NotebookNode(
349
        {'cell_type': 'code', 
350
         'execution_count': None,
351
         'source': code, 
352
         'metadata': {},
353
         'outputs': outputs})
354

355

356
def create_notebook(cells):
357
    """ Create a notebook with `cells`."""
358
    return nbformat.notebooknode.NotebookNode(
359
        {'cells': cells,
360
         'metadata': {},
361
         'nbformat': 4,
362
         'nbformat_minor': 4,
363
        })
364

365

366
def rm_first_line(text):
367
    """ Remove the first line in `text`."""
368
    return '\n'.join(text.split('\n')[1:])
369

370

371
# For the first cell of the notebook
372
INSTALL_CODE = """# Transformers installation
373
! pip install transformers datasets
374
# To install from source instead of the last release, comment the command above and uncomment the following one.
375
# ! pip install git+https://github.com/huggingface/transformers.git
376
"""
377

378

379
def convert_rst_file_to_notebook(
380
    rst_file,
381
    notebook_fname,
382
    framework=None,
383
    img_prefix=None,
384
    origin_folder=None,
385
    dest_folder=None
386
):
387
    r""" 
388
    Convert rst_file to a notebook named notebook_fname.
389
    
390
    Args:
391
        - rst_file (:obj:`str`):
392
            The doc file to convert (in rst format).
393
        - notebook_fname (:obj:`str`):
394
            The output notebook file name (will be replaced if it exists).
395
        - framework (:obj:`str`, `optional`):
396
            If provided, must be :obj:`"pt"` or :obj:`"tf"`. In this case, only the PyTorch (resp. TensorFlow) version
397
            of the code is kept.
398
        - img_prefix (:obj:`str`, `optional`):
399
            If provided, will be inserted at the beginning of each image filename (in the `pytorch` or `tensorflow`
400
            folder, we need to add ../ to each image file to find them).
401
        - origin_folder (:obj:`str`, `optional`):
402
            If provided in conjunction with :obj:`dest_folder`, images encountered will be copied from this folder to
403
            :obj:`dest_folder`.
404
        - dest_folder (:obj:`str`, `optional`):
405
            If provided in conjunction with :obj:`origin_folder`, images encountered will be copied from
406
            :obj:`origin_folder` to this folder.
407
    """
408
    with open(rst_file, 'r') as f:
409
        content = f.read()
410
    lines = content.split("\n")
411
    lines = process_titles(lines)
412
    blocks = split_blocks(lines)
413
    cells = [code_cell(INSTALL_CODE)]
414
    for block,block_type in blocks:
415
        if block_type == 'title' or block_type == 'prose':
416
            block = convert_table(convert_rst_formatting(convert_rst_links(block)))
417
            cells.append(markdown_cell(block))
418
        elif block_type == 'anchor':
419
            block = convert_anchor(block)
420
            cells.append(markdown_cell(block))
421
        elif block_type.startswith('code-block'):
422
            codes = split_frameworks(block)
423
            if framework == 'pt' and len(codes) > 1:
424
                codes = (rm_first_line(codes[0]),)
425
            elif framework == 'tf' and len(codes) > 1:
426
                codes = (rm_first_line(codes[1]),)
427
            for code in codes:
428
                source,output = parse_code_and_output(code)
429
                if block_type.endswith('bash'):
430
                    lines = source.split("\n")
431
                    new_lines = [line if line.startswith("#") else f"! {line}" for line in lines]
432
                    source = "\n".join(new_lines)
433
                cells.append(code_cell(source, output=output))
434
        elif block_type.startswith("image"):
435
            image_name = block_type[len("image "):]
436
            block = convert_image(
437
                image_name,
438
                block,
439
                pref=img_prefix,
440
                origin_folder=origin_folder,
441
                dest_folder=dest_folder
442
            )
443
            cells.append(markdown_cell(block))
444
        elif block_type == "math":
445
            block = convert_math(block)
446
            cells.append(markdown_cell(block))
447
        else:
448
            block = convert_rst_formatting(convert_rst_links(block))
449
            block = convert_to_note(block, block_type)
450
            cells.append(markdown_cell(block))
451
            
452
    notebook = create_notebook(cells)
453
    nbformat.write(notebook, notebook_fname, version=4)
454

455

456
def convert_all_tutorials(path_to_docs=None, path_to_dest=None):
457
    """ Convert all tutorials into notebooks."""
458
    path_to_docs = PATH_TO_DOCS if path_to_docs is None else path_to_docs
459
    path_to_dest = PATH_TO_DEST if path_to_dest is None else path_to_dest
460
    for folder in ["pytorch", "tensorflow"]:
461
        os.makedirs(os.path.join(path_to_dest, folder), exist_ok=True)
462
    for file in TUTORIAL_FILES:
463
        notebook_name = os.path.splitext(file)[0] + ".ipynb"
464
        doc_file = os.path.join(path_to_docs, file)
465
        notebook_file = os.path.join(path_to_dest, notebook_name)
466
        convert_rst_file_to_notebook(doc_file, notebook_file, origin_folder=path_to_docs, dest_folder=path_to_dest)
467
        for folder, framework in zip(["pytorch", "tensorflow"], ["pt", "tf"]):
468
            notebook_file = os.path.join(os.path.join(path_to_dest, folder), notebook_name)
469
            convert_rst_file_to_notebook(doc_file, notebook_file, framework=framework, img_prefix="..")
470

471

472
if __name__ == "__main__":
473
    convert_all_tutorials()
474
Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

Product

Resources

Company

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more, all in one place. Commercial Alternative to JupyterHub.

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.