CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutSign UpSign In
huggingface

Real-time collaboration for Jupyter Notebooks, Linux Terminals, LaTeX, VS Code, R IDE, and more,
all in one place. Commercial Alternative to JupyterHub.

GitHub Repository: huggingface/notebooks
Path: blob/main/utils/convert_doc_to_notebooks.py
Views: 2535
1
import nbformat
2
import os
3
import re
4
import shutil
5
6
# Paths are set to work by invoking this scrip from the notebooks repo, presuming the transformers repo is in the
7
# same parent folder as the notebooks repo.
8
PATH_TO_DOCS = '../transformers/docs/source'
9
PATH_TO_DEST = 'transformers_doc'
10
DOC_BASE_URL = "https://huggingface.co/transformers/"
11
12
# These are the doc files converted, add any new tutorial to this list if you want it handled by the conversion
13
# script.
14
TUTORIAL_FILES = [
15
"benchmarks.rst",
16
"custom_datasets.rst",
17
"multilingual.rst",
18
"perplexity.rst",
19
"preprocessing.rst",
20
"quicktour.rst",
21
"task_summary.rst",
22
"tokenizer_summary.rst",
23
"training.rst"
24
]
25
26
###################################
27
# Parsing the rst file #
28
###################################
29
30
# Re pattern that catches markdown titles.
31
_re_title = re.compile(r"^#+\s+(\S+)")
32
# Re pattern that catches rst blocks of the form `.. block_name::`.
33
_re_block = re.compile(r"^\.\.\s+(\S+)::")
34
# Re pattern that catches what's after the :: in rst blocks of the form `.. block_name:: something`.
35
_re_block_lang = re.compile(r"^\.\.\s+\S+::\s*(\S+)(\s+|$)")
36
# Re pattern that catchers section names like `.. _name:`.
37
_re_anchor_section = re.compile(r"^\.\.\s+_(\S+):")
38
# Re pattern that catches indentation at the start of a line.
39
_re_indent = re.compile(r"^(\s*)\S")
40
41
42
def split_blocks(lines):
43
""" Read the lines of a doc file and group them by blocks."""
44
blocks = []
45
block_type = None
46
current_block = []
47
i = 0
48
49
def _move_to_next_non_empty_line(i):
50
while i < len(lines) and len(lines[i]) == 0:
51
i += 1
52
return i
53
54
def _build_block(blocks, current_block, block_type):
55
if len(current_block) > 0:
56
while len(current_block[-1]) == 0:
57
current_block = current_block[:-1]
58
blocks.append(('\n'.join(current_block), block_type))
59
return blocks, []
60
61
# Ignore everything before the main title (copyright header)
62
while _re_title.search(lines[i]) is None:
63
i += 1
64
65
while i < len(lines):
66
line = lines[i]
67
if _re_title.search(line) is not None:
68
blocks, current_block = _build_block(blocks, current_block, "prose")
69
blocks.append((line, "title"))
70
i += 1
71
i = _move_to_next_non_empty_line(i)
72
elif _re_block.search(line) is not None:
73
blocks, current_block = _build_block(blocks, current_block, "prose")
74
block_type = _re_block.search(line).groups()[0]
75
if _re_block_lang.search(line):
76
block_type += " " + _re_block_lang.search(line).groups()[0]
77
i += 1
78
i = _move_to_next_non_empty_line(i)
79
indent = _re_indent.search(lines[i]).groups()[0]
80
if len(indent) > 0:
81
while i < len(lines) and (lines[i].startswith(indent) or len(lines[i]) == 0):
82
current_block.append(lines[i])
83
i += 1
84
blocks, current_block = _build_block(blocks, current_block, block_type)
85
elif _re_anchor_section.search(line):
86
blocks, current_block = _build_block(blocks, current_block, "prose")
87
blocks.append((line, "anchor"))
88
i += 1
89
i = _move_to_next_non_empty_line(i)
90
else:
91
current_block.append(line)
92
i += 1
93
blocks, current_block = _build_block(blocks, current_block, "prose")
94
return blocks
95
96
97
###################################
98
# Text formatting and cleaning #
99
###################################
100
101
def process_titles(lines):
102
""" Converts rst titles to markdown titles."""
103
title_chars = """= - ` : ' " ~ ^ _ * + # < >""".split(" ")
104
title_levels = {}
105
new_lines = []
106
for line in lines:
107
if len(new_lines) > 0 and len(line) >= len(new_lines[-1]) and len(set(line)) == 1 and line[0] in title_chars and line != "::":
108
char = line[0]
109
level = title_levels.get(char, len(title_levels) + 1)
110
if level not in title_levels:
111
title_levels[char] = level
112
new_lines[-1] = f"{'#' * level} {new_lines[-1]}"
113
else:
114
new_lines.append(line)
115
return new_lines
116
117
118
# Re pattern to catch things inside ` ` in :obj:`thing`.
119
_re_obj = re.compile(r":obj:`([^`]+)`")
120
# Re pattern to catch things inside ` ` in :math:`thing`.
121
_re_math = re.compile(r":math:`([^`]+)`")
122
# Re pattern to catch things between single backquotes.
123
_re_single_backquotes = re.compile(r"(^|[^`])`([^`]+)`([^`]|$)")
124
# Re pattern to catch things between stars.
125
_re_stars = re.compile(r"\*([^\*]+)\*")
126
# Re pattern to catch things between double backquotes.
127
_re_double_backquotes = re.compile(r"``([^`]+)``")
128
# Re pattern to catch things inside ` ` in :func/class/meth:`thing`.
129
_re_func_class = re.compile(r":(?:func|class|meth):`([^`]+)`")
130
131
def convert_rst_formatting(text):
132
""" Convert rst syntax for formatting to markdown in text."""
133
# Remove :class:, :func: and :meth: markers. Simplify what's inside and put double backquotes
134
# (to not be caught by the italic conversion).
135
def _rep_func_class(match):
136
name = match.groups()[0]
137
splits = name.split('.')
138
i = 0
139
while i < len(splits)-1 and not splits[i][0].isupper():
140
i += 1
141
return f"``{'.'.join(splits[i:])}``"
142
text = _re_func_class.sub(_rep_func_class, text)
143
# Remove :obj: markers. What's after is in a single backquotes so we put in double backquotes
144
# (to not be caught by the italic conversion).
145
text = _re_obj.sub(r"``\1``", text)
146
# Remove :math: markers.
147
text = _re_math.sub(r"$\1$", text)
148
# Convert content in stars to bold
149
text = _re_stars.sub(r'**\1**', text)
150
# Convert content in single backquotes to italic.
151
text = _re_single_backquotes.sub(r'\1*\2*\3', text)
152
# Convert content in double backquotes to single backquotes.
153
text = _re_double_backquotes.sub(r'`\1`', text)
154
# Remove remaining ::
155
text = re.sub(r"::\n", "", text)
156
return text
157
158
# Re pattern to catch description and url in links of the form `description <url>`_.
159
_re_links = re.compile(r"`([^`]+\S)\s+</*([^/][^>`]*)>`_+")
160
# Re pattern to catch reference in links of the form :doc:`reference`.
161
_re_simple_doc = re.compile(r":doc:`([^`<]*)`")
162
# Re pattern to catch description and reference in links of the form :doc:`description <reference>`.
163
_re_doc_with_description = re.compile(r":doc:`([^`<]+\S)\s+</*([^/][^>`]*)>`")
164
# Re pattern to catch reference in links of the form :ref:`reference`.
165
_re_simple_ref = re.compile(r":ref:`([^`<]*)`")
166
# Re pattern to catch description and reference in links of the form :ref:`description <reference>`.
167
_re_ref_with_description = re.compile(r":ref:`([^`<]+\S)\s+<([^>]*)>`")
168
169
170
def convert_rst_links(text):
171
""" Convert the rst links in text to markdown."""
172
# Links of the form :doc:`page`
173
text = _re_simple_doc.sub(r'[\1](' + DOC_BASE_URL + r'\1.html)', text)
174
# Links of the form :doc:`text <page>`
175
text = _re_doc_with_description.sub(r'[\1](' + DOC_BASE_URL + r'\2.html)', text)
176
# Refs of the form :ref:`page`
177
text = _re_simple_ref.sub(r'[\1](#\1)', text)
178
# Refs of the form :ref:`text <page>`
179
text = _re_ref_with_description.sub(r'[\1](#\2)', text)
180
# Other links
181
def _rep_links(match):
182
text,url = match.groups()
183
if not url.startswith('http'):
184
url = DOC_BASE_URL + url
185
return f"[{text}]({url})"
186
text = _re_links.sub(_rep_links, text)
187
return text
188
189
190
###################################
191
# Notes, math and reference #
192
###################################
193
194
def remove_indentation(text):
195
""" Remove the indendation found in the first line in text."""
196
lines = text.split("\n")
197
indent = _re_indent.search(lines[0]).groups()[0]
198
new_lines = [line[len(indent):] for line in lines]
199
return "\n".join(new_lines)
200
201
202
# For now we just do **NOTE_TYPE:** text, maybe there is some clever html solution to have something nicer.
203
def convert_to_note(text, note_type):
204
""" Convert text to a note of note_type."""
205
text = remove_indentation(text)
206
lines = text.split("\n")
207
new_lines = [f"> **{note_type.upper()}:** {lines[0]}"]
208
new_lines += [f"> {line}" for line in lines[1:]]
209
return "\n".join(new_lines)
210
211
212
def convert_math(text):
213
""" Convert text to disaply mode LaTeX."""
214
text = remove_indentation(text)
215
return f"$${text}$$"
216
217
218
def convert_anchor(text):
219
""" Convert text to an anchor that can be used in the notebook."""
220
anchor_name = _re_anchor_section.search(text).groups()[0]
221
return f"<a id='{anchor_name}'></a>"
222
223
224
###################################
225
# Images #
226
###################################
227
228
_re_attr_rst = re.compile(r"^\s*:(\S+):\s*(\S.*)$")
229
230
231
def convert_image(image_name, text, pref=None, origin_folder=None, dest_folder=None):
232
""" Convert text to proper html code for image_name.
233
Optionally copy image from origin_folder to dest_folder."""
234
# Copy the image if necessary
235
if origin_folder is not None and dest_folder is not None:
236
origin_file = os.path.join(origin_folder, image_name)
237
dest_file = os.path.join(dest_folder, image_name)
238
if not os.path.isfile(dest_file):
239
os.makedirs(os.path.dirname(dest_file), exist_ok=True)
240
shutil.copy(origin_file, dest_file)
241
attrs = {'src': image_name if pref is None else os.path.join(pref, image_name)}
242
for line in text.split("\n"):
243
if _re_attr_rst.search(line) is not None:
244
key, attr = _re_attr_rst.search(line).groups()
245
attrs[key] = attr
246
html = " ".join([f'{key}="{value}"' for key, value in attrs.items()])
247
return f"<img {html}/>"
248
249
250
###################################
251
# Tables #
252
###################################
253
254
# Matches lines with a pattern of a table new line in rst.
255
_re_ignore_line_table = re.compile("^(\+[\-\s]+)+\+\s*$")
256
# Matches lines with a pattern of a table new line in rst, with a first column empty.
257
_re_ignore_line_table1 = re.compile("^\|\s+(\+[\-\s]+)+\+\s*$")
258
# Matches lines with a pattern of a first table line in rst.
259
_re_sep_line_table = re.compile("^(\+[=\s]+)+\+\s*$")
260
261
262
def convert_table(text):
263
""" Convert a table in text from rst to markdown."""
264
lines = text.split("\n")
265
new_lines = []
266
for line in lines:
267
if _re_ignore_line_table.search(line) is not None:
268
continue
269
if _re_ignore_line_table1.search(line) is not None:
270
continue
271
if _re_sep_line_table.search(line) is not None:
272
line = line.replace('=', '-').replace('+', '|')
273
new_lines.append(line)
274
return "\n".join(new_lines)
275
276
277
###################################
278
# Code cleaning #
279
###################################
280
281
# Matches the pytorch code tag.
282
_re_pytorch = re.compile(r"## PYTORCH CODE")
283
# Matches the tensorflow code tag.
284
_re_tensorflow = re.compile(r"## TENSORFLOW CODE")
285
286
287
def split_frameworks(code):
288
""" Split code between the two frameworks (if it has two versions) with PyTorch first."""
289
if _re_pytorch.search(code) is None or _re_tensorflow.search(code) is None:
290
return (code,)
291
lines = code.split("\n")
292
is_pytorch_first = _re_pytorch.search(lines[0]) is not None
293
re_split = _re_tensorflow if is_pytorch_first else _re_pytorch
294
i = 1
295
while re_split.search(lines[i]) is None:
296
i += 1
297
j = i-1
298
while len(lines[j]) == 0:
299
j -= 1
300
return ("\n".join(lines[:j+1]), "\n".join(lines[i:])) if is_pytorch_first else ("\n".join(lines[i:]), "\n".join(lines[:j+1]))
301
302
# Matches any doctest pattern.
303
_re_doctest = re.compile(r"^(>>>|\.\.\.)")
304
305
306
def parse_code_and_output(code):
307
""" Parse code to remove indentation, doctest prompts and split between source and theoretical output."""
308
lines = code.split("\n")
309
indent = _re_indent.search(lines[0]).groups()[0]
310
has_doctest = False
311
input_lines = []
312
output_lines = []
313
for line in lines:
314
if len(line) > 0:
315
line = line[len(indent):]
316
if _re_doctest.search(line):
317
has_doctest = True
318
line = line[4:]
319
input_lines.append(line)
320
elif has_doctest:
321
if len(line) > 0:
322
output_lines.append(line)
323
else:
324
input_lines.append(line)
325
return "\n".join(input_lines), "\n".join(output_lines)
326
327
328
###################################
329
# All together! #
330
###################################
331
332
def markdown_cell(md):
333
""" Create a markdown cell with md inside."""
334
return nbformat.notebooknode.NotebookNode({'cell_type': 'markdown', 'source': md, 'metadata': {}})
335
336
337
def code_cell(code, output=None):
338
""" Create a code cell with `code` and optionally, `output`."""
339
if output is None or len(output) == 0:
340
outputs = []
341
else:
342
outputs = [nbformat.notebooknode.NotebookNode({
343
'data': {'text/plain': output},
344
'execution_count': None,
345
'metadata': {},
346
'output_type': 'execute_result'
347
})]
348
return nbformat.notebooknode.NotebookNode(
349
{'cell_type': 'code',
350
'execution_count': None,
351
'source': code,
352
'metadata': {},
353
'outputs': outputs})
354
355
356
def create_notebook(cells):
357
""" Create a notebook with `cells`."""
358
return nbformat.notebooknode.NotebookNode(
359
{'cells': cells,
360
'metadata': {},
361
'nbformat': 4,
362
'nbformat_minor': 4,
363
})
364
365
366
def rm_first_line(text):
367
""" Remove the first line in `text`."""
368
return '\n'.join(text.split('\n')[1:])
369
370
371
# For the first cell of the notebook
372
INSTALL_CODE = """# Transformers installation
373
! pip install transformers datasets
374
# To install from source instead of the last release, comment the command above and uncomment the following one.
375
# ! pip install git+https://github.com/huggingface/transformers.git
376
"""
377
378
379
def convert_rst_file_to_notebook(
380
rst_file,
381
notebook_fname,
382
framework=None,
383
img_prefix=None,
384
origin_folder=None,
385
dest_folder=None
386
):
387
r"""
388
Convert rst_file to a notebook named notebook_fname.
389
390
Args:
391
- rst_file (:obj:`str`):
392
The doc file to convert (in rst format).
393
- notebook_fname (:obj:`str`):
394
The output notebook file name (will be replaced if it exists).
395
- framework (:obj:`str`, `optional`):
396
If provided, must be :obj:`"pt"` or :obj:`"tf"`. In this case, only the PyTorch (resp. TensorFlow) version
397
of the code is kept.
398
- img_prefix (:obj:`str`, `optional`):
399
If provided, will be inserted at the beginning of each image filename (in the `pytorch` or `tensorflow`
400
folder, we need to add ../ to each image file to find them).
401
- origin_folder (:obj:`str`, `optional`):
402
If provided in conjunction with :obj:`dest_folder`, images encountered will be copied from this folder to
403
:obj:`dest_folder`.
404
- dest_folder (:obj:`str`, `optional`):
405
If provided in conjunction with :obj:`origin_folder`, images encountered will be copied from
406
:obj:`origin_folder` to this folder.
407
"""
408
with open(rst_file, 'r') as f:
409
content = f.read()
410
lines = content.split("\n")
411
lines = process_titles(lines)
412
blocks = split_blocks(lines)
413
cells = [code_cell(INSTALL_CODE)]
414
for block,block_type in blocks:
415
if block_type == 'title' or block_type == 'prose':
416
block = convert_table(convert_rst_formatting(convert_rst_links(block)))
417
cells.append(markdown_cell(block))
418
elif block_type == 'anchor':
419
block = convert_anchor(block)
420
cells.append(markdown_cell(block))
421
elif block_type.startswith('code-block'):
422
codes = split_frameworks(block)
423
if framework == 'pt' and len(codes) > 1:
424
codes = (rm_first_line(codes[0]),)
425
elif framework == 'tf' and len(codes) > 1:
426
codes = (rm_first_line(codes[1]),)
427
for code in codes:
428
source,output = parse_code_and_output(code)
429
if block_type.endswith('bash'):
430
lines = source.split("\n")
431
new_lines = [line if line.startswith("#") else f"! {line}" for line in lines]
432
source = "\n".join(new_lines)
433
cells.append(code_cell(source, output=output))
434
elif block_type.startswith("image"):
435
image_name = block_type[len("image "):]
436
block = convert_image(
437
image_name,
438
block,
439
pref=img_prefix,
440
origin_folder=origin_folder,
441
dest_folder=dest_folder
442
)
443
cells.append(markdown_cell(block))
444
elif block_type == "math":
445
block = convert_math(block)
446
cells.append(markdown_cell(block))
447
else:
448
block = convert_rst_formatting(convert_rst_links(block))
449
block = convert_to_note(block, block_type)
450
cells.append(markdown_cell(block))
451
452
notebook = create_notebook(cells)
453
nbformat.write(notebook, notebook_fname, version=4)
454
455
456
def convert_all_tutorials(path_to_docs=None, path_to_dest=None):
457
""" Convert all tutorials into notebooks."""
458
path_to_docs = PATH_TO_DOCS if path_to_docs is None else path_to_docs
459
path_to_dest = PATH_TO_DEST if path_to_dest is None else path_to_dest
460
for folder in ["pytorch", "tensorflow"]:
461
os.makedirs(os.path.join(path_to_dest, folder), exist_ok=True)
462
for file in TUTORIAL_FILES:
463
notebook_name = os.path.splitext(file)[0] + ".ipynb"
464
doc_file = os.path.join(path_to_docs, file)
465
notebook_file = os.path.join(path_to_dest, notebook_name)
466
convert_rst_file_to_notebook(doc_file, notebook_file, origin_folder=path_to_docs, dest_folder=path_to_dest)
467
for folder, framework in zip(["pytorch", "tensorflow"], ["pt", "tf"]):
468
notebook_file = os.path.join(os.path.join(path_to_dest, folder), notebook_name)
469
convert_rst_file_to_notebook(doc_file, notebook_file, framework=framework, img_prefix="..")
470
471
472
if __name__ == "__main__":
473
convert_all_tutorials()
474