CoCalc -- notebook.rb

GitHub Repository: galaxyproject/training-material
Path: blob/main/_plugins/notebook.rb
¹⁶⁷⁷ views
1
require 'digest'
2
require 'json'
3
require 'fileutils'
4
require 'yaml'
5
require 'base64'
6

7
# Monkey patching hash
8
class Hash
9
  def fetch2(key, default)
10
    fetch(key, default) || default
11
  end
12
end
13

14
# Generate Notebooks from Markdown
15
module Gtn
16
  ##
17
  # Notebook generation module, this converts markdown into Jupyter and RMarkdown/Quarto notebooks
18
  module Notebooks
19

20
    # Colors for the various boxes, based on our 2024 CSS
21
    COLORS = {
22
      'overview' => '#8A9AD0',
23
      'agenda' => '#86D486',
24
      'keypoints' => '#FFA1A1',
25
      'tip' => '#FFE19E',
26
      'warning' => '#de8875',
27
      'comment' => '#ffecc1',
28
      'hands_on' => '#dfe5f9',
29
      'question' => '#8A9AD0',
30
      'solution' => '#B8C3EA',
31
      'details' => '#ddd',
32
      'feedback' => '#86D486',
33
      'code-in' => '#86D486',
34
      'code-out' => '#fb99d0',
35
    }.freeze
36

37
    # +COLORS+ but hide the agenda box.
38
    COLORS_EXTRA = {
39
      'agenda' => 'display: none',
40
    }.freeze
41

42
    # Emoji icons for the various boxes
43
    ICONS = {
44
      'tip' => '💡',
45
      'code-in' => '⌨️',
46
      'code-out' => '🖥',
47
      'question' => '❓',
48
      'solution' => '👁',
49
      'warning' => '⚠️',
50
      'comment' => '💬',
51
      'feedback' => '⁉️',
52
      'details' => '💬',
53
      'hands_on' => '✏️',
54
    }.freeze
55

56
    # Font-awesome equivalents of the icons we use for our boxes
57
    ICONS_FA = {
58
      'far fa-keyboard' => 'code-in',
59
      'fas fa-laptop-code' => 'code-out',
60
      'far fa-comment-dots' => 'comment',
61
      'fas fa-info-circle' => 'details',
62
      'far fa-comments' => 'feedback',
63
      'fas fa-pencil-alt' => 'hands_on',
64
      'far fa-question-circle' => 'question',
65
      'far fa-eye' => 'solution',
66
      'far fa-lightbulb' => 'tip',
67
      'fas fa-exclamation-triangle' => 'warning',
68
    }.freeze
69

70
    # Generate the CSS to be included, by mapping our colors to appropriate classes.
71
    def self.generate_css
72
      COLORS.map do |key, val|
73
        ".#{key} { padding: 0 1em; margin: 1em 0.2em; border: 2px solid #{val} }"
74
      end.join("\n")
75
    end
76

77
    ##
78
    # Convert a markdown file into a Jupyter notebook JSON structure.
79
    #
80
    # Params:
81
    # +content+:: The markdown content to convert
82
    # +accepted_languages+:: The languages to accept as code blocks. Code blocks that do not match will not be accepted.
83
    #
84
    # Returns:
85
    # +Hash+:: A JSON structure representing the Jupyter notebook.
86
    def self.convert_notebook_markdown(content, accepted_languages)
87
      out = []
88
      inside_block = false
89
      cur_lang = nil
90
      val = []
91
      data = content.split("\n")
92
      data.each.with_index do |line, i|
93
        m = line.match(/^```(#{accepted_languages.join('|')})\s*$/)
94
        if m
95
          if inside_block
96
            puts data[i - 2..i + 2]
97
            raise "[GTN/Notebook] L#{i} Error! we're already in a block:"
98
          end
99
          # End the previous block
100
          out.push([val, inside_block, cur_lang])
101
          val = []
102

103
          inside_block = true
104
          cur_lang = m[1]
105
        elsif inside_block && line == '```'
106
          # End of code block
107
          out.push([val, inside_block, cur_lang])
108
          val = []
109
          inside_block = false
110
        else
111
          val.push(line)
112
        end
113
      end
114
      # final flush
115
      out.push([val, inside_block, cur_lang]) if !val.nil?
116

117
      notebook = {
118
        'metadata' => {},
119
        'nbformat' => 4,
120
        'nbformat_minor' => 5,
121
      }
122

123
      notebook['cells'] = out.map.with_index do |data2, index|
124
        res = {
125
          'id' => "cell-#{index}",
126
          'source' => data2[0].map { |x| "#{x.rstrip}\n" }
127
        }
128
        # Strip the trailing newline in the last cell.
129
        res['source'][-1] = res['source'][-1].rstrip if res['source'].length.positive?
130

131
        # Remove any remaining language tagged code blocks, e.g. in
132
        # tip/solution/etc boxes. These do not render well.
133
        res['source'] = res['source'].map { |x| x.gsub(/```(#{accepted_languages.join('|')})/, '```') }
134

135
        if data2[1]
136
          res.update({
137
                       'cell_type' => 'code',
138
                       'execution_count' => nil,
139
                       'outputs' => [],
140
                       'metadata' => {
141
                         'attributes' => {
142
                           'classes' => [
143
                             data[2]
144
                           ],
145
                           'id' => '',
146
                         }
147
                       }
148
                     })
149
        else
150
          res['cell_type'] = 'markdown'
151
        end
152
        res
153
      end
154
      notebook
155
    end
156

157
    ##
158
    # Group a document by the first character seen, which extracts blockquotes mostly.
159
    def self.group_doc_by_first_char(data)
160
      out = []
161
      first_char = nil
162
      val = []
163
      data = data.split("\n")
164

165
      # Here we collapse running groups of `>` into single blocks.
166
      data.each do |line|
167
        if first_char.nil?
168
          first_char = line[0]
169
          val = [line]
170
        elsif line[0] == first_char
171
          val.push(line)
172
        elsif line[0..1] == '{:' && first_char == '>'
173
          val.push(line)
174
        else
175
          # flush
176
          out.push(val)
177
          first_char = if line.size.positive?
178
                         line[0]
179
                       else
180
                         ''
181
                       end
182
          val = [line]
183
        end
184
      end
185
      # final flush
186
      out.push(val)
187

188
      out.reject! do |v|
189
        (v[0][0] == '>' && v[-1][0..1] == '{:' && v[-1].match(/.agenda/))
190
      end
191
      out.map! do |v|
192
        if v[0][0] == '>' && v[-1][0..1] == '{:'
193
          cls = v[-1][2..-2].strip
194
          res = [":::{#{cls}}"]
195
          res += v[0..-2].map { |c| c.sub(/^>\s*/, '') }
196
          res += [':::']
197
          res
198
        else
199
          v
200
        end
201
      end
202

203
      out.flatten(1).join("\n")
204
    end
205

206
    ##
207
    # Construct a byline from the metadata
208
    #
209
    # Params:
210
    # +site+:: The Jekyll site object
211
    # +metadata+:: The metadata to construct the byline from, including a contributions or contributors key
212
    #
213
    # Returns:
214
    # +String+:: The byline with markdown hyperlinks to the contributors
215
    def self.construct_byline(site, metadata)
216
      folks = Gtn::Contributors.get_authors(metadata)
217
      folks.map do |c|
218
        name = Gtn::Contributors.fetch_name(site, c)
219
        "[#{name}](https://training.galaxyproject.org/hall-of-fame/#{c}/)"
220
      end.join(', ')
221
    end
222

223
    ##
224
    # Given a notebook, add the metadata cell to the top of the notebook with the agenda, license, LOs, etc.
225
    #
226
    # Params:
227
    # +site+:: The Jekyll site object
228
    # +notebook+:: The notebook to add the metadata cell to
229
    # +metadata+:: The page.data to construct use for metadata.
230
    #
231
    # Returns:
232
    # +Hash+:: The updated notebook with the metadata cell added to the top.
233
    def self.add_metadata_cell(site, notebook, metadata)
234
      by_line = construct_byline(site, metadata)
235

236
      meta_header = [
237
        "<div style=\"border: 2px solid #8A9AD0; margin: 1em 0.2em; padding: 0.5em;\">\n\n",
238
        "# #{metadata['title']}\n",
239
        "\n",
240
        "by #{by_line}\n",
241
        "\n",
242
        "#{metadata.fetch('license', 'CC-BY')} licensed content from the [Galaxy Training Network]" \
243
        "(https://training.galaxyproject.org/)\n",
244
        "\n",
245
        "**Objectives**\n",
246
        "\n"
247
      ] + metadata.fetch2('questions', []).map { |q| "- #{q}\n" } + [
248
        "\n",
249
        "**Objectives**\n",
250
        "\n"
251
      ] + metadata.fetch2('objectives', []).map { |q| "- #{q}\n" } + [
252
        "\n",
253
        "**Time Estimation: #{metadata['time_estimation']}**\n",
254
        "\n",
255
        "</div>\n"
256
      ]
257
      metadata_cell = {
258
        'id' => 'metadata',
259
        'cell_type' => 'markdown',
260
        'source' => meta_header
261
      }
262
      notebook['cells'].unshift(metadata_cell)
263
      notebook
264
    end
265

266
    ##
267
    # Fix an R based Jupyter notebook by setting the kernel to R and stripping out the %%R magic commands.
268
    def self.fixRNotebook(notebook)
269
      # Set the bash kernel
270
      notebook['etadata'] = {
271
        'kernelspec' => {
272
          'display_name' => 'R',
273
          'language' => 'R',
274
          'name' => 'r'
275
        },
276
        'language_info' => {
277
          'codemirror_mode' => 'r',
278
          'file_extension' => '.r',
279
          'mimetype' => 'text/x-r-source',
280
          'name' => 'R',
281
          'pygments_lexer' => 'r',
282
          'version' => '4.1.0'
283
        }
284
      }
285
      # Strip out %%R since we'll use the bash kernel
286
      notebook['cells'].map do |cell|
287
        if cell.fetch('cell_type') == 'code' && (cell['source'][0] == "%%R\n")
288
          cell['source'] = cell['source'].slice(1..-1)
289
        end
290
        cell
291
      end
292
      notebook
293
    end
294

295
    ##
296
    # Similar to +fixRNotebook+ but for bash.
297
    def self.fixBashNotebook(notebook)
298
      # Set the bash kernel
299
      notebook['metadata'] = {
300
        'kernelspec' => {
301
          'display_name' => 'Bash',
302
          'language' => 'bash',
303
          'name' => 'bash'
304
        },
305
        'language_info' => {
306
          'codemirror_mode' => 'shell',
307
          'file_extension' => '.sh',
308
          'mimetype' => 'text/x-sh',
309
          'name' => 'bash'
310
        }
311
      }
312
      # Strip out %%bash since we'll use the bash kernel
313
      notebook['cells'].map do |cell|
314
        if cell.fetch('cell_type') == 'code' && (cell['source'][0] == "%%bash\n")
315
          cell['source'] = cell['source'].slice(1..-1)
316
        end
317
        cell
318
      end
319
      notebook
320
    end
321

322
    ##
323
    # Similar to +fixRNotebook+ but for Python, bash cells are accepted but must be prefixed with !
324
    def self.fixPythonNotebook(notebook)
325
      # TODO
326
      # prefix bash cells with `!`
327
      notebook['cells'].map do |cell|
328
        if cell.fetch('metadata', {}).fetch('attributes', {}).fetch('classes', [])[0] == 'bash'
329
          cell['source'] = cell['source'].map { |line| "!#{line}" }
330
        end
331
        cell
332
      end
333
      notebook
334
    end
335

336
    ##
337
    # Ibid, +fixRNotebook+ but for SQL.
338
    def self.fixSqlNotebook(notebook)
339
      # Add in a %%sql at the top of each cell
340
      notebook['cells'].map do |cell|
341
        if cell.fetch('cell_type') == 'code' && cell['source'].join.index('load_ext').nil?
342
          cell['source'] = ["%%sql\n"] + cell['source']
343
        end
344
        cell
345
      end
346
      notebook
347
    end
348

349
    ##
350
    # Call Jekyll's markdown plugin or failover to Kramdown
351
    #
352
    # I have no idea why that failure mode is supported, that's kinda wild.
353
    #
354
    # Params:
355
    # +site+:: The Jekyll site object
356
    # +text+:: The text to convert to html
357
    #
358
    # Returns:
359
    # +String+:: The HTML representation
360
    def self.markdownify(site, text)
361
      site.find_converter_instance(
362
        Jekyll::Converters::Markdown
363
      ).convert(text.to_s)
364
    rescue StandardError
365
      require 'kramdown'
366
      Kramdown::Document.new(text).to_html
367
    end
368

369
    ##
370
    # Return true if it's a notebook and the language is correct
371
    #
372
    # TODO: convert to `notebook?` which is more ruby-esque.
373
    #
374
    # +data+:: The page data to check
375
    # +language+:: The language to check for
376
    #
377
    # Returns:
378
    # +Boolean+:: True if it's a notebook (i.e hands on tutorial, has a notebook key, and the language is correct)
379
    def self.notebook_filter(data, language = nil)
380
      data['layout'] == 'tutorial_hands_on' \
381
        and data.key?('notebook') \
382
        and (language.nil? or data['notebook']['language'].downcase == language)
383
    end
384

385
    ##
386
    # Massage a page into RMarkdown preferred formatting.
387
    #
388
    # Params:
389
    # +site+:: The Jekyll site object
390
    # +page_data+:: The page metadata (page.data)
391
    # +page_content+:: The page content (page.content)
392
    # +page_url+:: The page URL
393
    # +page_last_modified+:: The last modified time of the page
394
    # +fn+:: The source filename of the page
395
    #
396
    # Returns:
397
    # +String+:: The RMarkdown formatted content
398
    #
399
    def self.render_rmarkdown(site, page_data, page_content, page_url, page_last_modified, fn)
400
      by_line = construct_byline(site, page_data)
401

402
      # Replace top level `>` blocks with fenced `:::`
403
      content = group_doc_by_first_char(page_content)
404

405
      # Re-run a second time to catch singly-nested Q&A?
406
      content = group_doc_by_first_char(content)
407

408
      # Replace zenodo links, the only replacement we do
409
      if !page_data['zenodo_link'].nil?
410
        Jekyll.logger.debug "Replacing zenodo links in #{page_url}, #{page_data['zenodo_link']}"
411
        content.gsub!(/{{\s*page.zenodo_link\s*}}/, page_data['zenodo_link'])
412
      end
413

414
      ICONS.each do |key, val|
415
        content.gsub!(/{% icon #{key} %}/, val)
416
      end
417
      ICONS_FA.each do |key, val|
418
        content.gsub!(%r{<i class="#{key}" aria-hidden="true"></i>}, ICONS[val])
419
      end
420

421
      content += %(\n\n# References\n\n<div id="refs"></div>\n)
422

423
      # https://raw.githubusercontent.com/rstudio/cheatsheets/master/rmarkdown-2.0.pdf
424
      # https://bookdown.org/yihui/rmarkdown/
425

426
      fnparts = fn.split('/')
427
      rmddata = {
428
        'title' => page_data['title'],
429
        'author' => "#{by_line}, #{page_data.fetch('license',
430
                                                   'CC-BY')} licensed content from the [Galaxy Training Network](https://training.galaxyproject.org/)",
431
        'bibliography' => "#{fnparts[2]}-#{fnparts[4]}.bib",
432
        'output' => {
433
          'html_notebook' => {
434
            'toc' => true,
435
            'toc_depth' => 2,
436
            'css' => 'gtn.css',
437
            'toc_float' => {
438
              'collapsed' => false,
439
              'smooth_scroll' => false,
440
            },
441
            # 'theme' => {'bootswatch' => 'journal'}
442
          },
443
          'word_document' => {
444
            'toc' => true,
445
            'toc_depth' => 2,
446
            'latex_engine' => 'xelatex',
447
          },
448
          'pdf_document' => {
449
            'toc' => true,
450
            'toc_depth' => 2,
451
            'latex_engine' => 'xelatex',
452
          },
453
        },
454
        'date' => page_last_modified.to_s,
455
        'link-citations' => true,
456
        'anchor_sections' => true,
457
        'code_download' => true,
458
      }
459
      rmddata['output']['html_document'] = JSON.parse(JSON.generate(rmddata['output']['html_notebook']))
460

461
      final_content = [
462
        "# Introduction\n",
463
        content.gsub(/```[Rr]/, '```{r}'),
464
        "# Key Points\n"
465
      ] + page_data.fetch2('key_points', []).map { |k| "- #{k}" } + [
466
        "\n# Congratulations on successfully completing this tutorial!\n",
467
        'Please [fill out the feedback on the GTN website](https://training.galaxyproject.org/' \
468
        "training-material#{page_url}#feedback) and check there for further resources!\n"
469
      ]
470

471
      "#{rmddata.to_yaml(line_width: rmddata['author'].size + 10)}---\n#{final_content.join("\n")}"
472
    end
473

474

475
    def self.render_jupyter_notebook(data, content, url, _last_modified, notebook_language, site, dir)
476
      # Here we read use internal methods to convert the tutorial to a Hash
477
      # representing the notebook
478
      accepted_languages = [notebook_language]
479
      accepted_languages << 'bash' if notebook_language == 'python'
480

481
      if !data['zenodo_link'].nil?
482
        Jekyll.logger.debug "Replacing zenodo links in #{url}, #{data['zenodo_link']}"
483
        content.gsub!(/{{\s*page.zenodo_link\s*}}/, data['zenodo_link'])
484
      end
485
      notebook = convert_notebook_markdown(content, accepted_languages)
486
      # This extracts the metadata yaml header and does manual formatting of
487
      # the header data to make for a nicer notebook.
488
      notebook = add_metadata_cell(site, notebook, data)
489

490
      # Apply language specific conventions
491
      case notebook_language
492
      when 'bash'
493
        notebook = fixBashNotebook(notebook)
494
      when 'sql'
495
        notebook = fixSqlNotebook(notebook)
496
      when 'r'
497
        notebook = fixRNotebook(notebook)
498
      when 'python'
499
        notebook = fixPythonNotebook(notebook)
500
      end
501

502
      # Here we loop over the markdown cells and render them to HTML. This
503
      # allows us to get rid of classes like {: .tip} that would be left in
504
      # the output by Jupyter's markdown renderer, and additionally do any
505
      # custom CSS which only seems to work when inline on a cell, i.e. we
506
      # can't setup a style block, so we really need to render the markdown
507
      # to html.
508
      notebook = renderMarkdownCells(site, notebook, data, url, dir)
509

510
      # Here we add a close to the notebook
511
      notebook['cells'] = notebook['cells'] + [{
512
        'cell_type' => 'markdown',
513
        'id' => 'final-ending-cell',
514
        'metadata' => { 'editable' => false, 'collapsed' => false },
515
        'source' => [
516
          "# Key Points\n\n"
517
        ] + data.fetch2('key_points', []).map { |k| "- #{k}\n" } + [
518
          "\n# Congratulations on successfully completing this tutorial!\n\n",
519
          'Please [fill out the feedback on the GTN website](https://training.galaxyproject.org/training-material' \
520
          "#{url}#feedback) and check there for further resources!\n"
521
        ]
522
      }]
523
      notebook
524
    end
525

526
    def self.renderMarkdownCells(site, notebook, metadata, _page_url, dir)
527
      seen_abbreviations = {}
528
      notebook['cells'].map do |cell|
529
        if cell.fetch('cell_type') == 'markdown'
530

531
          # The source is initially a list of strings, we'll merge it together
532
          # to make it easier to work with.
533
          source = cell['source'].join.strip
534

535
          # Here we replace individual `s with codeblocks, they screw up
536
          # rendering otherwise by going through rouge
537
          source = source.gsub(/ `([^`]*)`([^`])/, ' <code>\1</code>\2')
538
                         .gsub(/([^`])`([^`]*)` /, '\1<code>\2</code> ')
539

540
          # Strip out includes, snippets
541
          source.gsub!(/{% include .* %}/, '')
542
          source.gsub!(/{% snippet .* %}/, '')
543

544
          # Replace all the broken icons that can't render, because we don't
545
          # have access to the full render pipeline.
546
          cell['source'] = markdownify(site, source)
547

548
          ICONS.each do |key, val|
549
            # Replace the new box titles with h3s.
550
            cell['source'].gsub!(%r{<div class="box-title #{key}-title".*?</span>(.*?)</div>},
551
                                 "<div style=\"font-weight:900;font-size: 125%\">#{val} \\1</div>")
552

553
            # Remove the fa-icon spans
554
            cell['source'].gsub!(%r{<span role="button" class="fold-unfold fa fa-minus-square"></span>}, '')
555

556
            # just removing the buttons from solutions since they'll be changed
557
            # into summary/details in the parent notebook-jupyter.
558
            cell['source'].gsub!(%r{<button class="gtn-boxify-button solution".*?</button>}, '')
559
          end
560

561
          if metadata.key?('abbreviations')
562
            metadata['abbreviations'].each do |abbr, defn|
563
              cell['source'].gsub(/\{#{abbr}\}/) do
564
                if seen_abbreviations.key?(abbr)
565
                  firstdef = false
566
                else
567
                  firstdef = true
568
                  seen_abbreviations[abbr] = true
569
                end
570

571
                if firstdef
572
                  "#{defn} (#{abbr})"
573
                else
574
                  "<abbr title=\"#{defn}\">#{abbr}</abbr>"
575
                end
576
              end
577
            end
578
          end
579

580
          # Here we give a GTN-ish styling that doesn't try to be too faithful,
581
          # so we aren't spending time keeping up with changes to GTN css,
582
          # we're making it 'our own' a bit.
583

584
          COLORS.each do |key, val|
585
            val = "#{val};#{COLORS_EXTRA[key]}" if COLORS_EXTRA.key? key
586

587
            cell['source'].gsub!(/<blockquote class="#{key}">/,
588
                                 "<blockquote class=\"#{key}\" style=\"border: 2px solid #{val}; margin: 1em 0.2em\">")
589
          end
590

591
          # Images are referenced in the through relative URLs which is
592
          # fab, but in a notebook this doesn't make sense as it will live
593
          # outside of the GTN. We need real URLs.
594
          #
595
          # So either we'll embed the images directly via base64 encoding (cool,
596
          # love it) or we'll link to the production images and folks can live
597
          # without their images for a bit until it's merged.
598

599
          if cell['source'].match(/<img src="\.\./)
600
            cell['source'].gsub!(/<img src="(\.\.[^"]*)/) do |img|
601
              path = img[10..]
602
              image_path = File.join(dir, path)
603

604
              if img[-3..].downcase == 'png'
605
                data = Base64.encode64(File.binread(image_path))
606
                %(<img src="data:image/png;base64,#{data}")
607
              elsif (img[-3..].downcase == 'jpg') || (img[-4..].downcase == 'jpeg')
608
                data = Base64.encode64(File.binread(image_path))
609
                %(<img src="data:image/jpeg;base64,#{data}")
610
              elsif img[-3..].downcase == 'svg'
611
                data = Base64.encode64(File.binread(image_path))
612
                %(<img src="data:image/svg+xml;base64,#{data}")
613
              else
614
                # Falling back to non-embedded images
615
                "<img src=\"https://training.galaxyproject.org/training-material/#{page_url.split('/')[0..-2].join('/')}/.."
616
              end
617
            end
618
          end
619

620
          # Strip out the highlighting as it is bad on some platforms.
621
          cell['source'].gsub!(/<pre class="highlight">/, '<pre style="color: inherit; background: transparent">')
622
          cell['source'].gsub!(/<div class="highlight">/, '<div>')
623
          cell['source'].gsub!(/<code>/, '<code style="color: inherit">')
624

625
          # There is some weirdness in the processing of $s in Jupyter. After a
626
          # certain number of them, it will give up, and just render everything
627
          # like with a '<pre>'. We remove this to prevent that result.
628
          cell['source'].gsub!(/^\s*</, '<')
629
          # Additionally leading spaces are sometimes interpreted as <pre>s and
630
          # end up causing paragraphs to be rendered as code. So we wipe out
631
          # all leading space.
632
          # 'editable' is actually CoCalc specific but oh well.
633
          cell['metadata'] = { 'editable' => false, 'collapsed' => false }
634
          cell['source'].gsub!(/\$/, '&#36;')
635
        end
636
        cell
637
      end
638
      notebook
639
    end
640
  end
641
end
642

643
Product

Resources

Company