CoCalc -- ari-prep-script.rb

GitHub Repository: galaxyproject/training-material
Path: blob/main/bin/ari-prep-script.rb
¹⁶⁷⁷ views
1
#!/usr/bin/env ruby
2
# frozen_string_literal: true
3

4
# Given a script (json file)
5
# explode it into a directory of individual lines in .txt format (.txt, -sub.txt)
6
# For each spoken line convert into mp3
7
# Then re-assemble a script for ffmpeg
8
require "#{__dir__}/ari-synthesize.rb"
9
require 'fileutils'
10
require 'digest'
11
require 'json'
12

13
dir = ARGV[0]
14
script = JSON.parse(File.read(File.join(ARGV[0], 'script.json')))
15
engine = ARGV[1]
16

17
END_OF_SENTENCE_DURATION = script['voice'].fetch('endOfSentencePause', 0.2)
18
END_OF_SLIDE_DURATION = script['voice'].fetch('endOfSlidePause', 1.04)
19

20
editly = {
21
  'width' => 1920,
22
  'height' => 1080,
23
  'fps' => 30,
24
  'fast' => ENV.fetch('EDITLY_FAST', 'false') == 'true',
25
  'outPath' => File.join(dir, 'tmp.mp4'),
26
  'defaults' => {
27
    'transition' => {
28
      duration: 0,
29
    },
30
  },
31
  keepSourceAudio: true,
32
  'clips' => []
33
}
34

35
# Intro slides.
36
# Fast: editly --json editly.json5  126,23s user 5,62s system 126% cpu 1:44,08 total
37
# Slow: editly --json editly.json5  902,71s user 69,27s system 326% cpu 4:57,54 total
38

39
def timefmt(t, fmt)
40
  seconds = t % (24 * 3600)
41
  hours = seconds.to_i / 3600
42
  seconds = seconds % 3600
43
  minutes = seconds.to_i / 60
44
  seconds = seconds % 60
45
  (seconds, ms) = seconds.divmod(1)
46
  # seconds = seconds
47
  ms = 1000 * ms
48

49
  if fmt == 'vtt'
50
    format('%<h>02d:%<m>02d:%<s>02d.%<ms>03d', { h: hours, m: minutes, s: seconds, ms: ms })
51
  else
52
    format('%<h>02d:%<m>02d:%<s>02d,%<ms>03d', { h: hours, m: minutes, s: seconds, ms: ms })
53
  end
54
end
55

56
def split_sentence(sentence, timing)
57
  res = sentence.split
58
  chunk_size = (res.length.to_f / (res.length.to_f / 20).ceil).ceil
59
  chunks = res.each_slice(chunk_size).to_a.length
60
  res.each_slice(chunk_size).with_index.map do |chunk, idx|
61
    t0 = timing * (idx / chunks.to_f)
62
    tf = timing * ((1 + idx) / chunks.to_f)
63
    [chunk, t0, tf]
64
  end
65
end
66

67
# For each line in the script, iterate
68
# The spoken line is the one that's hashed. (2nd col)
69
editly['clips'] = script['blocks'].map.with_index do |phrases, idx|
70
  # We're inside a single slide.
71
  parts = phrases.map do |subtitle|
72
    digest = Digest::MD5.hexdigest subtitle
73

74
    # Write out our script bits.
75
    handle = File.open(File.join(dir, "#{digest}.txt"), 'w')
76
    handle.write(subtitle)
77

78
    handle = File.open(File.join(dir, "#{digest}-subtitle.txt"), 'w')
79
    handle.write(subtitle)
80

81
    # Synthesize and copy to the temp dir
82
    voice = script['voice']
83

84
    mp3, json, duration = synthesize(subtitle, engine, voice: voice['id'], lang: voice['lang'], neural: voice['neural'])
85
    puts "\tSynthesizing: #{mp3} #{subtitle}"
86
    FileUtils.cp(mp3, File.join(dir, "#{digest}.mp3"))
87
    FileUtils.cp(json, File.join(dir, "#{digest}.json"))
88

89
    [
90
      {
91
        :caption => subtitle,
92
        'duration' => duration,
93
        'layers' => [{
94
          'type' => 'audio',
95
          'path' => File.join(dir, "#{digest}.mp3"),
96
        }, {
97
          'type' => 'image',
98
          'path' => File.join(dir, 'slides.%03d.png' % idx),
99
          'resizeMode' => 'stretch',
100
          'zoomDirection' => nil,
101
        }]
102
      }, {
103
        'duration' => END_OF_SENTENCE_DURATION,
104
        'layers' => [{
105
          'type' => 'image',
106
          'path' => File.join(dir, 'slides.%03d.png' % idx),
107
          'resizeMode' => 'stretch',
108
          'zoomDirection' => nil,
109
        }]
110
      }
111
    ]
112
  end
113
  parts.flatten!
114
  # Strip out the last pause
115
  parts = parts[0..-2]
116

117
  # Here we add 1 second of silence at the end of each slide.
118
  parts.push({
119
               'transition' => {
120
                 'name' => 'fadegrayscale',
121
                 'duration' => END_OF_SLIDE_DURATION,
122
               },
123
               'duration' => END_OF_SLIDE_DURATION,
124
               'layers' => [{
125
                 'type' => 'image',
126
                 'path' => File.join(dir, 'slides.%03d.png' % idx),
127
                 'resizeMode' => 'stretch',
128
                 'zoomDirection' => nil,
129
               }]
130
             })
131
  parts.flatten
132
end.flatten
133

134
subtitle_timings = []
135
offset = 0
136
editly['clips'].each do |layer|
137
  if layer.key?(:caption)
138
    subtitle_timings += split_sentence(layer[:caption], layer['duration']).map do |sen_part, time_prev, time_next|
139
      [sen_part.join(' '), offset + time_prev, offset + time_next]
140
    end
141
    offset += layer['duration']
142
  elsif layer.key? 'transition'
143
    # End of slide.
144
    offset += 1.04 / 2 # The true transition time.
145
  else
146
    offset += layer['duration']
147
  end
148
end
149

150
# Remove our :caption key.
151
# editly['clips'].map!{|layer|
152
# layer.delete(:caption)
153
# layer
154
# }
155

156
video_script = File.open(File.join(dir, 'editly.json5'), 'w')
157
video_script.write(JSON.generate(editly))
158

159
vtt = File.open(File.join(dir, 'out.vtt'), 'w')
160
srt = File.open(File.join(dir, 'out.srt'), 'w')
161

162
vtt.write("WEBVTT\n\n\n")
163
subtitle_timings.each_with_index do |subtitle, index|
164
  sub, time_prev, time_next = subtitle
165

166
  vtt.write("#{index}\n")
167
  srt.write("#{index}\n")
168
  vtt.write("#{timefmt(time_prev, 'vtt')} --> #{timefmt(time_next, 'vtt')}\n")
169
  srt.write("#{timefmt(time_prev, 'srt')} --> #{timefmt(time_next, 'srt')}\n")
170
  vtt.write("#{sub}\n\n")
171
  srt.write("#{sub}\n\n")
172
end
173

174
Product

Resources

Company