require "#{__dir__}/ari-synthesize.rb"
require 'fileutils'
require 'digest'
require 'json'
dir = ARGV[0]
script = JSON.parse(File.read(File.join(ARGV[0], 'script.json')))
engine = ARGV[1]
END_OF_SENTENCE_DURATION = script['voice'].fetch('endOfSentencePause', 0.2)
END_OF_SLIDE_DURATION = script['voice'].fetch('endOfSlidePause', 1.04)
editly = {
'width' => 1920,
'height' => 1080,
'fps' => 30,
'fast' => ENV.fetch('EDITLY_FAST', 'false') == 'true',
'outPath' => File.join(dir, 'tmp.mp4'),
'defaults' => {
'transition' => {
duration: 0,
},
},
keepSourceAudio: true,
'clips' => []
}
def timefmt(t, fmt)
seconds = t % (24 * 3600)
hours = seconds.to_i / 3600
seconds = seconds % 3600
minutes = seconds.to_i / 60
seconds = seconds % 60
(seconds, ms) = seconds.divmod(1)
ms = 1000 * ms
if fmt == 'vtt'
format('%<h>02d:%<m>02d:%<s>02d.%<ms>03d', { h: hours, m: minutes, s: seconds, ms: ms })
else
format('%<h>02d:%<m>02d:%<s>02d,%<ms>03d', { h: hours, m: minutes, s: seconds, ms: ms })
end
end
def split_sentence(sentence, timing)
res = sentence.split
chunk_size = (res.length.to_f / (res.length.to_f / 20).ceil).ceil
chunks = res.each_slice(chunk_size).to_a.length
res.each_slice(chunk_size).with_index.map do |chunk, idx|
t0 = timing * (idx / chunks.to_f)
tf = timing * ((1 + idx) / chunks.to_f)
[chunk, t0, tf]
end
end
editly['clips'] = script['blocks'].map.with_index do |phrases, idx|
parts = phrases.map do |subtitle|
digest = Digest::MD5.hexdigest subtitle
handle = File.open(File.join(dir, "#{digest}.txt"), 'w')
handle.write(subtitle)
handle = File.open(File.join(dir, "#{digest}-subtitle.txt"), 'w')
handle.write(subtitle)
voice = script['voice']
mp3, json, duration = synthesize(subtitle, engine, voice: voice['id'], lang: voice['lang'], neural: voice['neural'])
puts "\tSynthesizing: #{mp3} #{subtitle}"
FileUtils.cp(mp3, File.join(dir, "#{digest}.mp3"))
FileUtils.cp(json, File.join(dir, "#{digest}.json"))
[
{
:caption => subtitle,
'duration' => duration,
'layers' => [{
'type' => 'audio',
'path' => File.join(dir, "#{digest}.mp3"),
}, {
'type' => 'image',
'path' => File.join(dir, 'slides.%03d.png' % idx),
'resizeMode' => 'stretch',
'zoomDirection' => nil,
}]
}, {
'duration' => END_OF_SENTENCE_DURATION,
'layers' => [{
'type' => 'image',
'path' => File.join(dir, 'slides.%03d.png' % idx),
'resizeMode' => 'stretch',
'zoomDirection' => nil,
}]
}
]
end
parts.flatten!
parts = parts[0..-2]
parts.push({
'transition' => {
'name' => 'fadegrayscale',
'duration' => END_OF_SLIDE_DURATION,
},
'duration' => END_OF_SLIDE_DURATION,
'layers' => [{
'type' => 'image',
'path' => File.join(dir, 'slides.%03d.png' % idx),
'resizeMode' => 'stretch',
'zoomDirection' => nil,
}]
})
parts.flatten
end.flatten
subtitle_timings = []
offset = 0
editly['clips'].each do |layer|
if layer.key?(:caption)
subtitle_timings += split_sentence(layer[:caption], layer['duration']).map do |sen_part, time_prev, time_next|
[sen_part.join(' '), offset + time_prev, offset + time_next]
end
offset += layer['duration']
elsif layer.key? 'transition'
offset += 1.04 / 2
else
offset += layer['duration']
end
end
video_script = File.open(File.join(dir, 'editly.json5'), 'w')
video_script.write(JSON.generate(editly))
vtt = File.open(File.join(dir, 'out.vtt'), 'w')
srt = File.open(File.join(dir, 'out.srt'), 'w')
vtt.write("WEBVTT\n\n\n")
subtitle_timings.each_with_index do |subtitle, index|
sub, time_prev, time_next = subtitle
vtt.write("#{index}\n")
srt.write("#{index}\n")
vtt.write("#{timefmt(time_prev, 'vtt')} --> #{timefmt(time_next, 'vtt')}\n")
srt.write("#{timefmt(time_prev, 'srt')} --> #{timefmt(time_next, 'srt')}\n")
vtt.write("#{sub}\n\n")
srt.write("#{sub}\n\n")
end