Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
galaxyproject
GitHub Repository: galaxyproject/training-material
Path: blob/main/bin/ari-synthesize.rb
1677 views
1
#!/usr/bin/env ruby
2
# frozen_string_literal: true
3
4
require 'cgi'
5
require 'json'
6
require 'yaml'
7
require 'optparse'
8
require 'fileutils'
9
require 'open3'
10
require 'tempfile'
11
require 'digest'
12
require 'tmpdir'
13
14
PUNCTUATION = ['-', '--', '@', '%', '‘', '’', ',', '!', '(', ')', '.', "'", '"', '[', ']', ';', ':'].freeze
15
ARI_MAP = File.expand_path(File.join(__dir__, 'ari-map.yml'))
16
WORD_MAP = {}
17
YAML.load_file(ARI_MAP).each_pair do |k, v|
18
WORD_MAP.merge!({ k.downcase => v })
19
end
20
21
GTN_CACHE = File.expand_path(File.join(File.expand_path(__dir__), '..', '.jekyll-cache', 'speech'))
22
FileUtils.mkdir_p GTN_CACHE
23
24
def translate(word)
25
return word if /^\s+$/.match(word)
26
27
return word if PUNCTUATION.find_index(word)
28
29
return WORD_MAP[word] if WORD_MAP.key?(word)
30
31
m = /([^A-Za-z0-9]*)([A-Za-z0-9]+)([^A-Za-z0-9]*)(.*)/.match(word)
32
33
if !m
34
puts "Error: #{word}"
35
return word
36
end
37
38
fixed = if m[2]
39
WORD_MAP.fetch(m[2].downcase, m[2])
40
else
41
m[2]
42
end
43
44
# puts "#{m} ⇒ #{m[1] + fixed + m[3]}"
45
m[1] + fixed + m[3] + m[4]
46
end
47
48
def correct(uncorrected_line)
49
# First we try and catch the things we can directly replace (esp usegalaxy.*)
50
line = uncorrected_line.strip.split.map do |w|
51
translate(w)
52
end.join(' ')
53
54
# Now we do more fancy replacements
55
line.strip.split(/([ ‘’,'".:;!`()])/).reject(&:empty?).compact.map do |w|
56
translate(w)
57
end.join
58
end
59
60
def call_engine(engine, line, mp3, voice, lang, neural)
61
if engine == 'aws'
62
awseng = if neural
63
'neural'
64
else
65
'standard'
66
end
67
68
# Synthesize
69
args = ['aws', 'polly', 'synthesize-speech', '--engine', awseng, '--language-code', lang, '--voice-id', voice,
70
'--output-format', 'mp3', '--text', line, mp3]
71
_, stderr, err = Open3.capture3(*args)
72
if err.exited? && err.exitstatus.positive?
73
puts "ERROR: #{stderr}"
74
puts "ERROR: #{err}"
75
exit 1
76
end
77
elsif engine == 'mozilla'
78
raw = Tempfile.new('synth-raw')
79
_, stderr, err = Open3.capture3('curl', '--silent', '-G', '--output', raw.path,
80
"http://localhost:5002/api/tts?text=#{CGI.escape(line)}")
81
if err.exited? && err.exitstatus.positive?
82
puts "ERROR: #{stderr}"
83
exit 1
84
end
85
86
_, stderr, err = Open3.capture3('ffmpeg', '-loglevel', 'error', '-i', raw.path, '-y', mp3)
87
if err.exited? && err.exitstatus.positive?
88
puts "ERROR: #{stderr}"
89
exit 1
90
end
91
end
92
end
93
94
def find_duration(mp3)
95
stdout, = Open3.capture2('ffprobe', '-loglevel', 'error', '-show_format', '-show_streams', '-print_format', 'json',
96
'-i', mp3)
97
data = JSON.parse(stdout)
98
data['format']['duration'].to_f
99
end
100
101
def synthesize(uncorrected_line, engine, voice: 'Amy', lang: 'en-GB', neural: true, output: nil)
102
line = correct(uncorrected_line)
103
digest = Digest::MD5.hexdigest line
104
if output.nil?
105
mp3 = File.join(GTN_CACHE, "#{engine}-#{digest}-#{voice}.mp3")
106
json = File.join(GTN_CACHE, "#{engine}-#{digest}-#{voice}.json")
107
if File.file?(mp3)
108
duration = JSON.parse(File.read(json))['end']
109
return mp3, json, duration.to_f
110
end
111
else
112
mp3 = output
113
json = "#{output}.json"
114
if File.file?(output)
115
return mp3, json, 0.0 # Todo
116
end
117
end
118
119
# Call our engine
120
call_engine(engine, line, mp3, voice, lang, neural)
121
duration = find_duration(mp3)
122
123
if line.length < 200 && duration > 27
124
# Helena managed to find a specific bad string which, when fed to Mozilla's
125
# TTS would generate
126
#
127
# In: Some important terms you should know.
128
# Out Some important terms you should know know know know know know know know know know know know know know ...
129
#
130
# So we put in a check that the duration hasn't done something crazy, and
131
# if it is add something to the end which seems to short-circuit that
132
# error.
133
#
134
# I've reported this upstream but the response was not useful, apparently
135
# this is an "expected failure mode".
136
#
137
# https://github.com/synesthesiam/docker-mozillatts/issues/9
138
# https://discourse.mozilla.org/t/sentences-which-trigger-an-endless-loop/72261/8
139
warn 'Strange: line was too long'
140
call_engine(engine, "#{line}.", mp3)
141
duration = find_duration(mp3)
142
end
143
144
if line.length < 200 && duration > 27
145
# Or maybe they just wrote a super long sentence. Or maybe we need to update the cutoff time.
146
warn "ERROR: #{duration} of line is bad: #{line}"
147
end
148
149
# Now collect metadata for JSON
150
json_handle = File.open(json, 'w')
151
json_handle.write(JSON.generate({ time: 0, type: 'sentence', start: 0, end: duration, value: line }))
152
json_handle.close
153
154
[mp3, json, duration]
155
end
156
157
def parseOptions
158
options = {}
159
OptionParser.new do |opts|
160
opts.banner = 'Usage: ari-synthesize.rb [options]'
161
162
options[:neural] = true
163
options[:voice] = 'Amy'
164
options[:lang] = 'en-GB'
165
166
opts.on('--aws', 'Use AWS Polly') do |v|
167
options[:aws] = v
168
end
169
170
opts.on('--mozilla', 'Use MozillaTTS') do |v|
171
options[:mozilla] = v
172
end
173
174
opts.on('--non-neural', '[AWS] Non-neural voice') do |_v|
175
options[:neural] = false
176
end
177
178
opts.on('--voice=VOICE', '[AWS] Voice ID') do |n|
179
options[:voice] = n
180
end
181
182
opts.on('--lang=LANG', '[AWS] Language code') do |n|
183
options[:lang] = n
184
end
185
186
opts.on('-fFILE', '--file=FILE', 'File containing line of text to speak') do |n|
187
options[:file] = n
188
end
189
190
opts.on('-oFILE', '--output=FILE', 'Location to save the file in (defaults to auto-generated location)') do |n|
191
options[:output] = n
192
end
193
194
opts.on('-v', '--[no-]verbose', 'Run verbosely') do |v|
195
options[:verbose] = v
196
end
197
end.parse!
198
199
if !(options[:aws] || options[:mozilla])
200
puts 'ERROR: You must use aws or mozilla'
201
exit 1
202
end
203
204
if !(options[:file])
205
puts 'ERROR: You must provide a file with a single sentence to speak'
206
exit 1
207
end
208
209
sentence = File.read(options[:file]).chomp
210
if options[:aws]
211
engine = 'aws'
212
elsif options[:mozilla]
213
engine = 'mozilla'
214
end
215
216
[sentence, engine, options]
217
end
218
219
if __FILE__ == $PROGRAM_NAME
220
sentence, engine, options = parseOptions
221
mp3, = synthesize(sentence, engine, voice: options[:voice], lang: options[:lang], neural: options[:neural],
222
output: options[:output])
223
puts mp3
224
end
225
226