CoCalc -- ari-synthesize.rb

GitHub Repository: galaxyproject/training-material
Path: blob/main/bin/ari-synthesize.rb
¹⁶⁷⁷ views
1
#!/usr/bin/env ruby
2
# frozen_string_literal: true
3

4
require 'cgi'
5
require 'json'
6
require 'yaml'
7
require 'optparse'
8
require 'fileutils'
9
require 'open3'
10
require 'tempfile'
11
require 'digest'
12
require 'tmpdir'
13

14
PUNCTUATION = ['-', '--', '@', '%', '‘', '’', ',', '!', '(', ')', '.', "'", '"', '[', ']', ';', ':'].freeze
15
ARI_MAP = File.expand_path(File.join(__dir__, 'ari-map.yml'))
16
WORD_MAP = {}
17
YAML.load_file(ARI_MAP).each_pair do |k, v|
18
  WORD_MAP.merge!({ k.downcase => v })
19
end
20

21
GTN_CACHE = File.expand_path(File.join(File.expand_path(__dir__), '..', '.jekyll-cache', 'speech'))
22
FileUtils.mkdir_p GTN_CACHE
23

24
def translate(word)
25
  return word if /^\s+$/.match(word)
26

27
  return word if PUNCTUATION.find_index(word)
28

29
  return WORD_MAP[word] if WORD_MAP.key?(word)
30

31
  m = /([^A-Za-z0-9]*)([A-Za-z0-9]+)([^A-Za-z0-9]*)(.*)/.match(word)
32

33
  if !m
34
    puts "Error: #{word}"
35
    return word
36
  end
37

38
  fixed = if m[2]
39
            WORD_MAP.fetch(m[2].downcase, m[2])
40
          else
41
            m[2]
42
          end
43

44
  # puts "#{m} ⇒ #{m[1] + fixed + m[3]}"
45
  m[1] + fixed + m[3] + m[4]
46
end
47

48
def correct(uncorrected_line)
49
  # First we try and catch the things we can directly replace (esp usegalaxy.*)
50
  line = uncorrected_line.strip.split.map do |w|
51
    translate(w)
52
  end.join(' ')
53

54
  # Now we do more fancy replacements
55
  line.strip.split(/([ ‘’,'".:;!`()])/).reject(&:empty?).compact.map do |w|
56
    translate(w)
57
  end.join
58
end
59

60
def call_engine(engine, line, mp3, voice, lang, neural)
61
  if engine == 'aws'
62
    awseng = if neural
63
               'neural'
64
             else
65
               'standard'
66
             end
67

68
    # Synthesize
69
    args = ['aws', 'polly', 'synthesize-speech', '--engine', awseng, '--language-code', lang, '--voice-id', voice,
70
            '--output-format', 'mp3', '--text', line, mp3]
71
    _, stderr, err = Open3.capture3(*args)
72
    if err.exited? && err.exitstatus.positive?
73
      puts "ERROR: #{stderr}"
74
      puts "ERROR: #{err}"
75
      exit 1
76
    end
77
  elsif engine == 'mozilla'
78
    raw = Tempfile.new('synth-raw')
79
    _, stderr, err = Open3.capture3('curl', '--silent', '-G', '--output', raw.path,
80
                                    "http://localhost:5002/api/tts?text=#{CGI.escape(line)}")
81
    if err.exited? && err.exitstatus.positive?
82
      puts "ERROR: #{stderr}"
83
      exit 1
84
    end
85

86
    _, stderr, err = Open3.capture3('ffmpeg', '-loglevel', 'error', '-i', raw.path, '-y', mp3)
87
    if err.exited? && err.exitstatus.positive?
88
      puts "ERROR: #{stderr}"
89
      exit 1
90
    end
91
  end
92
end
93

94
def find_duration(mp3)
95
  stdout, = Open3.capture2('ffprobe', '-loglevel', 'error', '-show_format', '-show_streams', '-print_format', 'json',
96
                           '-i', mp3)
97
  data = JSON.parse(stdout)
98
  data['format']['duration'].to_f
99
end
100

101
def synthesize(uncorrected_line, engine, voice: 'Amy', lang: 'en-GB', neural: true, output: nil)
102
  line = correct(uncorrected_line)
103
  digest = Digest::MD5.hexdigest line
104
  if output.nil?
105
    mp3 = File.join(GTN_CACHE, "#{engine}-#{digest}-#{voice}.mp3")
106
    json = File.join(GTN_CACHE, "#{engine}-#{digest}-#{voice}.json")
107
    if File.file?(mp3)
108
      duration = JSON.parse(File.read(json))['end']
109
      return mp3, json, duration.to_f
110
    end
111
  else
112
    mp3 = output
113
    json = "#{output}.json"
114
    if File.file?(output)
115
      return mp3, json, 0.0 # Todo
116
    end
117
  end
118

119
  # Call our engine
120
  call_engine(engine, line, mp3, voice, lang, neural)
121
  duration = find_duration(mp3)
122

123
  if line.length < 200 && duration > 27
124
    # Helena managed to find a specific bad string which, when fed to Mozilla's
125
    # TTS would generate
126
    #
127
    # In: Some important terms you should know.
128
    # Out Some important terms you should know know know know know know know know know know know know know know ...
129
    #
130
    # So we put in a check that the duration hasn't done something crazy, and
131
    # if it is add something to the end which seems to short-circuit that
132
    # error.
133
    #
134
    # I've reported this upstream but the response was not useful, apparently
135
    # this is an "expected failure mode".
136
    #
137
    # https://github.com/synesthesiam/docker-mozillatts/issues/9
138
    # https://discourse.mozilla.org/t/sentences-which-trigger-an-endless-loop/72261/8
139
    warn 'Strange: line was too long'
140
    call_engine(engine, "#{line}.", mp3)
141
    duration = find_duration(mp3)
142
  end
143

144
  if line.length < 200 && duration > 27
145
    # Or maybe they just wrote a super long sentence. Or maybe we need to update the cutoff time.
146
    warn "ERROR: #{duration} of line is bad: #{line}"
147
  end
148

149
  # Now collect metadata for JSON
150
  json_handle = File.open(json, 'w')
151
  json_handle.write(JSON.generate({ time: 0, type: 'sentence', start: 0, end: duration, value: line }))
152
  json_handle.close
153

154
  [mp3, json, duration]
155
end
156

157
def parseOptions
158
  options = {}
159
  OptionParser.new do |opts|
160
    opts.banner = 'Usage: ari-synthesize.rb [options]'
161

162
    options[:neural] = true
163
    options[:voice] = 'Amy'
164
    options[:lang] = 'en-GB'
165

166
    opts.on('--aws', 'Use AWS Polly') do |v|
167
      options[:aws] = v
168
    end
169

170
    opts.on('--mozilla', 'Use MozillaTTS') do |v|
171
      options[:mozilla] = v
172
    end
173

174
    opts.on('--non-neural', '[AWS] Non-neural voice') do |_v|
175
      options[:neural] = false
176
    end
177

178
    opts.on('--voice=VOICE', '[AWS] Voice ID') do |n|
179
      options[:voice] = n
180
    end
181

182
    opts.on('--lang=LANG', '[AWS] Language code') do |n|
183
      options[:lang] = n
184
    end
185

186
    opts.on('-fFILE', '--file=FILE', 'File containing line of text to speak') do |n|
187
      options[:file] = n
188
    end
189

190
    opts.on('-oFILE', '--output=FILE', 'Location to save the file in (defaults to auto-generated location)') do |n|
191
      options[:output] = n
192
    end
193

194
    opts.on('-v', '--[no-]verbose', 'Run verbosely') do |v|
195
      options[:verbose] = v
196
    end
197
  end.parse!
198

199
  if !(options[:aws] || options[:mozilla])
200
    puts 'ERROR: You must use aws or mozilla'
201
    exit 1
202
  end
203

204
  if !(options[:file])
205
    puts 'ERROR: You must provide a file with a single sentence to speak'
206
    exit 1
207
  end
208

209
  sentence = File.read(options[:file]).chomp
210
  if options[:aws]
211
    engine = 'aws'
212
  elsif options[:mozilla]
213
    engine = 'mozilla'
214
  end
215

216
  [sentence, engine, options]
217
end
218

219
if __FILE__ == $PROGRAM_NAME
220
  sentence, engine, options = parseOptions
221
  mp3, = synthesize(sentence, engine, voice: options[:voice], lang: options[:lang], neural: options[:neural],
222
                                      output: options[:output])
223
  puts mp3
224
end
225

226
Product

Resources

Company