Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
csc-training
GitHub Repository: csc-training/csc-env-eff
Path: blob/master/_slides/SRTCode/mdslideTOtranscribt.py
696 views
1
# Heavily modified from the script created by Aaron Cheung under MIT License
2
# Original repo: https://github.com/AaronCheung430/txt-TO-srt
3
4
import sys, os # for folders and os info and stuff
5
import argparse # for command line arguments
6
import re # for regex
7
8
from timecode import Timecode
9
10
def read_file(input_file, verbose=False):
11
"""
12
Reads the input file and returns the lines
13
"""
14
15
# Open the input file for reading
16
md_file = open(input_file,"r")
17
# read in lines of the slides.md file
18
md_lines = md_file.readlines()
19
20
# Optionally print first few lines
21
if verbose:
22
print("Here's what the input file says: \n",md_lines[:6])
23
24
# Close the file
25
md_file.close()
26
27
return md_lines
28
29
def collect_caps_VO(md_lines, verbose=False):
30
"""
31
Go through the lines of the slides.md and collect (+return) caption and voiceover content.
32
"""
33
34
# Lists of collected caption and voiceover content
35
caps = []
36
VO = []
37
38
# By default no content is taken
39
copy_captions = False
40
copy_speech = False
41
42
# Go through all lines in slides.md
43
for line in md_lines:
44
# When collecting caps is off see if it should be on
45
if not copy_captions:
46
if line == ":::info (speech)\n":
47
if verbose:
48
print("Recording captions")
49
copy_captions = True
50
# When copying caps is on see if it should be off
51
elif line == ":::\n":
52
copy_captions = False
53
# When copying caps is on copy caps
54
else:
55
caps.append(line)
56
#caps.append("\n")
57
if verbose:
58
print("To caps:",line)
59
60
# Collect slide titles to VO
61
if line.startswith('# '):
62
if verbose:
63
print("Title found:", line)
64
# Remove leading space and add dot before the new line
65
line = "{}.\n".format(line[1:-1])
66
VO.append("\n")
67
VO.append(line[1:])
68
69
# When collecting VO is off see if it should be on
70
if not copy_speech:
71
if line == ":::info (speech)\n":
72
if verbose:
73
print("Recording speech txt")
74
copy_speech = True
75
# When copying VO is on see if it should be off
76
elif line == ":::\n":
77
copy_speech = False
78
# When copying VO is on copy VO
79
else:
80
VO.append(line)
81
#VO.append("\n")
82
if verbose:
83
print("To speech:",line)
84
85
return caps, VO
86
87
def create_SRT(caps, verbose=False):
88
"""
89
Go through captions content and generate SRT-format where the captions are between 6 seconds.
90
Return lines of the SRT-file.
91
"""
92
93
# Lines of SRT-file as a list
94
srt = []
95
96
# Generate timecodes
97
tc = Timecode('30', '00:00:00.000')
98
tc_step1 = Timecode('30', '00:00:05.000')
99
tc_step2 = Timecode('30', '00:00:06.000')
100
tc.set_fractional(True)
101
tc_step1.set_fractional(True)
102
tc_step2.set_fractional(True)
103
104
# Number of caption lines for iteration
105
caps_n = len(caps)
106
107
# Go through the caption lines and insert SRT stuff
108
for i in range(caps_n):
109
# First in SRT is the line number
110
srt_line1 = "{}\n".format(i+1)
111
# caption ending timecode
112
tc2 = tc + tc_step1
113
tc2.set_fractional(True)
114
# Second in SRT is the start and end times
115
srt_line2 = "{} --> {}\n".format(tc,tc2)
116
# Third in SRT is the actual text
117
text = caps[i]
118
# Advance timecode for next iteration
119
tc = tc + tc_step2
120
tc.set_fractional(True)
121
122
# Add SRT content to the lines of SRT-file
123
srt.append(srt_line1)
124
srt.append(srt_line2)
125
srt.append(text)
126
srt.append("\n")
127
128
return srt
129
130
def create_VOs(VO, verbose=False):
131
"""
132
Go through voiceover content and generate a voiceover file.
133
The details of the file depend on the operating system: MacOS includes own Text-to-Speech engine and thus has a spesific functions here.
134
"""
135
# Copy VO to have another version for macOS
136
VO_mac = VO.copy()
137
138
# Lines of VO-file as list
139
speech = []
140
speech_mac = []
141
142
143
# Go through voiceover lines for macOS
144
for line in VO_mac:
145
# Make word replacements for nicer pronounciation
146
line = line.replace(" {.title}", "")
147
line = line.replace(" 1/2", "")
148
line = line.replace(" 2/2", "")
149
line = line.replace("Puhti", "Puyh.ti")
150
line = line.replace("puhti", "puyh.ti")
151
line = line.replace("Mahti", "Mah.ti")
152
line = line.replace("mahti", "mah.ti")
153
line = line.replace("Pouta", "Pawta")
154
line = line.replace("pouta", "pawta")
155
line = line.replace("Kaivos", "Kiavos")
156
line = line.replace("kaivos", "kiavos")
157
line = line.replace("rtools", "RTools")
158
line = line.replace("rsync", "RSync")
159
line = line.replace("rclone", "RClone")
160
line = line.replace("bashrc", "bashRC")
161
line = line.replace("NVMe", "NVME")
162
line = line.replace("SBATCH", "SBatch")
163
line = line.replace("sbatch", "SBatch")
164
line = line.replace("squeue", "SQueue")
165
line = line.replace("scontrol", "SControl")
166
line = line.replace("scancel", "SCancel")
167
line = line.replace(".sh", "dot SH")
168
line = line.replace("-u", "dash U")
169
line = line.replace("$USER", "dollar user")
170
line = line.replace("JOBID", "jobID")
171
line = line.replace("job ID", "jobID")
172
173
line = line.replace("YOURCSCUSERNAME", "your CSC username")
174
line = line.replace("MyCSC", "my CSC")
175
176
# Add one second pause after each line
177
speech_mac.append("[[slnc 1000]]")
178
# Add text
179
speech_mac.append(line)
180
# Add new line
181
speech_mac.append("\n")
182
183
# Go through voiceover lines
184
for line in VO:
185
# Make word replacements for nicer pronounciation
186
line = line.replace(" {.title}", "")
187
line = line.replace(" 1/2", "")
188
line = line.replace(" 2/2", "")
189
line = line.replace("Puhti", "Pouhti")
190
line = line.replace("puhti", "Pouhti")
191
line = line.replace("Pouta", "Pauwta")
192
line = line.replace("pouta", "Pauwta")
193
#line = line.replace("Mahti", "Mah:ti") # for speechelo non-PRO voices
194
line = line.replace("mahti", "Mahti")
195
#line = line.replace("Kaivos", "Kai:voss") # for speechelo non-PRO voices
196
line = line.replace("kaivos", "Kaivos")
197
line = line.replace("rtools", "r-tools")
198
line = line.replace("rclone", "r-clone")
199
line = line.replace("rsync", "r-sync")
200
line = line.replace("bashrc", "bash RC")
201
line = line.replace("NVMe", "NVME")
202
line = line.replace("SBATCH", "S-BATCH")
203
line = line.replace("sbatch", "s-batch")
204
line = line.replace("squeue", "S-Queue")
205
line = line.replace("scontrol", "S-Control")
206
line = line.replace("scancel", "S-Cancel")
207
#line = line.replace(".sh", "dot SH")
208
line = line.replace("-u", "dash U")
209
line = re.sub('\.\\b', ' dot ', line) # Replace .x with dot x
210
#line = re.sub(' \-\\b', ' dash ', line) # Replace -x with dash x ### Does not work with "CSC-accounts and -projects"
211
line = re.sub(' \-\-\\b', ' dash dash ', line) # Replace --x with dash dash x
212
line = line.replace("$USER", "dollar user")
213
line = line.replace("JOBID", "job ID")
214
line = line.replace("YOURCSCUSERNAME", "your CSC username")
215
#line = line.replace("MyCSC", "my CSC")
216
217
# Add text
218
speech.append(line)
219
# Add new line
220
#speech.append("\n")
221
222
return speech, speech_mac
223
224
225
def main(argv, verbose=True):
226
227
# Parser for arguments
228
parseri = argparse.ArgumentParser(description='Generate voiceover and captions from slide.md file that includes tags.')
229
# Add arguments
230
parseri.add_argument('mdslide_path', metavar='path_to_input_md', type=str, help='Path to the slide.md')
231
# parseri.add_argument('newfolder', metavar='newtargetfolder', type=bool, default=False, choices=['True','False'], required=False, help='Default: False. If True creates a new folder for output files. Use False if input file is in a new folder already.') # Did not get to work. Script always creates a new target folder.
232
# parse args
233
args=parseri.parse_args()
234
235
236
# Get the path for slides.md
237
input_file = args.mdslide_path
238
input_filename = os.path.basename(input_file)
239
input_path = os.path.dirname(input_file)
240
# If path is not given in args, current folder is assumed
241
if not input_path or input_path=='.':
242
input_path = os.getcwd()
243
244
# Stop here if input file does not exist
245
if not os.path.exists(os.path.join(input_path,input_filename)):
246
print("The input file does not exist. Check the input argument!!!")
247
sys.exit()
248
249
# A foldername for output files will include the original name without extension
250
output_folder = "{0}_files".format(os.path.splitext(input_filename)[0])
251
# Output folder path is the same as input path but creates a new folder inside
252
output_path = os.path.join(input_path, output_folder)
253
# Create a folder for output files if not exist
254
if not os.path.exists(output_path):
255
os.makedirs(output_path)
256
257
# Read input file sldes.md and output as lines
258
md_lines = read_file(input_file, verbose=verbose)
259
# Go through lines and collect captions and VO content
260
captions, VOs = collect_caps_VO(md_lines, verbose=False)
261
# Go through caption content and generate SRT lines
262
srt_lines = create_SRT(captions, verbose=verbose)
263
# Go through voiceover content and generate VO lines
264
VO_lines, VO_lines_mac = create_VOs(VOs, verbose=verbose)
265
266
# Parts of slides.md that goes to captions go to this file
267
captions_filename = "{0}_captions.srt".format(os.path.splitext(input_filename)[0])
268
# Parts of slides.md that goes to voiceover go to this file
269
VOs_filename = "{0}_voiceover.txt".format(os.path.splitext(input_filename)[0])
270
VOs_filename_mac = "{0}_voiceover_mac.txt".format(os.path.splitext(input_filename)[0])
271
audio_filename = "{0}_voiceover.aiff".format(os.path.splitext(input_filename)[0]) # Used only in MacOS
272
# Parts of slides.md that goes to transcribt go to this file
273
transcribt_filename = "{0}_transcribt.txt".format(os.path.splitext(input_filename)[0])
274
275
#print(input_file) # Original argument. Useful when figuring out the input paths
276
if verbose:
277
print('Input file is', input_filename)
278
print('Output path is', output_path)
279
print('Output files are', captions_filename, "and", VOs_filename, "and", VOs_filename_mac, "and", transcribt_filename)
280
281
282
# Create new SRT file and write down the SRT lines
283
srt_file = open(os.path.join(output_path, captions_filename) , "w")
284
srt_file.writelines(srt_lines)
285
srt_file.close()
286
if verbose:
287
print("New captions file created.")
288
289
# Create new transcribt file and write down the VO lines
290
transcribt_file = open(os.path.join(output_path, transcribt_filename) , "w")
291
transcribt_file.writelines(VOs)
292
transcribt_file.close()
293
if verbose:
294
print("New transcribt text file created.")
295
296
# Create new voiceover file and write down the VO lines
297
VOs_file = open(os.path.join(output_path, VOs_filename) , "w")
298
VOs_file.writelines(VO_lines)
299
VOs_file.close()
300
if verbose:
301
print("New voiceover text file created.")
302
303
# Create new voiceover file and write down the VO lines
304
VOs_file_mac = open(os.path.join(output_path, VOs_filename_mac) , "w")
305
VOs_file_mac.writelines(VO_lines_mac)
306
VOs_file_mac.close()
307
if verbose:
308
print("New voiceover text file created for Mac reader.")
309
310
311
# In mac should additionally call 'say' with Daniel and correct parameters
312
if sys.platform == "darwin":
313
os.system("say -f {0} -o {1} -r 44 -v Daniel".format(os.path.join(output_path, VOs_filename_mac),os.path.join(output_path, audio_filename)))
314
print("New voiceover audio file created.")
315
else:
316
print("Find the {0} file and run in through a Text-to-Speech engine of your choice.".format(VOs_filename))
317
318
print("Captions and VO extracted from {0}".format(input_filename))
319
320
321
if __name__ == "__main__":
322
main(sys.argv, verbose=False)
323