Path: blob/master/_slides/SRTCode/mdslideTOtranscribt.py
696 views
# Heavily modified from the script created by Aaron Cheung under MIT License1# Original repo: https://github.com/AaronCheung430/txt-TO-srt23import sys, os # for folders and os info and stuff4import argparse # for command line arguments5import re # for regex67from timecode import Timecode89def read_file(input_file, verbose=False):10"""11Reads the input file and returns the lines12"""1314# Open the input file for reading15md_file = open(input_file,"r")16# read in lines of the slides.md file17md_lines = md_file.readlines()1819# Optionally print first few lines20if verbose:21print("Here's what the input file says: \n",md_lines[:6])2223# Close the file24md_file.close()2526return md_lines2728def collect_caps_VO(md_lines, verbose=False):29"""30Go through the lines of the slides.md and collect (+return) caption and voiceover content.31"""3233# Lists of collected caption and voiceover content34caps = []35VO = []3637# By default no content is taken38copy_captions = False39copy_speech = False4041# Go through all lines in slides.md42for line in md_lines:43# When collecting caps is off see if it should be on44if not copy_captions:45if line == ":::info (speech)\n":46if verbose:47print("Recording captions")48copy_captions = True49# When copying caps is on see if it should be off50elif line == ":::\n":51copy_captions = False52# When copying caps is on copy caps53else:54caps.append(line)55#caps.append("\n")56if verbose:57print("To caps:",line)5859# Collect slide titles to VO60if line.startswith('# '):61if verbose:62print("Title found:", line)63# Remove leading space and add dot before the new line64line = "{}.\n".format(line[1:-1])65VO.append("\n")66VO.append(line[1:])6768# When collecting VO is off see if it should be on69if not copy_speech:70if line == ":::info (speech)\n":71if verbose:72print("Recording speech txt")73copy_speech = True74# When copying VO is on see if it should be off75elif line == ":::\n":76copy_speech = False77# When copying VO is on copy VO78else:79VO.append(line)80#VO.append("\n")81if verbose:82print("To speech:",line)8384return caps, VO8586def create_SRT(caps, verbose=False):87"""88Go through captions content and generate SRT-format where the captions are between 6 seconds.89Return lines of the SRT-file.90"""9192# Lines of SRT-file as a list93srt = []9495# Generate timecodes96tc = Timecode('30', '00:00:00.000')97tc_step1 = Timecode('30', '00:00:05.000')98tc_step2 = Timecode('30', '00:00:06.000')99tc.set_fractional(True)100tc_step1.set_fractional(True)101tc_step2.set_fractional(True)102103# Number of caption lines for iteration104caps_n = len(caps)105106# Go through the caption lines and insert SRT stuff107for i in range(caps_n):108# First in SRT is the line number109srt_line1 = "{}\n".format(i+1)110# caption ending timecode111tc2 = tc + tc_step1112tc2.set_fractional(True)113# Second in SRT is the start and end times114srt_line2 = "{} --> {}\n".format(tc,tc2)115# Third in SRT is the actual text116text = caps[i]117# Advance timecode for next iteration118tc = tc + tc_step2119tc.set_fractional(True)120121# Add SRT content to the lines of SRT-file122srt.append(srt_line1)123srt.append(srt_line2)124srt.append(text)125srt.append("\n")126127return srt128129def create_VOs(VO, verbose=False):130"""131Go through voiceover content and generate a voiceover file.132The details of the file depend on the operating system: MacOS includes own Text-to-Speech engine and thus has a spesific functions here.133"""134# Copy VO to have another version for macOS135VO_mac = VO.copy()136137# Lines of VO-file as list138speech = []139speech_mac = []140141142# Go through voiceover lines for macOS143for line in VO_mac:144# Make word replacements for nicer pronounciation145line = line.replace(" {.title}", "")146line = line.replace(" 1/2", "")147line = line.replace(" 2/2", "")148line = line.replace("Puhti", "Puyh.ti")149line = line.replace("puhti", "puyh.ti")150line = line.replace("Mahti", "Mah.ti")151line = line.replace("mahti", "mah.ti")152line = line.replace("Pouta", "Pawta")153line = line.replace("pouta", "pawta")154line = line.replace("Kaivos", "Kiavos")155line = line.replace("kaivos", "kiavos")156line = line.replace("rtools", "RTools")157line = line.replace("rsync", "RSync")158line = line.replace("rclone", "RClone")159line = line.replace("bashrc", "bashRC")160line = line.replace("NVMe", "NVME")161line = line.replace("SBATCH", "SBatch")162line = line.replace("sbatch", "SBatch")163line = line.replace("squeue", "SQueue")164line = line.replace("scontrol", "SControl")165line = line.replace("scancel", "SCancel")166line = line.replace(".sh", "dot SH")167line = line.replace("-u", "dash U")168line = line.replace("$USER", "dollar user")169line = line.replace("JOBID", "jobID")170line = line.replace("job ID", "jobID")171172line = line.replace("YOURCSCUSERNAME", "your CSC username")173line = line.replace("MyCSC", "my CSC")174175# Add one second pause after each line176speech_mac.append("[[slnc 1000]]")177# Add text178speech_mac.append(line)179# Add new line180speech_mac.append("\n")181182# Go through voiceover lines183for line in VO:184# Make word replacements for nicer pronounciation185line = line.replace(" {.title}", "")186line = line.replace(" 1/2", "")187line = line.replace(" 2/2", "")188line = line.replace("Puhti", "Pouhti")189line = line.replace("puhti", "Pouhti")190line = line.replace("Pouta", "Pauwta")191line = line.replace("pouta", "Pauwta")192#line = line.replace("Mahti", "Mah:ti") # for speechelo non-PRO voices193line = line.replace("mahti", "Mahti")194#line = line.replace("Kaivos", "Kai:voss") # for speechelo non-PRO voices195line = line.replace("kaivos", "Kaivos")196line = line.replace("rtools", "r-tools")197line = line.replace("rclone", "r-clone")198line = line.replace("rsync", "r-sync")199line = line.replace("bashrc", "bash RC")200line = line.replace("NVMe", "NVME")201line = line.replace("SBATCH", "S-BATCH")202line = line.replace("sbatch", "s-batch")203line = line.replace("squeue", "S-Queue")204line = line.replace("scontrol", "S-Control")205line = line.replace("scancel", "S-Cancel")206#line = line.replace(".sh", "dot SH")207line = line.replace("-u", "dash U")208line = re.sub('\.\\b', ' dot ', line) # Replace .x with dot x209#line = re.sub(' \-\\b', ' dash ', line) # Replace -x with dash x ### Does not work with "CSC-accounts and -projects"210line = re.sub(' \-\-\\b', ' dash dash ', line) # Replace --x with dash dash x211line = line.replace("$USER", "dollar user")212line = line.replace("JOBID", "job ID")213line = line.replace("YOURCSCUSERNAME", "your CSC username")214#line = line.replace("MyCSC", "my CSC")215216# Add text217speech.append(line)218# Add new line219#speech.append("\n")220221return speech, speech_mac222223224def main(argv, verbose=True):225226# Parser for arguments227parseri = argparse.ArgumentParser(description='Generate voiceover and captions from slide.md file that includes tags.')228# Add arguments229parseri.add_argument('mdslide_path', metavar='path_to_input_md', type=str, help='Path to the slide.md')230# parseri.add_argument('newfolder', metavar='newtargetfolder', type=bool, default=False, choices=['True','False'], required=False, help='Default: False. If True creates a new folder for output files. Use False if input file is in a new folder already.') # Did not get to work. Script always creates a new target folder.231# parse args232args=parseri.parse_args()233234235# Get the path for slides.md236input_file = args.mdslide_path237input_filename = os.path.basename(input_file)238input_path = os.path.dirname(input_file)239# If path is not given in args, current folder is assumed240if not input_path or input_path=='.':241input_path = os.getcwd()242243# Stop here if input file does not exist244if not os.path.exists(os.path.join(input_path,input_filename)):245print("The input file does not exist. Check the input argument!!!")246sys.exit()247248# A foldername for output files will include the original name without extension249output_folder = "{0}_files".format(os.path.splitext(input_filename)[0])250# Output folder path is the same as input path but creates a new folder inside251output_path = os.path.join(input_path, output_folder)252# Create a folder for output files if not exist253if not os.path.exists(output_path):254os.makedirs(output_path)255256# Read input file sldes.md and output as lines257md_lines = read_file(input_file, verbose=verbose)258# Go through lines and collect captions and VO content259captions, VOs = collect_caps_VO(md_lines, verbose=False)260# Go through caption content and generate SRT lines261srt_lines = create_SRT(captions, verbose=verbose)262# Go through voiceover content and generate VO lines263VO_lines, VO_lines_mac = create_VOs(VOs, verbose=verbose)264265# Parts of slides.md that goes to captions go to this file266captions_filename = "{0}_captions.srt".format(os.path.splitext(input_filename)[0])267# Parts of slides.md that goes to voiceover go to this file268VOs_filename = "{0}_voiceover.txt".format(os.path.splitext(input_filename)[0])269VOs_filename_mac = "{0}_voiceover_mac.txt".format(os.path.splitext(input_filename)[0])270audio_filename = "{0}_voiceover.aiff".format(os.path.splitext(input_filename)[0]) # Used only in MacOS271# Parts of slides.md that goes to transcribt go to this file272transcribt_filename = "{0}_transcribt.txt".format(os.path.splitext(input_filename)[0])273274#print(input_file) # Original argument. Useful when figuring out the input paths275if verbose:276print('Input file is', input_filename)277print('Output path is', output_path)278print('Output files are', captions_filename, "and", VOs_filename, "and", VOs_filename_mac, "and", transcribt_filename)279280281# Create new SRT file and write down the SRT lines282srt_file = open(os.path.join(output_path, captions_filename) , "w")283srt_file.writelines(srt_lines)284srt_file.close()285if verbose:286print("New captions file created.")287288# Create new transcribt file and write down the VO lines289transcribt_file = open(os.path.join(output_path, transcribt_filename) , "w")290transcribt_file.writelines(VOs)291transcribt_file.close()292if verbose:293print("New transcribt text file created.")294295# Create new voiceover file and write down the VO lines296VOs_file = open(os.path.join(output_path, VOs_filename) , "w")297VOs_file.writelines(VO_lines)298VOs_file.close()299if verbose:300print("New voiceover text file created.")301302# Create new voiceover file and write down the VO lines303VOs_file_mac = open(os.path.join(output_path, VOs_filename_mac) , "w")304VOs_file_mac.writelines(VO_lines_mac)305VOs_file_mac.close()306if verbose:307print("New voiceover text file created for Mac reader.")308309310# In mac should additionally call 'say' with Daniel and correct parameters311if sys.platform == "darwin":312os.system("say -f {0} -o {1} -r 44 -v Daniel".format(os.path.join(output_path, VOs_filename_mac),os.path.join(output_path, audio_filename)))313print("New voiceover audio file created.")314else:315print("Find the {0} file and run in through a Text-to-Speech engine of your choice.".format(VOs_filename))316317print("Captions and VO extracted from {0}".format(input_filename))318319320if __name__ == "__main__":321main(sys.argv, verbose=False)322323