From 7fca9ce178062d57e359dd60f09c513db2b9f9da Mon Sep 17 00:00:00 2001 From: Vivek Haldar Date: Sun, 20 Nov 2022 16:36:30 -0800 Subject: [PATCH] --- whisper_edit.py | 219 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 whisper_edit.py diff --git a/whisper_edit.py b/whisper_edit.py new file mode 100644 index 0000000..f16f0e5 --- /dev/null +++ b/whisper_edit.py @@ -0,0 +1,219 @@ +#!/usr/bin/env python3 +import whisper +import sys +import string +import math +from stable_whisper import modify_model +from stable_whisper import stabilize_timestamps +from stable_whisper import results_to_word_srt +from stable_whisper import results_to_word_srt +from moviepy.editor import AudioClip, VideoFileClip, concatenate_videoclips +MODEL = 'base' +# Store phonetic alphabet in a dictionary. +phonetic_alphabet = { + 'A': 'Alpha', + 'B': 'Bravo', + 'C': 'Charlie', + 'D': 'Delta', + 'E': 'Echo', + 'F': 'Foxtrot', + 'G': 'Golf', + 'H': 'Hotel', + 'I': 'India', + 'J': 'Juliett', + 'K': 'Kilo', + 'L': 'Lima', + 'M': 'Mike', + 'N': 'November', + 'O': 'Oscar', + 'P': 'Papa', + 'Q': 'Quebec', + 'R': 'Romeo', + 'S': 'Sierra', + 'T': 'Tango', + 'U': 'Uniform', + 'V': 'Victor', + 'W': 'Whiskey', + 'X': 'X', + 'Y': 'Yankee', + 'Z': 'Zulu', +} + +# Commands +class Commands: + COMMAND_PREFIX = 'victor' + keep_segment = [COMMAND_PREFIX, 'kilo'] + drop_segment = [COMMAND_PREFIX, 'delta'] + chapter_break = [COMMAND_PREFIX, 'charlie'] + COMMANDS = { + 'keep_segment': keep_segment, + 'drop_segment': drop_segment, + 'chapter_break': chapter_break, + } +input_file = 'input.mp4' +output_file = 'output.mp4' +def speech_to_text(): + model = whisper.load_model(MODEL) + modify_model(model) + print('Speech to text...') + results = model.transcribe(input_file) + print('... done!\n') + return results +def word_level_timestamps(parsed_speech_to_text): + print('Getting word-level timestamps...') + word_segments = stabilize_timestamps(parsed_speech_to_text, top_focus=True) + #print(word_segments) + print('... done!\n') + return word_segments +def string_canonical(s): + # Remove punctuation. + no_punc = s.translate(str.maketrans('', '', string.punctuation)) + return no_punc.strip().lower() + +# Return timestamp of (i + 1)th word. Used to get the ending timestamp +# of a command. +def timestamp_of_next_word(seg, i): + word_timestamps = seg['whole_word_timestamps'] + num_words = len(word_timestamps) + if (i + 1) >= num_words: + # No more words in this segment. + return seg['end'] + else: + return seg['whole_word_timestamps'][i + 1]['timestamp'] + +def find_command_in_segment(seg): + word_timestamps = seg['whole_word_timestamps'] + num_words = len(word_timestamps) + i = 0 + while i < num_words: + word = string_canonical(word_timestamps[i]['word']) + # print(f'{word} ') + if word == Commands.COMMAND_PREFIX: + #print('! ') + time_command_start = word_timestamps[i]['timestamp'] + i += 1 + if i > num_words: + break + second_word_of_command = string_canonical(word_timestamps[i]['word']) + match second_word_of_command: + case 'kilo': + time_command_end = timestamp_of_next_word(seg, i) + return Commands.keep_segment, time_command_start, time_command_end + case 'delta': + time_command_end = timestamp_of_next_word(seg, i) + return Commands.drop_segment, time_command_start, time_command_end + case _: + continue + i += 1 + +def find_commands(timestamps): + commands = [] + for segment in timestamps: + seg_command = find_command_in_segment(segment) + if seg_command: + commands.append(seg_command) + return commands + +# Hack. When dropping a segment I see a split-second of the end of +# the dropped segment. So add a small fuzz factor for that. +DROP_SEGMENT_DELTA = 0.2 + +# Returns list of (start, end) tuples of intervals to keep. +def intervals_to_keep(commands): + keep_intervals = [] + keep_start, keep_end = 0, 0 + for command_timestamp in commands: + cmd, begin_ts, end_ts = command_timestamp + match cmd: + case Commands.keep_segment: + # Keep until the start of the command. + keep_end = begin_ts + keep_intervals.append([keep_start, keep_end]) + # Next (possibly) starts at end of command. + keep_start = end_ts + case Commands.drop_segment: + # Next (possibly) starts at end of command. + keep_start = end_ts + DROP_SEGMENT_DELTA + case _: + print(f'Eeek! Unrecognized command: {cmd}') + return keep_intervals +# Iterate over audio to find the non-silent parts. Outputs a list of +# (speaking_start, speaking_end) intervals. +# Args: +# window_size: (in seconds) hunt for silence in windows of this size +# volume_threshold: volume below this threshold is considered to be silence +# ease_in: (in seconds) add this much silence around speaking intervals +def find_speaking_intervals(audio_clip, window_size=0.1, volume_threshold=0.05, ease_in=0.1, audio_fps=44100): + # First, iterate over audio to find all silent windows. + num_windows = math.floor(audio_clip.end/window_size) + window_is_silent = [] + for i in range(num_windows): + s = audio_clip.subclip(i * window_size, (i + 1) * window_size).set_fps(audio_fps) + v = s.max_volume() + window_is_silent.append(v < volume_threshold) + + # Find speaking intervals. + speaking_start = 0 + speaking_end = 0 + speaking_intervals = [] + for i in range(1, len(window_is_silent)): + e1 = window_is_silent[i - 1] + e2 = window_is_silent[i] + # silence -> speaking + if e1 and not e2: + speaking_start = i * window_size + # speaking -> silence, now have a speaking interval + if not e1 and e2: + speaking_end = i * window_size + new_speaking_interval = [max(0, speaking_start - ease_in), speaking_end + ease_in] + # With tiny windows, this can sometimes overlap the previous window, so merge. + need_to_merge = len(speaking_intervals) > 0 and speaking_intervals[-1][1] > new_speaking_interval[0] + if need_to_merge: + merged_interval = [speaking_intervals[-1][0], new_speaking_interval[1]] + speaking_intervals[-1] = merged_interval + else: + speaking_intervals.append(new_speaking_interval) + return speaking_intervals + +def find_speaking(input_clip, input_audio_fps): + print("\n\n\n----- Now cutting out dead air... -----") + speaking_intervals = find_speaking_intervals(input_clip.audio, audio_fps=input_audio_fps) + print("Keeping speaking intervals: " + str(speaking_intervals)) + speaking_clips = [input_clip.subclip(start, end) for [start, end] in speaking_intervals] + final_video = concatenate_videoclips(speaking_clips) + return final_video + +def main(): + sts = speech_to_text() + word_ts = word_level_timestamps(sts) + commands = find_commands(word_ts) + print(commands) + keep_intervals = intervals_to_keep(commands) + print(keep_intervals) + + vid = VideoFileClip(input_file) + + # Edit with speech-to-text. + keep_clips = [vid.subclip(start, end) for [start, end] in keep_intervals] + edited_vid = concatenate_videoclips(keep_clips) + + # Cut out dead air. + no_dead_air_video = find_speaking(edited_vid, vid.audio.fps) + + print("\n\n\n----- Writing out edited video... -----") + no_dead_air_video.write_videofile(output_file, + #fps=60, + preset='ultrafast', + codec='libx264', + #codec='h264_videotoolbox', + temp_audiofile='temp-audio.m4a', + remove_temp=True, + audio_codec="aac", + #threads=6, + ffmpeg_params = ['-threads', '8'], + ) + vid.close() + + +if __name__ == '__main__': + main()