def parse_vtt(file_path): with open(file_path, 'r', encoding='utf-8') as file: content = file.read() # Split the content into lines lines = content.strip().split('\n') parsed_captions = [] # Skip the first line if it is the WebVTT header if lines[0] == 'WEBVTT': lines = lines[1:] # Process each caption block for i in range(len(lines)): if '-->' in lines[i]: # This line contains the timing time = lines[i].strip() text = lines[i + 1].strip() if i + 1 < len(lines) else '' parsed_captions.append({ 'time': time, 'text': text }) return parsed_captions def remove_duplicates(subtitles): no_dupes = list() prev = subtitles[0] for caption in subtitles: if prev["text"] != str() and caption["text"] != prev["text"]: no_dupes.append(prev) prev = caption return no_dupes def resync(subtitles): # create placeholder timestamp for the first init = "00:00:00.000 --> 00:00:5.000 align:start position:0%" new_subs = list() new_subs.append({ "time": init, "text": subtitles[0]["text"] }) for i in range(len(subtitles)): if i == len(subtitles) -1: break # we don't need the first text # but we still need the fuck timestamp curr = subtitles[i] next_ = subtitles[i+1] new_time = curr["time"] new_text = next_["text"] new_subs.append({ "time": new_time, "text": new_text }) return new_subs def export_to_vtt(subtitles, output_file_path): with open(output_file_path, 'w', encoding='utf-8') as file: # Write the WebVTT header file.write("WEBVTT\n\n") for subtitle in subtitles: # Assuming subtitle has 'time' and 'text' keys time = subtitle['time'] text = subtitle['text'] # Write the formatted subtitle file.write(f"{time}\n{text}\n\n") if __name__ == "__main__": import sys vtt_file_path = sys.argv[1] print(vtt_file_path) export_to_vtt( resync(remove_duplicates( parse_vtt(vtt_file_path) )), "output.vtt" )