From a5383873458e7601ca1af074ab8e285722402cc5 Mon Sep 17 00:00:00 2001 From: - <-> Date: Mon, 17 Feb 2025 23:34:36 -0800 Subject: [PATCH] Removed duplicates but the timing is off. --- parse-subs.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/parse-subs.py b/parse-subs.py index ac08b06..fb77dbc 100644 --- a/parse-subs.py +++ b/parse-subs.py @@ -23,6 +23,30 @@ def parse_vtt(file_path): return parsed_captions +def remove_duplicates(subtitles): + no_dupes = list() + prev = subtitles[0] + # print(type(prev), ":", prev) + for caption in subtitles: + if prev["text"] != str() and caption["text"] != prev["text"]: + # print(prev) + no_dupes.append(prev) + prev = caption + return no_dupes + +def export_to_vtt(subtitles, output_file_path): + with open(output_file_path, 'w', encoding='utf-8') as file: + # Write the WebVTT header + file.write("WEBVTT\n\n") + + for subtitle in subtitles: + # Assuming subtitle has 'time' and 'text' keys + time = subtitle['time'] + text = subtitle['text'] + + # Write the formatted subtitle + file.write(f"{time}\n{text}\n\n") + if __name__ == "__main__": import sys @@ -30,7 +54,10 @@ if __name__ == "__main__": vtt_file_path = sys.argv[1] print(vtt_file_path) - parsed_data = parse_vtt(vtt_file_path) - for caption in parsed_data: - print(caption) + export_to_vtt( + remove_duplicates( + parse_vtt(vtt_file_path) + ), "output.vtt" + ) +