laborde-visio1/parse-subs.py


def parse_vtt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into lines
    lines = content.strip().split('\n')
    parsed_captions = []

    # Skip the first line if it is the WebVTT header
    if lines[0] == 'WEBVTT':
        lines = lines[1:]

    # Process each caption block
    for i in range(len(lines)):
        if '-->' in lines[i]:  # This line contains the timing
            time = lines[i].strip()
            text = lines[i + 1].strip() if i + 1 < len(lines) else ''
            parsed_captions.append({
                'time': time,
                'text': text
            })

    return parsed_captions

def remove_duplicates(subtitles):
    no_dupes = list()
    prev = subtitles[0]
    # print(type(prev), ":", prev)
    for caption in subtitles:
        if prev["text"] != str() and caption["text"] != prev["text"]:
            # print(prev)
            no_dupes.append(prev)
        prev = caption
    return no_dupes

def export_to_vtt(subtitles, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        # Write the WebVTT header
        file.write("WEBVTT\n\n")

        for subtitle in subtitles:
            # Assuming subtitle has 'time' and 'text' keys
            time = subtitle['time']
            text = subtitle['text']

            # Write the formatted subtitle
            file.write(f"{time}\n{text}\n\n")

if __name__ == "__main__":
    import sys


    vtt_file_path = sys.argv[1]
    print(vtt_file_path)

    export_to_vtt(
        remove_duplicates(
            parse_vtt(vtt_file_path)
        ), "output.vtt"
    )