laborde-visio1/parse-subs.py


def parse_vtt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into lines
    lines = content.strip().split('\n')
    parsed_captions = []

    # Skip the first line if it is the WebVTT header
    if lines[0] == 'WEBVTT':
        lines = lines[1:]

    # Process each caption block
    for i in range(len(lines)):
        if '-->' in lines[i]:  # This line contains the timing
            time = lines[i].strip()
            text = lines[i + 1].strip() if i + 1 < len(lines) else ''
            parsed_captions.append({
                'time': time,
                'text': text
            })

    return parsed_captions

def remove_duplicates(subtitles):
    no_dupes = list()
    prev = subtitles[0]
    for caption in subtitles:
        if prev["text"] != str() and caption["text"] != prev["text"]:
            no_dupes.append(prev)
        prev = caption
    return no_dupes

def resync(subtitles):
    # create placeholder timestamp for the first
    init = "00:00:00.000 --> 00:00:5.000 align:start position:0%"
    new_subs = list()
    new_subs.append({
                "time": init,
                "text": subtitles[0]["text"]
            })
    for i in range(len(subtitles)):
        if i == len(subtitles) -1:
            break
        # we don't need the first text
        # but we still need the fuck timestamp
        curr  = subtitles[i]
        next_ = subtitles[i+1]
        new_time = curr["time"]
        new_text = next_["text"]
        new_subs.append({
                "time": new_time,
                "text": new_text
            })
    return new_subs

def export_to_vtt(subtitles, output_file_path):
    with open(output_file_path, 'w', encoding='utf-8') as file:
        # Write the WebVTT header
        file.write("WEBVTT\n\n")

        for subtitle in subtitles:
            # Assuming subtitle has 'time' and 'text' keys
            time = subtitle['time']
            text = subtitle['text']

            # Write the formatted subtitle
            file.write(f"{time}\n{text}\n\n")

if __name__ == "__main__":
    import sys


    vtt_file_path = sys.argv[1]
    print(vtt_file_path)

    export_to_vtt(
        resync(remove_duplicates(
            parse_vtt(vtt_file_path)
        )), "output.vtt"
    )