laborde-visio1/parse-subs.py


def parse_vtt(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into lines
    lines = content.strip().split('\n')
    parsed_captions = []

    # Skip the first line if it is the WebVTT header
    if lines[0] == 'WEBVTT':
        lines = lines[1:]

    # Process each caption block
    for i in range(len(lines)):
        if '-->' in lines[i]:  # This line contains the timing
            time = lines[i].strip()
            text = lines[i + 1].strip() if i + 1 < len(lines) else ''
            parsed_captions.append({
                'time': time,
                'text': text
            })

    return parsed_captions

if __name__ == "__main__":
    import sys


    vtt_file_path = sys.argv[1]
    print(vtt_file_path)

    parsed_data = parse_vtt(vtt_file_path)
    for caption in parsed_data:
        print(caption)