laborde-visio1/parse-subs.py

64 lines
1.6 KiB
Python

def parse_vtt(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Split the content into lines
lines = content.strip().split('\n')
parsed_captions = []
# Skip the first line if it is the WebVTT header
if lines[0] == 'WEBVTT':
lines = lines[1:]
# Process each caption block
for i in range(len(lines)):
if '-->' in lines[i]: # This line contains the timing
time = lines[i].strip()
text = lines[i + 1].strip() if i + 1 < len(lines) else ''
parsed_captions.append({
'time': time,
'text': text
})
return parsed_captions
def remove_duplicates(subtitles):
no_dupes = list()
prev = subtitles[0]
# print(type(prev), ":", prev)
for caption in subtitles:
if prev["text"] != str() and caption["text"] != prev["text"]:
# print(prev)
no_dupes.append(prev)
prev = caption
return no_dupes
def export_to_vtt(subtitles, output_file_path):
with open(output_file_path, 'w', encoding='utf-8') as file:
# Write the WebVTT header
file.write("WEBVTT\n\n")
for subtitle in subtitles:
# Assuming subtitle has 'time' and 'text' keys
time = subtitle['time']
text = subtitle['text']
# Write the formatted subtitle
file.write(f"{time}\n{text}\n\n")
if __name__ == "__main__":
import sys
vtt_file_path = sys.argv[1]
print(vtt_file_path)
export_to_vtt(
remove_duplicates(
parse_vtt(vtt_file_path)
), "output.vtt"
)