laborde-visio1/parse-subs.py

85 lines
2.3 KiB
Python

def parse_vtt(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Split the content into lines
lines = content.strip().split('\n')
parsed_captions = []
# Skip the first line if it is the WebVTT header
if lines[0] == 'WEBVTT':
lines = lines[1:]
# Process each caption block
for i in range(len(lines)):
if '-->' in lines[i]: # This line contains the timing
time = lines[i].strip()
text = lines[i + 1].strip() if i + 1 < len(lines) else ''
parsed_captions.append({
'time': time,
'text': text
})
return parsed_captions
def remove_duplicates(subtitles):
no_dupes = list()
prev = subtitles[0]
for caption in subtitles:
if prev["text"] != str() and caption["text"] != prev["text"]:
no_dupes.append(prev)
prev = caption
return no_dupes
def resync(subtitles):
# create placeholder timestamp for the first
init = "00:00:00.000 --> 00:00:5.000 align:start position:0%"
new_subs = list()
new_subs.append({
"time": init,
"text": subtitles[0]["text"]
})
for i in range(len(subtitles)):
if i == len(subtitles) -1:
break
# we don't need the first text
# but we still need the fuck timestamp
curr = subtitles[i]
next_ = subtitles[i+1]
new_time = curr["time"]
new_text = next_["text"]
new_subs.append({
"time": new_time,
"text": new_text
})
return new_subs
def export_to_vtt(subtitles, output_file_path):
with open(output_file_path, 'w', encoding='utf-8') as file:
# Write the WebVTT header
file.write("WEBVTT\n\n")
for subtitle in subtitles:
# Assuming subtitle has 'time' and 'text' keys
time = subtitle['time']
text = subtitle['text']
# Write the formatted subtitle
file.write(f"{time}\n{text}\n\n")
if __name__ == "__main__":
import sys
vtt_file_path = sys.argv[1]
print(vtt_file_path)
export_to_vtt(
resync(remove_duplicates(
parse_vtt(vtt_file_path)
)), "output.vtt"
)