85 lines
2.3 KiB
Python
85 lines
2.3 KiB
Python
|
|
def parse_vtt(file_path):
|
|
with open(file_path, 'r', encoding='utf-8') as file:
|
|
content = file.read()
|
|
|
|
# Split the content into lines
|
|
lines = content.strip().split('\n')
|
|
parsed_captions = []
|
|
|
|
# Skip the first line if it is the WebVTT header
|
|
if lines[0] == 'WEBVTT':
|
|
lines = lines[1:]
|
|
|
|
# Process each caption block
|
|
for i in range(len(lines)):
|
|
if '-->' in lines[i]: # This line contains the timing
|
|
time = lines[i].strip()
|
|
text = lines[i + 1].strip() if i + 1 < len(lines) else ''
|
|
parsed_captions.append({
|
|
'time': time,
|
|
'text': text
|
|
})
|
|
|
|
return parsed_captions
|
|
|
|
def remove_duplicates(subtitles):
|
|
no_dupes = list()
|
|
prev = subtitles[0]
|
|
for caption in subtitles:
|
|
if prev["text"] != str() and caption["text"] != prev["text"]:
|
|
no_dupes.append(prev)
|
|
prev = caption
|
|
return no_dupes
|
|
|
|
def resync(subtitles):
|
|
# create placeholder timestamp for the first
|
|
init = "00:00:00.000 --> 00:00:5.000 align:start position:0%"
|
|
new_subs = list()
|
|
new_subs.append({
|
|
"time": init,
|
|
"text": subtitles[0]["text"]
|
|
})
|
|
for i in range(len(subtitles)):
|
|
if i == len(subtitles) -1:
|
|
break
|
|
# we don't need the first text
|
|
# but we still need the fuck timestamp
|
|
curr = subtitles[i]
|
|
next_ = subtitles[i+1]
|
|
new_time = curr["time"]
|
|
new_text = next_["text"]
|
|
new_subs.append({
|
|
"time": new_time,
|
|
"text": new_text
|
|
})
|
|
return new_subs
|
|
|
|
def export_to_vtt(subtitles, output_file_path):
|
|
with open(output_file_path, 'w', encoding='utf-8') as file:
|
|
# Write the WebVTT header
|
|
file.write("WEBVTT\n\n")
|
|
|
|
for subtitle in subtitles:
|
|
# Assuming subtitle has 'time' and 'text' keys
|
|
time = subtitle['time']
|
|
text = subtitle['text']
|
|
|
|
# Write the formatted subtitle
|
|
file.write(f"{time}\n{text}\n\n")
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
|
|
vtt_file_path = sys.argv[1]
|
|
print(vtt_file_path)
|
|
|
|
export_to_vtt(
|
|
resync(remove_duplicates(
|
|
parse_vtt(vtt_file_path)
|
|
)), "output.vtt"
|
|
)
|
|
|
|
|