Compare commits
9 Commits
readme-edi
...
master
Author | SHA1 | Date | |
---|---|---|---|
d377325a34 | |||
fbf4b17312 | |||
d4a3358f3c | |||
9044a420f2 | |||
ef3cf87070 | |||
4ac84e8682 | |||
dde405e6b4 | |||
640d29131a | |||
c985058afb |
14
Makefile
14
Makefile
@ -1,14 +1,20 @@
|
|||||||
|
|
||||||
ORIGINAL_SUBS=autogen-subs
|
ORIGINAL_SUBS=autogen-subs
|
||||||
VIDEO=1ere-introcut.mp4
|
VIDEO=1ere.mkv
|
||||||
|
# VIDEO=1ere-introcut.mp4
|
||||||
|
|
||||||
|
test: getvideo
|
||||||
|
ffmpeg -i $(VIDEO) -i mangen-subs.fr.vtt -vcodec copy -acodec copy -c:s mov_text test.mp4 -y
|
||||||
|
|
||||||
|
getvideo:
|
||||||
|
yt-dlp https://www.youtube.com/watch?v=WRq2197FlMw -o $(VIDEO)
|
||||||
|
|
||||||
cleansubs:
|
cleansubs:
|
||||||
python scripts/parse-subs.py $(ORIGINAL_SUBS).fr.vtt
|
python scripts/parse-subs.py $(ORIGINAL_SUBS).fr.vtt
|
||||||
# ffmpeg -i 1ere.mkv -i output.vtt -vcodec copy -acodec copy -c:s mov_text test.mp4 -y
|
|
||||||
|
|
||||||
getsubs:
|
getsubs:
|
||||||
yt-dlp --write-auto-subs --sub-lang=fr --skip-download https://www.youtube.com/watch?v=WRq2197FlMw -o autogen-subs
|
yt-dlp --write-auto-subs --sub-lang=fr --skip-download https://www.youtube.com/watch?v=WRq2197FlMw -o autogen-subs
|
||||||
|
|
||||||
# omit the music
|
# omit the music
|
||||||
clip:
|
# clip:
|
||||||
ffmpeg -ss 3:18 -i 1ere-combined.mp4 -vcodec copy -acodec copy $(VIDEO)
|
# ffmpeg -ss 3:18 -i 1ere-combined.mp4 -vcodec copy -acodec copy 1ere-introcut.mp4
|
||||||
|
18
README.md
18
README.md
@ -5,9 +5,23 @@ This repository tracks the transcription and translation for the video conferenc
|
|||||||
|
|
||||||
The original video and the auto-generated captions in French can be sourced from [https://www.youtube.com/watch?v=WRq2197FlMw](https://www.youtube.com/watch?v=WRq2197FlMw).
|
The original video and the auto-generated captions in French can be sourced from [https://www.youtube.com/watch?v=WRq2197FlMw](https://www.youtube.com/watch?v=WRq2197FlMw).
|
||||||
|
|
||||||
## Setup
|
## Contributing
|
||||||
|
|
||||||
Run the following to download auto-generated captions from YouTube to a new file named `autogen-subs.fr.vtt`.
|
The automatically generated French transcript needs to be checked over by a human. The French transcript then needs to be translated to English, also with human oversight.
|
||||||
|
|
||||||
|
* Verify and edit the file `mangen-subs.fr.vtt` which contains the current draft of the human-edited auto-generated transcript.
|
||||||
|
* Run `make test` to add subtitles to the video and ensure the format is correct.
|
||||||
|
|
||||||
|
### Contributors
|
||||||
|
|
||||||
|
* tuxmain, [https://txmn.tk/](https://txmn.tk/)
|
||||||
|
* 1 anonymous contributor(s)
|
||||||
|
|
||||||
|
### Getting the original auto-generated transcript
|
||||||
|
|
||||||
|
*To see how much a southern accent and speech quirks can throw off an AI.*
|
||||||
|
|
||||||
|
Run the following to download the auto-generated transcript from YouTube to a new file named `autogen-subs.fr.vtt`.
|
||||||
|
|
||||||
yt-dlp --write-auto-subs --sub-lang=fr --skip-download https://www.youtube.com/watch?v=WRq2197FlMw -o autogen-subs
|
yt-dlp --write-auto-subs --sub-lang=fr --skip-download https://www.youtube.com/watch?v=WRq2197FlMw -o autogen-subs
|
||||||
|
|
||||||
|
3566
mangen-subs.fr.vtt
3566
mangen-subs.fr.vtt
File diff suppressed because it is too large
Load Diff
50
scripts/update-subs.py
Normal file
50
scripts/update-subs.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
def offset_timestamp(timestamp, offset):
|
||||||
|
hours, minutes, seconds, microseconds = map(float, re.split('[:.]', timestamp))
|
||||||
|
total_seconds = hours * 3600 \
|
||||||
|
+ minutes * 60 \
|
||||||
|
+ seconds \
|
||||||
|
+ (microseconds * 0.001) \
|
||||||
|
+ offset
|
||||||
|
new_hours = int(total_seconds // 3600)
|
||||||
|
new_minutes = int((total_seconds % 3600) // 60)
|
||||||
|
new_seconds = total_seconds % 60
|
||||||
|
return f"{new_hours:02}:{new_minutes:02}:{new_seconds:.3f}"
|
||||||
|
|
||||||
|
def update_webvtt(file_path, offset):
|
||||||
|
with open(file_path, 'r', encoding='utf-8') as file:
|
||||||
|
lines = file.readlines()
|
||||||
|
|
||||||
|
updated_lines = []
|
||||||
|
for line in lines:
|
||||||
|
ts = line[:line.find("align")]
|
||||||
|
if '-->' in ts:
|
||||||
|
start, end = ts.split(' --> ')
|
||||||
|
updated_start = offset_timestamp(start.strip(), offset)
|
||||||
|
updated_end = offset_timestamp(end.strip(), offset)
|
||||||
|
updated_lines.append(f"{updated_start} --> {updated_end}\n")
|
||||||
|
else:
|
||||||
|
updated_lines.append(line)
|
||||||
|
|
||||||
|
with open('updated_' + file_path, 'w', encoding='utf-8') as file:
|
||||||
|
file.writelines(updated_lines)
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('input_file', type=str, help='The input file name')
|
||||||
|
parser.add_argument('time_offset', type=str, help='The output file name')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
vtt = args.input_file
|
||||||
|
t = args.time_offset
|
||||||
|
|
||||||
|
minutes, seconds = map(float, re.split('[:.]', t))
|
||||||
|
offset = minutes * 60 + seconds
|
||||||
|
|
||||||
|
print("bumping timestamps in file %s by %f seconds" % (vtt, offset))
|
||||||
|
update_webvtt(vtt, -offset) # Use -40 to offset back by 40 seconds
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user