Compare commits

...

9 Commits

Author SHA1 Message Date
d377325a34 Added transcript for 1h to 1h25 by Anonymous. 2025-06-22 10:10:10 -07:00
fbf4b17312 Added script to update timestamps. 2025-03-06 22:16:16 -08:00
d4a3358f3c Fixed missing line break. 2025-03-06 20:11:48 -08:00
9044a420f2 Corrections at the beginning.
Removed music/notes confused as words by auto-subber.
Corrected tiemstamps.
Added some punctuation.
2025-03-06 19:50:56 -08:00
ef3cf87070 Added 40 to 60 minute transcription by Anonymous. 2025-03-03 20:52:48 -08:00
4ac84e8682 Added 20 to 40 minute transcription by Anonymous 2025-03-02 16:12:40 -08:00
dde405e6b4 Clarified instructions for contributing.
Added new make recipe for testing subtitles.
2025-02-23 09:28:09 -08:00
640d29131a Added 10 min to 20 min transcription by Anonymous. 2025-02-23 09:12:27 -08:00
c985058afb Added Contributing section to README. 2025-02-22 17:52:44 -08:00
4 changed files with 1873 additions and 1775 deletions

View File

@ -1,14 +1,20 @@
ORIGINAL_SUBS=autogen-subs
VIDEO=1ere-introcut.mp4
VIDEO=1ere.mkv
# VIDEO=1ere-introcut.mp4
test: getvideo
ffmpeg -i $(VIDEO) -i mangen-subs.fr.vtt -vcodec copy -acodec copy -c:s mov_text test.mp4 -y
getvideo:
yt-dlp https://www.youtube.com/watch?v=WRq2197FlMw -o $(VIDEO)
cleansubs:
python scripts/parse-subs.py $(ORIGINAL_SUBS).fr.vtt
# ffmpeg -i 1ere.mkv -i output.vtt -vcodec copy -acodec copy -c:s mov_text test.mp4 -y
getsubs:
yt-dlp --write-auto-subs --sub-lang=fr --skip-download https://www.youtube.com/watch?v=WRq2197FlMw -o autogen-subs
# omit the music
clip:
ffmpeg -ss 3:18 -i 1ere-combined.mp4 -vcodec copy -acodec copy $(VIDEO)
# clip:
# ffmpeg -ss 3:18 -i 1ere-combined.mp4 -vcodec copy -acodec copy 1ere-introcut.mp4

View File

@ -5,9 +5,23 @@ This repository tracks the transcription and translation for the video conferenc
The original video and the auto-generated captions in French can be sourced from [https://www.youtube.com/watch?v=WRq2197FlMw](https://www.youtube.com/watch?v=WRq2197FlMw).
## Setup
## Contributing
Run the following to download auto-generated captions from YouTube to a new file named `autogen-subs.fr.vtt`.
The automatically generated French transcript needs to be checked over by a human. The French transcript then needs to be translated to English, also with human oversight.
* Verify and edit the file `mangen-subs.fr.vtt` which contains the current draft of the human-edited auto-generated transcript.
* Run `make test` to add subtitles to the video and ensure the format is correct.
### Contributors
* tuxmain, [https://txmn.tk/](https://txmn.tk/)
* 1 anonymous contributor(s)
### Getting the original auto-generated transcript
*To see how much a southern accent and speech quirks can throw off an AI.*
Run the following to download the auto-generated transcript from YouTube to a new file named `autogen-subs.fr.vtt`.
yt-dlp --write-auto-subs --sub-lang=fr --skip-download https://www.youtube.com/watch?v=WRq2197FlMw -o autogen-subs

File diff suppressed because it is too large Load Diff

50
scripts/update-subs.py Normal file
View File

@ -0,0 +1,50 @@
import re
import sys
import argparse
def offset_timestamp(timestamp, offset):
hours, minutes, seconds, microseconds = map(float, re.split('[:.]', timestamp))
total_seconds = hours * 3600 \
+ minutes * 60 \
+ seconds \
+ (microseconds * 0.001) \
+ offset
new_hours = int(total_seconds // 3600)
new_minutes = int((total_seconds % 3600) // 60)
new_seconds = total_seconds % 60
return f"{new_hours:02}:{new_minutes:02}:{new_seconds:.3f}"
def update_webvtt(file_path, offset):
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
updated_lines = []
for line in lines:
ts = line[:line.find("align")]
if '-->' in ts:
start, end = ts.split(' --> ')
updated_start = offset_timestamp(start.strip(), offset)
updated_end = offset_timestamp(end.strip(), offset)
updated_lines.append(f"{updated_start} --> {updated_end}\n")
else:
updated_lines.append(line)
with open('updated_' + file_path, 'w', encoding='utf-8') as file:
file.writelines(updated_lines)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('input_file', type=str, help='The input file name')
parser.add_argument('time_offset', type=str, help='The output file name')
args = parser.parse_args()
vtt = args.input_file
t = args.time_offset
minutes, seconds = map(float, re.split('[:.]', t))
offset = minutes * 60 + seconds
print("bumping timestamps in file %s by %f seconds" % (vtt, offset))
update_webvtt(vtt, -offset) # Use -40 to offset back by 40 seconds