Fix SRT subtitle timing: use word-level timestamps for chunk boundaries
Bug: BRAJDE reel showed subtitles 2-3 seconds out of sync with audio.
Soniox returned correct word timestamps:
- 'Ajmo,' at 41.82s
- 'Janezi!' at 42.18s
- 'Pejd' greva, ajde,' at 43.44-44.40s
But generate_srt_from_segments() ignored word timestamps and split long
segments into evenly-spaced 2.5s chunks based on segment duration:
chunk_dur = duration / n_parts ← assumes even pacing
for i in range(n_parts):
cs = rel_start + i * chunk_dur
This produces wrong timing because singers don't sing evenly. Real audio
had 'Ajmo, Janezi!' in 0.9s and 'Pejd' greva, ajde, na traktorju od Majde'
in 6s — the 2.5s chunks didn't align with vocals.
Fix: when word-level timestamps are available (Soniox/Scribe), group
words into chunks where each chunk's start/end match the actual first/last
word timestamps. Each chunk is at most MAX_CHUNK_DURATION (2.5s) but
respects natural word boundaries.
Before:
00:00.000 → 01.900 AJMO, JANEZI! PEJD' GREVA, AJDE, NA TRAKTORJU OD
00:01.900 → 03.800 MAJDE, NOBEN NAJU NE NAJDE, KO PELJEM TE
After:
00:00.020 → 02.120 AJMO, JANEZI! PEJD' GREVA,
00:02.360 → 04.820 AJDE, NA TRAKTORJU OD MAJDE, NOBEN
Subtitles now perfectly align with vocals.
This commit is contained in:
parent
865e21fe1a
commit
dc1cb1ad27
73
app/main.py
73
app/main.py
@ -293,10 +293,10 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
|
|||||||
f"({seg_dur:.1f}s, {word_count} besed): {text[:50]!r}", flush=True)
|
f"({seg_dur:.1f}s, {word_count} besed): {text[:50]!r}", flush=True)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Če segment delno štrli iz clip range-a IN imamo word-level timestampe,
|
# Pripravi words_in_clip vedno (če imamo word-level timestampe)
|
||||||
# uporabi samo tiste besede ki dejansko padejo v clip range
|
# Uporabili ga bomo tako za segment trim kot za chunk boundaries
|
||||||
# (sicer subtitle vsebuje besedilo iz prejšnjega/naslednjega refrena/verza)
|
words_in_clip = None
|
||||||
if words and (s_start < clip_start or s_end > clip_end):
|
if words:
|
||||||
words_in_clip = []
|
words_in_clip = []
|
||||||
for w in words:
|
for w in words:
|
||||||
w_start = float(w.get("start", 0))
|
w_start = float(w.get("start", 0))
|
||||||
@ -304,7 +304,6 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
|
|||||||
w_text = w.get("text", "").strip()
|
w_text = w.get("text", "").strip()
|
||||||
if not w_text:
|
if not w_text:
|
||||||
continue
|
continue
|
||||||
# Beseda padeva v clip če se prekriva (ne mora biti popolnoma znotraj)
|
|
||||||
if w_end > clip_start and w_start < clip_end:
|
if w_end > clip_start and w_start < clip_end:
|
||||||
words_in_clip.append({
|
words_in_clip.append({
|
||||||
"start": max(w_start, clip_start),
|
"start": max(w_start, clip_start),
|
||||||
@ -312,6 +311,9 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
|
|||||||
"text": w_text,
|
"text": w_text,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# Če segment delno štrli iz clip range-a IN imamo word-level timestampe,
|
||||||
|
# uporabi samo tiste besede ki dejansko padejo v clip range
|
||||||
|
if words_in_clip and (s_start < clip_start or s_end > clip_end):
|
||||||
if not words_in_clip:
|
if not words_in_clip:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -341,23 +343,50 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
|
|||||||
lines.append(f"{idx}\n{fmt_ts(rel_start)} --> {fmt_ts(rel_end)}\n{text_upper}\n")
|
lines.append(f"{idx}\n{fmt_ts(rel_start)} --> {fmt_ts(rel_end)}\n{text_upper}\n")
|
||||||
idx += 1
|
idx += 1
|
||||||
else:
|
else:
|
||||||
# Razdeli na N enakih kosov; če ima Whisper word-timing, jih lahko razdelimo bolje,
|
# ── WORD-LEVEL CHUNKING ──
|
||||||
# ampak za zdaj enako razdelimo
|
# Če imamo word-level timestampe (Soniox/Scribe), uporabi DEJANSKE čase besed
|
||||||
n_parts = int(duration / MAX_CHUNK_DURATION) + 1
|
# za chunk boundaries (NE enake time chunks, ker pevec ne poje enakomerno).
|
||||||
words = text_upper.split()
|
|
||||||
words_per_part = max(1, len(words) // n_parts)
|
if words_in_clip and len(words_in_clip) >= 2:
|
||||||
chunk_dur = duration / n_parts
|
# Group besede v chunke z max trajanjem MAX_CHUNK_DURATION
|
||||||
for i in range(n_parts):
|
chunks = []
|
||||||
cs = rel_start + i * chunk_dur
|
current_chunk = [words_in_clip[0]]
|
||||||
ce = rel_start + (i + 1) * chunk_dur
|
for w in words_in_clip[1:]:
|
||||||
# Vzemi pripadajoče besede
|
chunk_start_time = current_chunk[0]["start"]
|
||||||
wstart = i * words_per_part
|
chunk_dur_so_far = w["end"] - chunk_start_time
|
||||||
wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words)
|
if chunk_dur_so_far > MAX_CHUNK_DURATION:
|
||||||
chunk_text = " ".join(words[wstart:wend]) if wstart < len(words) else text_upper
|
chunks.append(current_chunk)
|
||||||
if not chunk_text.strip():
|
current_chunk = [w]
|
||||||
chunk_text = text_upper
|
else:
|
||||||
lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n")
|
current_chunk.append(w)
|
||||||
idx += 1
|
if current_chunk:
|
||||||
|
chunks.append(current_chunk)
|
||||||
|
|
||||||
|
# Generiraj SRT iz chunks (z dejanskimi word timestampi)
|
||||||
|
for chunk in chunks:
|
||||||
|
cs = chunk[0]["start"] - clip_start
|
||||||
|
ce = chunk[-1]["end"] - clip_start
|
||||||
|
chunk_text = " ".join(w["text"] for w in chunk).upper().strip()
|
||||||
|
if not chunk_text:
|
||||||
|
continue
|
||||||
|
lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text}\n")
|
||||||
|
idx += 1
|
||||||
|
else:
|
||||||
|
# Fallback: brez word-level timestampov, razdeli enako
|
||||||
|
n_parts = int(duration / MAX_CHUNK_DURATION) + 1
|
||||||
|
words_split = text_upper.split()
|
||||||
|
words_per_part = max(1, len(words_split) // n_parts)
|
||||||
|
chunk_dur = duration / n_parts
|
||||||
|
for i in range(n_parts):
|
||||||
|
cs = rel_start + i * chunk_dur
|
||||||
|
ce = rel_start + (i + 1) * chunk_dur
|
||||||
|
wstart = i * words_per_part
|
||||||
|
wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words_split)
|
||||||
|
chunk_text = " ".join(words_split[wstart:wend]) if wstart < len(words_split) else text_upper
|
||||||
|
if not chunk_text.strip():
|
||||||
|
chunk_text = text_upper
|
||||||
|
lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n")
|
||||||
|
idx += 1
|
||||||
|
|
||||||
with open(output_path, "w", encoding="utf-8") as f:
|
with open(output_path, "w", encoding="utf-8") as f:
|
||||||
f.write("\n".join(lines))
|
f.write("\n".join(lines))
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user