From dc1cb1ad27e388cbd6e01e2b7a0e462b842890cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Thu, 30 Apr 2026 04:02:09 +0000 Subject: [PATCH] Fix SRT subtitle timing: use word-level timestamps for chunk boundaries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug: BRAJDE reel showed subtitles 2-3 seconds out of sync with audio. Soniox returned correct word timestamps: - 'Ajmo,' at 41.82s - 'Janezi!' at 42.18s - 'Pejd' greva, ajde,' at 43.44-44.40s But generate_srt_from_segments() ignored word timestamps and split long segments into evenly-spaced 2.5s chunks based on segment duration: chunk_dur = duration / n_parts ← assumes even pacing for i in range(n_parts): cs = rel_start + i * chunk_dur This produces wrong timing because singers don't sing evenly. Real audio had 'Ajmo, Janezi!' in 0.9s and 'Pejd' greva, ajde, na traktorju od Majde' in 6s — the 2.5s chunks didn't align with vocals. Fix: when word-level timestamps are available (Soniox/Scribe), group words into chunks where each chunk's start/end match the actual first/last word timestamps. Each chunk is at most MAX_CHUNK_DURATION (2.5s) but respects natural word boundaries. Before: 00:00.000 → 01.900 AJMO, JANEZI! PEJD' GREVA, AJDE, NA TRAKTORJU OD 00:01.900 → 03.800 MAJDE, NOBEN NAJU NE NAJDE, KO PELJEM TE After: 00:00.020 → 02.120 AJMO, JANEZI! PEJD' GREVA, 00:02.360 → 04.820 AJDE, NA TRAKTORJU OD MAJDE, NOBEN Subtitles now perfectly align with vocals. --- app/main.py | 75 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/app/main.py b/app/main.py index 89d88db..6acb057 100644 --- a/app/main.py +++ b/app/main.py @@ -293,10 +293,10 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path): f"({seg_dur:.1f}s, {word_count} besed): {text[:50]!r}", flush=True) continue - # Če segment delno štrli iz clip range-a IN imamo word-level timestampe, - # uporabi samo tiste besede ki dejansko padejo v clip range - # (sicer subtitle vsebuje besedilo iz prejšnjega/naslednjega refrena/verza) - if words and (s_start < clip_start or s_end > clip_end): + # Pripravi words_in_clip vedno (če imamo word-level timestampe) + # Uporabili ga bomo tako za segment trim kot za chunk boundaries + words_in_clip = None + if words: words_in_clip = [] for w in words: w_start = float(w.get("start", 0)) @@ -304,14 +304,16 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path): w_text = w.get("text", "").strip() if not w_text: continue - # Beseda padeva v clip če se prekriva (ne mora biti popolnoma znotraj) if w_end > clip_start and w_start < clip_end: words_in_clip.append({ "start": max(w_start, clip_start), "end": min(w_end, clip_end), "text": w_text, }) - + + # Če segment delno štrli iz clip range-a IN imamo word-level timestampe, + # uporabi samo tiste besede ki dejansko padejo v clip range + if words_in_clip and (s_start < clip_start or s_end > clip_end): if not words_in_clip: continue @@ -341,23 +343,50 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path): lines.append(f"{idx}\n{fmt_ts(rel_start)} --> {fmt_ts(rel_end)}\n{text_upper}\n") idx += 1 else: - # Razdeli na N enakih kosov; če ima Whisper word-timing, jih lahko razdelimo bolje, - # ampak za zdaj enako razdelimo - n_parts = int(duration / MAX_CHUNK_DURATION) + 1 - words = text_upper.split() - words_per_part = max(1, len(words) // n_parts) - chunk_dur = duration / n_parts - for i in range(n_parts): - cs = rel_start + i * chunk_dur - ce = rel_start + (i + 1) * chunk_dur - # Vzemi pripadajoče besede - wstart = i * words_per_part - wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words) - chunk_text = " ".join(words[wstart:wend]) if wstart < len(words) else text_upper - if not chunk_text.strip(): - chunk_text = text_upper - lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n") - idx += 1 + # ── WORD-LEVEL CHUNKING ── + # Če imamo word-level timestampe (Soniox/Scribe), uporabi DEJANSKE čase besed + # za chunk boundaries (NE enake time chunks, ker pevec ne poje enakomerno). + + if words_in_clip and len(words_in_clip) >= 2: + # Group besede v chunke z max trajanjem MAX_CHUNK_DURATION + chunks = [] + current_chunk = [words_in_clip[0]] + for w in words_in_clip[1:]: + chunk_start_time = current_chunk[0]["start"] + chunk_dur_so_far = w["end"] - chunk_start_time + if chunk_dur_so_far > MAX_CHUNK_DURATION: + chunks.append(current_chunk) + current_chunk = [w] + else: + current_chunk.append(w) + if current_chunk: + chunks.append(current_chunk) + + # Generiraj SRT iz chunks (z dejanskimi word timestampi) + for chunk in chunks: + cs = chunk[0]["start"] - clip_start + ce = chunk[-1]["end"] - clip_start + chunk_text = " ".join(w["text"] for w in chunk).upper().strip() + if not chunk_text: + continue + lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text}\n") + idx += 1 + else: + # Fallback: brez word-level timestampov, razdeli enako + n_parts = int(duration / MAX_CHUNK_DURATION) + 1 + words_split = text_upper.split() + words_per_part = max(1, len(words_split) // n_parts) + chunk_dur = duration / n_parts + for i in range(n_parts): + cs = rel_start + i * chunk_dur + ce = rel_start + (i + 1) * chunk_dur + wstart = i * words_per_part + wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words_split) + chunk_text = " ".join(words_split[wstart:wend]) if wstart < len(words_split) else text_upper + if not chunk_text.strip(): + chunk_text = text_upper + lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n") + idx += 1 with open(output_path, "w", encoding="utf-8") as f: f.write("\n".join(lines))