From dc1cb1ad27e388cbd6e01e2b7a0e462b842890cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= <sebastjan@folx.tv>
Date: Thu, 30 Apr 2026 04:02:09 +0000
Subject: [PATCH] Fix SRT subtitle timing: use word-level timestamps for chunk
 boundaries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bug: BRAJDE reel showed subtitles 2-3 seconds out of sync with audio.

Soniox returned correct word timestamps:
- 'Ajmo,' at 41.82s
- 'Janezi!' at 42.18s
- 'Pejd' greva, ajde,' at 43.44-44.40s

But generate_srt_from_segments() ignored word timestamps and split long
segments into evenly-spaced 2.5s chunks based on segment duration:

  chunk_dur = duration / n_parts   ← assumes even pacing
  for i in range(n_parts):
      cs = rel_start + i * chunk_dur

This produces wrong timing because singers don't sing evenly. Real audio
had 'Ajmo, Janezi!' in 0.9s and 'Pejd' greva, ajde, na traktorju od Majde'
in 6s — the 2.5s chunks didn't align with vocals.

Fix: when word-level timestamps are available (Soniox/Scribe), group
words into chunks where each chunk's start/end match the actual first/last
word timestamps. Each chunk is at most MAX_CHUNK_DURATION (2.5s) but
respects natural word boundaries.

Before:
  00:00.000 → 01.900  AJMO, JANEZI! PEJD' GREVA, AJDE, NA TRAKTORJU OD
  00:01.900 → 03.800  MAJDE, NOBEN NAJU NE NAJDE, KO PELJEM TE

After:
  00:00.020 → 02.120  AJMO, JANEZI! PEJD' GREVA,
  00:02.360 → 04.820  AJDE, NA TRAKTORJU OD MAJDE, NOBEN

Subtitles now perfectly align with vocals.
---
 app/main.py | 75 +++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 23 deletions(-)

diff --git a/app/main.py b/app/main.py
index 89d88db..6acb057 100644
--- a/app/main.py
+++ b/app/main.py
@@ -293,10 +293,10 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
                   f"({seg_dur:.1f}s, {word_count} besed): {text[:50]!r}", flush=True)
             continue
         
-        # Če segment delno štrli iz clip range-a IN imamo word-level timestampe,
-        # uporabi samo tiste besede ki dejansko padejo v clip range
-        # (sicer subtitle vsebuje besedilo iz prejšnjega/naslednjega refrena/verza)
-        if words and (s_start < clip_start or s_end > clip_end):
+        # Pripravi words_in_clip vedno (če imamo word-level timestampe)
+        # Uporabili ga bomo tako za segment trim kot za chunk boundaries
+        words_in_clip = None
+        if words:
             words_in_clip = []
             for w in words:
                 w_start = float(w.get("start", 0))
@@ -304,14 +304,16 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
                 w_text = w.get("text", "").strip()
                 if not w_text:
                     continue
-                # Beseda padeva v clip če se prekriva (ne mora biti popolnoma znotraj)
                 if w_end > clip_start and w_start < clip_end:
                     words_in_clip.append({
                         "start": max(w_start, clip_start),
                         "end": min(w_end, clip_end),
                         "text": w_text,
                     })
-            
+        
+        # Če segment delno štrli iz clip range-a IN imamo word-level timestampe,
+        # uporabi samo tiste besede ki dejansko padejo v clip range
+        if words_in_clip and (s_start < clip_start or s_end > clip_end):
             if not words_in_clip:
                 continue
             
@@ -341,23 +343,50 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
             lines.append(f"{idx}\n{fmt_ts(rel_start)} --> {fmt_ts(rel_end)}\n{text_upper}\n")
             idx += 1
         else:
-            # Razdeli na N enakih kosov; če ima Whisper word-timing, jih lahko razdelimo bolje,
-            # ampak za zdaj enako razdelimo
-            n_parts = int(duration / MAX_CHUNK_DURATION) + 1
-            words = text_upper.split()
-            words_per_part = max(1, len(words) // n_parts)
-            chunk_dur = duration / n_parts
-            for i in range(n_parts):
-                cs = rel_start + i * chunk_dur
-                ce = rel_start + (i + 1) * chunk_dur
-                # Vzemi pripadajoče besede
-                wstart = i * words_per_part
-                wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words)
-                chunk_text = " ".join(words[wstart:wend]) if wstart < len(words) else text_upper
-                if not chunk_text.strip():
-                    chunk_text = text_upper
-                lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n")
-                idx += 1
+            # ── WORD-LEVEL CHUNKING ──
+            # Če imamo word-level timestampe (Soniox/Scribe), uporabi DEJANSKE čase besed
+            # za chunk boundaries (NE enake time chunks, ker pevec ne poje enakomerno).
+            
+            if words_in_clip and len(words_in_clip) >= 2:
+                # Group besede v chunke z max trajanjem MAX_CHUNK_DURATION
+                chunks = []
+                current_chunk = [words_in_clip[0]]
+                for w in words_in_clip[1:]:
+                    chunk_start_time = current_chunk[0]["start"]
+                    chunk_dur_so_far = w["end"] - chunk_start_time
+                    if chunk_dur_so_far > MAX_CHUNK_DURATION:
+                        chunks.append(current_chunk)
+                        current_chunk = [w]
+                    else:
+                        current_chunk.append(w)
+                if current_chunk:
+                    chunks.append(current_chunk)
+                
+                # Generiraj SRT iz chunks (z dejanskimi word timestampi)
+                for chunk in chunks:
+                    cs = chunk[0]["start"] - clip_start
+                    ce = chunk[-1]["end"] - clip_start
+                    chunk_text = " ".join(w["text"] for w in chunk).upper().strip()
+                    if not chunk_text:
+                        continue
+                    lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text}\n")
+                    idx += 1
+            else:
+                # Fallback: brez word-level timestampov, razdeli enako
+                n_parts = int(duration / MAX_CHUNK_DURATION) + 1
+                words_split = text_upper.split()
+                words_per_part = max(1, len(words_split) // n_parts)
+                chunk_dur = duration / n_parts
+                for i in range(n_parts):
+                    cs = rel_start + i * chunk_dur
+                    ce = rel_start + (i + 1) * chunk_dur
+                    wstart = i * words_per_part
+                    wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words_split)
+                    chunk_text = " ".join(words_split[wstart:wend]) if wstart < len(words_split) else text_upper
+                    if not chunk_text.strip():
+                        chunk_text = text_upper
+                    lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n")
+                    idx += 1
 
     with open(output_path, "w", encoding="utf-8") as f:
         f.write("\n".join(lines))