From d73453fe50e5b371d8bf160bedbc6bfeb3d8f5d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 16:48:39 +0000 Subject: [PATCH] Fix SRT subtitles: word-level clipping for partial segments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug found in Žena ME TEPE re-test: - Clip start: 76.73s (correct, captures full 'Žena' word) - But SRT subtitle #1 showed: 'SAJ ŠE DOMA MI VEČ NOČJO VERJET.' - That text is from the PREVIOUS verse, not the chorus! Why: previous segment (73.9-78.2s) contained 'saj še doma mi več nočjo verjet. Žena me'. Clip start fell at 76.73s (mid-segment). Old SRT logic: max(s_start, clip_start) just clipped TIMING but kept ALL the text from that segment, including text from before the clip. Fix: when a segment partially falls outside clip range AND has word-level timestamps (Scribe provides these), reconstruct the segment using only the words that actually fall within [clip_start, clip_end]. Audio (clipped at clip_start) only contains those words anyway, so the subtitle should match. Result for Žena chorus: - Old: 'SAJ ŠE DOMA MI VEČ NOČJO VERJET.' (wrong, that text is silent in clip) - New: 'ŽENA ME' (only words actually heard at 76.73-78.16s) --- app/main.py | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/app/main.py b/app/main.py index 89dda52..1c87360 100644 --- a/app/main.py +++ b/app/main.py @@ -224,13 +224,43 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path): s_start = float(seg["start"]) s_end = float(seg["end"]) text = str(seg["text"]).strip() + words = seg.get("words", []) or [] # Filter v range if s_end <= clip_start or s_start >= clip_end: continue - # Klipni - s_start = max(s_start, clip_start) - s_end = min(s_end, clip_end) + + # Če segment delno štrli iz clip range-a IN imamo word-level timestampe, + # uporabi samo tiste besede ki dejansko padejo v clip range + # (sicer subtitle vsebuje besedilo iz prejšnjega/naslednjega refrena/verza) + if words and (s_start < clip_start or s_end > clip_end): + words_in_clip = [] + for w in words: + w_start = float(w.get("start", 0)) + w_end = float(w.get("end", 0)) + w_text = w.get("text", "").strip() + if not w_text: + continue + # Beseda padeva v clip če se prekriva (ne mora biti popolnoma znotraj) + if w_end > clip_start and w_start < clip_end: + words_in_clip.append({ + "start": max(w_start, clip_start), + "end": min(w_end, clip_end), + "text": w_text, + }) + + if not words_in_clip: + continue + + # Reconstruiraj segment z dejanskim word-level timing-om + text = " ".join(w["text"] for w in words_in_clip) + s_start = words_in_clip[0]["start"] + s_end = words_in_clip[-1]["end"] + else: + # Klipni segment začetek/konec na clip range + s_start = max(s_start, clip_start) + s_end = min(s_end, clip_end) + if s_end - s_start < 0.2: continue