From 823eb3e91ed547198566e8cc228c0c72d18ca6f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 16:30:51 +0000 Subject: [PATCH] Use original Scribe transcript for word-level (Claude doesnt return words) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug found in Žena ME TEPE re-test: - Final clip start was 77.2s but word 'Žena' starts at 76.88s - Word-level extension would have correctly chosen 76.73s - Why didn't it? Because corrected_segs (Claude output) doesn't contain word-level timestamps, only segment start/end. all_words array was empty, triggering segment-level fallback (-0.5s) which produced 77.2s instead. Fix: always use transcript['segments'] (original Scribe output with word timestamps) for word-level boundary detection, not Claude corrected_segments. Now: 'Žena' word at 76.88-77.74s will trigger word-level extension to 76.73s (76.88 - 0.15s buffer), capturing the full word. --- scripts/analyze.py | 78 +++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 15 deletions(-) diff --git a/scripts/analyze.py b/scripts/analyze.py index cb6621d..4612287 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -805,6 +805,18 @@ Ko najdeš lyrics: - Identificiraj BRIDGE / PRE-CHORUS / OUTRO če obstajajo - Mapiraj transkript timestamp-e na strukturne dele - Popravi corrected_segments z dejanskim besedilom + +🎯 **POMEMBNA HEVRISTIKA: NASLOV PESMI = REFREN HOOK**: +Naslov pesmi je v 80-90% primerov **ključna fraza refrena** (hook). +- "Pijan" → refren vsebuje "pijan, pijan" +- "Brajde" → refren vsebuje "brajde" (ne pa pre-chorus o traktorju!) +- "Žena me tepe" → refren = "Žena me tepe" +- "Stisn se k men" → refren = "Stisni se k meni" +- "Cvetele so maline" → refren vsebuje "cvetele so maline" ali povezano + +Če iz transkripta ne najdeš naslovne fraze blizu izbranega clipa, **VERJETNO si izbral verz/pre-chorus, ne refrena**. Poišči pravi refren. + +⚠️ **PAZI**: prvi verz pesmi se pogosto začne **takoj po intro-u** (5-15s) in je kontekstualen — TO NI REFREN. Refren običajno pride **po prvem verzu** (pri 30-60s, odvisno od pesmi). """ return f"""Tu je transcript pesmi iz STT modela (timestamp v sekundah, besedilo): @@ -1327,20 +1339,53 @@ def main(): # llm_source npr. "claude:claude-sonnet-4-6" ali "gemini:gemini-3.1-pro-preview". if claude_result: llm_source = claude_result.get("source", "llm") - clip_range = { - "start": claude_result["start"], - "end": claude_result["end"], - "duration": claude_result["duration"], - "reason": f"{llm_source}: " + claude_result.get("reason", ""), - "chorus_text": claude_result.get("chorus_text", ""), - "structure": claude_result.get("structure", ""), - "source": llm_source, - } - # Apply max_duration cap če LLM pretirava - if clip_range["duration"] > args.max_duration: - clip_range["end"] = clip_range["start"] + args.max_duration - clip_range["duration"] = args.max_duration - clip_range["reason"] += " (capped at max_duration)" + + # ── HALUCINACIJA HANDLING ── + # Če je Claude detect-iral halucinacijo (npr. Scribe je vrnil + # "finančni moduli" namesto pesmi), NE zaupamo izbiri clipa, + # ker LLM ni mogel locirati pravega refrena. + if claude_result.get("hallucination_detected"): + print(f"⚠️ HALUCINACIJA DETECT-ANA — fallback na local heuristic " + f"(Scribe transkript ne ustreza zvočnemu vsebini)", file=sys.stderr) + # Reset claude_result — gremo na local fallback + clip_range = smart_clip_range( + chorus, transcript, duration, + target_duration=args.target_duration, + max_duration=args.max_duration, + min_duration=args.min_duration, + include_prebuild=args.include_prebuild, + ) + clip_range["source"] = "local_fallback_after_hallucination" + clip_range["reason"] = ( + "STT halucinacija — local heuristic fallback. " + "Refren je iz energy-based detekcije, ne iz transkripta. " + + clip_range.get("reason", "") + ) + claude_result = None # disable extensions + else: + clip_range = { + "start": claude_result["start"], + "end": claude_result["end"], + "duration": claude_result["duration"], + "reason": f"{llm_source}: " + claude_result.get("reason", ""), + "chorus_text": claude_result.get("chorus_text", ""), + "structure": claude_result.get("structure", ""), + "source": llm_source, + } + # Apply max_duration cap če LLM pretirava + if clip_range["duration"] > args.max_duration: + clip_range["end"] = clip_range["start"] + args.max_duration + clip_range["duration"] = args.max_duration + clip_range["reason"] += " (capped at max_duration)" + + # Apply min_duration floor — če je clip prekratek, podaljšaj + if clip_range["duration"] < args.min_duration: + needed = args.min_duration - clip_range["duration"] + new_end = min(clip_range["end"] + needed, duration) + actual_extension = new_end - clip_range["end"] + clip_range["end"] = new_end + clip_range["duration"] = clip_range["end"] - clip_range["start"] + clip_range["reason"] += f" (extended +{actual_extension:.1f}s to meet min_duration)" # ── EXTEND clip end do naslednje naravne pavze ── # LLM pogosto reže točno na zadnji besedi refrena, ampak zadnja @@ -1375,8 +1420,11 @@ def main(): current_start = clip_range["start"] # Zberi VSE besede z njihovimi timestampi + # POMEMBNO: Claude corrected_segments NE vsebuje word-level timestamps, + # samo segment start/end. Word-level je samo v originalnem Scribe transkriptu. + # Zato vedno uporabi `transcript["segments"]` ne `corrected_segs`. all_words = [] - for seg in corrected_segs: + for seg in transcript.get("segments", []): for w in seg.get("words", []): if w.get("start") is not None and w.get("end") is not None: all_words.append({