diff --git a/scripts/analyze.py b/scripts/analyze.py index cb6621d..4612287 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -805,6 +805,18 @@ Ko najdeš lyrics: - Identificiraj BRIDGE / PRE-CHORUS / OUTRO če obstajajo - Mapiraj transkript timestamp-e na strukturne dele - Popravi corrected_segments z dejanskim besedilom + +🎯 **POMEMBNA HEVRISTIKA: NASLOV PESMI = REFREN HOOK**: +Naslov pesmi je v 80-90% primerov **ključna fraza refrena** (hook). +- "Pijan" → refren vsebuje "pijan, pijan" +- "Brajde" → refren vsebuje "brajde" (ne pa pre-chorus o traktorju!) +- "Žena me tepe" → refren = "Žena me tepe" +- "Stisn se k men" → refren = "Stisni se k meni" +- "Cvetele so maline" → refren vsebuje "cvetele so maline" ali povezano + +Če iz transkripta ne najdeš naslovne fraze blizu izbranega clipa, **VERJETNO si izbral verz/pre-chorus, ne refrena**. Poišči pravi refren. + +⚠️ **PAZI**: prvi verz pesmi se pogosto začne **takoj po intro-u** (5-15s) in je kontekstualen — TO NI REFREN. Refren običajno pride **po prvem verzu** (pri 30-60s, odvisno od pesmi). """ return f"""Tu je transcript pesmi iz STT modela (timestamp v sekundah, besedilo): @@ -1327,20 +1339,53 @@ def main(): # llm_source npr. "claude:claude-sonnet-4-6" ali "gemini:gemini-3.1-pro-preview". if claude_result: llm_source = claude_result.get("source", "llm") - clip_range = { - "start": claude_result["start"], - "end": claude_result["end"], - "duration": claude_result["duration"], - "reason": f"{llm_source}: " + claude_result.get("reason", ""), - "chorus_text": claude_result.get("chorus_text", ""), - "structure": claude_result.get("structure", ""), - "source": llm_source, - } - # Apply max_duration cap če LLM pretirava - if clip_range["duration"] > args.max_duration: - clip_range["end"] = clip_range["start"] + args.max_duration - clip_range["duration"] = args.max_duration - clip_range["reason"] += " (capped at max_duration)" + + # ── HALUCINACIJA HANDLING ── + # Če je Claude detect-iral halucinacijo (npr. Scribe je vrnil + # "finančni moduli" namesto pesmi), NE zaupamo izbiri clipa, + # ker LLM ni mogel locirati pravega refrena. + if claude_result.get("hallucination_detected"): + print(f"⚠️ HALUCINACIJA DETECT-ANA — fallback na local heuristic " + f"(Scribe transkript ne ustreza zvočnemu vsebini)", file=sys.stderr) + # Reset claude_result — gremo na local fallback + clip_range = smart_clip_range( + chorus, transcript, duration, + target_duration=args.target_duration, + max_duration=args.max_duration, + min_duration=args.min_duration, + include_prebuild=args.include_prebuild, + ) + clip_range["source"] = "local_fallback_after_hallucination" + clip_range["reason"] = ( + "STT halucinacija — local heuristic fallback. " + "Refren je iz energy-based detekcije, ne iz transkripta. " + + clip_range.get("reason", "") + ) + claude_result = None # disable extensions + else: + clip_range = { + "start": claude_result["start"], + "end": claude_result["end"], + "duration": claude_result["duration"], + "reason": f"{llm_source}: " + claude_result.get("reason", ""), + "chorus_text": claude_result.get("chorus_text", ""), + "structure": claude_result.get("structure", ""), + "source": llm_source, + } + # Apply max_duration cap če LLM pretirava + if clip_range["duration"] > args.max_duration: + clip_range["end"] = clip_range["start"] + args.max_duration + clip_range["duration"] = args.max_duration + clip_range["reason"] += " (capped at max_duration)" + + # Apply min_duration floor — če je clip prekratek, podaljšaj + if clip_range["duration"] < args.min_duration: + needed = args.min_duration - clip_range["duration"] + new_end = min(clip_range["end"] + needed, duration) + actual_extension = new_end - clip_range["end"] + clip_range["end"] = new_end + clip_range["duration"] = clip_range["end"] - clip_range["start"] + clip_range["reason"] += f" (extended +{actual_extension:.1f}s to meet min_duration)" # ── EXTEND clip end do naslednje naravne pavze ── # LLM pogosto reže točno na zadnji besedi refrena, ampak zadnja @@ -1375,8 +1420,11 @@ def main(): current_start = clip_range["start"] # Zberi VSE besede z njihovimi timestampi + # POMEMBNO: Claude corrected_segments NE vsebuje word-level timestamps, + # samo segment start/end. Word-level je samo v originalnem Scribe transkriptu. + # Zato vedno uporabi `transcript["segments"]` ne `corrected_segs`. all_words = [] - for seg in corrected_segs: + for seg in transcript.get("segments", []): for w in seg.get("words", []): if w.get("start") is not None and w.get("end") is not None: all_words.append({