From 60765ad84c359491608cc3b6cb7dbe5877ac1514 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= <sebastjan@folx.tv>
Date: Wed, 29 Apr 2026 10:48:55 +0000
Subject: [PATCH] Anti-hallucination: filename hint to LLM + beam search +
 silence threshold

When Whisper hallucinates (generates fake lyrics not matching the audio),
LLM can now use the original filename as a hint to recognize the song
and override the false transcript with the actual lyrics.

Pipeline:
1. Pass filename (e.g. 'Ben Zucker - Bonnie und Clyde') as hint
2. Whisper transcribes (may hallucinate)
3. Claude/Gemini reads filename + transcript:
   - Recognizes song from filename hint
   - Compares Whisper output to known lyrics
   - Replaces hallucinated text with real lyrics (preserves timestamps)
   - If can't fix, removes segment (better silent than wrong)

Also added Whisper anti-hallucination params:
- beam_size=5 (more careful decoding vs greedy)
- hallucination_silence_threshold=2.0 (skip text in long silences)
---
 app/main.py        |  5 +++
 scripts/analyze.py | 89 +++++++++++++++++++++++++++++++---------------
 2 files changed, 65 insertions(+), 29 deletions(-)

diff --git a/app/main.py b/app/main.py
index 30a1300..b47da6b 100644
--- a/app/main.py
+++ b/app/main.py
@@ -242,6 +242,11 @@ def process_job(job_id):
                 cmd += ["--llm-provider", job["llm_provider"]]
             if job.get("llm_model"):
                 cmd += ["--llm-model", job["llm_model"]]
+            # Filename hint = original filename (Claude lahko prepozna pesem)
+            if job.get("filename"):
+                # Brez extension
+                fn_hint = Path(job["filename"]).stem
+                cmd += ["--filename-hint", fn_hint]
             # lang: če None ali 'auto', pusti analyze.py auto-detect
             if job.get("lang") and job["lang"] not in ("auto", ""):
                 cmd += ["--lang", job["lang"]]
diff --git a/scripts/analyze.py b/scripts/analyze.py
index 24e3899..ffd4e4b 100644
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@@ -110,6 +110,10 @@ def transcribe_full(audio_path, lang=None, model_size="small"):
             compression_ratio_threshold=2.4,
             log_prob_threshold=-1.0,
             no_speech_threshold=0.6,
+            # Beam search namesto greedy = bolj zanesljiv decode (manj halucinacij)
+            beam_size=5,
+            # Halucinacija detection: če je tišina dolga, ne pretvarjaj v tekst
+            hallucination_silence_threshold=2.0,
         )
         detected_lang = info.language
         detected_prob = float(info.language_probability)
@@ -437,7 +441,7 @@ def detect_audio_fade(clip_range, transcript, video_duration=None):
     }
 
 
-def _build_analysis_prompt(transcript, video_duration, target_duration=30):
+def _build_analysis_prompt(transcript, video_duration, target_duration=30, filename_hint=None):
     """Pripravi enotni prompt za Claude/Gemini analizo."""
     lines = []
     for seg in transcript["segments"]:
@@ -447,46 +451,67 @@ def _build_analysis_prompt(transcript, video_duration, target_duration=30):
         lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
     transcript_text = "\n".join(lines)
 
+    hint_block = ""
+    if filename_hint:
+        hint_block = f"""
+
+🎵 IME DATOTEKE: "{filename_hint}"
+Iz imena datoteke morda lahko prepoznaš naslov pesmi ali izvajalca. Če je tako:
+- Uporabi svoje znanje o **dejanskem besedilu** te pesmi
+- Če Whisper transkript ne ustreza znanemu besedilu pesmi (halucinacija), POPRAVI besedilo na **dejansko besedilo pesmi**
+- Ohrani timestamp-e iz Whisper-ja (časovne meje so pravilne, samo besede so napačne)
+"""
+
     return f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo):
 
 {transcript_text}
 
-Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.
+Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.{hint_block}
 
-POMEMBNO: Whisper je avtomatski STT in pogosto naredi napake, posebej pri:
-- slovanskih jezikih (slovenščina, hrvaščina, bosanščina, srbščina)
-- narečnih izrazih
-- ko glasba prevladuje nad vokalom
+⚠️ POMEMBNO: Whisper si IZMIŠLJA besede ko ne razume jasno (HALLUCINACIJA). Posebej:
+- Ko glasba prevladuje nad vokalom
+- Pri narečjih in slovanskih jezikih
+- Generira "tipičen" tekst (npr. tekst druge pesmi istega izvajalca)
+- Lahko vstavi besede ki se POdoBNO slišijo, ampak imajo ČISTO drug pomen
+
+KAKO PREPOZNATI HALUCINACIJO:
+- Tekst nima smisla v kontekstu pesmi
+- Različni segmenti imajo nepovezane teme (kot da bi bilo več pesmi)
+- Refren je v vsakem ponovitvi različen (refren se MORA ponavljati identično)
+- Tekst je premalo **glede na trajanje** (več tišine = manj besed, ne več)
 
 PROSIM:
 1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
-2. POPRAVI očitne napake v transkripciji:
-   - Če pesem ima refren ki se ponavlja, vse pojavitve refrena POPRAVI da imajo ENAKO besedilo (uporabi najjasnejšo varianto)
-   - Popravi napačne besede ki nimajo smisla v kontekstu
-   - Popravi pomešane jezike (če pesem je slovenska, vse vrstice naj bodo v slovenščini)
+2. POPRAVI očitne halucinacije:
+   - Če prepoznaš pesem (po izvajalcu, naslovu, znaku besedila) → **uporabi PRAVO besedilo**
+   - Če halucinacijo ne moreš popraviti, **odstrani segment** (raje brez podnapisa kot napačen)
+   - Refren MORA imeti vse pojavitve ENAKE
+   - Popravi pomešane jezike (vse vrstice v enem jeziku)
    - Ohrani timestamp-e nespremenjene
-3. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi
+3. Prepoznaj REFREN: del besedila ki se PONAVLJA
 4. Izberi najboljši odsek za reel:
-   - Vključi cel refren (cel verz besedila brez prekinitve)
-   - Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom
-   - Lahko traja 20-45 sekund (ne strogo 30s)
-   - Začni in končaj na smiselni meji (konec stavka, ne sredi besede)
-5. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
+   - Vključi cel refren (brez prekinitve)
+   - Lahko dodaj pre-chorus build-up
+   - 20-45 sekund
+   - Začni in končaj na smiselni meji
+5. Če pesem nima jasnega refrena, izberi najbolj dramatičen ali zaključen del
+6. Če Whisper transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "WHISPER_HALLUCINATION_DETECTED" in vrni najmanj segmentov (samo tisti ki so smiselni)
 
 Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
 {{
   "start": <sekunde>,
   "end": <sekunde>,
-  "reason": "<kratka razlaga zakaj ta odsek>",
-  "chorus_text": "<besedilo refrena ali ključni del>",
+  "reason": "<kratka razlaga>",
+  "chorus_text": "<besedilo refrena>",
   "structure": "<1 stavek o strukturi pesmi>",
   "language": "<jezik: sl/de/hr/bs/sr/en/it/es/fr>",
+  "hallucination_detected": <true/false>,
   "corrected_segments": [
-    {{"start": <s>, "end": <s>, "text": "<popravljeno besedilo>"}}
+    {{"start": <s>, "end": <s>, "text": "<popravljeno besedilo ALI prazno če halucinacija>"}}
   ]
 }}
 
-V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom (ohrani timestamp-e)."""
+V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom. Halucinacije nadomesti s pravim besedilom (če veš) ALI pusti prazno besedilo."""
 
 
 def _parse_llm_response(text, video_duration):
@@ -522,10 +547,11 @@ def _parse_llm_response(text, video_duration):
     }
 
 
-def analyze_with_claude(transcript, video_duration, target_duration=30, model="claude-sonnet-4-6"):
+def analyze_with_claude(transcript, video_duration, target_duration=30, model="claude-sonnet-4-6", filename_hint=None):
     """Pošlje transkript Claude API-ju (Anthropic).
     
     model: claude-sonnet-4-6 (default), claude-haiku-4-5-20251001, claude-opus-4-7
+    filename_hint: ime datoteke (Claude lahko prepozna pesem in popravi halucinacije)
     """
     api_key = os.environ.get("ANTHROPIC_API_KEY")
     if not api_key:
@@ -535,7 +561,7 @@ def analyze_with_claude(transcript, video_duration, target_duration=30, model="c
     if not transcript.get("segments"):
         return None
 
-    prompt = _build_analysis_prompt(transcript, video_duration, target_duration)
+    prompt = _build_analysis_prompt(transcript, video_duration, target_duration, filename_hint=filename_hint)
 
     try:
         import urllib.request
@@ -600,7 +626,7 @@ def analyze_with_claude(transcript, video_duration, target_duration=30, model="c
         return None
 
 
-def analyze_with_gemini(transcript, video_duration, target_duration=30, model="gemini-3.1-pro-preview"):
+def analyze_with_gemini(transcript, video_duration, target_duration=30, model="gemini-3.1-pro-preview", filename_hint=None):
     """Pošlje transkript Gemini API-ju (Google).
     
     Gemini 3.1 Pro ima najboljši multilingual rezultat (MMMLU 92.6%) — odličen za SLO/HR/BS.
@@ -613,7 +639,7 @@ def analyze_with_gemini(transcript, video_duration, target_duration=30, model="g
     if not transcript.get("segments"):
         return None
 
-    prompt = _build_analysis_prompt(transcript, video_duration, target_duration)
+    prompt = _build_analysis_prompt(transcript, video_duration, target_duration, filename_hint=filename_hint)
 
     try:
         import urllib.request
@@ -705,23 +731,23 @@ def analyze_with_gemini(transcript, video_duration, target_duration=30, model="g
         return None
 
 
-def analyze_with_llm(transcript, video_duration, target_duration=30, provider="claude", llm_model=None):
+def analyze_with_llm(transcript, video_duration, target_duration=30, provider="claude", llm_model=None, filename_hint=None):
     """Glavna funkcija — uporabi izbrano LLM (claude/gemini/auto)."""
     if provider == "gemini":
         model = llm_model or "gemini-3.1-pro-preview"
-        return analyze_with_gemini(transcript, video_duration, target_duration, model)
+        return analyze_with_gemini(transcript, video_duration, target_duration, model, filename_hint=filename_hint)
     elif provider == "claude":
         model = llm_model or "claude-sonnet-4-6"
-        return analyze_with_claude(transcript, video_duration, target_duration, model)
+        return analyze_with_claude(transcript, video_duration, target_duration, model, filename_hint=filename_hint)
     elif provider == "auto":
         # Najprej probaj Claude, fallback na Gemini
         result = analyze_with_claude(transcript, video_duration, target_duration,
-                                      llm_model or "claude-sonnet-4-6")
+                                      llm_model or "claude-sonnet-4-6", filename_hint=filename_hint)
         if result:
             return result
         print("   🔄 Claude ni uspel, probam Gemini...", file=sys.stderr)
         return analyze_with_gemini(transcript, video_duration, target_duration,
-                                    llm_model or "gemini-3.1-pro-preview")
+                                    llm_model or "gemini-3.1-pro-preview", filename_hint=filename_hint)
     else:
         print(f"   ⚠️ Neznan LLM provider: {provider}", file=sys.stderr)
         return None
@@ -760,6 +786,8 @@ def main():
                     help="Kateri LLM uporabiti za analizo (default: claude)")
     ap.add_argument("--llm-model", default=None,
                     help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
+    ap.add_argument("--filename-hint", default=None,
+                    help="Originalno ime datoteke (Claude lahko prepozna pesem)")
     ap.add_argument("--json", action="store_true", help="Output JSON")
     ap.add_argument("--output", help="Path za JSON output")
     args = ap.parse_args()
@@ -795,9 +823,12 @@ def main():
         if not instrumental and not args.no_claude:
             provider = args.llm_provider
             print(f"🤖 Pošiljam transkript {provider}-u za analizo...", file=sys.stderr)
+            # Filename hint = original filename brez extension (Claude lahko prepozna pesem)
+            fname_hint = args.filename_hint or video.stem
             claude_result = analyze_with_llm(
                 transcript, duration, target_duration=args.target_duration,
                 provider=provider, llm_model=args.llm_model,
+                filename_hint=fname_hint,
             )
 
         # 5b. Find chorus lokalno (kot fallback ali za score-jev preview)