From af3c933c784df134a4085b27174d3235fa13aef8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= <sebastjan@folx.tv>
Date: Wed, 29 Apr 2026 07:59:20 +0000
Subject: [PATCH] Robust language detection + anti-hallucination
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 3-sample voting for auto-detect (start/middle/end of song) prevents lang switching mid-song
- Lock detected language for full transcription
- Anti-hallucination: condition_on_previous_text=False, temperature=0.0
- compression_ratio_threshold=2.4 (rejects repetitive hallucinations)
- log_prob_threshold=-1.0 (rejects low-confidence segments)
- no_speech_threshold=0.6 (more aggressive silence detection)
- Default Whisper model changed: small → medium (better for all langs incl. Slavic)
---
 scripts/analyze.py   | 51 ++++++++++++++++++++++++++-
 scripts/subtitle.py  | 83 +++++++++++++++++++++++++++++++++++++++++++-
 templates/index.html | 12 ++++---
 3 files changed, 139 insertions(+), 7 deletions(-)

diff --git a/scripts/analyze.py b/scripts/analyze.py
index db4beb1..9402d2b 100644
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@@ -47,13 +47,56 @@ def extract_audio(video_path):
 
 
 def transcribe_full(audio_path, lang=None, model_size="small"):
-    """Whisper transcript celega avdia. lang=None → auto-detect.
+    """Whisper transcript celega avdia. lang=None → robust auto-detect.
     
     Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
     from faster_whisper import WhisperModel
 
     print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
     m = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+    # Auto-detect z 3-sample voting da se zaklenemo na en jezik
+    if not lang:
+        print("   🔍 Robust lang detection (3 samples)...", file=sys.stderr)
+        try:
+            duration_proc = subprocess.run(
+                ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+                 "-of", "default=nw=1:nokey=1", audio_path],
+                capture_output=True, text=True
+            )
+            audio_duration = float(duration_proc.stdout.strip())
+        except Exception:
+            audio_duration = 180.0
+
+        lang_votes = {}
+        for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
+            if ss + 5 > audio_duration:
+                continue
+            sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            sample.close()
+            try:
+                subprocess.run(
+                    ["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
+                     "-t", "30", "-vn", "-ac", "1", "-ar", "16000",
+                     "-c:a", "pcm_s16le", sample.name],
+                    check=True, capture_output=True
+                )
+                _, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
+                sl, sp = sample_info.language, float(sample_info.language_probability)
+                lang_votes[sl] = lang_votes.get(sl, 0) + sp
+                print(f"      sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
+            except Exception as e:
+                print(f"      sample @ {ss:.0f}s: failed", file=sys.stderr)
+            finally:
+                try:
+                    os.unlink(sample.name)
+                except Exception:
+                    pass
+
+        if lang_votes:
+            lang = max(lang_votes.items(), key=lambda x: x[1])[0]
+            print(f"   ✅ Lang lock: {lang}", file=sys.stderr)
+
     try:
         segs, info = m.transcribe(
             audio_path,
@@ -61,6 +104,12 @@ def transcribe_full(audio_path, lang=None, model_size="small"):
             word_timestamps=True,
             # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
             vad_filter=False,
+            # Anti-halucinacije
+            condition_on_previous_text=False,
+            temperature=0.0,
+            compression_ratio_threshold=2.4,
+            log_prob_threshold=-1.0,
+            no_speech_threshold=0.6,
         )
         detected_lang = info.language
         detected_prob = float(info.language_probability)
diff --git a/scripts/subtitle.py b/scripts/subtitle.py
index 75a38ab..4539712 100644
--- a/scripts/subtitle.py
+++ b/scripts/subtitle.py
@@ -17,18 +17,99 @@ import os
 from pathlib import Path
 
 
+def detect_language_robust(video, model):
+    """2-step detekcija jezika za auto mode:
+    1. Vzemi 3 vzorce po 30s iz različnih delov pesmi (start/middle/end refrena)
+    2. Vsak vzorec transkribiraj z auto-detect
+    3. Vrne najpogostejši jezik z največjo skupno verjetnostjo
+
+    To prepreči, da Whisper sredi pesmi spremeni jezik.
+    """
+    import subprocess
+    duration_proc = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+         "-of", "default=nw=1:nokey=1", str(video)],
+        capture_output=True, text=True
+    )
+    try:
+        duration = float(duration_proc.stdout.strip())
+    except Exception:
+        duration = 180.0
+
+    # 3 vzorci po 30s — začetek (po intru), sredina, proti koncu
+    sample_starts = [
+        max(15, duration * 0.15),  # po intru, kjer je verjetno verz 1
+        duration * 0.45,            # približno sredina, refren
+        duration * 0.75,            # zadnji refren
+    ]
+
+    lang_votes = {}  # lang → cumulative_prob
+    for ss in sample_starts:
+        if ss + 5 > duration:
+            continue
+        # Extract 30s sample
+        sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        sample.close()
+        subprocess.run(
+            ["ffmpeg", "-y", "-ss", str(ss), "-i", str(video),
+             "-t", "30", "-vn", "-ac", "1", "-ar", "16000",
+             "-c:a", "pcm_s16le", sample.name],
+            check=True, capture_output=True
+        )
+        try:
+            _, sample_info = model.transcribe(sample.name, language=None, vad_filter=False)
+            lang = sample_info.language
+            prob = float(sample_info.language_probability)
+            lang_votes[lang] = lang_votes.get(lang, 0) + prob
+            print(f"   sample @ {ss:.0f}s: {lang} (p={prob:.2f})")
+        except Exception as e:
+            print(f"   sample @ {ss:.0f}s: failed ({e})")
+        finally:
+            try:
+                os.unlink(sample.name)
+            except Exception:
+                pass
+
+    if not lang_votes:
+        return None
+    best_lang = max(lang_votes.items(), key=lambda x: x[1])
+    print(f"   🎯 Locked language: {best_lang[0]} (cumulative p={best_lang[1]:.2f})")
+    return best_lang[0]
+
+
 def transcribe(video, lang=None, model_size="small"):
     """Vrne pot do .srt datoteke."""
     from faster_whisper import WhisperModel
 
     print(f"🧠 Whisper model: {model_size}, lang={lang or 'auto'}")
     model = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+    # Auto-detect z robust 3-sample voting (preprečuje preklop jezika sredi pesmi)
+    if not lang:
+        print("   🔍 Robust auto-detect (3 sampli)...")
+        lang = detect_language_robust(video, model)
+        if lang:
+            print(f"   ✅ Lang lock: {lang}")
+        else:
+            print("   ⚠️ Detection failed, fallback na auto per-segment")
+
     segments, info = model.transcribe(
         str(video),
-        language=lang,
+        language=lang,  # fixed za cel video
         word_timestamps=True,
         # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
         vad_filter=False,
+        # Anti-halucinacije:
+        # - condition_on_previous_text: ne predaja napak naprej
+        # - temperature=0: deterministično (brez "kreativnega" ugibanja)
+        # - compression_ratio_threshold: zazna ponavljajoče halucinacije
+        # - log_prob_threshold: zavrne segmente z nizko verjetnostjo
+        # - no_speech_threshold: agresivneje preskoči tihe dele
+        condition_on_previous_text=False,
+        temperature=0.0,
+        compression_ratio_threshold=2.4,
+        log_prob_threshold=-1.0,
+        no_speech_threshold=0.6,
     )
     print(f"   Detekcija: {info.language} (p={info.language_probability:.2f})")
 
diff --git a/templates/index.html b/templates/index.html
index b8513e3..de12abe 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -231,11 +231,11 @@
         <div>
           <label>Whisper model</label>
           <select id="model">
-            <option value="tiny">tiny (najhitrejši)</option>
+            <option value="tiny">tiny (najhitrejši, slabša natančnost)</option>
             <option value="base">base</option>
-            <option value="small" selected>small (DE/EN, hitro)</option>
-            <option value="medium">medium (priporočeno za SLO/HR/BS)</option>
-            <option value="large-v3">large-v3 (najboljše, počasno)</option>
+            <option value="small">small (DE/EN, hitro)</option>
+            <option value="medium" selected>medium (privzeto, vsi jeziki)</option>
+            <option value="large-v3">large-v3 (najbolje, počasno)</option>
           </select>
         </div>
       </div>
@@ -353,10 +353,12 @@
     });
 
     // ─── Auto-upgrade Whisper model za slovanske jezike ──
+    // Privzeto je medium; če uporabnik specifično izbere SLO/HR/BS in je na manjšem modelu, upgrade
     $("#lang").addEventListener("change", e => {
       const slavicLangs = ["sl", "hr", "bs", "sr"];
       const currentModel = $("#model").value;
-      if (slavicLangs.includes(e.target.value) && (currentModel === "tiny" || currentModel === "base" || currentModel === "small")) {
+      const smallerModels = ["tiny", "base", "small"];
+      if (slavicLangs.includes(e.target.value) && smallerModels.includes(currentModel)) {
         $("#model").value = "medium";
       }
     });