Robust language detection + anti-hallucination

- 3-sample voting for auto-detect (start/middle/end of song) prevents lang switching mid-song - Lock detected language for full transcription - Anti-hallucination: condition_on_previous_text=False, temperature=0.0 - compression_ratio_threshold=2.4 (rejects repetitive hallucinations) - log_prob_threshold=-1.0 (rejects low-confidence segments) - no_speech_threshold=0.6 (more aggressive silence detection) - Default Whisper model changed: small → medium (better for all langs incl. Slavic)
2026-04-29 07:59:20 +00:00 · 2026-04-29 07:59:20 +00:00 · af3c933c78
commit af3c933c78
parent c870d80726
3 changed files with 139 additions and 7 deletions
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@ -47,13 +47,56 @@ def extract_audio(video_path):


 def transcribe_full(audio_path, lang=None, model_size="small"):
-    """Whisper transcript celega avdia. lang=None → auto-detect.
+    """Whisper transcript celega avdia. lang=None → robust auto-detect.
    
    Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
    from faster_whisper import WhisperModel

    print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
    m = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+    # Auto-detect z 3-sample voting da se zaklenemo na en jezik
+    if not lang:
+        print("   🔍 Robust lang detection (3 samples)...", file=sys.stderr)
+        try:
+            duration_proc = subprocess.run(
+                ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+                 "-of", "default=nw=1:nokey=1", audio_path],
+                capture_output=True, text=True
+            )
+            audio_duration = float(duration_proc.stdout.strip())
+        except Exception:
+            audio_duration = 180.0
+
+        lang_votes = {}
+        for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
+            if ss + 5 > audio_duration:
+                continue
+            sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+            sample.close()
+            try:
+                subprocess.run(
+                    ["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
+                     "-t", "30", "-vn", "-ac", "1", "-ar", "16000",
+                     "-c:a", "pcm_s16le", sample.name],
+                    check=True, capture_output=True
+                )
+                _, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
+                sl, sp = sample_info.language, float(sample_info.language_probability)
+                lang_votes[sl] = lang_votes.get(sl, 0) + sp
+                print(f"      sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
+            except Exception as e:
+                print(f"      sample @ {ss:.0f}s: failed", file=sys.stderr)
+            finally:
+                try:
+                    os.unlink(sample.name)
+                except Exception:
+                    pass
+
+        if lang_votes:
+            lang = max(lang_votes.items(), key=lambda x: x[1])[0]
+            print(f"   ✅ Lang lock: {lang}", file=sys.stderr)
+
    try:
        segs, info = m.transcribe(
            audio_path,
@ -61,6 +104,12 @@ def transcribe_full(audio_path, lang=None, model_size="small"):
            word_timestamps=True,
            # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
            vad_filter=False,
+            # Anti-halucinacije
+            condition_on_previous_text=False,
+            temperature=0.0,
+            compression_ratio_threshold=2.4,
+            log_prob_threshold=-1.0,
+            no_speech_threshold=0.6,
        )
        detected_lang = info.language
        detected_prob = float(info.language_probability)
--- a/scripts/subtitle.py
+++ b/scripts/subtitle.py
@ -17,18 +17,99 @@ import os
 from pathlib import Path


+def detect_language_robust(video, model):
+    """2-step detekcija jezika za auto mode:
+    1. Vzemi 3 vzorce po 30s iz različnih delov pesmi (start/middle/end refrena)
+    2. Vsak vzorec transkribiraj z auto-detect
+    3. Vrne najpogostejši jezik z največjo skupno verjetnostjo
+
+    To prepreči, da Whisper sredi pesmi spremeni jezik.
+    """
+    import subprocess
+    duration_proc = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+         "-of", "default=nw=1:nokey=1", str(video)],
+        capture_output=True, text=True
+    )
+    try:
+        duration = float(duration_proc.stdout.strip())
+    except Exception:
+        duration = 180.0
+
+    # 3 vzorci po 30s — začetek (po intru), sredina, proti koncu
+    sample_starts = [
+        max(15, duration * 0.15),  # po intru, kjer je verjetno verz 1
+        duration * 0.45,            # približno sredina, refren
+        duration * 0.75,            # zadnji refren
+    ]
+
+    lang_votes = {}  # lang → cumulative_prob
+    for ss in sample_starts:
+        if ss + 5 > duration:
+            continue
+        # Extract 30s sample
+        sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+        sample.close()
+        subprocess.run(
+            ["ffmpeg", "-y", "-ss", str(ss), "-i", str(video),
+             "-t", "30", "-vn", "-ac", "1", "-ar", "16000",
+             "-c:a", "pcm_s16le", sample.name],
+            check=True, capture_output=True
+        )
+        try:
+            _, sample_info = model.transcribe(sample.name, language=None, vad_filter=False)
+            lang = sample_info.language
+            prob = float(sample_info.language_probability)
+            lang_votes[lang] = lang_votes.get(lang, 0) + prob
+            print(f"   sample @ {ss:.0f}s: {lang} (p={prob:.2f})")
+        except Exception as e:
+            print(f"   sample @ {ss:.0f}s: failed ({e})")
+        finally:
+            try:
+                os.unlink(sample.name)
+            except Exception:
+                pass
+
+    if not lang_votes:
+        return None
+    best_lang = max(lang_votes.items(), key=lambda x: x[1])
+    print(f"   🎯 Locked language: {best_lang[0]} (cumulative p={best_lang[1]:.2f})")
+    return best_lang[0]
+
+
 def transcribe(video, lang=None, model_size="small"):
    """Vrne pot do .srt datoteke."""
    from faster_whisper import WhisperModel

    print(f"🧠 Whisper model: {model_size}, lang={lang or 'auto'}")
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
+
+    # Auto-detect z robust 3-sample voting (preprečuje preklop jezika sredi pesmi)
+    if not lang:
+        print("   🔍 Robust auto-detect (3 sampli)...")
+        lang = detect_language_robust(video, model)
+        if lang:
+            print(f"   ✅ Lang lock: {lang}")
+        else:
+            print("   ⚠️ Detection failed, fallback na auto per-segment")
+
    segments, info = model.transcribe(
        str(video),
-        language=lang,
+        language=lang,  # fixed za cel video
        word_timestamps=True,
        # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
        vad_filter=False,
+        # Anti-halucinacije:
+        # - condition_on_previous_text: ne predaja napak naprej
+        # - temperature=0: deterministično (brez "kreativnega" ugibanja)
+        # - compression_ratio_threshold: zazna ponavljajoče halucinacije
+        # - log_prob_threshold: zavrne segmente z nizko verjetnostjo
+        # - no_speech_threshold: agresivneje preskoči tihe dele
+        condition_on_previous_text=False,
+        temperature=0.0,
+        compression_ratio_threshold=2.4,
+        log_prob_threshold=-1.0,
+        no_speech_threshold=0.6,
    )
    print(f"   Detekcija: {info.language} (p={info.language_probability:.2f})")

--- a/templates/index.html
+++ b/templates/index.html
@ -231,11 +231,11 @@
        <div>
          <label>Whisper model</label>
          <select id="model">
-            <option value="tiny">tiny (najhitrejši)</option>
+            <option value="tiny">tiny (najhitrejši, slabša natančnost)</option>
            <option value="base">base</option>
-            <option value="small" selected>small (DE/EN, hitro)</option>
-            <option value="medium">medium (priporočeno za SLO/HR/BS)</option>
-            <option value="large-v3">large-v3 (najboljše, počasno)</option>
+            <option value="small">small (DE/EN, hitro)</option>
+            <option value="medium" selected>medium (privzeto, vsi jeziki)</option>
+            <option value="large-v3">large-v3 (najbolje, počasno)</option>
          </select>
        </div>
      </div>
@ -353,10 +353,12 @@
    });

    // ─── Auto-upgrade Whisper model za slovanske jezike ──
+    // Privzeto je medium; če uporabnik specifično izbere SLO/HR/BS in je na manjšem modelu, upgrade
    $("#lang").addEventListener("change", e => {
      const slavicLangs = ["sl", "hr", "bs", "sr"];
      const currentModel = $("#model").value;
-      if (slavicLangs.includes(e.target.value) && (currentModel === "tiny" || currentModel === "base" || currentModel === "small")) {
+      const smallerModels = ["tiny", "base", "small"];
+      if (slavicLangs.includes(e.target.value) && smallerModels.includes(currentModel)) {
        $("#model").value = "medium";
      }
    });