Robust language detection + anti-hallucination

- 3-sample voting for auto-detect (start/middle/end of song) prevents lang switching mid-song - Lock detected language for full transcription - Anti-hallucination: condition_on_previous_text=False, temperature=0.0 - compression_ratio_threshold=2.4 (rejects repetitive hallucinations) - log_prob_threshold=-1.0 (rejects low-confidence segments) - no_speech_threshold=0.6 (more aggressive silence detection) - Default Whisper model changed: small → medium (better for all langs incl. Slavic)
2026-04-29 07:59:20 +00:00 · 2026-04-29 07:59:20 +00:00 · af3c933c78
commit af3c933c78
parent c870d80726
3 changed files with 139 additions and 7 deletions
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@ -47,13 +47,56 @@ def extract_audio(video_path):
 def transcribe_full(audio_path, lang=None, model_size="small"):
-    """Whisper transcript celega avdia. lang=None → auto-detect.
+    """Whisper transcript celega avdia. lang=None → robust auto-detect.
    Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
    from faster_whisper import WhisperModel
    print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
    m = WhisperModel(model_size, device="cpu", compute_type="int8")
    # Auto-detect z 3-sample voting da se zaklenemo na en jezik
    if not lang:
        print("   🔍 Robust lang detection (3 samples)...", file=sys.stderr)
        try:
            duration_proc = subprocess.run(
                ["ffprobe", "-v", "error", "-show_entries", "format=duration",
                 "-of", "default=nw=1:nokey=1", audio_path],
                capture_output=True, text=True
            )
            audio_duration = float(duration_proc.stdout.strip())
        except Exception:
            audio_duration = 180.0
        lang_votes = {}
        for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
            if ss + 5 > audio_duration:
                continue
            sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
            sample.close()
            try:
                subprocess.run(
                    ["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
                     "-t", "30", "-vn", "-ac", "1", "-ar", "16000",
                     "-c:a", "pcm_s16le", sample.name],
                    check=True, capture_output=True
                )
                _, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
                sl, sp = sample_info.language, float(sample_info.language_probability)
                lang_votes[sl] = lang_votes.get(sl, 0) + sp
                print(f"      sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
            except Exception as e:
                print(f"      sample @ {ss:.0f}s: failed", file=sys.stderr)
            finally:
                try:
                    os.unlink(sample.name)
                except Exception:
                    pass
        if lang_votes:
            lang = max(lang_votes.items(), key=lambda x: x[1])[0]
            print(f"   ✅ Lang lock: {lang}", file=sys.stderr)
    try:
        segs, info = m.transcribe(
            audio_path,
@ -61,6 +104,12 @@ def transcribe_full(audio_path, lang=None, model_size="small"):
            word_timestamps=True,
            # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
            vad_filter=False,
            # Anti-halucinacije
            condition_on_previous_text=False,
            temperature=0.0,
            compression_ratio_threshold=2.4,
            log_prob_threshold=-1.0,
            no_speech_threshold=0.6,
        )
        detected_lang = info.language
        detected_prob = float(info.language_probability)
--- a/scripts/subtitle.py
+++ b/scripts/subtitle.py
@ -17,18 +17,99 @@ import os
 from pathlib import Path
 def detect_language_robust(video, model):
    """2-step detekcija jezika za auto mode:
    1. Vzemi 3 vzorce po 30s iz različnih delov pesmi (start/middle/end refrena)
    2. Vsak vzorec transkribiraj z auto-detect
    3. Vrne najpogostejši jezik z največjo skupno verjetnostjo
    To prepreči, da Whisper sredi pesmi spremeni jezik.
    """
    import subprocess
    duration_proc = subprocess.run(
        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
         "-of", "default=nw=1:nokey=1", str(video)],
        capture_output=True, text=True
    )
    try:
        duration = float(duration_proc.stdout.strip())
    except Exception:
        duration = 180.0
    # 3 vzorci po 30s — začetek (po intru), sredina, proti koncu
    sample_starts = [
        max(15, duration * 0.15),  # po intru, kjer je verjetno verz 1
        duration * 0.45,            # približno sredina, refren
        duration * 0.75,            # zadnji refren
    ]
    lang_votes = {}  # lang → cumulative_prob
    for ss in sample_starts:
        if ss + 5 > duration:
            continue
        # Extract 30s sample
        sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
        sample.close()
        subprocess.run(
            ["ffmpeg", "-y", "-ss", str(ss), "-i", str(video),
             "-t", "30", "-vn", "-ac", "1", "-ar", "16000",
             "-c:a", "pcm_s16le", sample.name],
            check=True, capture_output=True
        )
        try:
            _, sample_info = model.transcribe(sample.name, language=None, vad_filter=False)
            lang = sample_info.language
            prob = float(sample_info.language_probability)
            lang_votes[lang] = lang_votes.get(lang, 0) + prob
            print(f"   sample @ {ss:.0f}s: {lang} (p={prob:.2f})")
        except Exception as e:
            print(f"   sample @ {ss:.0f}s: failed ({e})")
        finally:
            try:
                os.unlink(sample.name)
            except Exception:
                pass
    if not lang_votes:
        return None
    best_lang = max(lang_votes.items(), key=lambda x: x[1])
    print(f"   🎯 Locked language: {best_lang[0]} (cumulative p={best_lang[1]:.2f})")
    return best_lang[0]
 def transcribe(video, lang=None, model_size="small"):
    """Vrne pot do .srt datoteke."""
    from faster_whisper import WhisperModel
    print(f"🧠 Whisper model: {model_size}, lang={lang or 'auto'}")
    model = WhisperModel(model_size, device="cpu", compute_type="int8")
    # Auto-detect z robust 3-sample voting (preprečuje preklop jezika sredi pesmi)
    if not lang:
        print("   🔍 Robust auto-detect (3 sampli)...")
        lang = detect_language_robust(video, model)
        if lang:
            print(f"   ✅ Lang lock: {lang}")
        else:
            print("   ⚠️ Detection failed, fallback na auto per-segment")
    segments, info = model.transcribe(
        str(video),
-        language=lang,
+        language=lang,  # fixed za cel video
        word_timestamps=True,
        # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
        vad_filter=False,
        # Anti-halucinacije:
        # - condition_on_previous_text: ne predaja napak naprej
        # - temperature=0: deterministično (brez "kreativnega" ugibanja)
        # - compression_ratio_threshold: zazna ponavljajoče halucinacije
        # - log_prob_threshold: zavrne segmente z nizko verjetnostjo
        # - no_speech_threshold: agresivneje preskoči tihe dele
        condition_on_previous_text=False,
        temperature=0.0,
        compression_ratio_threshold=2.4,
        log_prob_threshold=-1.0,
        no_speech_threshold=0.6,
    )
    print(f"   Detekcija: {info.language} (p={info.language_probability:.2f})")
--- a/templates/index.html
+++ b/templates/index.html
@ -231,11 +231,11 @@
        <div>
          <label>Whisper model</label>
          <select id="model">
-            <option value="tiny">tiny (najhitrejši)</option>
+            <option value="tiny">tiny (najhitrejši, slabša natančnost)</option>
            <option value="base">base</option>
-            <option value="small" selected>small (DE/EN, hitro)</option>
+            <option value="small">small (DE/EN, hitro)</option>
-            <option value="medium">medium (priporočeno za SLO/HR/BS)</option>
+            <option value="medium" selected>medium (privzeto, vsi jeziki)</option>
-            <option value="large-v3">large-v3 (najboljše, počasno)</option>
+            <option value="large-v3">large-v3 (najbolje, počasno)</option>
          </select>
        </div>
      </div>
@ -353,10 +353,12 @@
    });
    // ─── Auto-upgrade Whisper model za slovanske jezike ──
    // Privzeto je medium; če uporabnik specifično izbere SLO/HR/BS in je na manjšem modelu, upgrade
    $("#lang").addEventListener("change", e => {
      const slavicLangs = ["sl", "hr", "bs", "sr"];
      const currentModel = $("#model").value;
-      if (slavicLangs.includes(e.target.value) && (currentModel === "tiny" || currentModel === "base" || currentModel === "small")) {
+      const smallerModels = ["tiny", "base", "small"];
      if (slavicLangs.includes(e.target.value) && smallerModels.includes(currentModel)) {
        $("#model").value = "medium";
      }
    });