From af3c933c784df134a4085b27174d3235fa13aef8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 07:59:20 +0000 Subject: [PATCH] Robust language detection + anti-hallucination MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 3-sample voting for auto-detect (start/middle/end of song) prevents lang switching mid-song - Lock detected language for full transcription - Anti-hallucination: condition_on_previous_text=False, temperature=0.0 - compression_ratio_threshold=2.4 (rejects repetitive hallucinations) - log_prob_threshold=-1.0 (rejects low-confidence segments) - no_speech_threshold=0.6 (more aggressive silence detection) - Default Whisper model changed: small → medium (better for all langs incl. Slavic) --- scripts/analyze.py | 51 ++++++++++++++++++++++++++- scripts/subtitle.py | 83 +++++++++++++++++++++++++++++++++++++++++++- templates/index.html | 12 ++++--- 3 files changed, 139 insertions(+), 7 deletions(-) diff --git a/scripts/analyze.py b/scripts/analyze.py index db4beb1..9402d2b 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -47,13 +47,56 @@ def extract_audio(video_path): def transcribe_full(audio_path, lang=None, model_size="small"): - """Whisper transcript celega avdia. lang=None → auto-detect. + """Whisper transcript celega avdia. lang=None → robust auto-detect. Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental).""" from faster_whisper import WhisperModel print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr) m = WhisperModel(model_size, device="cpu", compute_type="int8") + + # Auto-detect z 3-sample voting da se zaklenemo na en jezik + if not lang: + print(" 🔍 Robust lang detection (3 samples)...", file=sys.stderr) + try: + duration_proc = subprocess.run( + ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=nw=1:nokey=1", audio_path], + capture_output=True, text=True + ) + audio_duration = float(duration_proc.stdout.strip()) + except Exception: + audio_duration = 180.0 + + lang_votes = {} + for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]: + if ss + 5 > audio_duration: + continue + sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + sample.close() + try: + subprocess.run( + ["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path, + "-t", "30", "-vn", "-ac", "1", "-ar", "16000", + "-c:a", "pcm_s16le", sample.name], + check=True, capture_output=True + ) + _, sample_info = m.transcribe(sample.name, language=None, vad_filter=False) + sl, sp = sample_info.language, float(sample_info.language_probability) + lang_votes[sl] = lang_votes.get(sl, 0) + sp + print(f" sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr) + except Exception as e: + print(f" sample @ {ss:.0f}s: failed", file=sys.stderr) + finally: + try: + os.unlink(sample.name) + except Exception: + pass + + if lang_votes: + lang = max(lang_votes.items(), key=lambda x: x[1])[0] + print(f" ✅ Lang lock: {lang}", file=sys.stderr) + try: segs, info = m.transcribe( audio_path, @@ -61,6 +104,12 @@ def transcribe_full(audio_path, lang=None, model_size="small"): word_timestamps=True, # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez vad_filter=False, + # Anti-halucinacije + condition_on_previous_text=False, + temperature=0.0, + compression_ratio_threshold=2.4, + log_prob_threshold=-1.0, + no_speech_threshold=0.6, ) detected_lang = info.language detected_prob = float(info.language_probability) diff --git a/scripts/subtitle.py b/scripts/subtitle.py index 75a38ab..4539712 100644 --- a/scripts/subtitle.py +++ b/scripts/subtitle.py @@ -17,18 +17,99 @@ import os from pathlib import Path +def detect_language_robust(video, model): + """2-step detekcija jezika za auto mode: + 1. Vzemi 3 vzorce po 30s iz različnih delov pesmi (start/middle/end refrena) + 2. Vsak vzorec transkribiraj z auto-detect + 3. Vrne najpogostejši jezik z največjo skupno verjetnostjo + + To prepreči, da Whisper sredi pesmi spremeni jezik. + """ + import subprocess + duration_proc = subprocess.run( + ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=nw=1:nokey=1", str(video)], + capture_output=True, text=True + ) + try: + duration = float(duration_proc.stdout.strip()) + except Exception: + duration = 180.0 + + # 3 vzorci po 30s — začetek (po intru), sredina, proti koncu + sample_starts = [ + max(15, duration * 0.15), # po intru, kjer je verjetno verz 1 + duration * 0.45, # približno sredina, refren + duration * 0.75, # zadnji refren + ] + + lang_votes = {} # lang → cumulative_prob + for ss in sample_starts: + if ss + 5 > duration: + continue + # Extract 30s sample + sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + sample.close() + subprocess.run( + ["ffmpeg", "-y", "-ss", str(ss), "-i", str(video), + "-t", "30", "-vn", "-ac", "1", "-ar", "16000", + "-c:a", "pcm_s16le", sample.name], + check=True, capture_output=True + ) + try: + _, sample_info = model.transcribe(sample.name, language=None, vad_filter=False) + lang = sample_info.language + prob = float(sample_info.language_probability) + lang_votes[lang] = lang_votes.get(lang, 0) + prob + print(f" sample @ {ss:.0f}s: {lang} (p={prob:.2f})") + except Exception as e: + print(f" sample @ {ss:.0f}s: failed ({e})") + finally: + try: + os.unlink(sample.name) + except Exception: + pass + + if not lang_votes: + return None + best_lang = max(lang_votes.items(), key=lambda x: x[1]) + print(f" 🎯 Locked language: {best_lang[0]} (cumulative p={best_lang[1]:.2f})") + return best_lang[0] + + def transcribe(video, lang=None, model_size="small"): """Vrne pot do .srt datoteke.""" from faster_whisper import WhisperModel print(f"🧠 Whisper model: {model_size}, lang={lang or 'auto'}") model = WhisperModel(model_size, device="cpu", compute_type="int8") + + # Auto-detect z robust 3-sample voting (preprečuje preklop jezika sredi pesmi) + if not lang: + print(" 🔍 Robust auto-detect (3 sampli)...") + lang = detect_language_robust(video, model) + if lang: + print(f" ✅ Lang lock: {lang}") + else: + print(" ⚠️ Detection failed, fallback na auto per-segment") + segments, info = model.transcribe( str(video), - language=lang, + language=lang, # fixed za cel video word_timestamps=True, # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez vad_filter=False, + # Anti-halucinacije: + # - condition_on_previous_text: ne predaja napak naprej + # - temperature=0: deterministično (brez "kreativnega" ugibanja) + # - compression_ratio_threshold: zazna ponavljajoče halucinacije + # - log_prob_threshold: zavrne segmente z nizko verjetnostjo + # - no_speech_threshold: agresivneje preskoči tihe dele + condition_on_previous_text=False, + temperature=0.0, + compression_ratio_threshold=2.4, + log_prob_threshold=-1.0, + no_speech_threshold=0.6, ) print(f" Detekcija: {info.language} (p={info.language_probability:.2f})") diff --git a/templates/index.html b/templates/index.html index b8513e3..de12abe 100644 --- a/templates/index.html +++ b/templates/index.html @@ -231,11 +231,11 @@
@@ -353,10 +353,12 @@ }); // ─── Auto-upgrade Whisper model za slovanske jezike ── + // Privzeto je medium; če uporabnik specifično izbere SLO/HR/BS in je na manjšem modelu, upgrade $("#lang").addEventListener("change", e => { const slavicLangs = ["sl", "hr", "bs", "sr"]; const currentModel = $("#model").value; - if (slavicLangs.includes(e.target.value) && (currentModel === "tiny" || currentModel === "base" || currentModel === "small")) { + const smallerModels = ["tiny", "base", "small"]; + if (slavicLangs.includes(e.target.value) && smallerModels.includes(currentModel)) { $("#model").value = "medium"; } });