Robust language detection + anti-hallucination
- 3-sample voting for auto-detect (start/middle/end of song) prevents lang switching mid-song - Lock detected language for full transcription - Anti-hallucination: condition_on_previous_text=False, temperature=0.0 - compression_ratio_threshold=2.4 (rejects repetitive hallucinations) - log_prob_threshold=-1.0 (rejects low-confidence segments) - no_speech_threshold=0.6 (more aggressive silence detection) - Default Whisper model changed: small → medium (better for all langs incl. Slavic)
This commit is contained in:
parent
c870d80726
commit
af3c933c78
@ -47,13 +47,56 @@ def extract_audio(video_path):
|
||||
|
||||
|
||||
def transcribe_full(audio_path, lang=None, model_size="small"):
|
||||
"""Whisper transcript celega avdia. lang=None → auto-detect.
|
||||
"""Whisper transcript celega avdia. lang=None → robust auto-detect.
|
||||
|
||||
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
||||
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
|
||||
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
|
||||
if not lang:
|
||||
print(" 🔍 Robust lang detection (3 samples)...", file=sys.stderr)
|
||||
try:
|
||||
duration_proc = subprocess.run(
|
||||
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||
"-of", "default=nw=1:nokey=1", audio_path],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
audio_duration = float(duration_proc.stdout.strip())
|
||||
except Exception:
|
||||
audio_duration = 180.0
|
||||
|
||||
lang_votes = {}
|
||||
for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
|
||||
if ss + 5 > audio_duration:
|
||||
continue
|
||||
sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
sample.close()
|
||||
try:
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
|
||||
"-t", "30", "-vn", "-ac", "1", "-ar", "16000",
|
||||
"-c:a", "pcm_s16le", sample.name],
|
||||
check=True, capture_output=True
|
||||
)
|
||||
_, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
|
||||
sl, sp = sample_info.language, float(sample_info.language_probability)
|
||||
lang_votes[sl] = lang_votes.get(sl, 0) + sp
|
||||
print(f" sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
|
||||
except Exception as e:
|
||||
print(f" sample @ {ss:.0f}s: failed", file=sys.stderr)
|
||||
finally:
|
||||
try:
|
||||
os.unlink(sample.name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if lang_votes:
|
||||
lang = max(lang_votes.items(), key=lambda x: x[1])[0]
|
||||
print(f" ✅ Lang lock: {lang}", file=sys.stderr)
|
||||
|
||||
try:
|
||||
segs, info = m.transcribe(
|
||||
audio_path,
|
||||
@ -61,6 +104,12 @@ def transcribe_full(audio_path, lang=None, model_size="small"):
|
||||
word_timestamps=True,
|
||||
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
||||
vad_filter=False,
|
||||
# Anti-halucinacije
|
||||
condition_on_previous_text=False,
|
||||
temperature=0.0,
|
||||
compression_ratio_threshold=2.4,
|
||||
log_prob_threshold=-1.0,
|
||||
no_speech_threshold=0.6,
|
||||
)
|
||||
detected_lang = info.language
|
||||
detected_prob = float(info.language_probability)
|
||||
|
||||
@ -17,18 +17,99 @@ import os
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def detect_language_robust(video, model):
|
||||
"""2-step detekcija jezika za auto mode:
|
||||
1. Vzemi 3 vzorce po 30s iz različnih delov pesmi (start/middle/end refrena)
|
||||
2. Vsak vzorec transkribiraj z auto-detect
|
||||
3. Vrne najpogostejši jezik z največjo skupno verjetnostjo
|
||||
|
||||
To prepreči, da Whisper sredi pesmi spremeni jezik.
|
||||
"""
|
||||
import subprocess
|
||||
duration_proc = subprocess.run(
|
||||
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||
"-of", "default=nw=1:nokey=1", str(video)],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
try:
|
||||
duration = float(duration_proc.stdout.strip())
|
||||
except Exception:
|
||||
duration = 180.0
|
||||
|
||||
# 3 vzorci po 30s — začetek (po intru), sredina, proti koncu
|
||||
sample_starts = [
|
||||
max(15, duration * 0.15), # po intru, kjer je verjetno verz 1
|
||||
duration * 0.45, # približno sredina, refren
|
||||
duration * 0.75, # zadnji refren
|
||||
]
|
||||
|
||||
lang_votes = {} # lang → cumulative_prob
|
||||
for ss in sample_starts:
|
||||
if ss + 5 > duration:
|
||||
continue
|
||||
# Extract 30s sample
|
||||
sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||
sample.close()
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-ss", str(ss), "-i", str(video),
|
||||
"-t", "30", "-vn", "-ac", "1", "-ar", "16000",
|
||||
"-c:a", "pcm_s16le", sample.name],
|
||||
check=True, capture_output=True
|
||||
)
|
||||
try:
|
||||
_, sample_info = model.transcribe(sample.name, language=None, vad_filter=False)
|
||||
lang = sample_info.language
|
||||
prob = float(sample_info.language_probability)
|
||||
lang_votes[lang] = lang_votes.get(lang, 0) + prob
|
||||
print(f" sample @ {ss:.0f}s: {lang} (p={prob:.2f})")
|
||||
except Exception as e:
|
||||
print(f" sample @ {ss:.0f}s: failed ({e})")
|
||||
finally:
|
||||
try:
|
||||
os.unlink(sample.name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not lang_votes:
|
||||
return None
|
||||
best_lang = max(lang_votes.items(), key=lambda x: x[1])
|
||||
print(f" 🎯 Locked language: {best_lang[0]} (cumulative p={best_lang[1]:.2f})")
|
||||
return best_lang[0]
|
||||
|
||||
|
||||
def transcribe(video, lang=None, model_size="small"):
|
||||
"""Vrne pot do .srt datoteke."""
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
print(f"🧠 Whisper model: {model_size}, lang={lang or 'auto'}")
|
||||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
|
||||
# Auto-detect z robust 3-sample voting (preprečuje preklop jezika sredi pesmi)
|
||||
if not lang:
|
||||
print(" 🔍 Robust auto-detect (3 sampli)...")
|
||||
lang = detect_language_robust(video, model)
|
||||
if lang:
|
||||
print(f" ✅ Lang lock: {lang}")
|
||||
else:
|
||||
print(" ⚠️ Detection failed, fallback na auto per-segment")
|
||||
|
||||
segments, info = model.transcribe(
|
||||
str(video),
|
||||
language=lang,
|
||||
language=lang, # fixed za cel video
|
||||
word_timestamps=True,
|
||||
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
||||
vad_filter=False,
|
||||
# Anti-halucinacije:
|
||||
# - condition_on_previous_text: ne predaja napak naprej
|
||||
# - temperature=0: deterministično (brez "kreativnega" ugibanja)
|
||||
# - compression_ratio_threshold: zazna ponavljajoče halucinacije
|
||||
# - log_prob_threshold: zavrne segmente z nizko verjetnostjo
|
||||
# - no_speech_threshold: agresivneje preskoči tihe dele
|
||||
condition_on_previous_text=False,
|
||||
temperature=0.0,
|
||||
compression_ratio_threshold=2.4,
|
||||
log_prob_threshold=-1.0,
|
||||
no_speech_threshold=0.6,
|
||||
)
|
||||
print(f" Detekcija: {info.language} (p={info.language_probability:.2f})")
|
||||
|
||||
|
||||
@ -231,11 +231,11 @@
|
||||
<div>
|
||||
<label>Whisper model</label>
|
||||
<select id="model">
|
||||
<option value="tiny">tiny (najhitrejši)</option>
|
||||
<option value="tiny">tiny (najhitrejši, slabša natančnost)</option>
|
||||
<option value="base">base</option>
|
||||
<option value="small" selected>small (DE/EN, hitro)</option>
|
||||
<option value="medium">medium (priporočeno za SLO/HR/BS)</option>
|
||||
<option value="large-v3">large-v3 (najboljše, počasno)</option>
|
||||
<option value="small">small (DE/EN, hitro)</option>
|
||||
<option value="medium" selected>medium (privzeto, vsi jeziki)</option>
|
||||
<option value="large-v3">large-v3 (najbolje, počasno)</option>
|
||||
</select>
|
||||
</div>
|
||||
</div>
|
||||
@ -353,10 +353,12 @@
|
||||
});
|
||||
|
||||
// ─── Auto-upgrade Whisper model za slovanske jezike ──
|
||||
// Privzeto je medium; če uporabnik specifično izbere SLO/HR/BS in je na manjšem modelu, upgrade
|
||||
$("#lang").addEventListener("change", e => {
|
||||
const slavicLangs = ["sl", "hr", "bs", "sr"];
|
||||
const currentModel = $("#model").value;
|
||||
if (slavicLangs.includes(e.target.value) && (currentModel === "tiny" || currentModel === "base" || currentModel === "small")) {
|
||||
const smallerModels = ["tiny", "base", "small"];
|
||||
if (slavicLangs.includes(e.target.value) && smallerModels.includes(currentModel)) {
|
||||
$("#model").value = "medium";
|
||||
}
|
||||
});
|
||||
|
||||
Loading…
Reference in New Issue
Block a user