Robust language detection + anti-hallucination
- 3-sample voting for auto-detect (start/middle/end of song) prevents lang switching mid-song - Lock detected language for full transcription - Anti-hallucination: condition_on_previous_text=False, temperature=0.0 - compression_ratio_threshold=2.4 (rejects repetitive hallucinations) - log_prob_threshold=-1.0 (rejects low-confidence segments) - no_speech_threshold=0.6 (more aggressive silence detection) - Default Whisper model changed: small → medium (better for all langs incl. Slavic)
This commit is contained in:
parent
c870d80726
commit
af3c933c78
@ -47,13 +47,56 @@ def extract_audio(video_path):
|
|||||||
|
|
||||||
|
|
||||||
def transcribe_full(audio_path, lang=None, model_size="small"):
|
def transcribe_full(audio_path, lang=None, model_size="small"):
|
||||||
"""Whisper transcript celega avdia. lang=None → auto-detect.
|
"""Whisper transcript celega avdia. lang=None → robust auto-detect.
|
||||||
|
|
||||||
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
|
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
||||||
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||||
|
|
||||||
|
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
|
||||||
|
if not lang:
|
||||||
|
print(" 🔍 Robust lang detection (3 samples)...", file=sys.stderr)
|
||||||
|
try:
|
||||||
|
duration_proc = subprocess.run(
|
||||||
|
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||||
|
"-of", "default=nw=1:nokey=1", audio_path],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
audio_duration = float(duration_proc.stdout.strip())
|
||||||
|
except Exception:
|
||||||
|
audio_duration = 180.0
|
||||||
|
|
||||||
|
lang_votes = {}
|
||||||
|
for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
|
||||||
|
if ss + 5 > audio_duration:
|
||||||
|
continue
|
||||||
|
sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||||
|
sample.close()
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
|
||||||
|
"-t", "30", "-vn", "-ac", "1", "-ar", "16000",
|
||||||
|
"-c:a", "pcm_s16le", sample.name],
|
||||||
|
check=True, capture_output=True
|
||||||
|
)
|
||||||
|
_, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
|
||||||
|
sl, sp = sample_info.language, float(sample_info.language_probability)
|
||||||
|
lang_votes[sl] = lang_votes.get(sl, 0) + sp
|
||||||
|
print(f" sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
|
||||||
|
except Exception as e:
|
||||||
|
print(f" sample @ {ss:.0f}s: failed", file=sys.stderr)
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(sample.name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if lang_votes:
|
||||||
|
lang = max(lang_votes.items(), key=lambda x: x[1])[0]
|
||||||
|
print(f" ✅ Lang lock: {lang}", file=sys.stderr)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
segs, info = m.transcribe(
|
segs, info = m.transcribe(
|
||||||
audio_path,
|
audio_path,
|
||||||
@ -61,6 +104,12 @@ def transcribe_full(audio_path, lang=None, model_size="small"):
|
|||||||
word_timestamps=True,
|
word_timestamps=True,
|
||||||
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
||||||
vad_filter=False,
|
vad_filter=False,
|
||||||
|
# Anti-halucinacije
|
||||||
|
condition_on_previous_text=False,
|
||||||
|
temperature=0.0,
|
||||||
|
compression_ratio_threshold=2.4,
|
||||||
|
log_prob_threshold=-1.0,
|
||||||
|
no_speech_threshold=0.6,
|
||||||
)
|
)
|
||||||
detected_lang = info.language
|
detected_lang = info.language
|
||||||
detected_prob = float(info.language_probability)
|
detected_prob = float(info.language_probability)
|
||||||
|
|||||||
@ -17,18 +17,99 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def detect_language_robust(video, model):
|
||||||
|
"""2-step detekcija jezika za auto mode:
|
||||||
|
1. Vzemi 3 vzorce po 30s iz različnih delov pesmi (start/middle/end refrena)
|
||||||
|
2. Vsak vzorec transkribiraj z auto-detect
|
||||||
|
3. Vrne najpogostejši jezik z največjo skupno verjetnostjo
|
||||||
|
|
||||||
|
To prepreči, da Whisper sredi pesmi spremeni jezik.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
duration_proc = subprocess.run(
|
||||||
|
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
||||||
|
"-of", "default=nw=1:nokey=1", str(video)],
|
||||||
|
capture_output=True, text=True
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
duration = float(duration_proc.stdout.strip())
|
||||||
|
except Exception:
|
||||||
|
duration = 180.0
|
||||||
|
|
||||||
|
# 3 vzorci po 30s — začetek (po intru), sredina, proti koncu
|
||||||
|
sample_starts = [
|
||||||
|
max(15, duration * 0.15), # po intru, kjer je verjetno verz 1
|
||||||
|
duration * 0.45, # približno sredina, refren
|
||||||
|
duration * 0.75, # zadnji refren
|
||||||
|
]
|
||||||
|
|
||||||
|
lang_votes = {} # lang → cumulative_prob
|
||||||
|
for ss in sample_starts:
|
||||||
|
if ss + 5 > duration:
|
||||||
|
continue
|
||||||
|
# Extract 30s sample
|
||||||
|
sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
||||||
|
sample.close()
|
||||||
|
subprocess.run(
|
||||||
|
["ffmpeg", "-y", "-ss", str(ss), "-i", str(video),
|
||||||
|
"-t", "30", "-vn", "-ac", "1", "-ar", "16000",
|
||||||
|
"-c:a", "pcm_s16le", sample.name],
|
||||||
|
check=True, capture_output=True
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
_, sample_info = model.transcribe(sample.name, language=None, vad_filter=False)
|
||||||
|
lang = sample_info.language
|
||||||
|
prob = float(sample_info.language_probability)
|
||||||
|
lang_votes[lang] = lang_votes.get(lang, 0) + prob
|
||||||
|
print(f" sample @ {ss:.0f}s: {lang} (p={prob:.2f})")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" sample @ {ss:.0f}s: failed ({e})")
|
||||||
|
finally:
|
||||||
|
try:
|
||||||
|
os.unlink(sample.name)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if not lang_votes:
|
||||||
|
return None
|
||||||
|
best_lang = max(lang_votes.items(), key=lambda x: x[1])
|
||||||
|
print(f" 🎯 Locked language: {best_lang[0]} (cumulative p={best_lang[1]:.2f})")
|
||||||
|
return best_lang[0]
|
||||||
|
|
||||||
|
|
||||||
def transcribe(video, lang=None, model_size="small"):
|
def transcribe(video, lang=None, model_size="small"):
|
||||||
"""Vrne pot do .srt datoteke."""
|
"""Vrne pot do .srt datoteke."""
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
print(f"🧠 Whisper model: {model_size}, lang={lang or 'auto'}")
|
print(f"🧠 Whisper model: {model_size}, lang={lang or 'auto'}")
|
||||||
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
model = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||||
|
|
||||||
|
# Auto-detect z robust 3-sample voting (preprečuje preklop jezika sredi pesmi)
|
||||||
|
if not lang:
|
||||||
|
print(" 🔍 Robust auto-detect (3 sampli)...")
|
||||||
|
lang = detect_language_robust(video, model)
|
||||||
|
if lang:
|
||||||
|
print(f" ✅ Lang lock: {lang}")
|
||||||
|
else:
|
||||||
|
print(" ⚠️ Detection failed, fallback na auto per-segment")
|
||||||
|
|
||||||
segments, info = model.transcribe(
|
segments, info = model.transcribe(
|
||||||
str(video),
|
str(video),
|
||||||
language=lang,
|
language=lang, # fixed za cel video
|
||||||
word_timestamps=True,
|
word_timestamps=True,
|
||||||
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
||||||
vad_filter=False,
|
vad_filter=False,
|
||||||
|
# Anti-halucinacije:
|
||||||
|
# - condition_on_previous_text: ne predaja napak naprej
|
||||||
|
# - temperature=0: deterministično (brez "kreativnega" ugibanja)
|
||||||
|
# - compression_ratio_threshold: zazna ponavljajoče halucinacije
|
||||||
|
# - log_prob_threshold: zavrne segmente z nizko verjetnostjo
|
||||||
|
# - no_speech_threshold: agresivneje preskoči tihe dele
|
||||||
|
condition_on_previous_text=False,
|
||||||
|
temperature=0.0,
|
||||||
|
compression_ratio_threshold=2.4,
|
||||||
|
log_prob_threshold=-1.0,
|
||||||
|
no_speech_threshold=0.6,
|
||||||
)
|
)
|
||||||
print(f" Detekcija: {info.language} (p={info.language_probability:.2f})")
|
print(f" Detekcija: {info.language} (p={info.language_probability:.2f})")
|
||||||
|
|
||||||
|
|||||||
@ -231,11 +231,11 @@
|
|||||||
<div>
|
<div>
|
||||||
<label>Whisper model</label>
|
<label>Whisper model</label>
|
||||||
<select id="model">
|
<select id="model">
|
||||||
<option value="tiny">tiny (najhitrejši)</option>
|
<option value="tiny">tiny (najhitrejši, slabša natančnost)</option>
|
||||||
<option value="base">base</option>
|
<option value="base">base</option>
|
||||||
<option value="small" selected>small (DE/EN, hitro)</option>
|
<option value="small">small (DE/EN, hitro)</option>
|
||||||
<option value="medium">medium (priporočeno za SLO/HR/BS)</option>
|
<option value="medium" selected>medium (privzeto, vsi jeziki)</option>
|
||||||
<option value="large-v3">large-v3 (najboljše, počasno)</option>
|
<option value="large-v3">large-v3 (najbolje, počasno)</option>
|
||||||
</select>
|
</select>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
@ -353,10 +353,12 @@
|
|||||||
});
|
});
|
||||||
|
|
||||||
// ─── Auto-upgrade Whisper model za slovanske jezike ──
|
// ─── Auto-upgrade Whisper model za slovanske jezike ──
|
||||||
|
// Privzeto je medium; če uporabnik specifično izbere SLO/HR/BS in je na manjšem modelu, upgrade
|
||||||
$("#lang").addEventListener("change", e => {
|
$("#lang").addEventListener("change", e => {
|
||||||
const slavicLangs = ["sl", "hr", "bs", "sr"];
|
const slavicLangs = ["sl", "hr", "bs", "sr"];
|
||||||
const currentModel = $("#model").value;
|
const currentModel = $("#model").value;
|
||||||
if (slavicLangs.includes(e.target.value) && (currentModel === "tiny" || currentModel === "base" || currentModel === "small")) {
|
const smallerModels = ["tiny", "base", "small"];
|
||||||
|
if (slavicLangs.includes(e.target.value) && smallerModels.includes(currentModel)) {
|
||||||
$("#model").value = "medium";
|
$("#model").value = "medium";
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user