diff --git a/app/main.py b/app/main.py index 1c87360..ef13ca4 100644 --- a/app/main.py +++ b/app/main.py @@ -230,6 +230,18 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path): if s_end <= clip_start or s_start >= clip_end: continue + # ── HALLUCINATION FILTER ── + # STT (Scribe, Whisper) občasno halucinira pri dolgih instrumentalih: + # vrne segment 60-100s z 1-2 besedama. Tak segment ne smemo dati v SRT. + # Pravilo: če je segment > 15s IN ima < 5 besed (ali words array < 5), + # je verjetno halucinacija. + seg_dur = s_end - s_start + word_count = len(words) if words else len(text.split()) + if seg_dur > 15 and word_count < 5: + print(f"[SRT] Preskočil halucinacijski segment [{s_start:.1f}-{s_end:.1f}] " + f"({seg_dur:.1f}s, {word_count} besed): {text[:50]!r}", flush=True) + continue + # Če segment delno štrli iz clip range-a IN imamo word-level timestampe, # uporabi samo tiste besede ki dejansko padejo v clip range # (sicer subtitle vsebuje besedilo iz prejšnjega/naslednjega refrena/verza) diff --git a/scripts/analyze.py b/scripts/analyze.py index 5146e8b..ed4c3fa 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -325,6 +325,31 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam if i + 1 < len(real_words): seg_start = real_words[i + 1].get("start", 0) + # ── HALLUCINATION DETECTION ── + # Scribe občasno vrne single dolg segment z 1-2 besedama (10-100s ene besede). + # To je halucinacija pri instrumentalih. + hallucination_segs = [] + total_audio_duration = max((s["end"] for s in segments), default=0) + coverage = 0 + for s in segments: + seg_dur = s["end"] - s["start"] + word_count = len(s.get("words", [])) + if seg_dur > 15 and word_count < 5: + hallucination_segs.append(s) + else: + coverage += seg_dur + + coverage_pct = coverage / total_audio_duration * 100 if total_audio_duration else 0 + + if hallucination_segs: + print(f" ⚠️ Halucinacija(e) zaznana(e): {len(hallucination_segs)} segment(ov) " + f"daljših od 15s z manj kot 5 besedami:", file=sys.stderr) + for h in hallucination_segs: + print(f" [{h['start']:.1f}-{h['end']:.1f}s] = {h['end']-h['start']:.0f}s " + f"({len(h.get('words', []))} bes.) text={h.get('text', '')[:50]!r}", file=sys.stderr) + print(f" 📊 Pravo pokritje: {coverage:.1f}s / {total_audio_duration:.1f}s " + f"= {coverage_pct:.0f}%", file=sys.stderr) + print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, " f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr) @@ -333,6 +358,8 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam "language_probability": float(detected_prob), "segments": segments, "_provider": "elevenlabs", + "_hallucination_count": len(hallucination_segs), + "_coverage_pct": coverage_pct, } @@ -348,7 +375,25 @@ def transcribe_full(audio_path, lang=None, model_size="small", provider="auto", """ if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"): result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) + + # Auto-retry če halucinacija zaznana (pokritje < 50% ali halucinacijski segmenti) if result and result.get("segments"): + hall_count = result.get("_hallucination_count", 0) + cov_pct = result.get("_coverage_pct", 100) + if hall_count > 0 or cov_pct < 50: + print(f" 🔄 Halucinacija/nizko pokritje ({cov_pct:.0f}%, " + f"{hall_count} hallucination segs) — RETRY Scribe...", file=sys.stderr) + # Drugi poskus z malo drugačnimi parametri + result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) + if result2 and result2.get("segments"): + h2 = result2.get("_hallucination_count", 0) + c2 = result2.get("_coverage_pct", 100) + if h2 < hall_count or c2 > cov_pct: + print(f" ✅ Retry boljši: pokritje {cov_pct:.0f}% → {c2:.0f}%, " + f"halucinacije {hall_count} → {h2}", file=sys.stderr) + result = result2 + else: + print(f" ⚠️ Retry ni izboljšal, ohrani prvi rezultat", file=sys.stderr) return result if provider == "elevenlabs": print(f" ⚠️ Scribe failed, no fallback (provider=elevenlabs)", file=sys.stderr)