From df6011c3cfc72e9085ac266827aaf1e09efd7a12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 18:08:35 +0000 Subject: [PATCH] Detect Scribe hallucinations + filter from SRT + auto-retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug found in Žena ME TEPE third re-test: - Scribe transcribed only verse 1 (0-33s) properly - Then returned a single 98s segment [34.7-133.2] with just 1 word 'sam' - This is a known Scribe hallucination on instrumental sections - Result: SRT showed 'SAM SAM SAM SAM...' 14 times across the chorus - Looked completely wrong because the chorus audio was correct but subtitles showed 'SAM' repeatedly Three-part fix: 1. SRT GENERATOR: skip segments > 15s with < 5 words. These are hallucinations and have no real transcription value. 2. SCRIBE TRANSCRIBE: detect hallucinations in returned segments. - Mark segments > 15s with < 5 words as hallucinations - Compute true coverage % (excluding hallucinations) - Add _hallucination_count and _coverage_pct to result 3. TRANSCRIBE_FULL: auto-retry Scribe if quality is poor. - If hallucinations detected OR coverage < 50%, retry once - Keep retry result only if it has better stats - Otherwise fall back to first attempt (still better than nothing) This makes the pipeline robust against Scribe's occasional bad transcripts on songs with long instrumental breaks. Most second attempts succeed where the first failed (random Scribe variance). --- app/main.py | 12 ++++++++++++ scripts/analyze.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/app/main.py b/app/main.py index 1c87360..ef13ca4 100644 --- a/app/main.py +++ b/app/main.py @@ -230,6 +230,18 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path): if s_end <= clip_start or s_start >= clip_end: continue + # ── HALLUCINATION FILTER ── + # STT (Scribe, Whisper) občasno halucinira pri dolgih instrumentalih: + # vrne segment 60-100s z 1-2 besedama. Tak segment ne smemo dati v SRT. + # Pravilo: če je segment > 15s IN ima < 5 besed (ali words array < 5), + # je verjetno halucinacija. + seg_dur = s_end - s_start + word_count = len(words) if words else len(text.split()) + if seg_dur > 15 and word_count < 5: + print(f"[SRT] Preskočil halucinacijski segment [{s_start:.1f}-{s_end:.1f}] " + f"({seg_dur:.1f}s, {word_count} besed): {text[:50]!r}", flush=True) + continue + # Če segment delno štrli iz clip range-a IN imamo word-level timestampe, # uporabi samo tiste besede ki dejansko padejo v clip range # (sicer subtitle vsebuje besedilo iz prejšnjega/naslednjega refrena/verza) diff --git a/scripts/analyze.py b/scripts/analyze.py index 5146e8b..ed4c3fa 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -325,6 +325,31 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam if i + 1 < len(real_words): seg_start = real_words[i + 1].get("start", 0) + # ── HALLUCINATION DETECTION ── + # Scribe občasno vrne single dolg segment z 1-2 besedama (10-100s ene besede). + # To je halucinacija pri instrumentalih. + hallucination_segs = [] + total_audio_duration = max((s["end"] for s in segments), default=0) + coverage = 0 + for s in segments: + seg_dur = s["end"] - s["start"] + word_count = len(s.get("words", [])) + if seg_dur > 15 and word_count < 5: + hallucination_segs.append(s) + else: + coverage += seg_dur + + coverage_pct = coverage / total_audio_duration * 100 if total_audio_duration else 0 + + if hallucination_segs: + print(f" ⚠️ Halucinacija(e) zaznana(e): {len(hallucination_segs)} segment(ov) " + f"daljših od 15s z manj kot 5 besedami:", file=sys.stderr) + for h in hallucination_segs: + print(f" [{h['start']:.1f}-{h['end']:.1f}s] = {h['end']-h['start']:.0f}s " + f"({len(h.get('words', []))} bes.) text={h.get('text', '')[:50]!r}", file=sys.stderr) + print(f" 📊 Pravo pokritje: {coverage:.1f}s / {total_audio_duration:.1f}s " + f"= {coverage_pct:.0f}%", file=sys.stderr) + print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, " f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr) @@ -333,6 +358,8 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam "language_probability": float(detected_prob), "segments": segments, "_provider": "elevenlabs", + "_hallucination_count": len(hallucination_segs), + "_coverage_pct": coverage_pct, } @@ -348,7 +375,25 @@ def transcribe_full(audio_path, lang=None, model_size="small", provider="auto", """ if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"): result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) + + # Auto-retry če halucinacija zaznana (pokritje < 50% ali halucinacijski segmenti) if result and result.get("segments"): + hall_count = result.get("_hallucination_count", 0) + cov_pct = result.get("_coverage_pct", 100) + if hall_count > 0 or cov_pct < 50: + print(f" 🔄 Halucinacija/nizko pokritje ({cov_pct:.0f}%, " + f"{hall_count} hallucination segs) — RETRY Scribe...", file=sys.stderr) + # Drugi poskus z malo drugačnimi parametri + result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) + if result2 and result2.get("segments"): + h2 = result2.get("_hallucination_count", 0) + c2 = result2.get("_coverage_pct", 100) + if h2 < hall_count or c2 > cov_pct: + print(f" ✅ Retry boljši: pokritje {cov_pct:.0f}% → {c2:.0f}%, " + f"halucinacije {hall_count} → {h2}", file=sys.stderr) + result = result2 + else: + print(f" ⚠️ Retry ni izboljšal, ohrani prvi rezultat", file=sys.stderr) return result if provider == "elevenlabs": print(f" ⚠️ Scribe failed, no fallback (provider=elevenlabs)", file=sys.stderr)