Detect Scribe hallucinations + filter from SRT + auto-retry

Bug found in Žena ME TEPE third re-test:
- Scribe transcribed only verse 1 (0-33s) properly
- Then returned a single 98s segment [34.7-133.2] with just 1 word 'sam'
- This is a known Scribe hallucination on instrumental sections
- Result: SRT showed 'SAM SAM SAM SAM...' 14 times across the chorus
- Looked completely wrong because the chorus audio was correct but
  subtitles showed 'SAM' repeatedly

Three-part fix:

1. SRT GENERATOR: skip segments > 15s with < 5 words.
   These are hallucinations and have no real transcription value.

2. SCRIBE TRANSCRIBE: detect hallucinations in returned segments.
   - Mark segments > 15s with < 5 words as hallucinations
   - Compute true coverage % (excluding hallucinations)
   - Add _hallucination_count and _coverage_pct to result

3. TRANSCRIBE_FULL: auto-retry Scribe if quality is poor.
   - If hallucinations detected OR coverage < 50%, retry once
   - Keep retry result only if it has better stats
   - Otherwise fall back to first attempt (still better than nothing)

This makes the pipeline robust against Scribe's occasional bad transcripts
on songs with long instrumental breaks. Most second attempts succeed
where the first failed (random Scribe variance).
This commit is contained in:
Sebastjan Artič 2026-04-29 18:08:35 +00:00
parent d3b71942d2
commit df6011c3cf
2 changed files with 57 additions and 0 deletions

View File

@ -230,6 +230,18 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
if s_end <= clip_start or s_start >= clip_end:
continue
# ── HALLUCINATION FILTER ──
# STT (Scribe, Whisper) občasno halucinira pri dolgih instrumentalih:
# vrne segment 60-100s z 1-2 besedama. Tak segment ne smemo dati v SRT.
# Pravilo: če je segment > 15s IN ima < 5 besed (ali words array < 5),
# je verjetno halucinacija.
seg_dur = s_end - s_start
word_count = len(words) if words else len(text.split())
if seg_dur > 15 and word_count < 5:
print(f"[SRT] Preskočil halucinacijski segment [{s_start:.1f}-{s_end:.1f}] "
f"({seg_dur:.1f}s, {word_count} besed): {text[:50]!r}", flush=True)
continue
# Če segment delno štrli iz clip range-a IN imamo word-level timestampe,
# uporabi samo tiste besede ki dejansko padejo v clip range
# (sicer subtitle vsebuje besedilo iz prejšnjega/naslednjega refrena/verza)

View File

@ -325,6 +325,31 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam
if i + 1 < len(real_words):
seg_start = real_words[i + 1].get("start", 0)
# ── HALLUCINATION DETECTION ──
# Scribe občasno vrne single dolg segment z 1-2 besedama (10-100s ene besede).
# To je halucinacija pri instrumentalih.
hallucination_segs = []
total_audio_duration = max((s["end"] for s in segments), default=0)
coverage = 0
for s in segments:
seg_dur = s["end"] - s["start"]
word_count = len(s.get("words", []))
if seg_dur > 15 and word_count < 5:
hallucination_segs.append(s)
else:
coverage += seg_dur
coverage_pct = coverage / total_audio_duration * 100 if total_audio_duration else 0
if hallucination_segs:
print(f" ⚠️ Halucinacija(e) zaznana(e): {len(hallucination_segs)} segment(ov) "
f"daljših od 15s z manj kot 5 besedami:", file=sys.stderr)
for h in hallucination_segs:
print(f" [{h['start']:.1f}-{h['end']:.1f}s] = {h['end']-h['start']:.0f}s "
f"({len(h.get('words', []))} bes.) text={h.get('text', '')[:50]!r}", file=sys.stderr)
print(f" 📊 Pravo pokritje: {coverage:.1f}s / {total_audio_duration:.1f}s "
f"= {coverage_pct:.0f}%", file=sys.stderr)
print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, "
f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr)
@ -333,6 +358,8 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam
"language_probability": float(detected_prob),
"segments": segments,
"_provider": "elevenlabs",
"_hallucination_count": len(hallucination_segs),
"_coverage_pct": coverage_pct,
}
@ -348,7 +375,25 @@ def transcribe_full(audio_path, lang=None, model_size="small", provider="auto",
"""
if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"):
result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
# Auto-retry če halucinacija zaznana (pokritje < 50% ali halucinacijski segmenti)
if result and result.get("segments"):
hall_count = result.get("_hallucination_count", 0)
cov_pct = result.get("_coverage_pct", 100)
if hall_count > 0 or cov_pct < 50:
print(f" 🔄 Halucinacija/nizko pokritje ({cov_pct:.0f}%, "
f"{hall_count} hallucination segs) — RETRY Scribe...", file=sys.stderr)
# Drugi poskus z malo drugačnimi parametri
result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
if result2 and result2.get("segments"):
h2 = result2.get("_hallucination_count", 0)
c2 = result2.get("_coverage_pct", 100)
if h2 < hall_count or c2 > cov_pct:
print(f" ✅ Retry boljši: pokritje {cov_pct:.0f}% → {c2:.0f}%, "
f"halucinacije {hall_count}{h2}", file=sys.stderr)
result = result2
else:
print(f" ⚠️ Retry ni izboljšal, ohrani prvi rezultat", file=sys.stderr)
return result
if provider == "elevenlabs":
print(f" ⚠️ Scribe failed, no fallback (provider=elevenlabs)", file=sys.stderr)