Detect Scribe hallucinations + filter from SRT + auto-retry
Bug found in Žena ME TEPE third re-test: - Scribe transcribed only verse 1 (0-33s) properly - Then returned a single 98s segment [34.7-133.2] with just 1 word 'sam' - This is a known Scribe hallucination on instrumental sections - Result: SRT showed 'SAM SAM SAM SAM...' 14 times across the chorus - Looked completely wrong because the chorus audio was correct but subtitles showed 'SAM' repeatedly Three-part fix: 1. SRT GENERATOR: skip segments > 15s with < 5 words. These are hallucinations and have no real transcription value. 2. SCRIBE TRANSCRIBE: detect hallucinations in returned segments. - Mark segments > 15s with < 5 words as hallucinations - Compute true coverage % (excluding hallucinations) - Add _hallucination_count and _coverage_pct to result 3. TRANSCRIBE_FULL: auto-retry Scribe if quality is poor. - If hallucinations detected OR coverage < 50%, retry once - Keep retry result only if it has better stats - Otherwise fall back to first attempt (still better than nothing) This makes the pipeline robust against Scribe's occasional bad transcripts on songs with long instrumental breaks. Most second attempts succeed where the first failed (random Scribe variance).
This commit is contained in:
parent
d3b71942d2
commit
df6011c3cf
12
app/main.py
12
app/main.py
@ -230,6 +230,18 @@ def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
|
||||
if s_end <= clip_start or s_start >= clip_end:
|
||||
continue
|
||||
|
||||
# ── HALLUCINATION FILTER ──
|
||||
# STT (Scribe, Whisper) občasno halucinira pri dolgih instrumentalih:
|
||||
# vrne segment 60-100s z 1-2 besedama. Tak segment ne smemo dati v SRT.
|
||||
# Pravilo: če je segment > 15s IN ima < 5 besed (ali words array < 5),
|
||||
# je verjetno halucinacija.
|
||||
seg_dur = s_end - s_start
|
||||
word_count = len(words) if words else len(text.split())
|
||||
if seg_dur > 15 and word_count < 5:
|
||||
print(f"[SRT] Preskočil halucinacijski segment [{s_start:.1f}-{s_end:.1f}] "
|
||||
f"({seg_dur:.1f}s, {word_count} besed): {text[:50]!r}", flush=True)
|
||||
continue
|
||||
|
||||
# Če segment delno štrli iz clip range-a IN imamo word-level timestampe,
|
||||
# uporabi samo tiste besede ki dejansko padejo v clip range
|
||||
# (sicer subtitle vsebuje besedilo iz prejšnjega/naslednjega refrena/verza)
|
||||
|
||||
@ -325,6 +325,31 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam
|
||||
if i + 1 < len(real_words):
|
||||
seg_start = real_words[i + 1].get("start", 0)
|
||||
|
||||
# ── HALLUCINATION DETECTION ──
|
||||
# Scribe občasno vrne single dolg segment z 1-2 besedama (10-100s ene besede).
|
||||
# To je halucinacija pri instrumentalih.
|
||||
hallucination_segs = []
|
||||
total_audio_duration = max((s["end"] for s in segments), default=0)
|
||||
coverage = 0
|
||||
for s in segments:
|
||||
seg_dur = s["end"] - s["start"]
|
||||
word_count = len(s.get("words", []))
|
||||
if seg_dur > 15 and word_count < 5:
|
||||
hallucination_segs.append(s)
|
||||
else:
|
||||
coverage += seg_dur
|
||||
|
||||
coverage_pct = coverage / total_audio_duration * 100 if total_audio_duration else 0
|
||||
|
||||
if hallucination_segs:
|
||||
print(f" ⚠️ Halucinacija(e) zaznana(e): {len(hallucination_segs)} segment(ov) "
|
||||
f"daljših od 15s z manj kot 5 besedami:", file=sys.stderr)
|
||||
for h in hallucination_segs:
|
||||
print(f" [{h['start']:.1f}-{h['end']:.1f}s] = {h['end']-h['start']:.0f}s "
|
||||
f"({len(h.get('words', []))} bes.) text={h.get('text', '')[:50]!r}", file=sys.stderr)
|
||||
print(f" 📊 Pravo pokritje: {coverage:.1f}s / {total_audio_duration:.1f}s "
|
||||
f"= {coverage_pct:.0f}%", file=sys.stderr)
|
||||
|
||||
print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, "
|
||||
f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr)
|
||||
|
||||
@ -333,6 +358,8 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam
|
||||
"language_probability": float(detected_prob),
|
||||
"segments": segments,
|
||||
"_provider": "elevenlabs",
|
||||
"_hallucination_count": len(hallucination_segs),
|
||||
"_coverage_pct": coverage_pct,
|
||||
}
|
||||
|
||||
|
||||
@ -348,7 +375,25 @@ def transcribe_full(audio_path, lang=None, model_size="small", provider="auto",
|
||||
"""
|
||||
if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"):
|
||||
result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
||||
|
||||
# Auto-retry če halucinacija zaznana (pokritje < 50% ali halucinacijski segmenti)
|
||||
if result and result.get("segments"):
|
||||
hall_count = result.get("_hallucination_count", 0)
|
||||
cov_pct = result.get("_coverage_pct", 100)
|
||||
if hall_count > 0 or cov_pct < 50:
|
||||
print(f" 🔄 Halucinacija/nizko pokritje ({cov_pct:.0f}%, "
|
||||
f"{hall_count} hallucination segs) — RETRY Scribe...", file=sys.stderr)
|
||||
# Drugi poskus z malo drugačnimi parametri
|
||||
result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
||||
if result2 and result2.get("segments"):
|
||||
h2 = result2.get("_hallucination_count", 0)
|
||||
c2 = result2.get("_coverage_pct", 100)
|
||||
if h2 < hall_count or c2 > cov_pct:
|
||||
print(f" ✅ Retry boljši: pokritje {cov_pct:.0f}% → {c2:.0f}%, "
|
||||
f"halucinacije {hall_count} → {h2}", file=sys.stderr)
|
||||
result = result2
|
||||
else:
|
||||
print(f" ⚠️ Retry ni izboljšal, ohrani prvi rezultat", file=sys.stderr)
|
||||
return result
|
||||
if provider == "elevenlabs":
|
||||
print(f" ⚠️ Scribe failed, no fallback (provider=elevenlabs)", file=sys.stderr)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user