From 81bae81401d84005fee0ce4f5c9602917d698ab6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 13:04:19 +0000 Subject: [PATCH] Fix Scribe stopping mid-song: enable tag_audio_events=true + filter events out MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ROOT CAUSE FOUND: tag_audio_events=false caused Scribe to stop transcribing when instrumental music dominates (polka harmonica taking over from vocals). Real-world test on Avseniki - Ena bolha za pomoč (186s polka): - tag_audio_events=false: 20% coverage (37s only) — fails - tag_audio_events=true: 100% coverage (186s full) — works When tag_audio_events=true, Scribe inserts placeholder markers like '(glasba)' / '(plesalna glasba)' for instrumental sections instead of giving up. We then filter these out so they don't appear in subtitles. Filtering logic: - Skip word.type != 'word' (audio_event types) - Skip parenthesized text legacy fallback like '(music)', '(applause)' This is the core fix — no longer reliant on filename for transcription completeness. Even untitled files like '12345.mp4' now get full coverage. --- scripts/analyze.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/scripts/analyze.py b/scripts/analyze.py index f5fa742..88ddc2c 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -206,7 +206,11 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam add_text("model_id", model) add_text("timestamps_granularity", "word") - add_text("tag_audio_events", "false") + # tag_audio_events=true je kritično: brez tega Scribe predčasno preneha s transkripcijo + # ko zazna instrumentalni del (npr. polka harmonika prevzame). Z true vstavi oznake + # kot "(glasba)" in nadaljuje transkripcijo do konca audia. + # Te oznake potem post-processing odstrani iz besedila. + add_text("tag_audio_events", "true") if lang: scribe_lang = LANG_1_TO_3.get(lang, lang) add_text("language_code", scribe_lang) @@ -252,8 +256,22 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam segments = [] if words: - # Filter out whitespace tokens - real_words = [w for w in words if w.get("text", "").strip()] + # Filter out: + # 1. whitespace tokens + # 2. audio event tags type='audio_event' or text in (parenthesis) like "(glasba)", "(music)" + real_words = [] + for w in words: + t = w.get("text", "").strip() + wtype = w.get("type", "word") + # Skip non-word events + if wtype != "word": + continue + if not t: + continue + # Skip parenthesized audio events (legacy fallback) + if t.startswith("(") and t.endswith(")"): + continue + real_words.append(w) if real_words: current_seg_words = []