diff --git a/scripts/analyze.py b/scripts/analyze.py index f5fa742..88ddc2c 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -206,7 +206,11 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam add_text("model_id", model) add_text("timestamps_granularity", "word") - add_text("tag_audio_events", "false") + # tag_audio_events=true je kritično: brez tega Scribe predčasno preneha s transkripcijo + # ko zazna instrumentalni del (npr. polka harmonika prevzame). Z true vstavi oznake + # kot "(glasba)" in nadaljuje transkripcijo do konca audia. + # Te oznake potem post-processing odstrani iz besedila. + add_text("tag_audio_events", "true") if lang: scribe_lang = LANG_1_TO_3.get(lang, lang) add_text("language_code", scribe_lang) @@ -252,8 +256,22 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam segments = [] if words: - # Filter out whitespace tokens - real_words = [w for w in words if w.get("text", "").strip()] + # Filter out: + # 1. whitespace tokens + # 2. audio event tags type='audio_event' or text in (parenthesis) like "(glasba)", "(music)" + real_words = [] + for w in words: + t = w.get("text", "").strip() + wtype = w.get("type", "word") + # Skip non-word events + if wtype != "word": + continue + if not t: + continue + # Skip parenthesized audio events (legacy fallback) + if t.startswith("(") and t.endswith(")"): + continue + real_words.append(w) if real_words: current_seg_words = []