Fix Scribe stopping mid-song: enable tag_audio_events=true + filter events out

ROOT CAUSE FOUND: tag_audio_events=false caused Scribe to stop transcribing when instrumental music dominates (polka harmonica taking over from vocals). Real-world test on Avseniki - Ena bolha za pomoč (186s polka): - tag_audio_events=false: 20% coverage (37s only) — fails - tag_audio_events=true: 100% coverage (186s full) — works When tag_audio_events=true, Scribe inserts placeholder markers like '(glasba)' / '(plesalna glasba)' for instrumental sections instead of giving up. We then filter these out so they don't appear in subtitles. Filtering logic: - Skip word.type != 'word' (audio_event types) - Skip parenthesized text legacy fallback like '(music)', '(applause)' This is the core fix — no longer reliant on filename for transcription completeness. Even untitled files like '12345.mp4' now get full coverage.
2026-04-29 13:04:19 +00:00 · 2026-04-29 13:04:19 +00:00 · 81bae81401
commit 81bae81401
parent 7d00730051
1 changed files with 21 additions and 3 deletions
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@ -206,7 +206,11 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam

    add_text("model_id", model)
    add_text("timestamps_granularity", "word")
-    add_text("tag_audio_events", "false")
+    # tag_audio_events=true je kritično: brez tega Scribe predčasno preneha s transkripcijo
+    # ko zazna instrumentalni del (npr. polka harmonika prevzame). Z true vstavi oznake
+    # kot "(glasba)" in nadaljuje transkripcijo do konca audia.
+    # Te oznake potem post-processing odstrani iz besedila.
+    add_text("tag_audio_events", "true")
    if lang:
        scribe_lang = LANG_1_TO_3.get(lang, lang)
        add_text("language_code", scribe_lang)
@ -252,8 +256,22 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam
    segments = []

    if words:
-        # Filter out whitespace tokens
-        real_words = [w for w in words if w.get("text", "").strip()]
+        # Filter out:
+        # 1. whitespace tokens
+        # 2. audio event tags type='audio_event' or text in (parenthesis) like "(glasba)", "(music)"
+        real_words = []
+        for w in words:
+            t = w.get("text", "").strip()
+            wtype = w.get("type", "word")
+            # Skip non-word events
+            if wtype != "word":
+                continue
+            if not t:
+                continue
+            # Skip parenthesized audio events (legacy fallback)
+            if t.startswith("(") and t.endswith(")"):
+                continue
+            real_words.append(w)
        
        if real_words:
            current_seg_words = []