From 81bae81401d84005fee0ce4f5c9602917d698ab6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= <sebastjan@folx.tv>
Date: Wed, 29 Apr 2026 13:04:19 +0000
Subject: [PATCH] Fix Scribe stopping mid-song: enable tag_audio_events=true +
 filter events out
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

ROOT CAUSE FOUND: tag_audio_events=false caused Scribe to stop transcribing
when instrumental music dominates (polka harmonica taking over from vocals).

Real-world test on Avseniki - Ena bolha za pomoč (186s polka):
- tag_audio_events=false: 20% coverage (37s only) — fails
- tag_audio_events=true:  100% coverage (186s full) — works

When tag_audio_events=true, Scribe inserts placeholder markers like
'(glasba)' / '(plesalna glasba)' for instrumental sections instead of
giving up. We then filter these out so they don't appear in subtitles.

Filtering logic:
- Skip word.type != 'word' (audio_event types)
- Skip parenthesized text legacy fallback like '(music)', '(applause)'

This is the core fix — no longer reliant on filename for transcription
completeness. Even untitled files like '12345.mp4' now get full coverage.
---
 scripts/analyze.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/scripts/analyze.py b/scripts/analyze.py
index f5fa742..88ddc2c 100644
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@@ -206,7 +206,11 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam
 
     add_text("model_id", model)
     add_text("timestamps_granularity", "word")
-    add_text("tag_audio_events", "false")
+    # tag_audio_events=true je kritično: brez tega Scribe predčasno preneha s transkripcijo
+    # ko zazna instrumentalni del (npr. polka harmonika prevzame). Z true vstavi oznake
+    # kot "(glasba)" in nadaljuje transkripcijo do konca audia.
+    # Te oznake potem post-processing odstrani iz besedila.
+    add_text("tag_audio_events", "true")
     if lang:
         scribe_lang = LANG_1_TO_3.get(lang, lang)
         add_text("language_code", scribe_lang)
@@ -252,8 +256,22 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam
     segments = []
 
     if words:
-        # Filter out whitespace tokens
-        real_words = [w for w in words if w.get("text", "").strip()]
+        # Filter out:
+        # 1. whitespace tokens
+        # 2. audio event tags type='audio_event' or text in (parenthesis) like "(glasba)", "(music)"
+        real_words = []
+        for w in words:
+            t = w.get("text", "").strip()
+            wtype = w.get("type", "word")
+            # Skip non-word events
+            if wtype != "word":
+                continue
+            if not t:
+                continue
+            # Skip parenthesized audio events (legacy fallback)
+            if t.startswith("(") and t.endswith(")"):
+                continue
+            real_words.append(w)
         
         if real_words:
             current_seg_words = []