From e06c3efb8ebbaa6070f283eedfa40f28bb5ffa06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= <sebastjan@folx.tv>
Date: Wed, 29 Apr 2026 15:23:37 +0000
Subject: [PATCH] Add audio amplitude defense (Layer 3) for first-word cut
 prevention
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Žena problem persists: even after word-level extension, some cases where
Scribe doesn't transcribe the very first word still result in clip cutting
the vocal start.

Layer 3 defense: after word-level start extension, probe the FIRST 150ms
of audio at clip start with ffmpeg volumedetect. If mean_volume > -35 dB
(threshold for vocal/music vs silence), extend clip start back 0.5s as a
safety buffer.

This catches cases where:
- Scribe missed the word entirely (no word-level timestamp to extend to)
- LLM picked a start that's already inside vocal energy
- Word-level extension didn't trigger because no nearby word matched

The check is fast (<100ms) and conservative (only triggers if audio is
clearly NOT silent). If it's a true musical break (silence before chorus),
mean_volume will be < -40 dB and extension is skipped.

Three layers of defense now:
1. Claude prompt: 'start ~0.3s before first chorus word'
2. Word-level boundary detection (Scribe word timestamps)
3. Audio amplitude check (catches cases 1-2 missed)
---
 scripts/analyze.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/scripts/analyze.py b/scripts/analyze.py
index 1d24c13..cb6621d 100644
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@@ -1428,6 +1428,51 @@ def main():
                 clip_range["duration"] = round(clip_range["end"] - current_start, 2)
                 clip_range["reason"] += f" (start extended back)"
             
+            # ── SLOJ 3: AUDIO AMPLITUDE CHECK na samem začetku clipa ──
+            # Tudi po word-level extension lahko clip začne sredi vokala (npr. če
+            # Scribe ni zaznal besede). Kot zadnja obramba: preveri RMS audio
+            # amplitudo v prvih 100ms clipa. Če je > silence threshold = vokal je
+            # že tam, dodaj 0.5s buffer nazaj.
+            try:
+                import subprocess as _sp
+                # ffmpeg lahko prebere kratek segment in vrne RMS volume
+                probe_start = clip_range["start"]
+                probe_dur = 0.15  # prvih 150ms
+                if probe_start >= 0.5:  # samo če imamo prostor za buffer
+                    cmd_probe = [
+                        "ffmpeg", "-hide_banner", "-loglevel", "error",
+                        "-ss", str(probe_start), "-t", str(probe_dur),
+                        "-i", str(args.video),
+                        "-af", "volumedetect",
+                        "-f", "null", "-"
+                    ]
+                    pr = _sp.run(cmd_probe, capture_output=True, text=True, timeout=10)
+                    output = pr.stderr or ""
+                    # Iščemo "mean_volume: -XX.X dB"
+                    import re as _re_amp
+                    m = _re_amp.search(r'mean_volume:\s*(-?\d+\.?\d*)\s*dB', output)
+                    if m:
+                        mean_db = float(m.group(1))
+                        # Silence threshold: pod -40 dB = tihota
+                        # Vokal/glasba je običajno -30 do -10 dB
+                        if mean_db > -35:
+                            # Audio je že "glasen" na začetku clipa = vokal/glasba
+                            # Dodaj 0.5s buffer nazaj (varno, ne prepogosto)
+                            old_start = clip_range["start"]
+                            new_start = max(0, old_start - 0.5)
+                            if new_start < old_start:
+                                print(f"   🎵 Audio amplitude check: prvih {probe_dur}s "
+                                      f"ima mean_volume {mean_db:.1f} dB (> -35 dB = vokal/glasba). "
+                                      f"Razširim clip {old_start:.2f}s → {new_start:.2f}s.", file=sys.stderr)
+                                clip_range["start"] = round(new_start, 2)
+                                clip_range["duration"] = round(clip_range["end"] - new_start, 2)
+                                clip_range["reason"] += " (amplitude defense -0.5s)"
+                        else:
+                            print(f"   🎵 Audio amplitude check: prvih {probe_dur}s "
+                                  f"ima mean_volume {mean_db:.1f} dB (≤ -35 dB = tiho). OK.", file=sys.stderr)
+            except Exception as _e:
+                print(f"   ⚠️ Audio amplitude check skipped: {_e}", file=sys.stderr)
+            
             # Najdi vse segmente ki se začnejo PO trenutnem clip end
             # STROŽJA pravila: ne podaljšuj v naslednji refren / verz / instrumental.
             # Razširjamo SAMO če zadnji segment se prekriva s clip (klesti iz njega) ALI