From 4488717f6fa605864d0b7b87c4c8d860a07845a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= <sebastjan@folx.tv>
Date: Wed, 29 Apr 2026 11:17:16 +0000
Subject: [PATCH] Filler detection: trim clip before la-la-la / instrumental
 medbridge
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Problem: When a song has chorus → la-la-la medbridge → chorus structure,
Claude was including the whole 40s+ block, with 18 seconds of la-la-la
making the reel feel artificially extended.

Fix:
1. Prompt enhancement: explicitly tell Claude NEVER to include
   la-la-la / ooh ooh / yeah yeah / instrumental fillers
2. Post-LLM detection: scan corrected_segments for repetitive content
   (>70% repeated words) and trim clip before that segment
3. Max duration guidance reduced from 45s → 35s in prompt

This means: clip will end at the first chorus, not extend through fillers.
---
 scripts/analyze.py | 37 +++++++++++++++++++++++++++++++++----
 1 file changed, 33 insertions(+), 4 deletions(-)

diff --git a/scripts/analyze.py b/scripts/analyze.py
index 2e77bd1..36e2200 100644
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@@ -629,10 +629,12 @@ PROSIM:
    - Ohrani timestamp-e nespremenjene
 3. Prepoznaj REFREN: del besedila ki se PONAVLJA
 4. Izberi najboljši odsek za reel:
-   - Vključi cel refren (brez prekinitve)
-   - Lahko dodaj pre-chorus build-up
-   - 20-45 sekund
-   - Začni in končaj na smiselni meji
+   - **PREDNOSTNO**: en cel refren + morda kratek pre-chorus (skupaj 20-35s)
+   - **NIKOLI ne vključi**: "la la la", "ooh ooh", "yeah yeah", instrumentalni medbridge (interludij)
+   - **NIKOLI ne podaljšaj** clip range zato da bi vključil 2 refrena povezana z la-la-la ali instrumentalom
+   - Če sta dva refrena ločena z medbridge-om/instrumentalom, izberi **SAMO PRVEGA**
+   - Začni in končaj na smiselni meji (konec stavka)
+   - Maksimalno 35 sekund (smartphone reel attention span)
 5. Če pesem nima jasnega refrena, izberi najbolj dramatičen ali zaključen del
 6. Če Whisper transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "WHISPER_HALLUCINATION_DETECTED" in vrni najmanj segmentov (samo tisti ki so smiselni)
 
@@ -1026,6 +1028,33 @@ def main():
                 clip_range["end"] = clip_range["start"] + args.max_duration
                 clip_range["duration"] = args.max_duration
                 clip_range["reason"] += " (capped at max_duration)"
+
+            # ── DETEKCIJA "filler" segmentov (la-la-la, ooh, instrumental fillers) ──
+            # Če clip vsebuje segment kjer je >70% besedila ponovljen token,
+            # skrajšaj clip tik pred tem segmentom (preprečimo nesmiselno podaljšanje)
+            corrected_segs = claude_result.get("corrected_segments") or transcript["segments"]
+            for seg in corrected_segs:
+                seg_start = float(seg.get("start", 0))
+                seg_end = float(seg.get("end", 0))
+                seg_text = str(seg.get("text", "")).lower().strip()
+                # Samo segmenti znotraj clip range
+                if seg_start < clip_range["start"] or seg_end > clip_range["end"]:
+                    continue
+                # Filler detection: ponavljajoče besede
+                words = seg_text.split()
+                if len(words) >= 4:
+                    unique_ratio = len(set(words)) / len(words)
+                    # Če je <30% unique besed = repetitive filler
+                    if unique_ratio < 0.3:
+                        # Skrajšaj clip do začetka tega segmenta
+                        if seg_start - clip_range["start"] >= args.min_duration:
+                            print(f"   ✂️  Filler detected at {seg_start:.1f}s "
+                                  f"('{seg_text[:40]}', unique={unique_ratio:.0%}), "
+                                  f"trimming clip", file=sys.stderr)
+                            clip_range["end"] = round(seg_start, 2)
+                            clip_range["duration"] = round(seg_start - clip_range["start"], 2)
+                            clip_range["reason"] += f" (trimmed at filler @ {seg_start:.1f}s)"
+                            break
         else:
             clip_range = smart_clip_range(
                 chorus, transcript, duration,