From 4488717f6fa605864d0b7b87c4c8d860a07845a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 11:17:16 +0000 Subject: [PATCH] Filler detection: trim clip before la-la-la / instrumental medbridge MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Problem: When a song has chorus → la-la-la medbridge → chorus structure, Claude was including the whole 40s+ block, with 18 seconds of la-la-la making the reel feel artificially extended. Fix: 1. Prompt enhancement: explicitly tell Claude NEVER to include la-la-la / ooh ooh / yeah yeah / instrumental fillers 2. Post-LLM detection: scan corrected_segments for repetitive content (>70% repeated words) and trim clip before that segment 3. Max duration guidance reduced from 45s → 35s in prompt This means: clip will end at the first chorus, not extend through fillers. --- scripts/analyze.py | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/scripts/analyze.py b/scripts/analyze.py index 2e77bd1..36e2200 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -629,10 +629,12 @@ PROSIM: - Ohrani timestamp-e nespremenjene 3. Prepoznaj REFREN: del besedila ki se PONAVLJA 4. Izberi najboljši odsek za reel: - - Vključi cel refren (brez prekinitve) - - Lahko dodaj pre-chorus build-up - - 20-45 sekund - - Začni in končaj na smiselni meji + - **PREDNOSTNO**: en cel refren + morda kratek pre-chorus (skupaj 20-35s) + - **NIKOLI ne vključi**: "la la la", "ooh ooh", "yeah yeah", instrumentalni medbridge (interludij) + - **NIKOLI ne podaljšaj** clip range zato da bi vključil 2 refrena povezana z la-la-la ali instrumentalom + - Če sta dva refrena ločena z medbridge-om/instrumentalom, izberi **SAMO PRVEGA** + - Začni in končaj na smiselni meji (konec stavka) + - Maksimalno 35 sekund (smartphone reel attention span) 5. Če pesem nima jasnega refrena, izberi najbolj dramatičen ali zaključen del 6. Če Whisper transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "WHISPER_HALLUCINATION_DETECTED" in vrni najmanj segmentov (samo tisti ki so smiselni) @@ -1026,6 +1028,33 @@ def main(): clip_range["end"] = clip_range["start"] + args.max_duration clip_range["duration"] = args.max_duration clip_range["reason"] += " (capped at max_duration)" + + # ── DETEKCIJA "filler" segmentov (la-la-la, ooh, instrumental fillers) ── + # Če clip vsebuje segment kjer je >70% besedila ponovljen token, + # skrajšaj clip tik pred tem segmentom (preprečimo nesmiselno podaljšanje) + corrected_segs = claude_result.get("corrected_segments") or transcript["segments"] + for seg in corrected_segs: + seg_start = float(seg.get("start", 0)) + seg_end = float(seg.get("end", 0)) + seg_text = str(seg.get("text", "")).lower().strip() + # Samo segmenti znotraj clip range + if seg_start < clip_range["start"] or seg_end > clip_range["end"]: + continue + # Filler detection: ponavljajoče besede + words = seg_text.split() + if len(words) >= 4: + unique_ratio = len(set(words)) / len(words) + # Če je <30% unique besed = repetitive filler + if unique_ratio < 0.3: + # Skrajšaj clip do začetka tega segmenta + if seg_start - clip_range["start"] >= args.min_duration: + print(f" ✂️ Filler detected at {seg_start:.1f}s " + f"('{seg_text[:40]}', unique={unique_ratio:.0%}), " + f"trimming clip", file=sys.stderr) + clip_range["end"] = round(seg_start, 2) + clip_range["duration"] = round(seg_start - clip_range["start"], 2) + clip_range["reason"] += f" (trimmed at filler @ {seg_start:.1f}s)" + break else: clip_range = smart_clip_range( chorus, transcript, duration,