From a04811bdc9ed251823d3fc82225d755d22be84e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 06:55:41 +0000 Subject: [PATCH] Add Claude LLM analysis: sends full transcript to Claude API for true song structure understanding (refrain detection across all repetitions, not just local heuristic) --- scripts/analyze.py | 231 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 199 insertions(+), 32 deletions(-) diff --git a/scripts/analyze.py b/scripts/analyze.py index bf2ac63..d31026e 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -254,14 +254,17 @@ def find_chorus(transcript, energies, video_duration): def smart_clip_range(chorus, transcript, video_duration, - target_duration=30, max_duration=45, min_duration=20): + target_duration=30, max_duration=45, min_duration=20, + include_prebuild=False): """Inteligentno določi clip range. Logika: 1. Začni z refrenom kot core - 2. Če je krajši od min_duration, razširi na obeh straneh - 3. Če imamo prostor, dodaj pre-chorus pred refrenom - 4. Cap na max_duration + 2. Če je krajši od min_duration → razširi z drugim refrenom (ne kitico!) + 3. Cap na max_duration + + include_prebuild=False (default): NE doda kitice/verza pred refrenom. + include_prebuild=True: doda kratek pre-chorus (max 8s, gap < 3s). """ if not chorus or not chorus.get("best"): # Fallback: vzemi sredino videa @@ -279,38 +282,56 @@ def smart_clip_range(chorus, transcript, video_duration, actual_start = best["start"] actual_end = best["end"] - # 1. Če je core refren prekratek, razširi + # Najdi VSE sekcije ki so podobne refrenu (verjetne ponovitve) + chorus_words = set(re.findall(r"\b\w+\b", best["text_preview"].lower())) + chorus_sections = [] + for sec in sections: + sec_words = set(re.findall(r"\b\w+\b", sec["text"].lower())) + if chorus_words and len(sec_words & chorus_words) >= len(chorus_words) * 0.4: + chorus_sections.append(sec) + + # 1. Če je core refren prekratek, razširi z naslednjim REFRENOM (ne kitico!) if actual_end - actual_start < min_duration: - # Najdi naslednjo sekcijo (verjetno se refren ponovi) - for sec in sections: - if sec["start"] > actual_end and sec["start"] - actual_end < 5: - # Sekcija blizu, dodaj jo + for sec in chorus_sections: + if sec["start"] > actual_end and sec["start"] - actual_end < 8: if sec["end"] - actual_start <= max_duration: actual_end = sec["end"] if actual_end - actual_start >= min_duration: break - # 2. Dodaj pre-chorus pred refrenom (build-up) - pre_section = None - for sec in sections: - if sec["end"] <= actual_start and actual_start - sec["end"] < 8: - pre_section = sec # zadnja pred refrenom - if pre_section: - candidate_start = pre_section["start"] - if actual_end - candidate_start <= max_duration: - actual_start = candidate_start + # 2. Pre-chorus build-up (samo če uporabnik to izrecno hoče) + if include_prebuild: + pre_section = None + for sec in sections: + # Pre-section mora biti BLIZU (gap < 3s) in NE preveč dolga (< 8s) + sec_duration = sec["end"] - sec["start"] + if (sec["end"] <= actual_start + and actual_start - sec["end"] < 3 + and sec_duration < 8): + pre_section = sec + if pre_section: + candidate_start = pre_section["start"] + if actual_end - candidate_start <= max_duration: + actual_start = candidate_start - # 3. Če je res prekratek, razširi simetrično + # 3. Če je še prekratek, razširi simetrično znotraj refrenov (ne kitic) if actual_end - actual_start < min_duration: deficit = min_duration - (actual_end - actual_start) - actual_start = max(0, actual_start - deficit / 2) - actual_end = min(video_duration, actual_end + deficit / 2) + # Razširi konec če lahko + for sec in chorus_sections: + if sec["start"] > actual_end and sec["start"] - actual_end < 5: + actual_end = min(sec["end"], actual_end + deficit) + break + # Če še ni dovolj, manjše simetrično + if actual_end - actual_start < min_duration: + extra = (min_duration - (actual_end - actual_start)) / 2 + actual_start = max(0, actual_start - extra) + actual_end = min(video_duration, actual_end + extra) # 4. Trim na max if actual_end - actual_start > max_duration: actual_end = actual_start + max_duration - # Snap to video bounds actual_start = max(0, actual_start) actual_end = min(video_duration, actual_end) @@ -318,7 +339,7 @@ def smart_clip_range(chorus, transcript, video_duration, "start": round(actual_start, 2), "end": round(actual_end, 2), "duration": round(actual_end - actual_start, 2), - "reason": "smart_chorus_with_prebuild", + "reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only", "chorus_start": round(best["start"], 2), "chorus_end": round(best["end"], 2), } @@ -351,6 +372,121 @@ def detect_audio_fade(clip_range, transcript): return {"fade_in": fade_in, "fade_out": fade_out} +def analyze_with_claude(transcript, video_duration, target_duration=30): + """Pošlje cel transkript Claude API-ju, ki razume strukturo pesmi + in vrne najboljši odsek za reel. + + Claude bere cel tekst, prepozna ponovitve med deli (refren) in razume + kontekst (kdaj je intro, verz, refren, bridge, outro). + + Vrne dict z 'start', 'end', 'reason', 'chorus_text' ali None če Claude + ni dosegljiv ali API key manjka. + """ + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr) + return None + + if not transcript.get("segments"): + return None + + # Pripravi tekstovni format za Claude — vsak segment z timestamp-om + lines = [] + for seg in transcript["segments"]: + start = seg["start"] + end = seg["end"] + text = seg["text"].strip() + lines.append(f"[{start:6.1f}-{end:6.1f}] {text}") + transcript_text = "\n".join(lines) + + prompt = f"""Tu je transcript pesmi (timestamp v sekundah, besedilo): + +{transcript_text} + +Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel. + +PROSIM: +1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro) +2. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi (običajno 2-3x z istim ali zelo podobnim besedilom) +3. Izberi najboljši odsek za reel: + - Vključi cel refren (cel verz besedila brez prekinitve) + - Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom + - Lahko traja 20-45 sekund (ne strogo 30s) + - Začni in končaj na smiselni meji (konec stavka, ne sredi besede) +4. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del + +Odgovori SAMO v JSON formatu (brez markdown, brez razlage): +{{ + "start": , + "end": , + "reason": "", + "chorus_text": "", + "structure": "<1 stavek o strukturi pesmi>" +}}""" + + try: + import urllib.request + import urllib.error + body = json.dumps({ + "model": "claude-haiku-4-5-20251001", + "max_tokens": 1024, + "messages": [{"role": "user", "content": prompt}], + }).encode("utf-8") + + req = urllib.request.Request( + "https://api.anthropic.com/v1/messages", + data=body, + headers={ + "Content-Type": "application/json", + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + }, + method="POST", + ) + with urllib.request.urlopen(req, timeout=60) as resp: + data = json.loads(resp.read().decode("utf-8")) + + content = data.get("content", []) + if not content: + print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr) + return None + text = content[0].get("text", "").strip() + + # Včasih Claude obda JSON v markdown + if text.startswith("```"): + text = re.sub(r"^```(?:json)?\s*", "", text) + text = re.sub(r"\s*```$", "", text) + result = json.loads(text) + + # Sanity check + start = float(result["start"]) + end = float(result["end"]) + if start >= end or start < 0 or end > video_duration: + print(f" ⚠️ Claude returned invalid range: {start}-{end}", file=sys.stderr) + return None + + print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr) + print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr) + print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr) + + return { + "start": round(start, 2), + "end": round(end, 2), + "duration": round(end - start, 2), + "reason": result.get("reason", ""), + "chorus_text": result.get("chorus_text", ""), + "structure": result.get("structure", ""), + "source": "claude_llm", + } + except urllib.error.HTTPError as e: + body = e.read().decode("utf-8", errors="replace")[:500] + print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr) + return None + except Exception as e: + print(f" ❌ Claude analysis failed: {e}", file=sys.stderr) + return None + + def is_instrumental(transcript, video_duration, threshold=0.1): """Detekcija ali je pesem instrumentalna. @@ -374,6 +510,10 @@ def main(): ap.add_argument("--target-duration", type=float, default=30.0) ap.add_argument("--max-duration", type=float, default=45.0) ap.add_argument("--min-duration", type=float, default=20.0) + ap.add_argument("--include-prebuild", action="store_true", + help="Vključi pre-chorus build-up (privzeto: ne)") + ap.add_argument("--no-claude", action="store_true", + help="Preskoči Claude LLM analizo (uporabi samo lokalno heuristiko)") ap.add_argument("--json", action="store_true", help="Output JSON") ap.add_argument("--output", help="Path za JSON output") args = ap.parse_args() @@ -404,7 +544,15 @@ def main(): instrumental = is_instrumental(transcript, duration) print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr) - # 5. Find chorus (samo če ni instrumental) + # 5a. PRIMARNO: Claude LLM analiza (razume cel tekst pesmi) + claude_result = None + if not instrumental and not args.no_claude: + print(f"🤖 Pošiljam transkript Claude-u za analizo strukture...", file=sys.stderr) + claude_result = analyze_with_claude( + transcript, duration, target_duration=args.target_duration + ) + + # 5b. Find chorus lokalno (kot fallback ali za score-jev preview) if not instrumental: chorus = find_chorus(transcript, energies, duration) else: @@ -434,15 +582,34 @@ def main(): ), } - # 6. Smart clip range - clip_range = smart_clip_range( - chorus, transcript, duration, - target_duration=args.target_duration, - max_duration=args.max_duration, - min_duration=args.min_duration, - ) + # 6. Clip range — Claude ima prednost, sicer smart_clip_range fallback + if claude_result: + clip_range = { + "start": claude_result["start"], + "end": claude_result["end"], + "duration": claude_result["duration"], + "reason": "claude_llm: " + claude_result.get("reason", ""), + "chorus_text": claude_result.get("chorus_text", ""), + "structure": claude_result.get("structure", ""), + "source": "claude", + } + # Apply max_duration cap če Claude pretirava + if clip_range["duration"] > args.max_duration: + clip_range["end"] = clip_range["start"] + args.max_duration + clip_range["duration"] = args.max_duration + clip_range["reason"] += " (capped at max_duration)" + else: + clip_range = smart_clip_range( + chorus, transcript, duration, + target_duration=args.target_duration, + max_duration=args.max_duration, + min_duration=args.min_duration, + include_prebuild=args.include_prebuild, + ) + clip_range["source"] = "local_heuristic" print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s " - f"(duration: {clip_range['duration']}s)", file=sys.stderr) + f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})", + file=sys.stderr) # 7. Fade params fade = detect_audio_fade(clip_range, transcript)