Add Claude LLM analysis: sends full transcript to Claude API for true song structure understanding (refrain detection across all repetitions, not just local heuristic)

2026-04-29 06:55:41 +00:00 · 2026-04-29 06:55:41 +00:00 · a04811bdc9
commit a04811bdc9
parent e072eec362
1 changed files with 199 additions and 32 deletions
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@ -254,14 +254,17 @@ def find_chorus(transcript, energies, video_duration):


 def smart_clip_range(chorus, transcript, video_duration,
-                      target_duration=30, max_duration=45, min_duration=20):
+                      target_duration=30, max_duration=45, min_duration=20,
+                      include_prebuild=False):
    """Inteligentno določi clip range.

    Logika:
    1. Začni z refrenom kot core
-    2. Če je krajši od min_duration, razširi na obeh straneh
-    3. Če imamo prostor, dodaj pre-chorus pred refrenom
-    4. Cap na max_duration
+    2. Če je krajši od min_duration → razširi z drugim refrenom (ne kitico!)
+    3. Cap na max_duration
+
+    include_prebuild=False (default): NE doda kitice/verza pred refrenom.
+    include_prebuild=True: doda kratek pre-chorus (max 8s, gap < 3s).
    """
    if not chorus or not chorus.get("best"):
        # Fallback: vzemi sredino videa
@ -279,38 +282,56 @@ def smart_clip_range(chorus, transcript, video_duration,
    actual_start = best["start"]
    actual_end = best["end"]

-    # 1. Če je core refren prekratek, razširi
+    # Najdi VSE sekcije ki so podobne refrenu (verjetne ponovitve)
+    chorus_words = set(re.findall(r"\b\w+\b", best["text_preview"].lower()))
+    chorus_sections = []
+    for sec in sections:
+        sec_words = set(re.findall(r"\b\w+\b", sec["text"].lower()))
+        if chorus_words and len(sec_words & chorus_words) >= len(chorus_words) * 0.4:
+            chorus_sections.append(sec)
+
+    # 1. Če je core refren prekratek, razširi z naslednjim REFRENOM (ne kitico!)
    if actual_end - actual_start < min_duration:
-        # Najdi naslednjo sekcijo (verjetno se refren ponovi)
-        for sec in sections:
-            if sec["start"] > actual_end and sec["start"] - actual_end < 5:
-                # Sekcija blizu, dodaj jo
+        for sec in chorus_sections:
+            if sec["start"] > actual_end and sec["start"] - actual_end < 8:
                if sec["end"] - actual_start <= max_duration:
                    actual_end = sec["end"]
                    if actual_end - actual_start >= min_duration:
                        break

-    # 2. Dodaj pre-chorus pred refrenom (build-up)
-    pre_section = None
-    for sec in sections:
-        if sec["end"] <= actual_start and actual_start - sec["end"] < 8:
-            pre_section = sec  # zadnja pred refrenom
-    if pre_section:
-        candidate_start = pre_section["start"]
-        if actual_end - candidate_start <= max_duration:
-            actual_start = candidate_start
+    # 2. Pre-chorus build-up (samo če uporabnik to izrecno hoče)
+    if include_prebuild:
+        pre_section = None
+        for sec in sections:
+            # Pre-section mora biti BLIZU (gap < 3s) in NE preveč dolga (< 8s)
+            sec_duration = sec["end"] - sec["start"]
+            if (sec["end"] <= actual_start
+                and actual_start - sec["end"] < 3
+                and sec_duration < 8):
+                pre_section = sec
+        if pre_section:
+            candidate_start = pre_section["start"]
+            if actual_end - candidate_start <= max_duration:
+                actual_start = candidate_start

-    # 3. Če je res prekratek, razširi simetrično
+    # 3. Če je še prekratek, razširi simetrično znotraj refrenov (ne kitic)
    if actual_end - actual_start < min_duration:
        deficit = min_duration - (actual_end - actual_start)
-        actual_start = max(0, actual_start - deficit / 2)
-        actual_end = min(video_duration, actual_end + deficit / 2)
+        # Razširi konec če lahko
+        for sec in chorus_sections:
+            if sec["start"] > actual_end and sec["start"] - actual_end < 5:
+                actual_end = min(sec["end"], actual_end + deficit)
+                break
+        # Če še ni dovolj, manjše simetrično
+        if actual_end - actual_start < min_duration:
+            extra = (min_duration - (actual_end - actual_start)) / 2
+            actual_start = max(0, actual_start - extra)
+            actual_end = min(video_duration, actual_end + extra)

    # 4. Trim na max
    if actual_end - actual_start > max_duration:
        actual_end = actual_start + max_duration

-    # Snap to video bounds
    actual_start = max(0, actual_start)
    actual_end = min(video_duration, actual_end)

@ -318,7 +339,7 @@ def smart_clip_range(chorus, transcript, video_duration,
        "start": round(actual_start, 2),
        "end": round(actual_end, 2),
        "duration": round(actual_end - actual_start, 2),
-        "reason": "smart_chorus_with_prebuild",
+        "reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only",
        "chorus_start": round(best["start"], 2),
        "chorus_end": round(best["end"], 2),
    }
@ -351,6 +372,121 @@ def detect_audio_fade(clip_range, transcript):
    return {"fade_in": fade_in, "fade_out": fade_out}


+def analyze_with_claude(transcript, video_duration, target_duration=30):
+    """Pošlje cel transkript Claude API-ju, ki razume strukturo pesmi
+    in vrne najboljši odsek za reel.
+    
+    Claude bere cel tekst, prepozna ponovitve med deli (refren) in razume
+    kontekst (kdaj je intro, verz, refren, bridge, outro).
+    
+    Vrne dict z 'start', 'end', 'reason', 'chorus_text' ali None če Claude
+    ni dosegljiv ali API key manjka.
+    """
+    api_key = os.environ.get("ANTHROPIC_API_KEY")
+    if not api_key:
+        print("   ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
+        return None
+
+    if not transcript.get("segments"):
+        return None
+
+    # Pripravi tekstovni format za Claude — vsak segment z timestamp-om
+    lines = []
+    for seg in transcript["segments"]:
+        start = seg["start"]
+        end = seg["end"]
+        text = seg["text"].strip()
+        lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
+    transcript_text = "\n".join(lines)
+
+    prompt = f"""Tu je transcript pesmi (timestamp v sekundah, besedilo):
+
+{transcript_text}
+
+Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.
+
+PROSIM:
+1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
+2. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi (običajno 2-3x z istim ali zelo podobnim besedilom)
+3. Izberi najboljši odsek za reel:
+   - Vključi cel refren (cel verz besedila brez prekinitve)
+   - Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom
+   - Lahko traja 20-45 sekund (ne strogo 30s)
+   - Začni in končaj na smiselni meji (konec stavka, ne sredi besede)
+4. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
+
+Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
+{{
+  "start": <sekunde>,
+  "end": <sekunde>,
+  "reason": "<kratka razlaga zakaj ta odsek>",
+  "chorus_text": "<besedilo refrena ali ključni del>",
+  "structure": "<1 stavek o strukturi pesmi>"
+}}"""
+
+    try:
+        import urllib.request
+        import urllib.error
+        body = json.dumps({
+            "model": "claude-haiku-4-5-20251001",
+            "max_tokens": 1024,
+            "messages": [{"role": "user", "content": prompt}],
+        }).encode("utf-8")
+
+        req = urllib.request.Request(
+            "https://api.anthropic.com/v1/messages",
+            data=body,
+            headers={
+                "Content-Type": "application/json",
+                "x-api-key": api_key,
+                "anthropic-version": "2023-06-01",
+            },
+            method="POST",
+        )
+        with urllib.request.urlopen(req, timeout=60) as resp:
+            data = json.loads(resp.read().decode("utf-8"))
+
+        content = data.get("content", [])
+        if not content:
+            print("   ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
+            return None
+        text = content[0].get("text", "").strip()
+
+        # Včasih Claude obda JSON v markdown
+        if text.startswith("```"):
+            text = re.sub(r"^```(?:json)?\s*", "", text)
+            text = re.sub(r"\s*```$", "", text)
+        result = json.loads(text)
+
+        # Sanity check
+        start = float(result["start"])
+        end = float(result["end"])
+        if start >= end or start < 0 or end > video_duration:
+            print(f"   ⚠️ Claude returned invalid range: {start}-{end}", file=sys.stderr)
+            return None
+
+        print(f"   🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr)
+        print(f"      Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
+        print(f"      Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
+
+        return {
+            "start": round(start, 2),
+            "end": round(end, 2),
+            "duration": round(end - start, 2),
+            "reason": result.get("reason", ""),
+            "chorus_text": result.get("chorus_text", ""),
+            "structure": result.get("structure", ""),
+            "source": "claude_llm",
+        }
+    except urllib.error.HTTPError as e:
+        body = e.read().decode("utf-8", errors="replace")[:500]
+        print(f"   ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr)
+        return None
+    except Exception as e:
+        print(f"   ❌ Claude analysis failed: {e}", file=sys.stderr)
+        return None
+
+
 def is_instrumental(transcript, video_duration, threshold=0.1):
    """Detekcija ali je pesem instrumentalna.

@ -374,6 +510,10 @@ def main():
    ap.add_argument("--target-duration", type=float, default=30.0)
    ap.add_argument("--max-duration", type=float, default=45.0)
    ap.add_argument("--min-duration", type=float, default=20.0)
+    ap.add_argument("--include-prebuild", action="store_true",
+                    help="Vključi pre-chorus build-up (privzeto: ne)")
+    ap.add_argument("--no-claude", action="store_true",
+                    help="Preskoči Claude LLM analizo (uporabi samo lokalno heuristiko)")
    ap.add_argument("--json", action="store_true", help="Output JSON")
    ap.add_argument("--output", help="Path za JSON output")
    args = ap.parse_args()
@ -404,7 +544,15 @@ def main():
        instrumental = is_instrumental(transcript, duration)
        print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)

-        # 5. Find chorus (samo če ni instrumental)
+        # 5a. PRIMARNO: Claude LLM analiza (razume cel tekst pesmi)
+        claude_result = None
+        if not instrumental and not args.no_claude:
+            print(f"🤖 Pošiljam transkript Claude-u za analizo strukture...", file=sys.stderr)
+            claude_result = analyze_with_claude(
+                transcript, duration, target_duration=args.target_duration
+            )
+
+        # 5b. Find chorus lokalno (kot fallback ali za score-jev preview)
        if not instrumental:
            chorus = find_chorus(transcript, energies, duration)
        else:
@ -434,15 +582,34 @@ def main():
                ),
            }

-        # 6. Smart clip range
-        clip_range = smart_clip_range(
-            chorus, transcript, duration,
-            target_duration=args.target_duration,
-            max_duration=args.max_duration,
-            min_duration=args.min_duration,
-        )
+        # 6. Clip range — Claude ima prednost, sicer smart_clip_range fallback
+        if claude_result:
+            clip_range = {
+                "start": claude_result["start"],
+                "end": claude_result["end"],
+                "duration": claude_result["duration"],
+                "reason": "claude_llm: " + claude_result.get("reason", ""),
+                "chorus_text": claude_result.get("chorus_text", ""),
+                "structure": claude_result.get("structure", ""),
+                "source": "claude",
+            }
+            # Apply max_duration cap če Claude pretirava
+            if clip_range["duration"] > args.max_duration:
+                clip_range["end"] = clip_range["start"] + args.max_duration
+                clip_range["duration"] = args.max_duration
+                clip_range["reason"] += " (capped at max_duration)"
+        else:
+            clip_range = smart_clip_range(
+                chorus, transcript, duration,
+                target_duration=args.target_duration,
+                max_duration=args.max_duration,
+                min_duration=args.min_duration,
+                include_prebuild=args.include_prebuild,
+            )
+            clip_range["source"] = "local_heuristic"
        print(f"✂  Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
-              f"(duration: {clip_range['duration']}s)", file=sys.stderr)
+              f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
+              file=sys.stderr)

        # 7. Fade params
        fade = detect_audio_fade(clip_range, transcript)