Add Claude LLM analysis: sends full transcript to Claude API for true song structure understanding (refrain detection across all repetitions, not just local heuristic)
This commit is contained in:
parent
e072eec362
commit
a04811bdc9
@ -254,14 +254,17 @@ def find_chorus(transcript, energies, video_duration):
|
||||
|
||||
|
||||
def smart_clip_range(chorus, transcript, video_duration,
|
||||
target_duration=30, max_duration=45, min_duration=20):
|
||||
target_duration=30, max_duration=45, min_duration=20,
|
||||
include_prebuild=False):
|
||||
"""Inteligentno določi clip range.
|
||||
|
||||
Logika:
|
||||
1. Začni z refrenom kot core
|
||||
2. Če je krajši od min_duration, razširi na obeh straneh
|
||||
3. Če imamo prostor, dodaj pre-chorus pred refrenom
|
||||
4. Cap na max_duration
|
||||
2. Če je krajši od min_duration → razširi z drugim refrenom (ne kitico!)
|
||||
3. Cap na max_duration
|
||||
|
||||
include_prebuild=False (default): NE doda kitice/verza pred refrenom.
|
||||
include_prebuild=True: doda kratek pre-chorus (max 8s, gap < 3s).
|
||||
"""
|
||||
if not chorus or not chorus.get("best"):
|
||||
# Fallback: vzemi sredino videa
|
||||
@ -279,38 +282,56 @@ def smart_clip_range(chorus, transcript, video_duration,
|
||||
actual_start = best["start"]
|
||||
actual_end = best["end"]
|
||||
|
||||
# 1. Če je core refren prekratek, razširi
|
||||
# Najdi VSE sekcije ki so podobne refrenu (verjetne ponovitve)
|
||||
chorus_words = set(re.findall(r"\b\w+\b", best["text_preview"].lower()))
|
||||
chorus_sections = []
|
||||
for sec in sections:
|
||||
sec_words = set(re.findall(r"\b\w+\b", sec["text"].lower()))
|
||||
if chorus_words and len(sec_words & chorus_words) >= len(chorus_words) * 0.4:
|
||||
chorus_sections.append(sec)
|
||||
|
||||
# 1. Če je core refren prekratek, razširi z naslednjim REFRENOM (ne kitico!)
|
||||
if actual_end - actual_start < min_duration:
|
||||
# Najdi naslednjo sekcijo (verjetno se refren ponovi)
|
||||
for sec in sections:
|
||||
if sec["start"] > actual_end and sec["start"] - actual_end < 5:
|
||||
# Sekcija blizu, dodaj jo
|
||||
for sec in chorus_sections:
|
||||
if sec["start"] > actual_end and sec["start"] - actual_end < 8:
|
||||
if sec["end"] - actual_start <= max_duration:
|
||||
actual_end = sec["end"]
|
||||
if actual_end - actual_start >= min_duration:
|
||||
break
|
||||
|
||||
# 2. Dodaj pre-chorus pred refrenom (build-up)
|
||||
pre_section = None
|
||||
for sec in sections:
|
||||
if sec["end"] <= actual_start and actual_start - sec["end"] < 8:
|
||||
pre_section = sec # zadnja pred refrenom
|
||||
if pre_section:
|
||||
candidate_start = pre_section["start"]
|
||||
if actual_end - candidate_start <= max_duration:
|
||||
actual_start = candidate_start
|
||||
# 2. Pre-chorus build-up (samo če uporabnik to izrecno hoče)
|
||||
if include_prebuild:
|
||||
pre_section = None
|
||||
for sec in sections:
|
||||
# Pre-section mora biti BLIZU (gap < 3s) in NE preveč dolga (< 8s)
|
||||
sec_duration = sec["end"] - sec["start"]
|
||||
if (sec["end"] <= actual_start
|
||||
and actual_start - sec["end"] < 3
|
||||
and sec_duration < 8):
|
||||
pre_section = sec
|
||||
if pre_section:
|
||||
candidate_start = pre_section["start"]
|
||||
if actual_end - candidate_start <= max_duration:
|
||||
actual_start = candidate_start
|
||||
|
||||
# 3. Če je res prekratek, razširi simetrično
|
||||
# 3. Če je še prekratek, razširi simetrično znotraj refrenov (ne kitic)
|
||||
if actual_end - actual_start < min_duration:
|
||||
deficit = min_duration - (actual_end - actual_start)
|
||||
actual_start = max(0, actual_start - deficit / 2)
|
||||
actual_end = min(video_duration, actual_end + deficit / 2)
|
||||
# Razširi konec če lahko
|
||||
for sec in chorus_sections:
|
||||
if sec["start"] > actual_end and sec["start"] - actual_end < 5:
|
||||
actual_end = min(sec["end"], actual_end + deficit)
|
||||
break
|
||||
# Če še ni dovolj, manjše simetrično
|
||||
if actual_end - actual_start < min_duration:
|
||||
extra = (min_duration - (actual_end - actual_start)) / 2
|
||||
actual_start = max(0, actual_start - extra)
|
||||
actual_end = min(video_duration, actual_end + extra)
|
||||
|
||||
# 4. Trim na max
|
||||
if actual_end - actual_start > max_duration:
|
||||
actual_end = actual_start + max_duration
|
||||
|
||||
# Snap to video bounds
|
||||
actual_start = max(0, actual_start)
|
||||
actual_end = min(video_duration, actual_end)
|
||||
|
||||
@ -318,7 +339,7 @@ def smart_clip_range(chorus, transcript, video_duration,
|
||||
"start": round(actual_start, 2),
|
||||
"end": round(actual_end, 2),
|
||||
"duration": round(actual_end - actual_start, 2),
|
||||
"reason": "smart_chorus_with_prebuild",
|
||||
"reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only",
|
||||
"chorus_start": round(best["start"], 2),
|
||||
"chorus_end": round(best["end"], 2),
|
||||
}
|
||||
@ -351,6 +372,121 @@ def detect_audio_fade(clip_range, transcript):
|
||||
return {"fade_in": fade_in, "fade_out": fade_out}
|
||||
|
||||
|
||||
def analyze_with_claude(transcript, video_duration, target_duration=30):
|
||||
"""Pošlje cel transkript Claude API-ju, ki razume strukturo pesmi
|
||||
in vrne najboljši odsek za reel.
|
||||
|
||||
Claude bere cel tekst, prepozna ponovitve med deli (refren) in razume
|
||||
kontekst (kdaj je intro, verz, refren, bridge, outro).
|
||||
|
||||
Vrne dict z 'start', 'end', 'reason', 'chorus_text' ali None če Claude
|
||||
ni dosegljiv ali API key manjka.
|
||||
"""
|
||||
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||
if not api_key:
|
||||
print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
|
||||
return None
|
||||
|
||||
if not transcript.get("segments"):
|
||||
return None
|
||||
|
||||
# Pripravi tekstovni format za Claude — vsak segment z timestamp-om
|
||||
lines = []
|
||||
for seg in transcript["segments"]:
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
text = seg["text"].strip()
|
||||
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
|
||||
transcript_text = "\n".join(lines)
|
||||
|
||||
prompt = f"""Tu je transcript pesmi (timestamp v sekundah, besedilo):
|
||||
|
||||
{transcript_text}
|
||||
|
||||
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.
|
||||
|
||||
PROSIM:
|
||||
1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
|
||||
2. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi (običajno 2-3x z istim ali zelo podobnim besedilom)
|
||||
3. Izberi najboljši odsek za reel:
|
||||
- Vključi cel refren (cel verz besedila brez prekinitve)
|
||||
- Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom
|
||||
- Lahko traja 20-45 sekund (ne strogo 30s)
|
||||
- Začni in končaj na smiselni meji (konec stavka, ne sredi besede)
|
||||
4. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
|
||||
|
||||
Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
||||
{{
|
||||
"start": <sekunde>,
|
||||
"end": <sekunde>,
|
||||
"reason": "<kratka razlaga zakaj ta odsek>",
|
||||
"chorus_text": "<besedilo refrena ali ključni del>",
|
||||
"structure": "<1 stavek o strukturi pesmi>"
|
||||
}}"""
|
||||
|
||||
try:
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
body = json.dumps({
|
||||
"model": "claude-haiku-4-5-20251001",
|
||||
"max_tokens": 1024,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}).encode("utf-8")
|
||||
|
||||
req = urllib.request.Request(
|
||||
"https://api.anthropic.com/v1/messages",
|
||||
data=body,
|
||||
headers={
|
||||
"Content-Type": "application/json",
|
||||
"x-api-key": api_key,
|
||||
"anthropic-version": "2023-06-01",
|
||||
},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
content = data.get("content", [])
|
||||
if not content:
|
||||
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
|
||||
return None
|
||||
text = content[0].get("text", "").strip()
|
||||
|
||||
# Včasih Claude obda JSON v markdown
|
||||
if text.startswith("```"):
|
||||
text = re.sub(r"^```(?:json)?\s*", "", text)
|
||||
text = re.sub(r"\s*```$", "", text)
|
||||
result = json.loads(text)
|
||||
|
||||
# Sanity check
|
||||
start = float(result["start"])
|
||||
end = float(result["end"])
|
||||
if start >= end or start < 0 or end > video_duration:
|
||||
print(f" ⚠️ Claude returned invalid range: {start}-{end}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr)
|
||||
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
|
||||
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
|
||||
|
||||
return {
|
||||
"start": round(start, 2),
|
||||
"end": round(end, 2),
|
||||
"duration": round(end - start, 2),
|
||||
"reason": result.get("reason", ""),
|
||||
"chorus_text": result.get("chorus_text", ""),
|
||||
"structure": result.get("structure", ""),
|
||||
"source": "claude_llm",
|
||||
}
|
||||
except urllib.error.HTTPError as e:
|
||||
body = e.read().decode("utf-8", errors="replace")[:500]
|
||||
print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr)
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" ❌ Claude analysis failed: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def is_instrumental(transcript, video_duration, threshold=0.1):
|
||||
"""Detekcija ali je pesem instrumentalna.
|
||||
|
||||
@ -374,6 +510,10 @@ def main():
|
||||
ap.add_argument("--target-duration", type=float, default=30.0)
|
||||
ap.add_argument("--max-duration", type=float, default=45.0)
|
||||
ap.add_argument("--min-duration", type=float, default=20.0)
|
||||
ap.add_argument("--include-prebuild", action="store_true",
|
||||
help="Vključi pre-chorus build-up (privzeto: ne)")
|
||||
ap.add_argument("--no-claude", action="store_true",
|
||||
help="Preskoči Claude LLM analizo (uporabi samo lokalno heuristiko)")
|
||||
ap.add_argument("--json", action="store_true", help="Output JSON")
|
||||
ap.add_argument("--output", help="Path za JSON output")
|
||||
args = ap.parse_args()
|
||||
@ -404,7 +544,15 @@ def main():
|
||||
instrumental = is_instrumental(transcript, duration)
|
||||
print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
|
||||
|
||||
# 5. Find chorus (samo če ni instrumental)
|
||||
# 5a. PRIMARNO: Claude LLM analiza (razume cel tekst pesmi)
|
||||
claude_result = None
|
||||
if not instrumental and not args.no_claude:
|
||||
print(f"🤖 Pošiljam transkript Claude-u za analizo strukture...", file=sys.stderr)
|
||||
claude_result = analyze_with_claude(
|
||||
transcript, duration, target_duration=args.target_duration
|
||||
)
|
||||
|
||||
# 5b. Find chorus lokalno (kot fallback ali za score-jev preview)
|
||||
if not instrumental:
|
||||
chorus = find_chorus(transcript, energies, duration)
|
||||
else:
|
||||
@ -434,15 +582,34 @@ def main():
|
||||
),
|
||||
}
|
||||
|
||||
# 6. Smart clip range
|
||||
clip_range = smart_clip_range(
|
||||
chorus, transcript, duration,
|
||||
target_duration=args.target_duration,
|
||||
max_duration=args.max_duration,
|
||||
min_duration=args.min_duration,
|
||||
)
|
||||
# 6. Clip range — Claude ima prednost, sicer smart_clip_range fallback
|
||||
if claude_result:
|
||||
clip_range = {
|
||||
"start": claude_result["start"],
|
||||
"end": claude_result["end"],
|
||||
"duration": claude_result["duration"],
|
||||
"reason": "claude_llm: " + claude_result.get("reason", ""),
|
||||
"chorus_text": claude_result.get("chorus_text", ""),
|
||||
"structure": claude_result.get("structure", ""),
|
||||
"source": "claude",
|
||||
}
|
||||
# Apply max_duration cap če Claude pretirava
|
||||
if clip_range["duration"] > args.max_duration:
|
||||
clip_range["end"] = clip_range["start"] + args.max_duration
|
||||
clip_range["duration"] = args.max_duration
|
||||
clip_range["reason"] += " (capped at max_duration)"
|
||||
else:
|
||||
clip_range = smart_clip_range(
|
||||
chorus, transcript, duration,
|
||||
target_duration=args.target_duration,
|
||||
max_duration=args.max_duration,
|
||||
min_duration=args.min_duration,
|
||||
include_prebuild=args.include_prebuild,
|
||||
)
|
||||
clip_range["source"] = "local_heuristic"
|
||||
print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
|
||||
f"(duration: {clip_range['duration']}s)", file=sys.stderr)
|
||||
f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
|
||||
file=sys.stderr)
|
||||
|
||||
# 7. Fade params
|
||||
fade = detect_audio_fade(clip_range, transcript)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user