Add Claude LLM analysis: sends full transcript to Claude API for true song structure understanding (refrain detection across all repetitions, not just local heuristic)
This commit is contained in:
parent
e072eec362
commit
a04811bdc9
@ -254,14 +254,17 @@ def find_chorus(transcript, energies, video_duration):
|
|||||||
|
|
||||||
|
|
||||||
def smart_clip_range(chorus, transcript, video_duration,
|
def smart_clip_range(chorus, transcript, video_duration,
|
||||||
target_duration=30, max_duration=45, min_duration=20):
|
target_duration=30, max_duration=45, min_duration=20,
|
||||||
|
include_prebuild=False):
|
||||||
"""Inteligentno določi clip range.
|
"""Inteligentno določi clip range.
|
||||||
|
|
||||||
Logika:
|
Logika:
|
||||||
1. Začni z refrenom kot core
|
1. Začni z refrenom kot core
|
||||||
2. Če je krajši od min_duration, razširi na obeh straneh
|
2. Če je krajši od min_duration → razširi z drugim refrenom (ne kitico!)
|
||||||
3. Če imamo prostor, dodaj pre-chorus pred refrenom
|
3. Cap na max_duration
|
||||||
4. Cap na max_duration
|
|
||||||
|
include_prebuild=False (default): NE doda kitice/verza pred refrenom.
|
||||||
|
include_prebuild=True: doda kratek pre-chorus (max 8s, gap < 3s).
|
||||||
"""
|
"""
|
||||||
if not chorus or not chorus.get("best"):
|
if not chorus or not chorus.get("best"):
|
||||||
# Fallback: vzemi sredino videa
|
# Fallback: vzemi sredino videa
|
||||||
@ -279,38 +282,56 @@ def smart_clip_range(chorus, transcript, video_duration,
|
|||||||
actual_start = best["start"]
|
actual_start = best["start"]
|
||||||
actual_end = best["end"]
|
actual_end = best["end"]
|
||||||
|
|
||||||
# 1. Če je core refren prekratek, razširi
|
# Najdi VSE sekcije ki so podobne refrenu (verjetne ponovitve)
|
||||||
|
chorus_words = set(re.findall(r"\b\w+\b", best["text_preview"].lower()))
|
||||||
|
chorus_sections = []
|
||||||
|
for sec in sections:
|
||||||
|
sec_words = set(re.findall(r"\b\w+\b", sec["text"].lower()))
|
||||||
|
if chorus_words and len(sec_words & chorus_words) >= len(chorus_words) * 0.4:
|
||||||
|
chorus_sections.append(sec)
|
||||||
|
|
||||||
|
# 1. Če je core refren prekratek, razširi z naslednjim REFRENOM (ne kitico!)
|
||||||
if actual_end - actual_start < min_duration:
|
if actual_end - actual_start < min_duration:
|
||||||
# Najdi naslednjo sekcijo (verjetno se refren ponovi)
|
for sec in chorus_sections:
|
||||||
for sec in sections:
|
if sec["start"] > actual_end and sec["start"] - actual_end < 8:
|
||||||
if sec["start"] > actual_end and sec["start"] - actual_end < 5:
|
|
||||||
# Sekcija blizu, dodaj jo
|
|
||||||
if sec["end"] - actual_start <= max_duration:
|
if sec["end"] - actual_start <= max_duration:
|
||||||
actual_end = sec["end"]
|
actual_end = sec["end"]
|
||||||
if actual_end - actual_start >= min_duration:
|
if actual_end - actual_start >= min_duration:
|
||||||
break
|
break
|
||||||
|
|
||||||
# 2. Dodaj pre-chorus pred refrenom (build-up)
|
# 2. Pre-chorus build-up (samo če uporabnik to izrecno hoče)
|
||||||
pre_section = None
|
if include_prebuild:
|
||||||
for sec in sections:
|
pre_section = None
|
||||||
if sec["end"] <= actual_start and actual_start - sec["end"] < 8:
|
for sec in sections:
|
||||||
pre_section = sec # zadnja pred refrenom
|
# Pre-section mora biti BLIZU (gap < 3s) in NE preveč dolga (< 8s)
|
||||||
if pre_section:
|
sec_duration = sec["end"] - sec["start"]
|
||||||
candidate_start = pre_section["start"]
|
if (sec["end"] <= actual_start
|
||||||
if actual_end - candidate_start <= max_duration:
|
and actual_start - sec["end"] < 3
|
||||||
actual_start = candidate_start
|
and sec_duration < 8):
|
||||||
|
pre_section = sec
|
||||||
|
if pre_section:
|
||||||
|
candidate_start = pre_section["start"]
|
||||||
|
if actual_end - candidate_start <= max_duration:
|
||||||
|
actual_start = candidate_start
|
||||||
|
|
||||||
# 3. Če je res prekratek, razširi simetrično
|
# 3. Če je še prekratek, razširi simetrično znotraj refrenov (ne kitic)
|
||||||
if actual_end - actual_start < min_duration:
|
if actual_end - actual_start < min_duration:
|
||||||
deficit = min_duration - (actual_end - actual_start)
|
deficit = min_duration - (actual_end - actual_start)
|
||||||
actual_start = max(0, actual_start - deficit / 2)
|
# Razširi konec če lahko
|
||||||
actual_end = min(video_duration, actual_end + deficit / 2)
|
for sec in chorus_sections:
|
||||||
|
if sec["start"] > actual_end and sec["start"] - actual_end < 5:
|
||||||
|
actual_end = min(sec["end"], actual_end + deficit)
|
||||||
|
break
|
||||||
|
# Če še ni dovolj, manjše simetrično
|
||||||
|
if actual_end - actual_start < min_duration:
|
||||||
|
extra = (min_duration - (actual_end - actual_start)) / 2
|
||||||
|
actual_start = max(0, actual_start - extra)
|
||||||
|
actual_end = min(video_duration, actual_end + extra)
|
||||||
|
|
||||||
# 4. Trim na max
|
# 4. Trim na max
|
||||||
if actual_end - actual_start > max_duration:
|
if actual_end - actual_start > max_duration:
|
||||||
actual_end = actual_start + max_duration
|
actual_end = actual_start + max_duration
|
||||||
|
|
||||||
# Snap to video bounds
|
|
||||||
actual_start = max(0, actual_start)
|
actual_start = max(0, actual_start)
|
||||||
actual_end = min(video_duration, actual_end)
|
actual_end = min(video_duration, actual_end)
|
||||||
|
|
||||||
@ -318,7 +339,7 @@ def smart_clip_range(chorus, transcript, video_duration,
|
|||||||
"start": round(actual_start, 2),
|
"start": round(actual_start, 2),
|
||||||
"end": round(actual_end, 2),
|
"end": round(actual_end, 2),
|
||||||
"duration": round(actual_end - actual_start, 2),
|
"duration": round(actual_end - actual_start, 2),
|
||||||
"reason": "smart_chorus_with_prebuild",
|
"reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only",
|
||||||
"chorus_start": round(best["start"], 2),
|
"chorus_start": round(best["start"], 2),
|
||||||
"chorus_end": round(best["end"], 2),
|
"chorus_end": round(best["end"], 2),
|
||||||
}
|
}
|
||||||
@ -351,6 +372,121 @@ def detect_audio_fade(clip_range, transcript):
|
|||||||
return {"fade_in": fade_in, "fade_out": fade_out}
|
return {"fade_in": fade_in, "fade_out": fade_out}
|
||||||
|
|
||||||
|
|
||||||
|
def analyze_with_claude(transcript, video_duration, target_duration=30):
|
||||||
|
"""Pošlje cel transkript Claude API-ju, ki razume strukturo pesmi
|
||||||
|
in vrne najboljši odsek za reel.
|
||||||
|
|
||||||
|
Claude bere cel tekst, prepozna ponovitve med deli (refren) in razume
|
||||||
|
kontekst (kdaj je intro, verz, refren, bridge, outro).
|
||||||
|
|
||||||
|
Vrne dict z 'start', 'end', 'reason', 'chorus_text' ali None če Claude
|
||||||
|
ni dosegljiv ali API key manjka.
|
||||||
|
"""
|
||||||
|
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not transcript.get("segments"):
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Pripravi tekstovni format za Claude — vsak segment z timestamp-om
|
||||||
|
lines = []
|
||||||
|
for seg in transcript["segments"]:
|
||||||
|
start = seg["start"]
|
||||||
|
end = seg["end"]
|
||||||
|
text = seg["text"].strip()
|
||||||
|
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
|
||||||
|
transcript_text = "\n".join(lines)
|
||||||
|
|
||||||
|
prompt = f"""Tu je transcript pesmi (timestamp v sekundah, besedilo):
|
||||||
|
|
||||||
|
{transcript_text}
|
||||||
|
|
||||||
|
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.
|
||||||
|
|
||||||
|
PROSIM:
|
||||||
|
1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
|
||||||
|
2. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi (običajno 2-3x z istim ali zelo podobnim besedilom)
|
||||||
|
3. Izberi najboljši odsek za reel:
|
||||||
|
- Vključi cel refren (cel verz besedila brez prekinitve)
|
||||||
|
- Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom
|
||||||
|
- Lahko traja 20-45 sekund (ne strogo 30s)
|
||||||
|
- Začni in končaj na smiselni meji (konec stavka, ne sredi besede)
|
||||||
|
4. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
|
||||||
|
|
||||||
|
Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
||||||
|
{{
|
||||||
|
"start": <sekunde>,
|
||||||
|
"end": <sekunde>,
|
||||||
|
"reason": "<kratka razlaga zakaj ta odsek>",
|
||||||
|
"chorus_text": "<besedilo refrena ali ključni del>",
|
||||||
|
"structure": "<1 stavek o strukturi pesmi>"
|
||||||
|
}}"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
import urllib.request
|
||||||
|
import urllib.error
|
||||||
|
body = json.dumps({
|
||||||
|
"model": "claude-haiku-4-5-20251001",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
|
}).encode("utf-8")
|
||||||
|
|
||||||
|
req = urllib.request.Request(
|
||||||
|
"https://api.anthropic.com/v1/messages",
|
||||||
|
data=body,
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"x-api-key": api_key,
|
||||||
|
"anthropic-version": "2023-06-01",
|
||||||
|
},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
||||||
|
data = json.loads(resp.read().decode("utf-8"))
|
||||||
|
|
||||||
|
content = data.get("content", [])
|
||||||
|
if not content:
|
||||||
|
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
text = content[0].get("text", "").strip()
|
||||||
|
|
||||||
|
# Včasih Claude obda JSON v markdown
|
||||||
|
if text.startswith("```"):
|
||||||
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
||||||
|
text = re.sub(r"\s*```$", "", text)
|
||||||
|
result = json.loads(text)
|
||||||
|
|
||||||
|
# Sanity check
|
||||||
|
start = float(result["start"])
|
||||||
|
end = float(result["end"])
|
||||||
|
if start >= end or start < 0 or end > video_duration:
|
||||||
|
print(f" ⚠️ Claude returned invalid range: {start}-{end}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr)
|
||||||
|
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
|
||||||
|
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"start": round(start, 2),
|
||||||
|
"end": round(end, 2),
|
||||||
|
"duration": round(end - start, 2),
|
||||||
|
"reason": result.get("reason", ""),
|
||||||
|
"chorus_text": result.get("chorus_text", ""),
|
||||||
|
"structure": result.get("structure", ""),
|
||||||
|
"source": "claude_llm",
|
||||||
|
}
|
||||||
|
except urllib.error.HTTPError as e:
|
||||||
|
body = e.read().decode("utf-8", errors="replace")[:500]
|
||||||
|
print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
print(f" ❌ Claude analysis failed: {e}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def is_instrumental(transcript, video_duration, threshold=0.1):
|
def is_instrumental(transcript, video_duration, threshold=0.1):
|
||||||
"""Detekcija ali je pesem instrumentalna.
|
"""Detekcija ali je pesem instrumentalna.
|
||||||
|
|
||||||
@ -374,6 +510,10 @@ def main():
|
|||||||
ap.add_argument("--target-duration", type=float, default=30.0)
|
ap.add_argument("--target-duration", type=float, default=30.0)
|
||||||
ap.add_argument("--max-duration", type=float, default=45.0)
|
ap.add_argument("--max-duration", type=float, default=45.0)
|
||||||
ap.add_argument("--min-duration", type=float, default=20.0)
|
ap.add_argument("--min-duration", type=float, default=20.0)
|
||||||
|
ap.add_argument("--include-prebuild", action="store_true",
|
||||||
|
help="Vključi pre-chorus build-up (privzeto: ne)")
|
||||||
|
ap.add_argument("--no-claude", action="store_true",
|
||||||
|
help="Preskoči Claude LLM analizo (uporabi samo lokalno heuristiko)")
|
||||||
ap.add_argument("--json", action="store_true", help="Output JSON")
|
ap.add_argument("--json", action="store_true", help="Output JSON")
|
||||||
ap.add_argument("--output", help="Path za JSON output")
|
ap.add_argument("--output", help="Path za JSON output")
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
@ -404,7 +544,15 @@ def main():
|
|||||||
instrumental = is_instrumental(transcript, duration)
|
instrumental = is_instrumental(transcript, duration)
|
||||||
print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
|
print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
|
||||||
|
|
||||||
# 5. Find chorus (samo če ni instrumental)
|
# 5a. PRIMARNO: Claude LLM analiza (razume cel tekst pesmi)
|
||||||
|
claude_result = None
|
||||||
|
if not instrumental and not args.no_claude:
|
||||||
|
print(f"🤖 Pošiljam transkript Claude-u za analizo strukture...", file=sys.stderr)
|
||||||
|
claude_result = analyze_with_claude(
|
||||||
|
transcript, duration, target_duration=args.target_duration
|
||||||
|
)
|
||||||
|
|
||||||
|
# 5b. Find chorus lokalno (kot fallback ali za score-jev preview)
|
||||||
if not instrumental:
|
if not instrumental:
|
||||||
chorus = find_chorus(transcript, energies, duration)
|
chorus = find_chorus(transcript, energies, duration)
|
||||||
else:
|
else:
|
||||||
@ -434,15 +582,34 @@ def main():
|
|||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
# 6. Smart clip range
|
# 6. Clip range — Claude ima prednost, sicer smart_clip_range fallback
|
||||||
clip_range = smart_clip_range(
|
if claude_result:
|
||||||
chorus, transcript, duration,
|
clip_range = {
|
||||||
target_duration=args.target_duration,
|
"start": claude_result["start"],
|
||||||
max_duration=args.max_duration,
|
"end": claude_result["end"],
|
||||||
min_duration=args.min_duration,
|
"duration": claude_result["duration"],
|
||||||
)
|
"reason": "claude_llm: " + claude_result.get("reason", ""),
|
||||||
|
"chorus_text": claude_result.get("chorus_text", ""),
|
||||||
|
"structure": claude_result.get("structure", ""),
|
||||||
|
"source": "claude",
|
||||||
|
}
|
||||||
|
# Apply max_duration cap če Claude pretirava
|
||||||
|
if clip_range["duration"] > args.max_duration:
|
||||||
|
clip_range["end"] = clip_range["start"] + args.max_duration
|
||||||
|
clip_range["duration"] = args.max_duration
|
||||||
|
clip_range["reason"] += " (capped at max_duration)"
|
||||||
|
else:
|
||||||
|
clip_range = smart_clip_range(
|
||||||
|
chorus, transcript, duration,
|
||||||
|
target_duration=args.target_duration,
|
||||||
|
max_duration=args.max_duration,
|
||||||
|
min_duration=args.min_duration,
|
||||||
|
include_prebuild=args.include_prebuild,
|
||||||
|
)
|
||||||
|
clip_range["source"] = "local_heuristic"
|
||||||
print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
|
print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
|
||||||
f"(duration: {clip_range['duration']}s)", file=sys.stderr)
|
f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
|
||||||
|
file=sys.stderr)
|
||||||
|
|
||||||
# 7. Fade params
|
# 7. Fade params
|
||||||
fade = detect_audio_fade(clip_range, transcript)
|
fade = detect_audio_fade(clip_range, transcript)
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user