reels-app/scripts/analyze.py
OpenClaw Agent e350352883 Fix: Gemini 3.1 Pro thinking model needs 32k maxOutputTokens (was 4096 → MAX_TOKENS truncation)
Diagnoza:
- Gemini 3.x Pro je thinking model (ima internal reasoning, thoughtsTokenCount)
- Pri velikih transkriptih (60+ segmentov pesmi):
  * thoughts ~ 1500-3000 tokens
  * output JSON s corrected_segments ~ 3000-7000 tokens
  * total ~ 4500-10000 tokens
- Z maxOutputTokens=4096 je bil response prekinjen (finishReason: MAX_TOKENS),
  JSON odrezan na pol, _parse_llm_response je threw json.JSONDecodeError
- Rezultat: 'Gemini vrnil prazen string' v logih

Popravki:
1. Gemini maxOutputTokens 4096 → 32768 (dovolj za thinking + dolg JSON)
2. Diagnostika finishReason==MAX_TOKENS in usage tokens v logih
3. Detekcija praznega text-a (ne samo praznega parts array-a)
4. Claude max_tokens 4096 → 8192 (rezerva za dolge pesmi)
5. Claude detekcija stop_reason==max_tokens

Test (60 segmentov, 5631 char prompt):
- 4096 → finishReason=MAX_TOKENS, thoughts=2594, output=1488, JSON odrezan 
- 16384 → finishReason=STOP, thoughts=1445, output=3040, JSON popoln 
- 32768 → varen default 
2026-04-29 09:03:53 +00:00

945 lines
36 KiB
Python

#!/usr/bin/env python3
"""
analyze.py — Predhodna analiza CELEGA videa pred trim-anjem.
Naredi:
1. Whisper transcript celega videa (auto-detect jezika ali user-specified)
2. Energy profile (RMS dB na 1s windows)
3. Structural detection (vocal/instrumental sections, energy peaks)
4. Pametno izbere clip range (lahko >30s, vključi pre-chorus)
5. Detekcija instrumentalnih pesmi (no_subs auto)
Output: JSON s podatki za clip.py
"""
import argparse
import json
import os
import re
import subprocess
import sys
import tempfile
from pathlib import Path
def get_video_duration(path):
r = subprocess.run(
["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=nw=1:nokey=1", str(path)],
capture_output=True, text=True
)
try:
return float(r.stdout.strip())
except ValueError:
return 0.0
def extract_audio(video_path):
"""Extract avdio v 16kHz mono WAV za Whisper + energy."""
audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
audio.close()
subprocess.run(
["ffmpeg", "-y", "-i", str(video_path), "-vn",
"-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", audio.name],
check=True, capture_output=True
)
return audio.name
def transcribe_full(audio_path, lang=None, model_size="small"):
"""Whisper transcript celega avdia. lang=None → robust auto-detect.
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
from faster_whisper import WhisperModel
print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
m = WhisperModel(model_size, device="cpu", compute_type="int8")
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
if not lang:
print(" 🔍 Robust lang detection (3 samples)...", file=sys.stderr)
try:
duration_proc = subprocess.run(
["ffprobe", "-v", "error", "-show_entries", "format=duration",
"-of", "default=nw=1:nokey=1", audio_path],
capture_output=True, text=True
)
audio_duration = float(duration_proc.stdout.strip())
except Exception:
audio_duration = 180.0
lang_votes = {}
for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
if ss + 5 > audio_duration:
continue
sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sample.close()
try:
subprocess.run(
["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
"-t", "30", "-vn", "-ac", "1", "-ar", "16000",
"-c:a", "pcm_s16le", sample.name],
check=True, capture_output=True
)
_, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
sl, sp = sample_info.language, float(sample_info.language_probability)
lang_votes[sl] = lang_votes.get(sl, 0) + sp
print(f" sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
except Exception as e:
print(f" sample @ {ss:.0f}s: failed", file=sys.stderr)
finally:
try:
os.unlink(sample.name)
except Exception:
pass
if lang_votes:
lang = max(lang_votes.items(), key=lambda x: x[1])[0]
print(f" ✅ Lang lock: {lang}", file=sys.stderr)
try:
segs, info = m.transcribe(
audio_path,
language=lang,
word_timestamps=True,
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
vad_filter=False,
# Anti-halucinacije
condition_on_previous_text=False,
temperature=0.0,
compression_ratio_threshold=2.4,
log_prob_threshold=-1.0,
no_speech_threshold=0.6,
)
detected_lang = info.language
detected_prob = float(info.language_probability)
except (ValueError, RuntimeError) as e:
# Whisper failure (např. pri popolnoma instrumentalnih datotekah z VAD)
print(f" ⚠️ Whisper transcribe failed: {e}", file=sys.stderr)
return {
"language": "unknown",
"language_probability": 0.0,
"segments": [],
}
print(f" Detekcija: {detected_lang} (p={detected_prob:.2f})", file=sys.stderr)
segments = []
for s in segs:
words = []
if s.words:
for w in s.words:
words.append({
"start": w.start,
"end": w.end,
"text": w.word,
})
segments.append({
"start": s.start,
"end": s.end,
"text": s.text.strip(),
"words": words,
})
return {
"language": detected_lang,
"language_probability": detected_prob,
"segments": segments,
}
def compute_energy_profile(audio_path, window_sec=1.0):
"""RMS dB na window_sec sekund. Vrne list (timestamp, rms_db)."""
cmd = [
"ffmpeg", "-i", audio_path,
"-af", f"asetnsamples=n={int(16000 * window_sec)}:p=0,"
f"astats=metadata=1:reset={window_sec},"
f"ametadata=print:key=lavfi.astats.Overall.RMS_level:file=-",
"-f", "null", "-",
]
result = subprocess.run(cmd, capture_output=True, text=True)
output = result.stdout + "\n" + result.stderr
energies = []
current_pts = 0.0
for line in output.split("\n"):
line = line.strip()
m = re.search(r"pts_time:(\S+)", line)
if m:
try:
current_pts = float(m.group(1))
except ValueError:
pass
continue
if "RMS_level=" in line:
val = line.split("RMS_level=")[-1].strip()
try:
rms = float(val)
# -inf zamenjamo z -90
if rms < -90 or rms != rms: # NaN check
rms = -90.0
energies.append((current_pts, rms))
current_pts += window_sec
except ValueError:
pass
return energies
def detect_vocal_sections(segments, max_gap=3.0):
"""Združi consecutive segmente v "vokalne sekcije"."""
if not segments:
return []
sections = []
current = {
"start": segments[0]["start"],
"end": segments[0]["end"],
"segments": [segments[0]],
"text": segments[0]["text"],
}
for seg in segments[1:]:
if seg["start"] - current["end"] > max_gap:
sections.append(current)
current = {
"start": seg["start"],
"end": seg["end"],
"segments": [seg],
"text": seg["text"],
}
else:
current["end"] = seg["end"]
current["segments"].append(seg)
current["text"] += " " + seg["text"]
sections.append(current)
return sections
def avg_energy_in_range(energies, start, end):
"""Povprečna RMS v rangeu."""
vals = [r for (t, r) in energies if start <= t <= end]
if not vals:
return -90.0
return sum(vals) / len(vals)
def score_section_as_chorus(section, all_sections, energies, avg_rms):
"""Score sekcijo kot kandidat za refren.
Faktorji:
- Ponavljajoče besede (low unique-word-ratio) = refren
- Visoka energija
- Sekcija se pojavi večkrat v pesmi (refren se ponovi)
- Krajše vrstice (3-8 besed)
"""
text = section["text"].lower()
words = re.findall(r"\b\w+\b", text)
if not words:
return 0
unique_ratio = len(set(words)) / len(words)
# Refren = nizko unique ratio (ponovitve)
chorus_signal = max(0, (1.0 - unique_ratio) * 30)
# Energija
sec_energy = avg_energy_in_range(energies, section["start"], section["end"])
energy_above = max(0, sec_energy - avg_rms)
energy_score = energy_above * 8
# Kako pogosto se pojavi podobno besedilo
repeat_count = 0
for other in all_sections:
if other is section:
continue
other_text = other["text"].lower()
other_words = set(re.findall(r"\b\w+\b", other_text))
common = set(words) & other_words
# Če imata >50% besed skupnih, je verjetno isti refren
if len(common) >= len(set(words)) * 0.5 and len(common) >= 3:
repeat_count += 1
repeat_score = repeat_count * 25
# Dolžina vrstice
duration = section["end"] - section["start"]
if 3 <= duration <= 25:
length_score = 10
elif duration > 25:
length_score = 5
else:
length_score = 2
return chorus_signal + energy_score + repeat_score + length_score
def find_chorus(transcript, energies, video_duration):
"""Najde najbolj verjeten refren."""
sections = detect_vocal_sections(transcript["segments"])
if not sections:
return None
avg_rms = sum(r for (_, r) in energies) / len(energies) if energies else -30.0
candidates = []
for sec in sections:
score = score_section_as_chorus(sec, sections, energies, avg_rms)
candidates.append({
"start": sec["start"],
"end": sec["end"],
"duration": sec["end"] - sec["start"],
"text_preview": sec["text"][:80],
"score": round(score, 2),
"avg_rms": round(avg_energy_in_range(energies, sec["start"], sec["end"]), 2),
})
# Sort by score descending
candidates.sort(key=lambda c: -c["score"])
if not candidates:
return None
return {
"best": candidates[0],
"all_candidates": candidates[:10],
"avg_rms_total": round(avg_rms, 2),
}
def smart_clip_range(chorus, transcript, video_duration,
target_duration=30, max_duration=45, min_duration=20,
include_prebuild=False):
"""Inteligentno določi clip range.
Logika:
1. Začni z refrenom kot core
2. Če je krajši od min_duration → razširi z drugim refrenom (ne kitico!)
3. Cap na max_duration
include_prebuild=False (default): NE doda kitice/verza pred refrenom.
include_prebuild=True: doda kratek pre-chorus (max 8s, gap < 3s).
"""
if not chorus or not chorus.get("best"):
# Fallback: vzemi sredino videa
mid = video_duration / 2
start = max(0, mid - target_duration / 2)
return {
"start": start,
"end": min(video_duration, start + target_duration),
"reason": "fallback_middle",
}
best = chorus["best"]
sections = detect_vocal_sections(transcript["segments"])
actual_start = best["start"]
actual_end = best["end"]
# Najdi VSE sekcije ki so podobne refrenu (verjetne ponovitve)
chorus_words = set(re.findall(r"\b\w+\b", best["text_preview"].lower()))
chorus_sections = []
for sec in sections:
sec_words = set(re.findall(r"\b\w+\b", sec["text"].lower()))
if chorus_words and len(sec_words & chorus_words) >= len(chorus_words) * 0.4:
chorus_sections.append(sec)
# 1. Če je core refren prekratek, razširi z naslednjim REFRENOM (ne kitico!)
if actual_end - actual_start < min_duration:
for sec in chorus_sections:
if sec["start"] > actual_end and sec["start"] - actual_end < 8:
if sec["end"] - actual_start <= max_duration:
actual_end = sec["end"]
if actual_end - actual_start >= min_duration:
break
# 2. Pre-chorus build-up (samo če uporabnik to izrecno hoče)
if include_prebuild:
pre_section = None
for sec in sections:
# Pre-section mora biti BLIZU (gap < 3s) in NE preveč dolga (< 8s)
sec_duration = sec["end"] - sec["start"]
if (sec["end"] <= actual_start
and actual_start - sec["end"] < 3
and sec_duration < 8):
pre_section = sec
if pre_section:
candidate_start = pre_section["start"]
if actual_end - candidate_start <= max_duration:
actual_start = candidate_start
# 3. Če je še prekratek, razširi simetrično znotraj refrenov (ne kitic)
if actual_end - actual_start < min_duration:
deficit = min_duration - (actual_end - actual_start)
# Razširi konec če lahko
for sec in chorus_sections:
if sec["start"] > actual_end and sec["start"] - actual_end < 5:
actual_end = min(sec["end"], actual_end + deficit)
break
# Če še ni dovolj, manjše simetrično
if actual_end - actual_start < min_duration:
extra = (min_duration - (actual_end - actual_start)) / 2
actual_start = max(0, actual_start - extra)
actual_end = min(video_duration, actual_end + extra)
# 4. Trim na max
if actual_end - actual_start > max_duration:
actual_end = actual_start + max_duration
actual_start = max(0, actual_start)
actual_end = min(video_duration, actual_end)
return {
"start": round(actual_start, 2),
"end": round(actual_end, 2),
"duration": round(actual_end - actual_start, 2),
"reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only",
"chorus_start": round(best["start"], 2),
"chorus_end": round(best["end"], 2),
}
def detect_audio_fade(clip_range, transcript, video_duration=None):
"""Določi fade-in/fade-out trajanje + ev. razširi clip range, da fade
ne reže besedila na koncu refrena.
Logika:
- Če clip začne sredi vokala → 0.5s fade in
- Če se konča sredi vokala → razširi clip do konca segmenta (+ buffer),
potem 1.0s fade out
- Sicer manj fade
"""
cs, ce = clip_range["start"], clip_range["end"]
# Najdi segment, ki konča znotraj clip-a (ali je clip end znotraj segmenta)
starts_in_vocal = False
ends_in_vocal = False
end_segment = None
for seg in transcript["segments"]:
if seg["start"] <= cs <= seg["end"]:
starts_in_vocal = True
if seg["start"] <= ce <= seg["end"]:
ends_in_vocal = True
end_segment = seg
# Če clip konča znotraj segmenta, razširi do konca segmenta + 0.5s buffer
extended_end = ce
if end_segment:
extended_end = end_segment["end"] + 0.5
if video_duration is not None:
extended_end = min(extended_end, video_duration)
fade_in = 0.4 if starts_in_vocal else 0.2
# Krajši fade out (0.5s) ker zdaj clip konča po koncu vokala
fade_out = 0.5 if ends_in_vocal else 0.3
return {
"fade_in": fade_in,
"fade_out": fade_out,
"extended_end": round(extended_end, 2),
"ends_in_vocal": ends_in_vocal,
}
def _build_analysis_prompt(transcript, video_duration, target_duration=30):
"""Pripravi enotni prompt za Claude/Gemini analizo."""
lines = []
for seg in transcript["segments"]:
start = seg["start"]
end = seg["end"]
text = seg["text"].strip()
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
transcript_text = "\n".join(lines)
return f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo):
{transcript_text}
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.
POMEMBNO: Whisper je avtomatski STT in pogosto naredi napake, posebej pri:
- slovanskih jezikih (slovenščina, hrvaščina, bosanščina, srbščina)
- narečnih izrazih
- ko glasba prevladuje nad vokalom
PROSIM:
1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
2. POPRAVI očitne napake v transkripciji:
- Če pesem ima refren ki se ponavlja, vse pojavitve refrena POPRAVI da imajo ENAKO besedilo (uporabi najjasnejšo varianto)
- Popravi napačne besede ki nimajo smisla v kontekstu
- Popravi pomešane jezike (če pesem je slovenska, vse vrstice naj bodo v slovenščini)
- Ohrani timestamp-e nespremenjene
3. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi
4. Izberi najboljši odsek za reel:
- Vključi cel refren (cel verz besedila brez prekinitve)
- Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom
- Lahko traja 20-45 sekund (ne strogo 30s)
- Začni in končaj na smiselni meji (konec stavka, ne sredi besede)
5. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
{{
"start": <sekunde>,
"end": <sekunde>,
"reason": "<kratka razlaga zakaj ta odsek>",
"chorus_text": "<besedilo refrena ali ključni del>",
"structure": "<1 stavek o strukturi pesmi>",
"language": "<jezik: sl/de/hr/bs/sr/en/it/es/fr>",
"corrected_segments": [
{{"start": <s>, "end": <s>, "text": "<popravljeno besedilo>"}}
]
}}
V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom (ohrani timestamp-e)."""
def _parse_llm_response(text, video_duration):
"""Parse JSON odgovor iz LLM-a, vrne None če invalid."""
text = text.strip()
# Odstrani markdown ovoj če obstaja
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)
# Včasih je pred JSON-om še kakšna razlaga, vzemi prvi { ... } blok
first_brace = text.find("{")
last_brace = text.rfind("}")
if first_brace >= 0 and last_brace > first_brace:
text = text[first_brace:last_brace + 1]
result = json.loads(text)
start = float(result["start"])
end = float(result["end"])
if start >= end or start < 0 or end > video_duration:
print(f" ⚠️ LLM returned invalid range: {start}-{end}", file=sys.stderr)
return None
return {
"start": round(start, 2),
"end": round(end, 2),
"duration": round(end - start, 2),
"reason": result.get("reason", ""),
"chorus_text": result.get("chorus_text", ""),
"structure": result.get("structure", ""),
"language": result.get("language"),
"corrected_segments": result.get("corrected_segments"),
}
def analyze_with_claude(transcript, video_duration, target_duration=30, model="claude-sonnet-4-6"):
"""Pošlje transkript Claude API-ju (Anthropic).
model: claude-sonnet-4-6 (default), claude-haiku-4-5-20251001, claude-opus-4-7
"""
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
return None
if not transcript.get("segments"):
return None
prompt = _build_analysis_prompt(transcript, video_duration, target_duration)
try:
import urllib.request
import urllib.error
body = json.dumps({
"model": model,
# 8192 je dovolj za ~250 corrected_segments + ostali metadata pri dolgih pesmih.
# Sonnet 4.6 podpira precej več, ampak 8192 je varen default.
"max_tokens": 8192,
"messages": [{"role": "user", "content": prompt}],
}).encode("utf-8")
req = urllib.request.Request(
"https://api.anthropic.com/v1/messages",
data=body,
headers={
"Content-Type": "application/json",
"x-api-key": api_key,
"anthropic-version": "2023-06-01",
},
method="POST",
)
with urllib.request.urlopen(req, timeout=120) as resp:
data = json.loads(resp.read().decode("utf-8"))
content = data.get("content", [])
if not content:
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
return None
# Diagnostika: če je bil response odrezan, je JSON nepopoln
stop_reason = data.get("stop_reason")
if stop_reason == "max_tokens":
usage = data.get("usage", {})
print(
f" ⚠️ Claude odrezan (max_tokens): "
f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}",
file=sys.stderr,
)
return None
text = content[0].get("text", "").strip()
result = _parse_llm_response(text, video_duration)
if not result:
return None
print(f" 🤖 Claude ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr)
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
if result.get("corrected_segments"):
print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr)
result["source"] = f"claude:{model}"
return result
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")[:500]
print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr)
return None
except Exception as e:
print(f" ❌ Claude analysis failed: {e}", file=sys.stderr)
return None
def analyze_with_gemini(transcript, video_duration, target_duration=30, model="gemini-3.1-pro-preview"):
"""Pošlje transkript Gemini API-ju (Google).
Gemini 3.1 Pro ima najboljši multilingual rezultat (MMMLU 92.6%) — odličen za SLO/HR/BS.
"""
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
if not api_key:
print(" ⚠️ GEMINI_API_KEY ni nastavljen — preskakujem Gemini analizo", file=sys.stderr)
return None
if not transcript.get("segments"):
return None
prompt = _build_analysis_prompt(transcript, video_duration, target_duration)
try:
import urllib.request
import urllib.error
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
# Gemini 3.x Pro je THINKING model — porabi tokene tudi za internal reasoning (thoughtsTokenCount).
# 4096 je prenizko: pri velikih transkriptih thinking lahko porabi 1500-3000 tokenov,
# output (corrected_segments za 60+ segmentov) pa še dodatnih 3000-7000 → odreže JSON na pol
# (finishReason: MAX_TOKENS) in vrne nepopolen, neveljaven JSON.
# 32768 daje dovolj prostora za thinking + cel JSON output tudi pri dolgih pesmih.
body = json.dumps({
"contents": [{
"role": "user",
"parts": [{"text": prompt}],
}],
"generationConfig": {
"temperature": 0.1,
"maxOutputTokens": 32768,
"responseMimeType": "application/json",
},
}).encode("utf-8")
req = urllib.request.Request(
url,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=180) as resp:
data = json.loads(resp.read().decode("utf-8"))
candidates = data.get("candidates", [])
if not candidates:
print(" ⚠️ Gemini vrnil 0 candidates", file=sys.stderr)
return None
cand0 = candidates[0]
finish_reason = cand0.get("finishReason", "?")
usage = data.get("usageMetadata", {})
# Diagnostika: če je finishReason == MAX_TOKENS, je output odrezan in JSON je invalid
if finish_reason == "MAX_TOKENS":
print(
f" ⚠️ Gemini odrezan (MAX_TOKENS): "
f"prompt={usage.get('promptTokenCount')} "
f"thoughts={usage.get('thoughtsTokenCount')} "
f"output={usage.get('candidatesTokenCount')}",
file=sys.stderr,
)
return None
parts = cand0.get("content", {}).get("parts", [])
if not parts:
print(
f" ⚠️ Gemini vrnil prazen content (finishReason={finish_reason}, "
f"thoughts={usage.get('thoughtsTokenCount')})",
file=sys.stderr,
)
return None
text = parts[0].get("text", "").strip()
if not text:
print(
f" ⚠️ Gemini vrnil prazen text (finishReason={finish_reason}, "
f"thoughts={usage.get('thoughtsTokenCount')}, "
f"output={usage.get('candidatesTokenCount')})",
file=sys.stderr,
)
return None
result = _parse_llm_response(text, video_duration)
if not result:
return None
print(f" 🤖 Gemini ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr)
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
if result.get("corrected_segments"):
print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr)
result["source"] = f"gemini:{model}"
return result
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")[:500]
print(f" ❌ Gemini API HTTP {e.code}: {body}", file=sys.stderr)
return None
except Exception as e:
print(f" ❌ Gemini analysis failed: {e}", file=sys.stderr)
return None
def analyze_with_llm(transcript, video_duration, target_duration=30, provider="claude", llm_model=None):
"""Glavna funkcija — uporabi izbrano LLM (claude/gemini/auto)."""
if provider == "gemini":
model = llm_model or "gemini-3.1-pro-preview"
return analyze_with_gemini(transcript, video_duration, target_duration, model)
elif provider == "claude":
model = llm_model or "claude-sonnet-4-6"
return analyze_with_claude(transcript, video_duration, target_duration, model)
elif provider == "auto":
# Najprej probaj Claude, fallback na Gemini
result = analyze_with_claude(transcript, video_duration, target_duration,
llm_model or "claude-sonnet-4-6")
if result:
return result
print(" 🔄 Claude ni uspel, probam Gemini...", file=sys.stderr)
return analyze_with_gemini(transcript, video_duration, target_duration,
llm_model or "gemini-3.1-pro-preview")
else:
print(f" ⚠️ Neznan LLM provider: {provider}", file=sys.stderr)
return None
def is_instrumental(transcript, video_duration, threshold=0.1):
"""Detekcija ali je pesem instrumentalna.
Če je vsota trajanja vokalnih segmentov < threshold * video_duration,
je pesem instrumentalna.
"""
if not transcript.get("segments"):
return True
vocal_duration = sum(
s["end"] - s["start"] for s in transcript["segments"]
)
ratio = vocal_duration / max(video_duration, 1)
return bool(ratio < threshold)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("video", help="Vhod video file")
ap.add_argument("--lang", default=None, help="ISO 639-1 ali 'auto' (default: auto)")
ap.add_argument("--model", default="large-v3", help="Whisper model")
ap.add_argument("--target-duration", type=float, default=30.0)
ap.add_argument("--max-duration", type=float, default=45.0)
ap.add_argument("--min-duration", type=float, default=20.0)
ap.add_argument("--include-prebuild", action="store_true",
help="Vključi pre-chorus build-up (privzeto: ne)")
ap.add_argument("--no-claude", action="store_true",
help="Preskoči LLM analizo (uporabi samo lokalno heuristiko)")
ap.add_argument("--llm-provider", default="claude",
choices=["claude", "gemini", "auto"],
help="Kateri LLM uporabiti za analizo (default: claude)")
ap.add_argument("--llm-model", default=None,
help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
ap.add_argument("--json", action="store_true", help="Output JSON")
ap.add_argument("--output", help="Path za JSON output")
args = ap.parse_args()
video = Path(args.video)
if not video.exists():
print(f"❌ Video ne obstaja: {video}", file=sys.stderr)
sys.exit(1)
duration = get_video_duration(video)
print(f"📹 Video: {video.name}, {duration:.1f}s", file=sys.stderr)
# 1. Extract avdio
audio = extract_audio(video)
try:
# 2. Whisper transcript
lang = None if args.lang in (None, "auto", "") else args.lang
transcript = transcribe_full(audio, lang=lang, model_size=args.model)
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
# 3. Energy profile
print(f"⚡ Energy profile...", file=sys.stderr)
energies = compute_energy_profile(audio)
print(f" Energy samples: {len(energies)}", file=sys.stderr)
# 4. Instrumental detection
instrumental = is_instrumental(transcript, duration)
print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
# 5a. PRIMARNO: LLM analiza (razume cel tekst pesmi + popravki)
claude_result = None
if not instrumental and not args.no_claude:
provider = args.llm_provider
print(f"🤖 Pošiljam transkript {provider}-u za analizo...", file=sys.stderr)
claude_result = analyze_with_llm(
transcript, duration, target_duration=args.target_duration,
provider=provider, llm_model=args.llm_model,
)
# 5b. Find chorus lokalno (kot fallback ali za score-jev preview)
if not instrumental:
chorus = find_chorus(transcript, energies, duration)
else:
# Za instrumentalne: najdi sekcijo z najvišjo energijo
window = args.target_duration
best_start = 0
best_avg = -100
t = 0
while t + window <= duration:
avg = avg_energy_in_range(energies, t, t + window)
if avg > best_avg:
best_avg = avg
best_start = t
t += 5 # step 5s
chorus = {
"best": {
"start": best_start,
"end": best_start + window,
"duration": window,
"text_preview": "(instrumental — energy peak)",
"score": 0,
"avg_rms": round(best_avg, 2),
},
"all_candidates": [],
"avg_rms_total": round(
sum(r for (_, r) in energies) / len(energies) if energies else -30, 2
),
}
# 6. Clip range — Claude ima prednost, sicer smart_clip_range fallback
if claude_result:
clip_range = {
"start": claude_result["start"],
"end": claude_result["end"],
"duration": claude_result["duration"],
"reason": "claude_llm: " + claude_result.get("reason", ""),
"chorus_text": claude_result.get("chorus_text", ""),
"structure": claude_result.get("structure", ""),
"source": "claude",
}
# Apply max_duration cap če Claude pretirava
if clip_range["duration"] > args.max_duration:
clip_range["end"] = clip_range["start"] + args.max_duration
clip_range["duration"] = args.max_duration
clip_range["reason"] += " (capped at max_duration)"
else:
clip_range = smart_clip_range(
chorus, transcript, duration,
target_duration=args.target_duration,
max_duration=args.max_duration,
min_duration=args.min_duration,
include_prebuild=args.include_prebuild,
)
clip_range["source"] = "local_heuristic"
print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
file=sys.stderr)
# Če Claude je vrnil popravljene segmente, jih uporabi (boljši za podnapise)
if claude_result and claude_result.get("corrected_segments"):
corrected = claude_result["corrected_segments"]
# Ohrani word-level timing iz originala, posodobi samo text
orig_by_start = {round(s["start"], 1): s for s in transcript["segments"]}
new_segments = []
for cs in corrected:
try:
cs_start = float(cs["start"])
cs_end = float(cs["end"])
cs_text = str(cs["text"]).strip()
except (KeyError, ValueError, TypeError):
continue
# Najdi originalni segment z istim start (ali blizu) za word-level timing
orig = orig_by_start.get(round(cs_start, 1))
if not orig:
# Najdi najbližji
closest_diff = 999
for s in transcript["segments"]:
diff = abs(s["start"] - cs_start)
if diff < closest_diff and diff < 1.0:
closest_diff = diff
orig = s
new_segments.append({
"start": cs_start,
"end": cs_end,
"text": cs_text,
# Word-level timing ne moremo posodabljati ker Claude ne vrača besede,
# ampak ohranimo če imamo
"words": orig.get("words", []) if orig else [],
})
transcript["segments"] = new_segments
transcript["claude_corrected"] = True
# Posodobi tudi jezik če Claude je drugačnega mnenja
if claude_result.get("language") and claude_result["language"] != transcript["language"]:
print(f" ✏️ Claude je popravil jezik: {transcript['language']}{claude_result['language']}", file=sys.stderr)
transcript["language"] = claude_result["language"]
print(f" ✏️ Whisper segmenti zamenjani s Claude popravljenimi ({len(new_segments)})", file=sys.stderr)
# 7. Fade params (lahko razširi clip end če konča sredi vokala)
fade = detect_audio_fade(clip_range, transcript, video_duration=duration)
print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr)
# Če fade detection razširi end (ker clip konča sredi vokala), apply
if fade.get("extended_end") and fade["extended_end"] > clip_range["end"]:
old_end = clip_range["end"]
new_end = min(fade["extended_end"], clip_range["start"] + args.max_duration)
clip_range["end"] = round(new_end, 2)
clip_range["duration"] = round(new_end - clip_range["start"], 2)
print(f" ↳ Razširjen za {new_end - old_end:.1f}s (zaključek besedila)",
file=sys.stderr)
result = {
"video": str(video),
"video_duration": duration,
"language": transcript["language"],
"language_probability": transcript["language_probability"],
"instrumental": instrumental,
"transcript": transcript,
"chorus": chorus,
"clip_range": clip_range,
"fade": fade,
"claude_used": claude_result is not None,
"claude_corrected_text": bool(claude_result and claude_result.get("corrected_segments")),
}
if args.output:
with open(args.output, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
print(f"💾 Saved: {args.output}", file=sys.stderr)
if args.json:
print(json.dumps(result, ensure_ascii=False))
finally:
try:
os.unlink(audio)
except Exception:
pass
if __name__ == "__main__":
main()