Upgrade to Sonnet 4.6 + add Gemini 3.1 Pro support

- Refactored analyze_with_claude into shared _build_analysis_prompt + _parse_llm_response helpers
- New analyze_with_gemini() using Gemini 3.1 Pro ($2/M in, MMMLU 92.6% — best multilingual)
- Unified analyze_with_llm(provider) dispatcher with auto-fallback (Claude → Gemini)
- API endpoint accepts llm_provider in StartJobIn (claude/gemini/auto)
- Frontend dropdown to pick LLM
- Default model is now Sonnet 4.6 (was Haiku 4.5) — 3x quality at 3x price (~3 cents/video)
- Gemini support is opt-in: needs GEMINI_API_KEY env var to activate
This commit is contained in:
Sebastjan Artič 2026-04-29 08:26:27 +00:00
parent 9faa224885
commit ec71c54570
3 changed files with 191 additions and 54 deletions

View File

@ -237,6 +237,11 @@ def process_job(job_id):
]
if job.get("include_prebuild"):
cmd += ["--include-prebuild"]
# LLM provider (claude/gemini/auto)
if job.get("llm_provider"):
cmd += ["--llm-provider", job["llm_provider"]]
if job.get("llm_model"):
cmd += ["--llm-model", job["llm_model"]]
# lang: če None ali 'auto', pusti analyze.py auto-detect
if job.get("lang") and job["lang"] not in ("auto", ""):
cmd += ["--lang", job["lang"]]
@ -409,6 +414,9 @@ class StartJobIn(BaseModel):
subtitle_style: str = "reels"
whisper_model: str = "large-v3"
quality: str = "medium"
# LLM za semantično analizo + popravke
llm_provider: str = "claude" # claude / gemini / auto
llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider)
# ────────────────────────────────────────────────────────────────
@ -512,6 +520,8 @@ async def start_processing(
subtitle_style=payload.subtitle_style,
whisper_model=payload.whisper_model,
quality=payload.quality,
llm_provider=payload.llm_provider,
llm_model=payload.llm_model,
current_step="V vrsti za obdelavo",
)
background.add_task(process_job, payload.job_id)

View File

@ -437,25 +437,8 @@ def detect_audio_fade(clip_range, transcript, video_duration=None):
}
def analyze_with_claude(transcript, video_duration, target_duration=30):
"""Pošlje cel transkript Claude API-ju, ki razume strukturo pesmi
in vrne najboljši odsek za reel.
Claude bere cel tekst, prepozna ponovitve med deli (refren) in razume
kontekst (kdaj je intro, verz, refren, bridge, outro).
Vrne dict z 'start', 'end', 'reason', 'chorus_text' ali None če Claude
ni dosegljiv ali API key manjka.
"""
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
return None
if not transcript.get("segments"):
return None
# Pripravi tekstovni format za Claude — vsak segment z timestamp-om
def _build_analysis_prompt(transcript, video_duration, target_duration=30):
"""Pripravi enotni prompt za Claude/Gemini analizo."""
lines = []
for seg in transcript["segments"]:
start = seg["start"]
@ -464,7 +447,7 @@ def analyze_with_claude(transcript, video_duration, target_duration=30):
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
transcript_text = "\n".join(lines)
prompt = f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo):
return f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo):
{transcript_text}
@ -481,7 +464,7 @@ PROSIM:
- Če pesem ima refren ki se ponavlja, vse pojavitve refrena POPRAVI da imajo ENAKO besedilo (uporabi najjasnejšo varianto)
- Popravi napačne besede ki nimajo smisla v kontekstu
- Popravi pomešane jezike (če pesem je slovenska, vse vrstice naj bodo v slovenščini)
- Ohrani timestamp-e nepriremenjene
- Ohrani timestamp-e nespremenjene
3. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi
4. Izberi najboljši odsek za reel:
- Vključi cel refren (cel verz besedila brez prekinitve)
@ -505,11 +488,60 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom (ohrani timestamp-e)."""
def _parse_llm_response(text, video_duration):
"""Parse JSON odgovor iz LLM-a, vrne None če invalid."""
text = text.strip()
# Odstrani markdown ovoj če obstaja
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)
# Včasih je pred JSON-om še kakšna razlaga, vzemi prvi { ... } blok
first_brace = text.find("{")
last_brace = text.rfind("}")
if first_brace >= 0 and last_brace > first_brace:
text = text[first_brace:last_brace + 1]
result = json.loads(text)
start = float(result["start"])
end = float(result["end"])
if start >= end or start < 0 or end > video_duration:
print(f" ⚠️ LLM returned invalid range: {start}-{end}", file=sys.stderr)
return None
return {
"start": round(start, 2),
"end": round(end, 2),
"duration": round(end - start, 2),
"reason": result.get("reason", ""),
"chorus_text": result.get("chorus_text", ""),
"structure": result.get("structure", ""),
"language": result.get("language"),
"corrected_segments": result.get("corrected_segments"),
}
def analyze_with_claude(transcript, video_duration, target_duration=30, model="claude-sonnet-4-6"):
"""Pošlje transkript Claude API-ju (Anthropic).
model: claude-sonnet-4-6 (default), claude-haiku-4-5-20251001, claude-opus-4-7
"""
api_key = os.environ.get("ANTHROPIC_API_KEY")
if not api_key:
print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
return None
if not transcript.get("segments"):
return None
prompt = _build_analysis_prompt(transcript, video_duration, target_duration)
try:
import urllib.request
import urllib.error
body = json.dumps({
"model": "claude-sonnet-4-6",
"model": model,
"max_tokens": 4096,
"messages": [{"role": "user", "content": prompt}],
}).encode("utf-8")
@ -524,7 +556,7 @@ V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom
},
method="POST",
)
with urllib.request.urlopen(req, timeout=60) as resp:
with urllib.request.urlopen(req, timeout=120) as resp:
data = json.loads(resp.read().decode("utf-8"))
content = data.get("content", [])
@ -533,37 +565,18 @@ V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom
return None
text = content[0].get("text", "").strip()
# Včasih Claude obda JSON v markdown
if text.startswith("```"):
text = re.sub(r"^```(?:json)?\s*", "", text)
text = re.sub(r"\s*```$", "", text)
result = json.loads(text)
# Sanity check
start = float(result["start"])
end = float(result["end"])
if start >= end or start < 0 or end > video_duration:
print(f" ⚠️ Claude returned invalid range: {start}-{end}", file=sys.stderr)
result = _parse_llm_response(text, video_duration)
if not result:
return None
print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr)
print(f" 🤖 Claude ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr)
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
cs = result.get("corrected_segments")
if cs:
print(f" Popravljeni segmenti: {len(cs)}", file=sys.stderr)
if result.get("corrected_segments"):
print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr)
return {
"start": round(start, 2),
"end": round(end, 2),
"duration": round(end - start, 2),
"reason": result.get("reason", ""),
"chorus_text": result.get("chorus_text", ""),
"structure": result.get("structure", ""),
"language": result.get("language"),
"corrected_segments": result.get("corrected_segments"),
"source": "claude_llm",
}
result["source"] = f"claude:{model}"
return result
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")[:500]
print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr)
@ -573,6 +586,101 @@ V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom
return None
def analyze_with_gemini(transcript, video_duration, target_duration=30, model="gemini-3.1-pro-preview"):
"""Pošlje transkript Gemini API-ju (Google).
Gemini 3.1 Pro ima najboljši multilingual rezultat (MMMLU 92.6%) odličen za SLO/HR/BS.
"""
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
if not api_key:
print(" ⚠️ GEMINI_API_KEY ni nastavljen — preskakujem Gemini analizo", file=sys.stderr)
return None
if not transcript.get("segments"):
return None
prompt = _build_analysis_prompt(transcript, video_duration, target_duration)
try:
import urllib.request
import urllib.error
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
body = json.dumps({
"contents": [{
"role": "user",
"parts": [{"text": prompt}],
}],
"generationConfig": {
"temperature": 0.1,
"maxOutputTokens": 4096,
"responseMimeType": "application/json",
},
}).encode("utf-8")
req = urllib.request.Request(
url,
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=120) as resp:
data = json.loads(resp.read().decode("utf-8"))
candidates = data.get("candidates", [])
if not candidates:
print(" ⚠️ Gemini vrnil 0 candidates", file=sys.stderr)
return None
parts = candidates[0].get("content", {}).get("parts", [])
if not parts:
print(" ⚠️ Gemini vrnil prazen content", file=sys.stderr)
return None
text = parts[0].get("text", "").strip()
result = _parse_llm_response(text, video_duration)
if not result:
return None
print(f" 🤖 Gemini ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr)
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
if result.get("corrected_segments"):
print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr)
result["source"] = f"gemini:{model}"
return result
except urllib.error.HTTPError as e:
body = e.read().decode("utf-8", errors="replace")[:500]
print(f" ❌ Gemini API HTTP {e.code}: {body}", file=sys.stderr)
return None
except Exception as e:
print(f" ❌ Gemini analysis failed: {e}", file=sys.stderr)
return None
def analyze_with_llm(transcript, video_duration, target_duration=30, provider="claude", llm_model=None):
"""Glavna funkcija — uporabi izbrano LLM (claude/gemini/auto)."""
if provider == "gemini":
model = llm_model or "gemini-3.1-pro-preview"
return analyze_with_gemini(transcript, video_duration, target_duration, model)
elif provider == "claude":
model = llm_model or "claude-sonnet-4-6"
return analyze_with_claude(transcript, video_duration, target_duration, model)
elif provider == "auto":
# Najprej probaj Claude, fallback na Gemini
result = analyze_with_claude(transcript, video_duration, target_duration,
llm_model or "claude-sonnet-4-6")
if result:
return result
print(" 🔄 Claude ni uspel, probam Gemini...", file=sys.stderr)
return analyze_with_gemini(transcript, video_duration, target_duration,
llm_model or "gemini-3.1-pro-preview")
else:
print(f" ⚠️ Neznan LLM provider: {provider}", file=sys.stderr)
return None
def is_instrumental(transcript, video_duration, threshold=0.1):
"""Detekcija ali je pesem instrumentalna.
@ -599,7 +707,12 @@ def main():
ap.add_argument("--include-prebuild", action="store_true",
help="Vključi pre-chorus build-up (privzeto: ne)")
ap.add_argument("--no-claude", action="store_true",
help="Preskoči Claude LLM analizo (uporabi samo lokalno heuristiko)")
help="Preskoči LLM analizo (uporabi samo lokalno heuristiko)")
ap.add_argument("--llm-provider", default="claude",
choices=["claude", "gemini", "auto"],
help="Kateri LLM uporabiti za analizo (default: claude)")
ap.add_argument("--llm-model", default=None,
help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
ap.add_argument("--json", action="store_true", help="Output JSON")
ap.add_argument("--output", help="Path za JSON output")
args = ap.parse_args()
@ -630,12 +743,14 @@ def main():
instrumental = is_instrumental(transcript, duration)
print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
# 5a. PRIMARNO: Claude LLM analiza (razume cel tekst pesmi)
# 5a. PRIMARNO: LLM analiza (razume cel tekst pesmi + popravki)
claude_result = None
if not instrumental and not args.no_claude:
print(f"🤖 Pošiljam transkript Claude-u za analizo strukture...", file=sys.stderr)
claude_result = analyze_with_claude(
transcript, duration, target_duration=args.target_duration
provider = args.llm_provider
print(f"🤖 Pošiljam transkript {provider}-u za analizo...", file=sys.stderr)
claude_result = analyze_with_llm(
transcript, duration, target_duration=args.target_duration,
provider=provider, llm_model=args.llm_model,
)
# 5b. Find chorus lokalno (kot fallback ali za score-jev preview)

View File

@ -267,6 +267,17 @@
</div>
</div>
<div class="row" style="margin-top: 12px;">
<div>
<label>AI za analizo (popravlja transkript + razume strukturo)</label>
<select id="llm-provider">
<option value="claude" selected>Claude Sonnet 4.6 (priporočeno)</option>
<option value="gemini">Gemini 3.1 Pro (multilingual)</option>
<option value="auto">Auto (Claude → Gemini fallback)</option>
</select>
</div>
</div>
<label class="toggle" style="margin-top: 12px;">
<input type="checkbox" id="no-subs">
Brez podnapisov
@ -374,6 +385,7 @@
subtitle_style: $("#subtitle-style").value,
quality: $("#quality").value,
no_subs: $("#no-subs").checked,
llm_provider: $("#llm-provider").value,
};
}