From ec71c545707af7b120abc8654c097106de2a9e42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 08:26:27 +0000 Subject: [PATCH] Upgrade to Sonnet 4.6 + add Gemini 3.1 Pro support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Refactored analyze_with_claude into shared _build_analysis_prompt + _parse_llm_response helpers - New analyze_with_gemini() using Gemini 3.1 Pro ($2/M in, MMMLU 92.6% — best multilingual) - Unified analyze_with_llm(provider) dispatcher with auto-fallback (Claude → Gemini) - API endpoint accepts llm_provider in StartJobIn (claude/gemini/auto) - Frontend dropdown to pick LLM - Default model is now Sonnet 4.6 (was Haiku 4.5) — 3x quality at 3x price (~3 cents/video) - Gemini support is opt-in: needs GEMINI_API_KEY env var to activate --- app/main.py | 10 ++ scripts/analyze.py | 223 ++++++++++++++++++++++++++++++++----------- templates/index.html | 12 +++ 3 files changed, 191 insertions(+), 54 deletions(-) diff --git a/app/main.py b/app/main.py index 06460e4..67277a1 100644 --- a/app/main.py +++ b/app/main.py @@ -237,6 +237,11 @@ def process_job(job_id): ] if job.get("include_prebuild"): cmd += ["--include-prebuild"] + # LLM provider (claude/gemini/auto) + if job.get("llm_provider"): + cmd += ["--llm-provider", job["llm_provider"]] + if job.get("llm_model"): + cmd += ["--llm-model", job["llm_model"]] # lang: če None ali 'auto', pusti analyze.py auto-detect if job.get("lang") and job["lang"] not in ("auto", ""): cmd += ["--lang", job["lang"]] @@ -409,6 +414,9 @@ class StartJobIn(BaseModel): subtitle_style: str = "reels" whisper_model: str = "large-v3" quality: str = "medium" + # LLM za semantično analizo + popravke + llm_provider: str = "claude" # claude / gemini / auto + llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider) # ──────────────────────────────────────────────────────────────── @@ -512,6 +520,8 @@ async def start_processing( subtitle_style=payload.subtitle_style, whisper_model=payload.whisper_model, quality=payload.quality, + llm_provider=payload.llm_provider, + llm_model=payload.llm_model, current_step="V vrsti za obdelavo", ) background.add_task(process_job, payload.job_id) diff --git a/scripts/analyze.py b/scripts/analyze.py index e3d543f..7db582e 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -437,25 +437,8 @@ def detect_audio_fade(clip_range, transcript, video_duration=None): } -def analyze_with_claude(transcript, video_duration, target_duration=30): - """Pošlje cel transkript Claude API-ju, ki razume strukturo pesmi - in vrne najboljši odsek za reel. - - Claude bere cel tekst, prepozna ponovitve med deli (refren) in razume - kontekst (kdaj je intro, verz, refren, bridge, outro). - - Vrne dict z 'start', 'end', 'reason', 'chorus_text' ali None če Claude - ni dosegljiv ali API key manjka. - """ - api_key = os.environ.get("ANTHROPIC_API_KEY") - if not api_key: - print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr) - return None - - if not transcript.get("segments"): - return None - - # Pripravi tekstovni format za Claude — vsak segment z timestamp-om +def _build_analysis_prompt(transcript, video_duration, target_duration=30): + """Pripravi enotni prompt za Claude/Gemini analizo.""" lines = [] for seg in transcript["segments"]: start = seg["start"] @@ -464,7 +447,7 @@ def analyze_with_claude(transcript, video_duration, target_duration=30): lines.append(f"[{start:6.1f}-{end:6.1f}] {text}") transcript_text = "\n".join(lines) - prompt = f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo): + return f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo): {transcript_text} @@ -481,7 +464,7 @@ PROSIM: - Če pesem ima refren ki se ponavlja, vse pojavitve refrena POPRAVI da imajo ENAKO besedilo (uporabi najjasnejšo varianto) - Popravi napačne besede ki nimajo smisla v kontekstu - Popravi pomešane jezike (če pesem je slovenska, vse vrstice naj bodo v slovenščini) - - Ohrani timestamp-e nepriremenjene + - Ohrani timestamp-e nespremenjene 3. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi 4. Izberi najboljši odsek za reel: - Vključi cel refren (cel verz besedila brez prekinitve) @@ -505,11 +488,60 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage): V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom (ohrani timestamp-e).""" + +def _parse_llm_response(text, video_duration): + """Parse JSON odgovor iz LLM-a, vrne None če invalid.""" + text = text.strip() + # Odstrani markdown ovoj če obstaja + if text.startswith("```"): + text = re.sub(r"^```(?:json)?\s*", "", text) + text = re.sub(r"\s*```$", "", text) + # Včasih je pred JSON-om še kakšna razlaga, vzemi prvi { ... } blok + first_brace = text.find("{") + last_brace = text.rfind("}") + if first_brace >= 0 and last_brace > first_brace: + text = text[first_brace:last_brace + 1] + + result = json.loads(text) + + start = float(result["start"]) + end = float(result["end"]) + if start >= end or start < 0 or end > video_duration: + print(f" ⚠️ LLM returned invalid range: {start}-{end}", file=sys.stderr) + return None + + return { + "start": round(start, 2), + "end": round(end, 2), + "duration": round(end - start, 2), + "reason": result.get("reason", ""), + "chorus_text": result.get("chorus_text", ""), + "structure": result.get("structure", ""), + "language": result.get("language"), + "corrected_segments": result.get("corrected_segments"), + } + + +def analyze_with_claude(transcript, video_duration, target_duration=30, model="claude-sonnet-4-6"): + """Pošlje transkript Claude API-ju (Anthropic). + + model: claude-sonnet-4-6 (default), claude-haiku-4-5-20251001, claude-opus-4-7 + """ + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr) + return None + + if not transcript.get("segments"): + return None + + prompt = _build_analysis_prompt(transcript, video_duration, target_duration) + try: import urllib.request import urllib.error body = json.dumps({ - "model": "claude-sonnet-4-6", + "model": model, "max_tokens": 4096, "messages": [{"role": "user", "content": prompt}], }).encode("utf-8") @@ -524,7 +556,7 @@ V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom }, method="POST", ) - with urllib.request.urlopen(req, timeout=60) as resp: + with urllib.request.urlopen(req, timeout=120) as resp: data = json.loads(resp.read().decode("utf-8")) content = data.get("content", []) @@ -533,37 +565,18 @@ V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom return None text = content[0].get("text", "").strip() - # Včasih Claude obda JSON v markdown - if text.startswith("```"): - text = re.sub(r"^```(?:json)?\s*", "", text) - text = re.sub(r"\s*```$", "", text) - result = json.loads(text) - - # Sanity check - start = float(result["start"]) - end = float(result["end"]) - if start >= end or start < 0 or end > video_duration: - print(f" ⚠️ Claude returned invalid range: {start}-{end}", file=sys.stderr) + result = _parse_llm_response(text, video_duration) + if not result: return None - print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr) + print(f" 🤖 Claude ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr) print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr) print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr) - cs = result.get("corrected_segments") - if cs: - print(f" Popravljeni segmenti: {len(cs)}", file=sys.stderr) + if result.get("corrected_segments"): + print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr) - return { - "start": round(start, 2), - "end": round(end, 2), - "duration": round(end - start, 2), - "reason": result.get("reason", ""), - "chorus_text": result.get("chorus_text", ""), - "structure": result.get("structure", ""), - "language": result.get("language"), - "corrected_segments": result.get("corrected_segments"), - "source": "claude_llm", - } + result["source"] = f"claude:{model}" + return result except urllib.error.HTTPError as e: body = e.read().decode("utf-8", errors="replace")[:500] print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr) @@ -573,6 +586,101 @@ V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom return None +def analyze_with_gemini(transcript, video_duration, target_duration=30, model="gemini-3.1-pro-preview"): + """Pošlje transkript Gemini API-ju (Google). + + Gemini 3.1 Pro ima najboljši multilingual rezultat (MMMLU 92.6%) — odličen za SLO/HR/BS. + """ + api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY") + if not api_key: + print(" ⚠️ GEMINI_API_KEY ni nastavljen — preskakujem Gemini analizo", file=sys.stderr) + return None + + if not transcript.get("segments"): + return None + + prompt = _build_analysis_prompt(transcript, video_duration, target_duration) + + try: + import urllib.request + import urllib.error + + url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}" + body = json.dumps({ + "contents": [{ + "role": "user", + "parts": [{"text": prompt}], + }], + "generationConfig": { + "temperature": 0.1, + "maxOutputTokens": 4096, + "responseMimeType": "application/json", + }, + }).encode("utf-8") + + req = urllib.request.Request( + url, + data=body, + headers={"Content-Type": "application/json"}, + method="POST", + ) + with urllib.request.urlopen(req, timeout=120) as resp: + data = json.loads(resp.read().decode("utf-8")) + + candidates = data.get("candidates", []) + if not candidates: + print(" ⚠️ Gemini vrnil 0 candidates", file=sys.stderr) + return None + parts = candidates[0].get("content", {}).get("parts", []) + if not parts: + print(" ⚠️ Gemini vrnil prazen content", file=sys.stderr) + return None + text = parts[0].get("text", "").strip() + + result = _parse_llm_response(text, video_duration) + if not result: + return None + + print(f" 🤖 Gemini ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr) + print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr) + print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr) + if result.get("corrected_segments"): + print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr) + + result["source"] = f"gemini:{model}" + return result + except urllib.error.HTTPError as e: + body = e.read().decode("utf-8", errors="replace")[:500] + print(f" ❌ Gemini API HTTP {e.code}: {body}", file=sys.stderr) + return None + except Exception as e: + print(f" ❌ Gemini analysis failed: {e}", file=sys.stderr) + return None + + +def analyze_with_llm(transcript, video_duration, target_duration=30, provider="claude", llm_model=None): + """Glavna funkcija — uporabi izbrano LLM (claude/gemini/auto).""" + if provider == "gemini": + model = llm_model or "gemini-3.1-pro-preview" + return analyze_with_gemini(transcript, video_duration, target_duration, model) + elif provider == "claude": + model = llm_model or "claude-sonnet-4-6" + return analyze_with_claude(transcript, video_duration, target_duration, model) + elif provider == "auto": + # Najprej probaj Claude, fallback na Gemini + result = analyze_with_claude(transcript, video_duration, target_duration, + llm_model or "claude-sonnet-4-6") + if result: + return result + print(" 🔄 Claude ni uspel, probam Gemini...", file=sys.stderr) + return analyze_with_gemini(transcript, video_duration, target_duration, + llm_model or "gemini-3.1-pro-preview") + else: + print(f" ⚠️ Neznan LLM provider: {provider}", file=sys.stderr) + return None + + + def is_instrumental(transcript, video_duration, threshold=0.1): """Detekcija ali je pesem instrumentalna. @@ -599,7 +707,12 @@ def main(): ap.add_argument("--include-prebuild", action="store_true", help="Vključi pre-chorus build-up (privzeto: ne)") ap.add_argument("--no-claude", action="store_true", - help="Preskoči Claude LLM analizo (uporabi samo lokalno heuristiko)") + help="Preskoči LLM analizo (uporabi samo lokalno heuristiko)") + ap.add_argument("--llm-provider", default="claude", + choices=["claude", "gemini", "auto"], + help="Kateri LLM uporabiti za analizo (default: claude)") + ap.add_argument("--llm-model", default=None, + help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)") ap.add_argument("--json", action="store_true", help="Output JSON") ap.add_argument("--output", help="Path za JSON output") args = ap.parse_args() @@ -630,12 +743,14 @@ def main(): instrumental = is_instrumental(transcript, duration) print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr) - # 5a. PRIMARNO: Claude LLM analiza (razume cel tekst pesmi) + # 5a. PRIMARNO: LLM analiza (razume cel tekst pesmi + popravki) claude_result = None if not instrumental and not args.no_claude: - print(f"🤖 Pošiljam transkript Claude-u za analizo strukture...", file=sys.stderr) - claude_result = analyze_with_claude( - transcript, duration, target_duration=args.target_duration + provider = args.llm_provider + print(f"🤖 Pošiljam transkript {provider}-u za analizo...", file=sys.stderr) + claude_result = analyze_with_llm( + transcript, duration, target_duration=args.target_duration, + provider=provider, llm_model=args.llm_model, ) # 5b. Find chorus lokalno (kot fallback ali za score-jev preview) diff --git a/templates/index.html b/templates/index.html index e8018b4..e29fd51 100644 --- a/templates/index.html +++ b/templates/index.html @@ -267,6 +267,17 @@ +
+
+ + +
+
+