#!/usr/bin/env python3 """ analyze.py — Predhodna analiza CELEGA videa pred trim-anjem. Naredi: 1. Whisper transcript celega videa (auto-detect jezika ali user-specified) 2. Energy profile (RMS dB na 1s windows) 3. Structural detection (vocal/instrumental sections, energy peaks) 4. Pametno izbere clip range (lahko >30s, vključi pre-chorus) 5. Detekcija instrumentalnih pesmi (no_subs auto) Output: JSON s podatki za clip.py """ import argparse import json import os import re import subprocess import sys import tempfile from pathlib import Path def get_video_duration(path): r = subprocess.run( ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=nw=1:nokey=1", str(path)], capture_output=True, text=True ) try: return float(r.stdout.strip()) except ValueError: return 0.0 def extract_audio(video_path): """Extract avdio v 16kHz mono WAV za Whisper + energy.""" audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) audio.close() subprocess.run( ["ffmpeg", "-y", "-i", str(video_path), "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", audio.name], check=True, capture_output=True ) return audio.name def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"): """ElevenLabs Scribe transkripcija (najboljša multilingual accuracy 2026). Lang accepted in ISO 639-1 ('de', 'sl', 'hr') — auto-converted to ISO 639-3. Pricing: ~$0.40/h (~$0.022 per 200s pesem). """ import urllib.request import urllib.error import uuid api_key = os.environ.get("ELEVENLABS_API_KEY") if not api_key: print(" ⚠️ ELEVENLABS_API_KEY ni nastavljen", file=sys.stderr) return None # ISO 639-1 → 639-3 mapping (Scribe uses 639-3) LANG_1_TO_3 = { "en": "eng", "de": "deu", "sl": "slv", "hr": "hrv", "bs": "bos", "sr": "srp", "it": "ita", "es": "spa", "fr": "fra", "pt": "por", "ru": "rus", "pl": "pol", "cs": "ces", "sk": "slk", "hu": "hun", "ro": "ron", "nl": "nld", "sv": "swe", "no": "nor", "da": "dan", "fi": "fin", "tr": "tur", "ar": "ara", "uk": "ukr", "bg": "bul", "el": "ell", "he": "heb", "ja": "jpn", "ko": "kor", "zh": "zho", } # Reverse mapping for parsing response LANG_3_TO_1 = {v: k for k, v in LANG_1_TO_3.items()} # Multipart upload boundary = uuid.uuid4().hex parts = [] def add_text(name, value): parts.append( f"--{boundary}\r\nContent-Disposition: form-data; " f"name=\"{name}\"\r\n\r\n{value}\r\n".encode() ) def add_file(name, filename, content, ctype): parts.append( f"--{boundary}\r\nContent-Disposition: form-data; " f"name=\"{name}\"; filename=\"{filename}\"\r\n" f"Content-Type: {ctype}\r\n\r\n".encode() + content + b"\r\n" ) with open(audio_path, "rb") as f: audio_content = f.read() # Limit: ElevenLabs Scribe supports up to ~25 MB / 4.5h per request if len(audio_content) > 24 * 1024 * 1024: print(f" ⚠️ Audio {len(audio_content)/1024/1024:.1f} MB > 24 MB limit, fallback", file=sys.stderr) return None add_text("model_id", model) add_text("timestamps_granularity", "word") add_text("tag_audio_events", "false") if lang: scribe_lang = LANG_1_TO_3.get(lang, lang) add_text("language_code", scribe_lang) add_file("file", "audio.mp3", audio_content, "audio/mpeg") parts.append(f"--{boundary}--\r\n".encode()) body = b"".join(parts) print(f" 📡 ElevenLabs Scribe ({model}, {len(audio_content)/1024/1024:.1f} MB, " f"lang={lang or 'auto'})...", file=sys.stderr) req = urllib.request.Request( "https://api.elevenlabs.io/v1/speech-to-text", data=body, headers={ "xi-api-key": api_key, "Content-Type": f"multipart/form-data; boundary={boundary}", }, ) try: with urllib.request.urlopen(req, timeout=300) as resp: data = json.loads(resp.read().decode()) except urllib.error.HTTPError as e: body_err = e.read().decode("utf-8", errors="replace")[:500] print(f" ❌ Scribe HTTP {e.code}: {body_err}", file=sys.stderr) return None except Exception as e: print(f" ❌ Scribe exception: {e}", file=sys.stderr) return None # Convert response to our standard format detected_lang_3 = data.get("language_code", "unknown") detected_lang_1 = LANG_3_TO_1.get(detected_lang_3, detected_lang_3[:2]) detected_prob = data.get("language_probability", 1.0) # Scribe returns flat list of words (not segments) # We group words into pseudo-segments using **smart phrase-aware segmentation**: # - Close on long pause (>= 0.4s) — natural breath/phrase boundary # - OR after sentence-ending punctuation (. ! ?) # - OR after 4 seconds (max segment length for readable subtitle) # This gives ~3-7 word segments matching natural sung phrases. words = data.get("words", []) segments = [] if words: # Filter out whitespace tokens real_words = [w for w in words if w.get("text", "").strip()] if real_words: current_seg_words = [] seg_start = real_words[0].get("start", 0) for i, w in enumerate(real_words): current_seg_words.append(w) w_end = w.get("end", w.get("start", 0)) w_text = w.get("text", "") close = False # Decide if we should close the segment if i + 1 < len(real_words): next_start = real_words[i + 1].get("start", w_end) pause = next_start - w_end seg_duration = w_end - seg_start # Trigger close on: # 1. Long pause (>= 0.4s) = phrase boundary # 2. Sentence-ending punctuation # 3. Segment is long enough (>= 4s) if pause >= 0.4: close = True elif seg_duration >= 4.0 and pause >= 0.15: close = True elif w_text.rstrip().endswith(('.', '!', '?')) and pause >= 0.2: close = True elif seg_duration >= 5.5: # hard cap close = True else: close = True # last word if close: seg_text = " ".join(ww.get("text", "") for ww in current_seg_words).strip() if seg_text: segments.append({ "start": seg_start, "end": w_end, "text": seg_text, "words": [ { "start": ww.get("start", 0), "end": ww.get("end", 0), "text": ww.get("text", ""), } for ww in current_seg_words ], }) # Reset current_seg_words = [] if i + 1 < len(real_words): seg_start = real_words[i + 1].get("start", 0) print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, " f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr) return { "language": detected_lang_1, "language_probability": float(detected_prob), "segments": segments, "_provider": "elevenlabs", } def transcribe_full(audio_path, lang=None, model_size="small", provider="auto"): """Whisper/Scribe transcript dispatcher. provider: - "elevenlabs" → ElevenLabs Scribe (najboljša kvaliteta, $0.40/h, ~10s na 200s pesem) - "local" → faster-whisper na CPU (brezplačno, počasi, halucinacije) - "auto" → Scribe če ELEVENLABS_API_KEY obstaja, sicer local """ if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"): result = transcribe_with_elevenlabs(audio_path, lang=lang) if result and result.get("segments"): return result if provider == "elevenlabs": print(f" ⚠️ Scribe failed, no fallback (provider=elevenlabs)", file=sys.stderr) return {"language": "unknown", "language_probability": 0.0, "segments": []} print(f" 🔄 Scribe failed, fallback na local Whisper...", file=sys.stderr) # Local faster-whisper return _transcribe_full_local(audio_path, lang=lang, model_size=model_size) def _transcribe_full_local(audio_path, lang=None, model_size="small"): """Whisper transcript celega avdia. lang=None → robust auto-detect. Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental).""" from faster_whisper import WhisperModel print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr) m = WhisperModel(model_size, device="cpu", compute_type="int8") # Auto-detect z 3-sample voting da se zaklenemo na en jezik if not lang: print(" 🔍 Robust lang detection (3 samples)...", file=sys.stderr) try: duration_proc = subprocess.run( ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=nw=1:nokey=1", audio_path], capture_output=True, text=True ) audio_duration = float(duration_proc.stdout.strip()) except Exception: audio_duration = 180.0 lang_votes = {} for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]: if ss + 5 > audio_duration: continue sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) sample.close() try: subprocess.run( ["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path, "-t", "30", "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", sample.name], check=True, capture_output=True ) _, sample_info = m.transcribe(sample.name, language=None, vad_filter=False) sl, sp = sample_info.language, float(sample_info.language_probability) lang_votes[sl] = lang_votes.get(sl, 0) + sp print(f" sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr) except Exception as e: print(f" sample @ {ss:.0f}s: failed", file=sys.stderr) finally: try: os.unlink(sample.name) except Exception: pass if lang_votes: lang = max(lang_votes.items(), key=lambda x: x[1])[0] print(f" ✅ Lang lock: {lang}", file=sys.stderr) try: segs, info = m.transcribe( audio_path, language=lang, word_timestamps=True, # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez vad_filter=False, # Anti-halucinacije condition_on_previous_text=False, temperature=0.0, compression_ratio_threshold=2.4, log_prob_threshold=-1.0, no_speech_threshold=0.6, # Beam search namesto greedy = bolj zanesljiv decode (manj halucinacij) beam_size=5, # Halucinacija detection: če je tišina dolga, ne pretvarjaj v tekst hallucination_silence_threshold=2.0, ) detected_lang = info.language detected_prob = float(info.language_probability) except (ValueError, RuntimeError) as e: # Whisper failure (např. pri popolnoma instrumentalnih datotekah z VAD) print(f" ⚠️ Whisper transcribe failed: {e}", file=sys.stderr) return { "language": "unknown", "language_probability": 0.0, "segments": [], } print(f" Detekcija: {detected_lang} (p={detected_prob:.2f})", file=sys.stderr) segments = [] for s in segs: words = [] if s.words: for w in s.words: words.append({ "start": w.start, "end": w.end, "text": w.word, }) segments.append({ "start": s.start, "end": s.end, "text": s.text.strip(), "words": words, }) return { "language": detected_lang, "language_probability": detected_prob, "segments": segments, } def compute_energy_profile(audio_path, window_sec=1.0): """RMS dB na window_sec sekund. Vrne list (timestamp, rms_db).""" cmd = [ "ffmpeg", "-i", audio_path, "-af", f"asetnsamples=n={int(16000 * window_sec)}:p=0," f"astats=metadata=1:reset={window_sec}," f"ametadata=print:key=lavfi.astats.Overall.RMS_level:file=-", "-f", "null", "-", ] result = subprocess.run(cmd, capture_output=True, text=True) output = result.stdout + "\n" + result.stderr energies = [] current_pts = 0.0 for line in output.split("\n"): line = line.strip() m = re.search(r"pts_time:(\S+)", line) if m: try: current_pts = float(m.group(1)) except ValueError: pass continue if "RMS_level=" in line: val = line.split("RMS_level=")[-1].strip() try: rms = float(val) # -inf zamenjamo z -90 if rms < -90 or rms != rms: # NaN check rms = -90.0 energies.append((current_pts, rms)) current_pts += window_sec except ValueError: pass return energies def detect_vocal_sections(segments, max_gap=3.0): """Združi consecutive segmente v "vokalne sekcije".""" if not segments: return [] sections = [] current = { "start": segments[0]["start"], "end": segments[0]["end"], "segments": [segments[0]], "text": segments[0]["text"], } for seg in segments[1:]: if seg["start"] - current["end"] > max_gap: sections.append(current) current = { "start": seg["start"], "end": seg["end"], "segments": [seg], "text": seg["text"], } else: current["end"] = seg["end"] current["segments"].append(seg) current["text"] += " " + seg["text"] sections.append(current) return sections def avg_energy_in_range(energies, start, end): """Povprečna RMS v rangeu.""" vals = [r for (t, r) in energies if start <= t <= end] if not vals: return -90.0 return sum(vals) / len(vals) def score_section_as_chorus(section, all_sections, energies, avg_rms): """Score sekcijo kot kandidat za refren. Faktorji: - Ponavljajoče besede (low unique-word-ratio) = refren - Visoka energija - Sekcija se pojavi večkrat v pesmi (refren se ponovi) - Krajše vrstice (3-8 besed) """ text = section["text"].lower() words = re.findall(r"\b\w+\b", text) if not words: return 0 unique_ratio = len(set(words)) / len(words) # Refren = nizko unique ratio (ponovitve) chorus_signal = max(0, (1.0 - unique_ratio) * 30) # Energija sec_energy = avg_energy_in_range(energies, section["start"], section["end"]) energy_above = max(0, sec_energy - avg_rms) energy_score = energy_above * 8 # Kako pogosto se pojavi podobno besedilo repeat_count = 0 for other in all_sections: if other is section: continue other_text = other["text"].lower() other_words = set(re.findall(r"\b\w+\b", other_text)) common = set(words) & other_words # Če imata >50% besed skupnih, je verjetno isti refren if len(common) >= len(set(words)) * 0.5 and len(common) >= 3: repeat_count += 1 repeat_score = repeat_count * 25 # Dolžina vrstice duration = section["end"] - section["start"] if 3 <= duration <= 25: length_score = 10 elif duration > 25: length_score = 5 else: length_score = 2 return chorus_signal + energy_score + repeat_score + length_score def find_chorus(transcript, energies, video_duration): """Najde najbolj verjeten refren.""" sections = detect_vocal_sections(transcript["segments"]) if not sections: return None avg_rms = sum(r for (_, r) in energies) / len(energies) if energies else -30.0 candidates = [] for sec in sections: score = score_section_as_chorus(sec, sections, energies, avg_rms) candidates.append({ "start": sec["start"], "end": sec["end"], "duration": sec["end"] - sec["start"], "text_preview": sec["text"][:80], "score": round(score, 2), "avg_rms": round(avg_energy_in_range(energies, sec["start"], sec["end"]), 2), }) # Sort by score descending candidates.sort(key=lambda c: -c["score"]) if not candidates: return None return { "best": candidates[0], "all_candidates": candidates[:10], "avg_rms_total": round(avg_rms, 2), } def smart_clip_range(chorus, transcript, video_duration, target_duration=30, max_duration=45, min_duration=20, include_prebuild=False): """Inteligentno določi clip range. Logika: 1. Začni z refrenom kot core 2. Če je krajši od min_duration → razširi z drugim refrenom (ne kitico!) 3. Cap na max_duration include_prebuild=False (default): NE doda kitice/verza pred refrenom. include_prebuild=True: doda kratek pre-chorus (max 8s, gap < 3s). """ if not chorus or not chorus.get("best"): # Fallback: vzemi sredino videa mid = video_duration / 2 start = max(0, mid - target_duration / 2) return { "start": start, "end": min(video_duration, start + target_duration), "reason": "fallback_middle", } best = chorus["best"] sections = detect_vocal_sections(transcript["segments"]) actual_start = best["start"] actual_end = best["end"] # Najdi VSE sekcije ki so podobne refrenu (verjetne ponovitve) chorus_words = set(re.findall(r"\b\w+\b", best["text_preview"].lower())) chorus_sections = [] for sec in sections: sec_words = set(re.findall(r"\b\w+\b", sec["text"].lower())) if chorus_words and len(sec_words & chorus_words) >= len(chorus_words) * 0.4: chorus_sections.append(sec) # 1. Če je core refren prekratek, razširi z naslednjim REFRENOM (ne kitico!) if actual_end - actual_start < min_duration: for sec in chorus_sections: if sec["start"] > actual_end and sec["start"] - actual_end < 8: if sec["end"] - actual_start <= max_duration: actual_end = sec["end"] if actual_end - actual_start >= min_duration: break # 2. Pre-chorus build-up (samo če uporabnik to izrecno hoče) if include_prebuild: pre_section = None for sec in sections: # Pre-section mora biti BLIZU (gap < 3s) in NE preveč dolga (< 8s) sec_duration = sec["end"] - sec["start"] if (sec["end"] <= actual_start and actual_start - sec["end"] < 3 and sec_duration < 8): pre_section = sec if pre_section: candidate_start = pre_section["start"] if actual_end - candidate_start <= max_duration: actual_start = candidate_start # 3. Če je še prekratek, razširi simetrično znotraj refrenov (ne kitic) if actual_end - actual_start < min_duration: deficit = min_duration - (actual_end - actual_start) # Razširi konec če lahko for sec in chorus_sections: if sec["start"] > actual_end and sec["start"] - actual_end < 5: actual_end = min(sec["end"], actual_end + deficit) break # Če še ni dovolj, manjše simetrično if actual_end - actual_start < min_duration: extra = (min_duration - (actual_end - actual_start)) / 2 actual_start = max(0, actual_start - extra) actual_end = min(video_duration, actual_end + extra) # 4. Trim na max if actual_end - actual_start > max_duration: actual_end = actual_start + max_duration actual_start = max(0, actual_start) actual_end = min(video_duration, actual_end) return { "start": round(actual_start, 2), "end": round(actual_end, 2), "duration": round(actual_end - actual_start, 2), "reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only", "chorus_start": round(best["start"], 2), "chorus_end": round(best["end"], 2), } def detect_audio_fade(clip_range, transcript, video_duration=None): """Določi fade-in/fade-out trajanje + ev. razširi clip range, da fade ne reže besedila na koncu refrena. Logika: - Če clip začne sredi vokala → 0.5s fade in - Če se konča sredi vokala → razširi clip do konca segmenta (+ buffer), potem 1.0s fade out - Sicer manj fade """ cs, ce = clip_range["start"], clip_range["end"] # Najdi segment, ki konča znotraj clip-a (ali je clip end znotraj segmenta) starts_in_vocal = False ends_in_vocal = False end_segment = None for seg in transcript["segments"]: if seg["start"] <= cs <= seg["end"]: starts_in_vocal = True if seg["start"] <= ce <= seg["end"]: ends_in_vocal = True end_segment = seg # Če clip konča znotraj segmenta, razširi do konca segmenta + 0.5s buffer extended_end = ce if end_segment: extended_end = end_segment["end"] + 0.5 if video_duration is not None: extended_end = min(extended_end, video_duration) fade_in = 0.4 if starts_in_vocal else 0.2 # Krajši fade out (0.5s) ker zdaj clip konča po koncu vokala fade_out = 0.5 if ends_in_vocal else 0.3 return { "fade_in": fade_in, "fade_out": fade_out, "extended_end": round(extended_end, 2), "ends_in_vocal": ends_in_vocal, } def _build_analysis_prompt(transcript, video_duration, target_duration=30, filename_hint=None): """Pripravi enotni prompt za Claude/Gemini analizo.""" lines = [] for seg in transcript["segments"]: start = seg["start"] end = seg["end"] text = seg["text"].strip() lines.append(f"[{start:6.1f}-{end:6.1f}] {text}") transcript_text = "\n".join(lines) hint_block = "" if filename_hint: hint_block = f""" 🎵 IME DATOTEKE: "{filename_hint}" Iz imena datoteke morda lahko prepoznaš naslov pesmi ali izvajalca. Če je tako: - Uporabi svoje znanje o **dejanskem besedilu** te pesmi - Če Whisper transkript ne ustreza znanemu besedilu pesmi, POPRAVI besedilo na **dejansko besedilo pesmi** - Ohrani timestamp-e iz Whisper-ja (časovne meje so pravilne, samo besede so morda napačne) 🔍 ČE NE POZNAŠ PESMI (npr. slovenske narodno-zabavne, manj znane pesmi) → **UPORABI web_search tool** da poiščeš pravo besedilo! Primeri search queryjev: - "[ime izvajalca] [naslov pesmi] besedilo" (slovenske: Modrijani, Veseli Dolenjci, Avseniki, Čuki, Atomik Harmonik) - "[artist] [title] lyrics" (angleške/nemške) - Pogosto so besedila na: besedila.com, lyricstranslate.com, genius.com, tekstovi.net (HR/SR), songtexte.com (DE) Ko najdeš pravo besedilo, uporabi to za popravljanje "corrected_segments" — **transkript bo veliko bolj točen** kot če le ugibaš. """ return f"""Tu je transcript pesmi iz STT modela (timestamp v sekundah, besedilo): {transcript_text} Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.{hint_block} ⚠️ POMEMBNO: STT lahko naredi napake — narečne besede, slovanski jeziki, ko glasba prevladuje: - Pri narečjih in slovanskih jezikih - Generira "tipičen" tekst (npr. tekst druge pesmi istega izvajalca) - Lahko vstavi besede ki se POdoBNO slišijo, ampak imajo ČISTO drug pomen KAKO PREPOZNATI HALUCINACIJO: - Tekst nima smisla v kontekstu pesmi - Različni segmenti imajo nepovezane teme (kot da bi bilo več pesmi) - Refren je v vsakem ponovitvi različen (refren se MORA ponavljati identično) - Tekst je premalo **glede na trajanje** (več tišine = manj besed, ne več) PROSIM: 1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro) 2. POPRAVI očitne halucinacije: - Če prepoznaš pesem (po izvajalcu, naslovu, znaku besedila) → **uporabi PRAVO besedilo** - Če halucinacijo ne moreš popraviti, **odstrani segment** (raje brez podnapisa kot napačen) - Refren MORA imeti vse pojavitve ENAKE - Popravi pomešane jezike (vse vrstice v enem jeziku) - Ohrani timestamp-e nespremenjene 3. Prepoznaj REFREN: del besedila ki se PONAVLJA 4. Izberi najboljši odsek za reel: - Vključi cel refren (brez prekinitve) - Lahko dodaj pre-chorus build-up - 20-45 sekund - Začni in končaj na smiselni meji 5. Če pesem nima jasnega refrena, izberi najbolj dramatičen ali zaključen del 6. Če Whisper transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "WHISPER_HALLUCINATION_DETECTED" in vrni najmanj segmentov (samo tisti ki so smiselni) Odgovori SAMO v JSON formatu (brez markdown, brez razlage): {{ "start": , "end": , "reason": "", "chorus_text": "", "structure": "<1 stavek o strukturi pesmi>", "language": "", "hallucination_detected": , "corrected_segments": [ {{"start": , "end": , "text": ""}} ] }} V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom. Halucinacije nadomesti s pravim besedilom (če veš) ALI pusti prazno besedilo.""" def _parse_llm_response(text, video_duration): """Parse JSON odgovor iz LLM-a, vrne None če invalid.""" text = text.strip() # Odstrani markdown ovoj če obstaja if text.startswith("```"): text = re.sub(r"^```(?:json)?\s*", "", text) text = re.sub(r"\s*```$", "", text) # Včasih je pred JSON-om še kakšna razlaga, vzemi prvi { ... } blok first_brace = text.find("{") last_brace = text.rfind("}") if first_brace >= 0 and last_brace > first_brace: text = text[first_brace:last_brace + 1] result = json.loads(text) start = float(result["start"]) end = float(result["end"]) if start >= end or start < 0 or end > video_duration: print(f" ⚠️ LLM returned invalid range: {start}-{end}", file=sys.stderr) return None return { "start": round(start, 2), "end": round(end, 2), "duration": round(end - start, 2), "reason": result.get("reason", ""), "chorus_text": result.get("chorus_text", ""), "structure": result.get("structure", ""), "language": result.get("language"), "corrected_segments": result.get("corrected_segments"), } def analyze_with_claude(transcript, video_duration, target_duration=30, model="claude-sonnet-4-6", filename_hint=None): """Pošlje transkript Claude API-ju (Anthropic). model: claude-sonnet-4-6 (default), claude-haiku-4-5-20251001, claude-opus-4-7 filename_hint: ime datoteke (Claude lahko prepozna pesem in popravi halucinacije) """ api_key = os.environ.get("ANTHROPIC_API_KEY") if not api_key: print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr) return None if not transcript.get("segments"): return None prompt = _build_analysis_prompt(transcript, video_duration, target_duration, filename_hint=filename_hint) try: import urllib.request import urllib.error # Initial messages messages = [{"role": "user", "content": prompt}] # Sonnet 4.6 podpira web_search tool — Claude lahko poišče prave lyrics # za pesmi v slovenščini/hrvaščini/itd., če jih ne pozna iz training data. tools = [{ "type": "web_search_20250305", "name": "web_search", "max_uses": 3, # Maksimalno 3 search-i = $0.03/job }] # Agentic loop: Claude lahko kliče web_search, dobi rezultate, vrne final answer max_iterations = 5 for iteration in range(max_iterations): body = json.dumps({ "model": model, "max_tokens": 8192, "messages": messages, "tools": tools, }).encode("utf-8") req = urllib.request.Request( "https://api.anthropic.com/v1/messages", data=body, headers={ "Content-Type": "application/json", "x-api-key": api_key, "anthropic-version": "2023-06-01", }, method="POST", ) with urllib.request.urlopen(req, timeout=180) as resp: data = json.loads(resp.read().decode("utf-8")) content = data.get("content", []) if not content: print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr) return None stop_reason = data.get("stop_reason") if stop_reason == "max_tokens": usage = data.get("usage", {}) print( f" ⚠️ Claude odrezan (max_tokens): " f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}", file=sys.stderr, ) return None # Če je end_turn → smo končali, parsiraj text if stop_reason in ("end_turn", "stop_sequence"): # Najdem zadnji text block text_blocks = [b for b in content if b.get("type") == "text"] if text_blocks: text = text_blocks[-1].get("text", "").strip() break print(" ⚠️ Claude end_turn brez text bloka", file=sys.stderr) return None # Če je tool_use → Claude kliče web_search; appendamo response in nadaljujemo if stop_reason == "tool_use": # Anthropic web_search tool je server-side — sami obdela searches in vrne web_search_tool_result # Ampak v API odgovoru so OBA: tool_use IN web_search_tool_result kot del content # Torej končni text že obstaja v naslednji iteraciji # Appendamo content do messages in pošljem nazaj (Claude bo nadaljeval) messages.append({"role": "assistant", "content": content}) # Claude server-side že obdela search, samo nadaljujemo s pustim user msg # Ampak server-side tools NE potrebujejo follow-up tool_result # Pravilen flow: če stop_reason=tool_use ampak web_search_tool_result je že v content, # potem Claude sam nadaljuje. Drugače moramo poslati tool_result. # Preverim ali so že rezultati v content has_results = any(b.get("type") == "web_search_tool_result" for b in content) if has_results: # Server-side: Anthropic je sam obdelal search, čakamo nadaljevanje # Pošlji nazaj brez sprememb da Claude nadaljuje print(f" 🔍 Claude je iskal lyrics, čakam nadaljevanje (iter {iteration+1})", file=sys.stderr) continue else: print(f" ⚠️ tool_use brez results", file=sys.stderr) return None # Drugi stop reasons print(f" ⚠️ Nepričakovan stop_reason: {stop_reason}", file=sys.stderr) return None else: print(f" ⚠️ Presežena max_iterations ({max_iterations})", file=sys.stderr) return None result = _parse_llm_response(text, video_duration) if not result: return None print(f" 🤖 Claude ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr) print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr) print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr) if result.get("corrected_segments"): print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr) result["source"] = f"claude:{model}" return result except urllib.error.HTTPError as e: body = e.read().decode("utf-8", errors="replace")[:500] print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr) return None except Exception as e: print(f" ❌ Claude analysis failed: {e}", file=sys.stderr) return None def analyze_with_gemini(transcript, video_duration, target_duration=30, model="gemini-3.1-pro-preview", filename_hint=None): """Pošlje transkript Gemini API-ju (Google). Gemini 3.1 Pro ima najboljši multilingual rezultat (MMMLU 92.6%) — odličen za SLO/HR/BS. """ api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY") if not api_key: print(" ⚠️ GEMINI_API_KEY ni nastavljen — preskakujem Gemini analizo", file=sys.stderr) return None if not transcript.get("segments"): return None prompt = _build_analysis_prompt(transcript, video_duration, target_duration, filename_hint=filename_hint) try: import urllib.request import urllib.error url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}" # Gemini 3.x Pro je THINKING model — porabi tokene tudi za internal reasoning (thoughtsTokenCount). # 4096 je prenizko: pri velikih transkriptih thinking lahko porabi 1500-3000 tokenov, # output (corrected_segments za 60+ segmentov) pa še dodatnih 3000-7000 → odreže JSON na pol # (finishReason: MAX_TOKENS) in vrne nepopolen, neveljaven JSON. # 32768 daje dovolj prostora za thinking + cel JSON output tudi pri dolgih pesmih. body = json.dumps({ "contents": [{ "role": "user", "parts": [{"text": prompt}], }], "generationConfig": { "temperature": 0.1, "maxOutputTokens": 32768, "responseMimeType": "application/json", }, }).encode("utf-8") req = urllib.request.Request( url, data=body, headers={"Content-Type": "application/json"}, method="POST", ) with urllib.request.urlopen(req, timeout=180) as resp: data = json.loads(resp.read().decode("utf-8")) candidates = data.get("candidates", []) if not candidates: print(" ⚠️ Gemini vrnil 0 candidates", file=sys.stderr) return None cand0 = candidates[0] finish_reason = cand0.get("finishReason", "?") usage = data.get("usageMetadata", {}) # Diagnostika: če je finishReason == MAX_TOKENS, je output odrezan in JSON je invalid if finish_reason == "MAX_TOKENS": print( f" ⚠️ Gemini odrezan (MAX_TOKENS): " f"prompt={usage.get('promptTokenCount')} " f"thoughts={usage.get('thoughtsTokenCount')} " f"output={usage.get('candidatesTokenCount')}", file=sys.stderr, ) return None parts = cand0.get("content", {}).get("parts", []) if not parts: print( f" ⚠️ Gemini vrnil prazen content (finishReason={finish_reason}, " f"thoughts={usage.get('thoughtsTokenCount')})", file=sys.stderr, ) return None text = parts[0].get("text", "").strip() if not text: print( f" ⚠️ Gemini vrnil prazen text (finishReason={finish_reason}, " f"thoughts={usage.get('thoughtsTokenCount')}, " f"output={usage.get('candidatesTokenCount')})", file=sys.stderr, ) return None result = _parse_llm_response(text, video_duration) if not result: return None print(f" 🤖 Gemini ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr) print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr) print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr) if result.get("corrected_segments"): print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr) result["source"] = f"gemini:{model}" return result except urllib.error.HTTPError as e: body = e.read().decode("utf-8", errors="replace")[:500] print(f" ❌ Gemini API HTTP {e.code}: {body}", file=sys.stderr) return None except Exception as e: print(f" ❌ Gemini analysis failed: {e}", file=sys.stderr) return None def analyze_with_llm(transcript, video_duration, target_duration=30, provider="claude", llm_model=None, filename_hint=None): """Glavna funkcija — uporabi izbrano LLM (claude/gemini/auto).""" if provider == "gemini": model = llm_model or "gemini-3.1-pro-preview" return analyze_with_gemini(transcript, video_duration, target_duration, model, filename_hint=filename_hint) elif provider == "claude": model = llm_model or "claude-sonnet-4-6" return analyze_with_claude(transcript, video_duration, target_duration, model, filename_hint=filename_hint) elif provider == "auto": # Najprej probaj Claude, fallback na Gemini result = analyze_with_claude(transcript, video_duration, target_duration, llm_model or "claude-sonnet-4-6", filename_hint=filename_hint) if result: return result print(" 🔄 Claude ni uspel, probam Gemini...", file=sys.stderr) return analyze_with_gemini(transcript, video_duration, target_duration, llm_model or "gemini-3.1-pro-preview", filename_hint=filename_hint) else: print(f" ⚠️ Neznan LLM provider: {provider}", file=sys.stderr) return None def is_instrumental(transcript, video_duration, threshold=0.1): """Detekcija ali je pesem instrumentalna. Če je vsota trajanja vokalnih segmentov < threshold * video_duration, je pesem instrumentalna. """ if not transcript.get("segments"): return True vocal_duration = sum( s["end"] - s["start"] for s in transcript["segments"] ) ratio = vocal_duration / max(video_duration, 1) return bool(ratio < threshold) def main(): ap = argparse.ArgumentParser() ap.add_argument("video", help="Vhod video file") ap.add_argument("--lang", default=None, help="ISO 639-1 ali 'auto' (default: auto)") ap.add_argument("--model", default="large-v3", help="Whisper model") ap.add_argument("--target-duration", type=float, default=30.0) ap.add_argument("--max-duration", type=float, default=45.0) ap.add_argument("--min-duration", type=float, default=20.0) ap.add_argument("--include-prebuild", action="store_true", help="Vključi pre-chorus build-up (privzeto: ne)") ap.add_argument("--no-claude", action="store_true", help="Preskoči LLM analizo (uporabi samo lokalno heuristiko)") ap.add_argument("--llm-provider", default="claude", choices=["claude", "gemini", "auto"], help="Kateri LLM uporabiti za analizo (default: claude)") ap.add_argument("--llm-model", default=None, help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)") ap.add_argument("--filename-hint", default=None, help="Originalno ime datoteke (Claude lahko prepozna pesem)") ap.add_argument("--whisper-provider", default="auto", choices=["auto", "elevenlabs", "local"], help="STT provider: elevenlabs=ElevenLabs Scribe (najboljša kvaliteta, $0.40/h), " "local=faster-whisper CPU (brezplačno, halucinacije), auto=Scribe če key, sicer local") ap.add_argument("--json", action="store_true", help="Output JSON") ap.add_argument("--output", help="Path za JSON output") args = ap.parse_args() video = Path(args.video) if not video.exists(): print(f"❌ Video ne obstaja: {video}", file=sys.stderr) sys.exit(1) duration = get_video_duration(video) print(f"📹 Video: {video.name}, {duration:.1f}s", file=sys.stderr) # 1. Extract avdio audio = extract_audio(video) try: # 2. Whisper transcript lang = None if args.lang in (None, "auto", "") else args.lang transcript = transcribe_full( audio, lang=lang, model_size=args.model, provider=args.whisper_provider, ) print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr) # 3. Energy profile print(f"⚡ Energy profile...", file=sys.stderr) energies = compute_energy_profile(audio) print(f" Energy samples: {len(energies)}", file=sys.stderr) # 4. Instrumental detection instrumental = is_instrumental(transcript, duration) print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr) # 5a. PRIMARNO: LLM analiza (razume cel tekst pesmi + popravki) claude_result = None if not instrumental and not args.no_claude: provider = args.llm_provider print(f"🤖 Pošiljam transkript {provider}-u za analizo...", file=sys.stderr) # Filename hint = original filename brez extension (Claude lahko prepozna pesem) fname_hint = args.filename_hint or video.stem claude_result = analyze_with_llm( transcript, duration, target_duration=args.target_duration, provider=provider, llm_model=args.llm_model, filename_hint=fname_hint, ) # 5b. Find chorus lokalno (kot fallback ali za score-jev preview) if not instrumental: chorus = find_chorus(transcript, energies, duration) else: # Za instrumentalne: najdi sekcijo z najvišjo energijo window = args.target_duration best_start = 0 best_avg = -100 t = 0 while t + window <= duration: avg = avg_energy_in_range(energies, t, t + window) if avg > best_avg: best_avg = avg best_start = t t += 5 # step 5s chorus = { "best": { "start": best_start, "end": best_start + window, "duration": window, "text_preview": "(instrumental — energy peak)", "score": 0, "avg_rms": round(best_avg, 2), }, "all_candidates": [], "avg_rms_total": round( sum(r for (_, r) in energies) / len(energies) if energies else -30, 2 ), } # 6. Clip range — LLM (Claude/Gemini) ima prednost, sicer smart_clip_range fallback. # POMEMBNO: spremenljivka se zgodovinsko imenuje claude_result, dejansko pa vsebuje # rezultat KATEREGA KOLI LLM-ja (Claude ali Gemini) — glej analyze_with_llm(). # llm_source npr. "claude:claude-sonnet-4-6" ali "gemini:gemini-3.1-pro-preview". if claude_result: llm_source = claude_result.get("source", "llm") clip_range = { "start": claude_result["start"], "end": claude_result["end"], "duration": claude_result["duration"], "reason": f"{llm_source}: " + claude_result.get("reason", ""), "chorus_text": claude_result.get("chorus_text", ""), "structure": claude_result.get("structure", ""), "source": llm_source, } # Apply max_duration cap če LLM pretirava if clip_range["duration"] > args.max_duration: clip_range["end"] = clip_range["start"] + args.max_duration clip_range["duration"] = args.max_duration clip_range["reason"] += " (capped at max_duration)" else: clip_range = smart_clip_range( chorus, transcript, duration, target_duration=args.target_duration, max_duration=args.max_duration, min_duration=args.min_duration, include_prebuild=args.include_prebuild, ) clip_range["source"] = "local_heuristic" print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s " f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})", file=sys.stderr) # Če Claude je vrnil popravljene segmente, jih uporabi (boljši za podnapise) if claude_result and claude_result.get("corrected_segments"): corrected = claude_result["corrected_segments"] # Ohrani word-level timing iz originala, posodobi samo text orig_by_start = {round(s["start"], 1): s for s in transcript["segments"]} new_segments = [] for cs in corrected: try: cs_start = float(cs["start"]) cs_end = float(cs["end"]) cs_text = str(cs["text"]).strip() except (KeyError, ValueError, TypeError): continue # Najdi originalni segment z istim start (ali blizu) za word-level timing orig = orig_by_start.get(round(cs_start, 1)) if not orig: # Najdi najbližji closest_diff = 999 for s in transcript["segments"]: diff = abs(s["start"] - cs_start) if diff < closest_diff and diff < 1.0: closest_diff = diff orig = s new_segments.append({ "start": cs_start, "end": cs_end, "text": cs_text, # Word-level timing ne moremo posodabljati ker Claude ne vrača besede, # ampak ohranimo če imamo "words": orig.get("words", []) if orig else [], }) transcript["segments"] = new_segments transcript["claude_corrected"] = True # ohranimo ime ključa zaradi backward compat # Posodobi tudi jezik če LLM je drugačnega mnenja if claude_result.get("language") and claude_result["language"] != transcript["language"]: print(f" ✏️ LLM je popravil jezik: {transcript['language']} → {claude_result['language']}", file=sys.stderr) transcript["language"] = claude_result["language"] llm_label = claude_result.get("source", "LLM") print(f" ✏️ Whisper segmenti zamenjani z {llm_label} popravljenimi ({len(new_segments)})", file=sys.stderr) # 7. Fade params (lahko razširi clip end če konča sredi vokala) fade = detect_audio_fade(clip_range, transcript, video_duration=duration) print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr) # Če fade detection razširi end (ker clip konča sredi vokala), apply if fade.get("extended_end") and fade["extended_end"] > clip_range["end"]: old_end = clip_range["end"] new_end = min(fade["extended_end"], clip_range["start"] + args.max_duration) clip_range["end"] = round(new_end, 2) clip_range["duration"] = round(new_end - clip_range["start"], 2) print(f" ↳ Razširjen za {new_end - old_end:.1f}s (zaključek besedila)", file=sys.stderr) result = { "video": str(video), "video_duration": duration, "language": transcript["language"], "language_probability": transcript["language_probability"], "instrumental": instrumental, "transcript": transcript, "chorus": chorus, "clip_range": clip_range, "fade": fade, "claude_used": claude_result is not None, "claude_corrected_text": bool(claude_result and claude_result.get("corrected_segments")), } if args.output: with open(args.output, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) print(f"💾 Saved: {args.output}", file=sys.stderr) if args.json: print(json.dumps(result, ensure_ascii=False)) finally: try: os.unlink(audio) except Exception: pass if __name__ == "__main__": main()