diff --git a/scripts/analyze.py b/scripts/analyze.py index ed4c3fa..8b080a0 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -19,6 +19,7 @@ import re import subprocess import sys import tempfile +import time from pathlib import Path @@ -363,44 +364,336 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filenam } +def transcribe_with_gemini(audio_path, lang=None, filename_hint=None): + """Gemini 3 Pro audio transcription — fallback za narodno-zabavne pesmi + kjer Scribe halucinarala. + + Prednosti: + - Pravilna besedila slovenskih, hrvaških in drugih "manjšinskih" jezikov + - Ne halucinira pri instrumentalnih sekcijah + - Razume kontekst pesmi (lirika) + + Slabosti: + - Počasen (~100s na 2min audio) + - Dražji ($0.20 vs $0.013) + - Timestamps včasih off za 1-2s + """ + import urllib.request + import urllib.error + + api_key = os.environ.get("GEMINI_API_KEY") + if not api_key: + print(f" ❌ Gemini fallback: GEMINI_API_KEY missing", file=sys.stderr) + return None + + print(f"🧠 Gemini 3 Pro transcribing {audio_path}...", file=sys.stderr) + audio_size_mb = os.path.getsize(audio_path) / 1024 / 1024 + print(f" 📦 Audio size: {audio_size_mb:.1f} MB", file=sys.stderr) + + try: + # 1. Upload audio prek Files API (resumable) + upload_url_base = "https://generativelanguage.googleapis.com/upload/v1beta/files" + with open(audio_path, 'rb') as f: + audio_bytes = f.read() + + # Step 1: start + headers_start = { + 'X-Goog-Upload-Protocol': 'resumable', + 'X-Goog-Upload-Command': 'start', + 'X-Goog-Upload-Header-Content-Length': str(len(audio_bytes)), + 'X-Goog-Upload-Header-Content-Type': 'audio/mp3', + 'Content-Type': 'application/json', + } + req_start = urllib.request.Request( + f"{upload_url_base}?key={api_key}", + data=json.dumps({"file": {"display_name": "reels_audio"}}).encode(), + headers=headers_start, method='POST' + ) + with urllib.request.urlopen(req_start, timeout=30) as resp: + upload_url = resp.headers.get('X-Goog-Upload-URL') + + # Step 2: upload bytes + headers_upload = { + 'Content-Length': str(len(audio_bytes)), + 'X-Goog-Upload-Offset': '0', + 'X-Goog-Upload-Command': 'upload, finalize', + } + req_upload = urllib.request.Request( + upload_url, data=audio_bytes, + headers=headers_upload, method='POST' + ) + with urllib.request.urlopen(req_upload, timeout=120) as resp: + file_info = json.loads(resp.read().decode()) + file_uri = file_info['file']['uri'] + + print(f" ✓ Uploaded to Gemini Files API", file=sys.stderr) + # Manjši delay da se file procesi + time.sleep(2) + + # 2. Generate transcript + gen_url = (f"https://generativelanguage.googleapis.com/v1beta/" + f"models/gemini-3-pro-preview:generateContent?key={api_key}") + + lang_hint = "" + if filename_hint: + lang_hint = f"\nFilename hint: {filename_hint}" + if lang: + lang_hint += f"\nLanguage: {lang}" + + prompt = f"""Transcribe this song with precise word-level timestamps.{lang_hint} + +Return ONLY valid JSON in this EXACT format (no markdown fences, no explanation): +{{ + "language": "sl", + "segments": [ + {{ + "start": 0.5, + "end": 4.2, + "text": "Besedilo segmenta", + "words": [ + {{"start": 0.5, "end": 0.9, "text": "Besedilo"}}, + {{"start": 1.0, "end": 1.4, "text": "segmenta"}} + ] + }} + ] +}} + +Rules: +- Only transcribe vocal singing, NOT instrumental sections +- Each segment is a complete musical phrase (typically 2-4 seconds) +- Include word-level timestamps for EVERY word +- Use proper orthography (š, č, ž for Slavic; ä, ö, ü for German etc.) +- Skip instrumental breaks (don't fill with silence segments) +- Be very accurate with timestamps - this is for video subtitle generation +- DO NOT hallucinate words during instrumental sections +- DO NOT include trailing commas in JSON + +Output ONLY the JSON object.""" + + payload = { + "contents": [{ + "parts": [ + {"text": prompt}, + {"file_data": {"mime_type": "audio/mp3", "file_uri": file_uri}} + ] + }], + "generationConfig": { + "temperature": 0.0, + "maxOutputTokens": 32000, + } + } + + req_gen = urllib.request.Request( + gen_url, + data=json.dumps(payload).encode(), + headers={'Content-Type': 'application/json'}, + method='POST' + ) + + t0 = time.time() + with urllib.request.urlopen(req_gen, timeout=300) as resp: + result = json.loads(resp.read().decode()) + elapsed = time.time() - t0 + + usage = result.get('usageMetadata', {}) + print(f" ✓ Gemini 3 Pro response v {elapsed:.0f}s " + f"(in: {usage.get('promptTokenCount', 0)}, " + f"out: {usage.get('candidatesTokenCount', 0)}, " + f"thoughts: {usage.get('thoughtsTokenCount', 0)})", file=sys.stderr) + + # 3. Parse JSON output + candidate_text = result['candidates'][0]['content']['parts'][0]['text'].strip() + + # Pobriši markdown code fences če so + if candidate_text.startswith('```'): + # ```json\n...\n``` + lines = candidate_text.split('\n') + if lines[0].startswith('```'): + lines = lines[1:] + if lines and lines[-1].rstrip() == '```': + lines = lines[:-1] + candidate_text = '\n'.join(lines) + + # Try-except za JSON z popravki za pogoste težave + parsed = None + try: + parsed = json.loads(candidate_text) + except json.JSONDecodeError as e: + # Trailing comma fix + import re as _re + cleaned = _re.sub(r',(\s*[}\]])', r'\1', candidate_text) + try: + parsed = json.loads(cleaned) + print(f" ✓ Fixed trailing commas in Gemini JSON", file=sys.stderr) + except json.JSONDecodeError as e2: + print(f" ❌ Gemini JSON parse failed: {e2}", file=sys.stderr) + print(f" First 500 chars: {candidate_text[:500]}", file=sys.stderr) + return None + + if not parsed or not parsed.get('segments'): + print(f" ❌ Gemini returned no segments", file=sys.stderr) + return None + + segments = parsed['segments'] + # Detected language + detected_lang = parsed.get('language', lang or 'unknown') + + # Compute coverage stats + hallucination_count = 0 + coverage = 0 + total_dur = max((s.get('end', 0) for s in segments), default=0) + for s in segments: + seg_dur = s.get('end', 0) - s.get('start', 0) + word_count = len(s.get('words', [])) + if seg_dur > 15 and word_count < 5: + hallucination_count += 1 + else: + coverage += seg_dur + coverage_pct = (coverage / total_dur * 100) if total_dur else 0 + + total_words = sum(len(s.get('words', [])) for s in segments) + print(f" ✅ Gemini 3 Pro: {total_words} words → {len(segments)} segments, " + f"lang={detected_lang}, coverage={coverage_pct:.0f}%", file=sys.stderr) + + return { + "language": detected_lang, + "language_probability": 0.95, + "segments": segments, + "_provider": "gemini-3-pro", + "_hallucination_count": hallucination_count, + "_coverage_pct": coverage_pct, + } + + except urllib.error.HTTPError as e: + err_body = e.read().decode()[:500] if hasattr(e, 'read') else '' + print(f" ❌ Gemini HTTP {e.code}: {err_body}", file=sys.stderr) + return None + except Exception as e: + print(f" ❌ Gemini fallback exception: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return None + + def transcribe_full(audio_path, lang=None, model_size="small", provider="auto", filename_hint=None): - """Whisper/Scribe transcript dispatcher. + """Whisper/Scribe transcript dispatcher z hybrid fallback. provider: - - "elevenlabs" → ElevenLabs Scribe (najboljša kvaliteta, $0.40/h, ~10s na 200s pesem) - - "local" → faster-whisper na CPU (brezplačno, počasi, halucinacije) - - "auto" → Scribe če ELEVENLABS_API_KEY obstaja, sicer local + - "elevenlabs" → samo Scribe (z auto-retry) + - "gemini" → samo Gemini 3 Pro + - "local" → faster-whisper na CPU + - "hybrid" → Scribe primary, Gemini fallback ob halucinaciji + - "auto" → hybrid (Scribe + Gemini fallback) če oba API key dostopna filename_hint: ime datoteke (uporablja za auto-detect jezika če lang=None) """ - if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"): + has_scribe = bool(os.environ.get("ELEVENLABS_API_KEY")) + has_gemini = bool(os.environ.get("GEMINI_API_KEY")) + + # Resolve "auto" → "hybrid" če oba API ključa, sicer "elevenlabs" + if provider == "auto": + provider = "hybrid" if (has_scribe and has_gemini) else ("elevenlabs" if has_scribe else "local") + + # ─── HYBRID: Scribe primary, Gemini fallback ─── + if provider == "hybrid": + if not has_scribe: + print(f" ⚠️ Hybrid mode but ELEVENLABS_API_KEY missing — switching to gemini", file=sys.stderr) + provider = "gemini" + else: + # Try Scribe first + print(f"🎯 HYBRID mode: Scribe primary, Gemini fallback", file=sys.stderr) + result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) + + if result and result.get("segments"): + hall_count = result.get("_hallucination_count", 0) + cov_pct = result.get("_coverage_pct", 100) + + # Quality gate: če je Scribe rezultat dober, vrni ga + if hall_count == 0 and cov_pct >= 50: + print(f" ✅ Scribe OK (coverage {cov_pct:.0f}%) — no fallback needed", + file=sys.stderr) + return result + + # Halucinacija ali nizko pokritje → preizkusi Scribe še 1x preden gremo na Gemini + print(f" ⚠️ Scribe quality issues (coverage {cov_pct:.0f}%, " + f"{hall_count} halu) — RETRY Scribe...", file=sys.stderr) + result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) + if result2 and result2.get("segments"): + h2 = result2.get("_hallucination_count", 0) + c2 = result2.get("_coverage_pct", 100) + if h2 == 0 and c2 >= 50: + print(f" ✅ Scribe retry uspel: coverage {cov_pct:.0f}% → {c2:.0f}%", + file=sys.stderr) + return result2 + # Še vedno slabo, ali je drugi tek boljši? + if h2 < hall_count or c2 > cov_pct: + result = result2 + hall_count = h2 + cov_pct = c2 + + # Še vedno halucinacija → Gemini fallback + if has_gemini: + print(f" 🔄 Scribe še vedno slab (coverage {cov_pct:.0f}%, " + f"{hall_count} halu) — switching na Gemini 3 Pro...", file=sys.stderr) + gemini_result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint) + if gemini_result and gemini_result.get("segments"): + g_cov = gemini_result.get("_coverage_pct", 100) + g_hall = gemini_result.get("_hallucination_count", 0) + # Vzemi tisto kar je boljše + if g_hall < hall_count or g_cov > cov_pct: + print(f" ✅ Gemini boljši: coverage {cov_pct:.0f}% → {g_cov:.0f}%, " + f"hallu {hall_count} → {g_hall}", file=sys.stderr) + return gemini_result + else: + print(f" ⚠️ Gemini ni boljši, ohrani Scribe", file=sys.stderr) + return result + else: + print(f" ⚠️ Gemini fallback ni dosegljiv — vrnem Scribe rezultat", + file=sys.stderr) + + return result + else: + # Scribe popolnoma failed → Gemini direktno + if has_gemini: + print(f" 🔄 Scribe failed → Gemini 3 Pro", file=sys.stderr) + gemini_result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint) + if gemini_result and gemini_result.get("segments"): + return gemini_result + # Brez fallback → empty + return {"language": "unknown", "language_probability": 0.0, "segments": []} + + # ─── GEMINI ONLY ─── + if provider == "gemini": + if not has_gemini: + print(f" ❌ provider=gemini ampak GEMINI_API_KEY missing", file=sys.stderr) + return {"language": "unknown", "language_probability": 0.0, "segments": []} + result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint) + if result and result.get("segments"): + return result + return {"language": "unknown", "language_probability": 0.0, "segments": []} + + # ─── ELEVENLABS / SCRIBE ONLY (z auto-retry) ─── + if provider == "elevenlabs" and has_scribe: result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) - # Auto-retry če halucinacija zaznana (pokritje < 50% ali halucinacijski segmenti) if result and result.get("segments"): hall_count = result.get("_hallucination_count", 0) cov_pct = result.get("_coverage_pct", 100) if hall_count > 0 or cov_pct < 50: print(f" 🔄 Halucinacija/nizko pokritje ({cov_pct:.0f}%, " f"{hall_count} hallucination segs) — RETRY Scribe...", file=sys.stderr) - # Drugi poskus z malo drugačnimi parametri result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) if result2 and result2.get("segments"): h2 = result2.get("_hallucination_count", 0) c2 = result2.get("_coverage_pct", 100) if h2 < hall_count or c2 > cov_pct: - print(f" ✅ Retry boljši: pokritje {cov_pct:.0f}% → {c2:.0f}%, " - f"halucinacije {hall_count} → {h2}", file=sys.stderr) + print(f" ✅ Retry boljši: pokritje {cov_pct:.0f}% → {c2:.0f}%", + file=sys.stderr) result = result2 - else: - print(f" ⚠️ Retry ni izboljšal, ohrani prvi rezultat", file=sys.stderr) return result - if provider == "elevenlabs": - print(f" ⚠️ Scribe failed, no fallback (provider=elevenlabs)", file=sys.stderr) - return {"language": "unknown", "language_probability": 0.0, "segments": []} - print(f" 🔄 Scribe failed, fallback na local Whisper...", file=sys.stderr) - - # Local faster-whisper + return {"language": "unknown", "language_probability": 0.0, "segments": []} + + # ─── LOCAL faster-whisper ─── return _transcribe_full_local(audio_path, lang=lang, model_size=model_size)