diff --git a/scripts/analyze.py b/scripts/analyze.py index 70eb0a8..ef22e01 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -575,29 +575,286 @@ Output ONLY the JSON object.""" return None +def transcribe_with_soniox(audio_path, lang=None, filename_hint=None): + """Soniox stt-async-v4 audio transcription — primary STT. + + Prednosti: + - Najboljša natančnost za 60+ jezikov vključno s slovenščino + - Brezhibno za narodno-zabavno glasbo (Avsenik, Modrijani, itd.) + - Word-level timestamps + punctuation + diakritike + - $0.10/h ($0.005 za 3-min pesem) - 4x cenejši kot Scribe + - 4-13s za 180s audio (5x hitrejši kot Scribe) + """ + import urllib.request + import urllib.error + + api_key = os.environ.get("SONIOX_API_KEY") + if not api_key: + print(f" ❌ SONIOX_API_KEY missing", file=sys.stderr) + return None + + BASE = "https://api.soniox.com" + print(f"🎤 Soniox stt-async-v4 transcribing {audio_path}...", file=sys.stderr) + + file_id = None + trans_id = None + + def api_call(method, path, **kwargs): + headers = kwargs.pop('headers', {}) + headers['Authorization'] = f'Bearer {api_key}' + data = kwargs.get('data') + if isinstance(data, dict): + data = json.dumps(data).encode() + headers['Content-Type'] = 'application/json' + req = urllib.request.Request(f"{BASE}{path}", data=data, headers=headers, method=method) + with urllib.request.urlopen(req, timeout=120) as resp: + content = resp.read().decode() + return json.loads(content) if content else {} + + try: + # 1. Upload file (multipart) + boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW" + with open(audio_path, 'rb') as f: + audio_bytes = f.read() + body = b''.join([ + f"--{boundary}\r\n".encode(), + b'Content-Disposition: form-data; name="file"; filename="audio.mp3"\r\n', + b'Content-Type: audio/mpeg\r\n\r\n', + audio_bytes, + f"\r\n--{boundary}--\r\n".encode() + ]) + req = urllib.request.Request( + f"{BASE}/v1/files", + data=body, + headers={ + 'Authorization': f'Bearer {api_key}', + 'Content-Type': f'multipart/form-data; boundary={boundary}', + }, + method='POST' + ) + with urllib.request.urlopen(req, timeout=120) as resp: + file_data = json.loads(resp.read().decode()) + file_id = file_data['id'] + size_mb = len(audio_bytes) / 1024 / 1024 + print(f" ✓ Uploaded {size_mb:.1f}MB → file_id={file_id}", file=sys.stderr) + + # 2. Create transcription + config = { + "model": "stt-async-v4", + "file_id": file_id, + "enable_language_identification": True, + } + # Language hints — prepoznaj jezik iz filename ali parametra + if lang: + config["language_hints"] = [lang] + else: + # Auto-detect iz filename + fn_lower = (filename_hint or "").lower() + if any(k in fn_lower for k in ["ansambel", "avsenik", "fehtar", "modrijan", "polka", "valček", "slovensk"]): + config["language_hints"] = ["sl"] + else: + # Multilingual default - top svetovni + config["language_hints"] = ["en", "sl", "de", "hr", "es", "fr", "it"] + + trans_data = api_call("POST", "/v1/transcriptions", data=config) + trans_id = trans_data['id'] + print(f" ✓ Transcription started: {trans_id}", file=sys.stderr) + + # 3. Poll status + t0 = time.time() + while True: + status_data = api_call("GET", f"/v1/transcriptions/{trans_id}") + status = status_data.get('status', 'unknown') + elapsed = time.time() - t0 + if status == "completed": + print(f" ✓ Completed in {elapsed:.0f}s", file=sys.stderr) + break + if status == "error": + print(f" ❌ Soniox error: {status_data.get('error_message', '?')}", file=sys.stderr) + return None + if elapsed > 180: + print(f" ⚠️ Timeout (180s)", file=sys.stderr) + return None + time.sleep(2) + + # 4. Get transcript + transcript_data = api_call("GET", f"/v1/transcriptions/{trans_id}/transcript") + + # Convert Soniox format → naš standard format (segments + words) + tokens = transcript_data.get('tokens', []) + if not tokens: + print(f" ❌ Empty transcript", file=sys.stderr) + return None + + # Group tokens into words (Soniox vrača subwords; "Del" + " neb" + "a" = "Del neba") + # Soniox token ima text in start_ms/end_ms. Beseda začne kjer ima text začetni space ali je prvi. + words = [] + current_word = None + for tok in tokens: + text = tok.get('text', '') + start_s = tok.get('start_ms', 0) / 1000 + end_s = tok.get('end_ms', 0) / 1000 + # Token, ki začne z space ali je /special, je nova beseda + if text.startswith(' ') or text in ('', ''): + if current_word and current_word['text'].strip(): + words.append(current_word) + if text in ('', ''): + current_word = None + continue + current_word = {'text': text, 'start': start_s, 'end': end_s, 'language': tok.get('language', lang or 'sl')} + else: + if current_word is None: + current_word = {'text': text, 'start': start_s, 'end': end_s, 'language': tok.get('language', lang or 'sl')} + else: + # Append k current_word + current_word['text'] += text + current_word['end'] = end_s + if current_word and current_word['text'].strip(): + words.append(current_word) + + # Group words into segments (po pavzah > 0.6s) + segments = [] + if words: + current_seg = {'start': words[0]['start'], 'end': words[0]['end'], + 'text': words[0]['text'].strip(), + 'words': [{'start': words[0]['start'], 'end': words[0]['end'], 'text': words[0]['text'].strip()}]} + + for w in words[1:]: + gap = w['start'] - current_seg['end'] + if gap > 0.6 and len(current_seg['words']) >= 3: + segments.append(current_seg) + current_seg = {'start': w['start'], 'end': w['end'], + 'text': w['text'].strip(), + 'words': [{'start': w['start'], 'end': w['end'], 'text': w['text'].strip()}]} + else: + current_seg['end'] = w['end'] + current_seg['text'] = (current_seg['text'] + ' ' + w['text'].strip()).strip() + current_seg['words'].append({'start': w['start'], 'end': w['end'], 'text': w['text'].strip()}) + segments.append(current_seg) + + # Detected language + detected_lang = lang or 'sl' + if tokens: + # Get most common language from tokens + lang_counts = {} + for tok in tokens: + tl = tok.get('language') + if tl: + lang_counts[tl] = lang_counts.get(tl, 0) + 1 + if lang_counts: + detected_lang = max(lang_counts, key=lang_counts.get) + + # Compute coverage stats (compatible z ostalimi providerji) + total_dur = max((s['end'] for s in segments), default=0) + coverage = sum(s['end'] - s['start'] for s in segments) + coverage_pct = (coverage / total_dur * 100) if total_dur else 0 + + total_words = sum(len(s.get('words', [])) for s in segments) + full_text = transcript_data.get('text', '') + print(f" ✅ Soniox: {total_words} words → {len(segments)} segments, " + f"lang={detected_lang}, coverage={coverage_pct:.0f}%", file=sys.stderr) + print(f" 📝 First 200 chars: {full_text[:200]!r}", file=sys.stderr) + + return { + "language": detected_lang, + "language_probability": 0.95, + "segments": segments, + "_provider": "soniox", + "_hallucination_count": 0, # Soniox redko halucinarala + "_coverage_pct": coverage_pct, + } + + except urllib.error.HTTPError as e: + err_body = e.read().decode()[:500] if hasattr(e, 'read') else '' + print(f" ❌ Soniox HTTP {e.code}: {err_body}", file=sys.stderr) + return None + except Exception as e: + print(f" ❌ Soniox exception: {e}", file=sys.stderr) + import traceback + traceback.print_exc(file=sys.stderr) + return None + finally: + # Cleanup — pošlji DELETE ampak ne preverjaj response (Soniox returns empty body) + for path in ([f"/v1/transcriptions/{trans_id}"] if trans_id else []) + ([f"/v1/files/{file_id}"] if file_id else []): + try: + req = urllib.request.Request(f"{BASE}{path}", + headers={'Authorization': f'Bearer {api_key}'}, method='DELETE') + urllib.request.urlopen(req, timeout=10) + except Exception: + pass + + def transcribe_full(audio_path, lang=None, model_size="small", provider="auto", filename_hint=None): - """Whisper/Scribe transcript dispatcher z hybrid fallback. + """STT dispatcher — Soniox primary z fallback chain. provider: - - "elevenlabs" → samo Scribe (z auto-retry) - - "gemini" → samo Gemini 3 Pro + - "soniox" → Soniox stt-async-v4 (najboljši, $0.10/h, 5-15s) + - "elevenlabs" → ElevenLabs Scribe ($0.40/h, 8-15s) + - "gemini" → Gemini 3 Pro ($3-5/h, 100-200s, najbolj točen za music) - "local" → faster-whisper na CPU - - "hybrid" → Scribe primary, Gemini fallback ob halucinaciji - - "auto" → hybrid (Scribe + Gemini fallback) če oba API key dostopna - - filename_hint: ime datoteke (uporablja za auto-detect jezika če lang=None) + - "auto" → Soniox primary, Scribe fallback, Gemini fallback ob halucinaciji """ + has_soniox = bool(os.environ.get("SONIOX_API_KEY")) has_scribe = bool(os.environ.get("ELEVENLABS_API_KEY")) has_gemini = bool(os.environ.get("GEMINI_API_KEY")) - # Resolve "auto" → "hybrid" če oba API ključa, sicer "elevenlabs" - if provider == "auto": - provider = "hybrid" if (has_scribe and has_gemini) else ("elevenlabs" if has_scribe else "local") + # Resolve "auto" → "soniox" če key, sicer fallback chain + if provider in ("auto", "hybrid"): + if has_soniox: + provider = "soniox_chain" # Soniox primary + fallbacks + elif has_scribe and has_gemini: + provider = "hybrid" # legacy hybrid + elif has_scribe: + provider = "elevenlabs" + else: + provider = "local" - # ─── HYBRID: Scribe primary, Gemini fallback ─── + # ─── SONIOX CHAIN: Soniox primary, Scribe/Gemini fallback ─── + if provider == "soniox_chain": + print(f"🎯 Provider chain: Soniox → Scribe → Gemini", file=sys.stderr) + result = transcribe_with_soniox(audio_path, lang=lang, filename_hint=filename_hint) + + if result and result.get("segments"): + cov = result.get("_coverage_pct", 100) + hall = result.get("_hallucination_count", 0) + if cov >= 50 and hall == 0: + return result + print(f" ⚠️ Soniox sumljiv (coverage {cov:.0f}%, hall {hall}) — try fallback", file=sys.stderr) + else: + print(f" ❌ Soniox failed → fallback", file=sys.stderr) + + # Fallback 1: Scribe + if has_scribe: + print(f" 🔄 Fallback to Scribe...", file=sys.stderr) + result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint) + if result2 and result2.get("segments"): + cov = result2.get("_coverage_pct", 100) + hall = result2.get("_hallucination_count", 0) + if cov >= 50 and hall == 0: + return result2 + # ohrani za primerjavo + result = result2 if not result else result + + # Fallback 2: Gemini (samo če sve doslej slabe) + if has_gemini: + print(f" 🔄 Fallback to Gemini 3 Pro (last resort)...", file=sys.stderr) + result3 = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint) + if result3 and result3.get("segments"): + return result3 + + # Vrni karkoli imamo + return result or {"language": "unknown", "language_probability": 0.0, "segments": []} + + # ─── SONIOX ONLY ─── + if provider == "soniox": + if not has_soniox: + return {"language": "unknown", "language_probability": 0.0, "segments": []} + result = transcribe_with_soniox(audio_path, lang=lang, filename_hint=filename_hint) + return result or {"language": "unknown", "language_probability": 0.0, "segments": []} + + # ─── HYBRID (legacy): Scribe primary, Gemini fallback ─── if provider == "hybrid": if not has_scribe: - print(f" ⚠️ Hybrid mode but ELEVENLABS_API_KEY missing — switching to gemini", file=sys.stderr) provider = "gemini" else: # Try Scribe first @@ -1588,9 +1845,12 @@ def main(): ap.add_argument("--filename-hint", default=None, help="Originalno ime datoteke (Claude lahko prepozna pesem)") ap.add_argument("--whisper-provider", default="auto", - choices=["auto", "elevenlabs", "local"], - help="STT provider: elevenlabs=ElevenLabs Scribe (najboljša kvaliteta, $0.40/h), " - "local=faster-whisper CPU (brezplačno, halucinacije), auto=Scribe če key, sicer local") + choices=["auto", "soniox", "elevenlabs", "local", "hybrid", "gemini"], + help="STT provider: " + "soniox=Soniox stt-async-v4 ($0.10/h, 5-15s, najboljši za NZ, PRIPOROČENO), " + "elevenlabs=Scribe ($0.40/h, halucinacije pri NZ), " + "gemini=Gemini 3 Pro ($3-5/h, počasen), " + "auto=Soniox primary + fallback chain (PRIVZETO)") ap.add_argument("--json", action="store_true", help="Output JSON") ap.add_argument("--output", help="Path za JSON output") args = ap.parse_args()