diff --git a/app/main.py b/app/main.py index b47da6b..58b6bff 100644 --- a/app/main.py +++ b/app/main.py @@ -247,6 +247,9 @@ def process_job(job_id): # Brez extension fn_hint = Path(job["filename"]).stem cmd += ["--filename-hint", fn_hint] + # STT provider (elevenlabs = Scribe, local = faster-whisper, auto = preferiraj Scribe) + if job.get("whisper_provider"): + cmd += ["--whisper-provider", job["whisper_provider"]] # lang: če None ali 'auto', pusti analyze.py auto-detect if job.get("lang") and job["lang"] not in ("auto", ""): cmd += ["--lang", job["lang"]] @@ -500,6 +503,8 @@ class StartJobIn(BaseModel): # LLM za semantično analizo + popravke llm_provider: str = "claude" # claude / gemini / auto llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider) + # STT provider (Scribe je 18x hitreje + boljša multilingual accuracy) + whisper_provider: str = "auto" # auto / elevenlabs / local # ──────────────────────────────────────────────────────────────── @@ -605,6 +610,7 @@ async def start_processing( quality=payload.quality, llm_provider=payload.llm_provider, llm_model=payload.llm_model, + whisper_provider=payload.whisper_provider, current_step="V vrsti za obdelavo", # Počisti pretekle napake (retry-friendly) chorus_error=None, diff --git a/scripts/analyze.py b/scripts/analyze.py index ffd4e4b..9a964bf 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -46,13 +46,186 @@ def extract_audio(video_path): return audio.name -def transcribe_full(audio_path, lang=None, model_size="small"): +def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"): + """ElevenLabs Scribe transkripcija (najboljša multilingual accuracy 2026). + + Lang accepted in ISO 639-1 ('de', 'sl', 'hr') — auto-converted to ISO 639-3. + Pricing: ~$0.40/h (~$0.022 per 200s pesem). + """ + import urllib.request + import urllib.error + import uuid + + api_key = os.environ.get("ELEVENLABS_API_KEY") + if not api_key: + print(" ⚠️ ELEVENLABS_API_KEY ni nastavljen", file=sys.stderr) + return None + + # ISO 639-1 → 639-3 mapping (Scribe uses 639-3) + LANG_1_TO_3 = { + "en": "eng", "de": "deu", "sl": "slv", "hr": "hrv", "bs": "bos", + "sr": "srp", "it": "ita", "es": "spa", "fr": "fra", "pt": "por", + "ru": "rus", "pl": "pol", "cs": "ces", "sk": "slk", "hu": "hun", + "ro": "ron", "nl": "nld", "sv": "swe", "no": "nor", "da": "dan", + "fi": "fin", "tr": "tur", "ar": "ara", "uk": "ukr", "bg": "bul", + "el": "ell", "he": "heb", "ja": "jpn", "ko": "kor", "zh": "zho", + } + # Reverse mapping for parsing response + LANG_3_TO_1 = {v: k for k, v in LANG_1_TO_3.items()} + + # Multipart upload + boundary = uuid.uuid4().hex + parts = [] + + def add_text(name, value): + parts.append( + f"--{boundary}\r\nContent-Disposition: form-data; " + f"name=\"{name}\"\r\n\r\n{value}\r\n".encode() + ) + + def add_file(name, filename, content, ctype): + parts.append( + f"--{boundary}\r\nContent-Disposition: form-data; " + f"name=\"{name}\"; filename=\"{filename}\"\r\n" + f"Content-Type: {ctype}\r\n\r\n".encode() + content + b"\r\n" + ) + + with open(audio_path, "rb") as f: + audio_content = f.read() + + # Limit: ElevenLabs Scribe supports up to ~25 MB / 4.5h per request + if len(audio_content) > 24 * 1024 * 1024: + print(f" ⚠️ Audio {len(audio_content)/1024/1024:.1f} MB > 24 MB limit, fallback", file=sys.stderr) + return None + + add_text("model_id", model) + add_text("timestamps_granularity", "word") + add_text("tag_audio_events", "false") + if lang: + scribe_lang = LANG_1_TO_3.get(lang, lang) + add_text("language_code", scribe_lang) + add_file("file", "audio.mp3", audio_content, "audio/mpeg") + parts.append(f"--{boundary}--\r\n".encode()) + body = b"".join(parts) + + print(f" 📡 ElevenLabs Scribe ({model}, {len(audio_content)/1024/1024:.1f} MB, " + f"lang={lang or 'auto'})...", file=sys.stderr) + + req = urllib.request.Request( + "https://api.elevenlabs.io/v1/speech-to-text", + data=body, + headers={ + "xi-api-key": api_key, + "Content-Type": f"multipart/form-data; boundary={boundary}", + }, + ) + + try: + with urllib.request.urlopen(req, timeout=300) as resp: + data = json.loads(resp.read().decode()) + except urllib.error.HTTPError as e: + body_err = e.read().decode("utf-8", errors="replace")[:500] + print(f" ❌ Scribe HTTP {e.code}: {body_err}", file=sys.stderr) + return None + except Exception as e: + print(f" ❌ Scribe exception: {e}", file=sys.stderr) + return None + + # Convert response to our standard format + detected_lang_3 = data.get("language_code", "unknown") + detected_lang_1 = LANG_3_TO_1.get(detected_lang_3, detected_lang_3[:2]) + detected_prob = data.get("language_probability", 1.0) + + # Scribe returns flat list of words (not segments) + # We need to group words into pseudo-segments (~10s each, breaking on long pauses) + words = data.get("words", []) + segments = [] + + if words: + # Filter out whitespace tokens + real_words = [w for w in words if w.get("text", "").strip()] + + if real_words: + current_seg_words = [] + seg_start = real_words[0].get("start", 0) + + for i, w in enumerate(real_words): + current_seg_words.append(w) + w_end = w.get("end", w.get("start", 0)) + + # Decide if we should close the segment + close = False + # Close on long pause (>= 0.6s) + if i + 1 < len(real_words): + next_start = real_words[i + 1].get("start", w_end) + pause = next_start - w_end + seg_duration = w_end - seg_start + # Long pause OR segment is long enough (>= 4s) + if pause >= 0.6 or seg_duration >= 6.0: + close = True + else: + close = True # last word + + if close: + seg_text = " ".join(ww.get("text", "") for ww in current_seg_words).strip() + if seg_text: + segments.append({ + "start": seg_start, + "end": w_end, + "text": seg_text, + "words": [ + { + "start": ww.get("start", 0), + "end": ww.get("end", 0), + "text": ww.get("text", ""), + } + for ww in current_seg_words + ], + }) + # Reset + current_seg_words = [] + if i + 1 < len(real_words): + seg_start = real_words[i + 1].get("start", 0) + + print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, " + f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr) + + return { + "language": detected_lang_1, + "language_probability": float(detected_prob), + "segments": segments, + "_provider": "elevenlabs", + } + + +def transcribe_full(audio_path, lang=None, model_size="small", provider="auto"): + """Whisper/Scribe transcript dispatcher. + + provider: + - "elevenlabs" → ElevenLabs Scribe (najboljša kvaliteta, $0.40/h, ~10s na 200s pesem) + - "local" → faster-whisper na CPU (brezplačno, počasi, halucinacije) + - "auto" → Scribe če ELEVENLABS_API_KEY obstaja, sicer local + """ + if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"): + result = transcribe_with_elevenlabs(audio_path, lang=lang) + if result and result.get("segments"): + return result + if provider == "elevenlabs": + print(f" ⚠️ Scribe failed, no fallback (provider=elevenlabs)", file=sys.stderr) + return {"language": "unknown", "language_probability": 0.0, "segments": []} + print(f" 🔄 Scribe failed, fallback na local Whisper...", file=sys.stderr) + + # Local faster-whisper + return _transcribe_full_local(audio_path, lang=lang, model_size=model_size) + + +def _transcribe_full_local(audio_path, lang=None, model_size="small"): """Whisper transcript celega avdia. lang=None → robust auto-detect. Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental).""" from faster_whisper import WhisperModel - print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr) + print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr) m = WhisperModel(model_size, device="cpu", compute_type="int8") # Auto-detect z 3-sample voting da se zaklenemo na en jezik @@ -788,6 +961,10 @@ def main(): help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)") ap.add_argument("--filename-hint", default=None, help="Originalno ime datoteke (Claude lahko prepozna pesem)") + ap.add_argument("--whisper-provider", default="auto", + choices=["auto", "elevenlabs", "local"], + help="STT provider: elevenlabs=ElevenLabs Scribe (najboljša kvaliteta, $0.40/h), " + "local=faster-whisper CPU (brezplačno, halucinacije), auto=Scribe če key, sicer local") ap.add_argument("--json", action="store_true", help="Output JSON") ap.add_argument("--output", help="Path za JSON output") args = ap.parse_args() @@ -806,7 +983,10 @@ def main(): try: # 2. Whisper transcript lang = None if args.lang in (None, "auto", "") else args.lang - transcript = transcribe_full(audio, lang=lang, model_size=args.model) + transcript = transcribe_full( + audio, lang=lang, model_size=args.model, + provider=args.whisper_provider, + ) print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr) # 3. Energy profile