diff --git a/app/main.py b/app/main.py index b47da6b..f5dc2d7 100644 --- a/app/main.py +++ b/app/main.py @@ -247,6 +247,9 @@ def process_job(job_id): # Brez extension fn_hint = Path(job["filename"]).stem cmd += ["--filename-hint", fn_hint] + # Whisper provider (groq = 200x hitreje od lokalnega) + if job.get("whisper_provider"): + cmd += ["--whisper-provider", job["whisper_provider"]] # lang: če None ali 'auto', pusti analyze.py auto-detect if job.get("lang") and job["lang"] not in ("auto", ""): cmd += ["--lang", job["lang"]] @@ -500,6 +503,8 @@ class StartJobIn(BaseModel): # LLM za semantično analizo + popravke llm_provider: str = "claude" # claude / gemini / auto llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider) + # Whisper provider (Groq je 200x hitrejši od lokalnega CPU faster-whisper) + whisper_provider: str = "auto" # auto / groq / local # ──────────────────────────────────────────────────────────────── @@ -605,6 +610,7 @@ async def start_processing( quality=payload.quality, llm_provider=payload.llm_provider, llm_model=payload.llm_model, + whisper_provider=payload.whisper_provider, current_step="V vrsti za obdelavo", # Počisti pretekle napake (retry-friendly) chorus_error=None, diff --git a/scripts/analyze.py b/scripts/analyze.py index ffd4e4b..2e77bd1 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -46,13 +46,152 @@ def extract_audio(video_path): return audio.name -def transcribe_full(audio_path, lang=None, model_size="small"): - """Whisper transcript celega avdia. lang=None → robust auto-detect. +def transcribe_with_groq(audio_path, lang=None, model="whisper-large-v3-turbo"): + """Whisper transkripcija prek Groq API-ja. - Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental).""" + 216x realtime speed factor — 30s audio = ~0.5s transcribe time. + Stroški: $0.04/h (turbo) ali $0.111/h (large-v3). + """ + import urllib.request + import urllib.error + import uuid + + api_key = os.environ.get("GROQ_API_KEY") + if not api_key: + print(" ⚠️ GROQ_API_KEY ni nastavljen", file=sys.stderr) + return None + + # Pripravi multipart/form-data + boundary = uuid.uuid4().hex + parts = [] + + def add_text(name, value): + parts.append( + f"--{boundary}\r\nContent-Disposition: form-data; " + f"name=\"{name}\"\r\n\r\n{value}\r\n".encode() + ) + + def add_file(name, filename, content, content_type="application/octet-stream"): + parts.append( + f"--{boundary}\r\nContent-Disposition: form-data; " + f"name=\"{name}\"; filename=\"{filename}\"\r\n" + f"Content-Type: {content_type}\r\n\r\n".encode() + + content + b"\r\n" + ) + + with open(audio_path, "rb") as f: + file_content = f.read() + + # Groq ima 25 MB limit za API requests (verjetno dovolj za večino pesmi) + if len(file_content) > 24 * 1024 * 1024: + print(f" ⚠️ Audio file {len(file_content)/1024/1024:.1f} MB > 24 MB limit, fallback na lokalno", file=sys.stderr) + return None + + add_file("file", "audio.wav", file_content, "audio/wav") + add_text("model", model) + add_text("response_format", "verbose_json") + add_text("temperature", "0.0") + add_text("timestamp_granularities[]", "segment") + add_text("timestamp_granularities[]", "word") + if lang: + add_text("language", lang) + + parts.append(f"--{boundary}--\r\n".encode()) + body = b"".join(parts) + + req = urllib.request.Request( + "https://api.groq.com/openai/v1/audio/transcriptions", + data=body, + headers={ + "Authorization": f"Bearer {api_key}", + "Content-Type": f"multipart/form-data; boundary={boundary}", + "User-Agent": "groq-python/0.11.0", # Cloudflare bypass + }, + ) + + print(f" 📡 Groq Whisper ({model}, {len(file_content)/1024/1024:.1f} MB)...", file=sys.stderr) + try: + with urllib.request.urlopen(req, timeout=180) as resp: + data = json.loads(resp.read().decode()) + except urllib.error.HTTPError as e: + body_err = e.read().decode("utf-8", errors="replace")[:500] + print(f" ❌ Groq HTTP {e.code}: {body_err}", file=sys.stderr) + return None + except Exception as e: + print(f" ❌ Groq exception: {e}", file=sys.stderr) + return None + + # Pretvori Groq response v isti format kot lokalni Whisper + detected_lang = data.get("language", "unknown") + # Groq vrača jezik z velikim začetkom (npr. "German", "Slovenian"), pretvorimo v ISO + LANG_MAP = { + "english": "en", "german": "de", "slovenian": "sl", "croatian": "hr", + "bosnian": "bs", "serbian": "sr", "italian": "it", "spanish": "es", + "french": "fr", "portuguese": "pt", "russian": "ru", "polish": "pl", + "czech": "cs", "slovak": "sk", "hungarian": "hu", "romanian": "ro", + } + detected_lang_iso = LANG_MAP.get(detected_lang.lower(), detected_lang.lower()[:2]) + + segments = [] + for s in data.get("segments", []): + # Word-level timestamps (če so na voljo) + words_in_segment = [] + for w in data.get("words", []): + if s["start"] <= w["start"] <= s["end"]: + words_in_segment.append({ + "start": w["start"], + "end": w["end"], + "text": w["word"], + }) + segments.append({ + "start": s["start"], + "end": s["end"], + "text": s["text"].strip(), + "words": words_in_segment, + }) + + print(f" ✅ Groq: {len(segments)} segmentov, lang={detected_lang_iso}", file=sys.stderr) + return { + "language": detected_lang_iso, + "language_probability": 1.0, # Groq ne vrača confidence + "segments": segments, + } + + +def transcribe_full(audio_path, lang=None, model_size="small", provider="local"): + """Whisper transcript celega avdia. + + provider: + - "local" → faster-whisper na CPU (počasi ampak brezplačno) + - "groq" → Groq Whisper API (216x hitreje, $0.04/h) + - "auto" → poskusi Groq, fallback na local če manjka API key + + Vrne empty transcript če Whisper ne najde govora.""" + + # ── Provider routing ── + if provider in ("groq", "auto") and os.environ.get("GROQ_API_KEY"): + # Mapping: model_size → Groq model name + groq_model = "whisper-large-v3-turbo" + if model_size == "large-v3": + groq_model = "whisper-large-v3" # boljša kvaliteta, malo počasneje + result = transcribe_with_groq(audio_path, lang=lang, model=groq_model) + if result: + return result + if provider == "groq": + # Strict groq mode — če ne uspe, vrni prazen + print(f" ⚠️ Groq failed, no fallback (provider=groq)", file=sys.stderr) + return {"language": "unknown", "language_probability": 0.0, "segments": []} + print(f" 🔄 Groq failed, fallback na lokalno faster-whisper...", file=sys.stderr) + + # ── Lokalni faster-whisper ── + return _transcribe_full_local(audio_path, lang=lang, model_size=model_size) + + +def _transcribe_full_local(audio_path, lang=None, model_size="small"): + """Lokalna faster-whisper transkripcija (originalna implementacija).""" from faster_whisper import WhisperModel - print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr) + print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr) m = WhisperModel(model_size, device="cpu", compute_type="int8") # Auto-detect z 3-sample voting da se zaklenemo na en jezik @@ -788,6 +927,9 @@ def main(): help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)") ap.add_argument("--filename-hint", default=None, help="Originalno ime datoteke (Claude lahko prepozna pesem)") + ap.add_argument("--whisper-provider", default="auto", + choices=["local", "groq", "auto"], + help="Whisper provider: local=faster-whisper na CPU, groq=Groq API (200x hitreje), auto=Groq če ima API key") ap.add_argument("--json", action="store_true", help="Output JSON") ap.add_argument("--output", help="Path za JSON output") args = ap.parse_args() @@ -806,7 +948,10 @@ def main(): try: # 2. Whisper transcript lang = None if args.lang in (None, "auto", "") else args.lang - transcript = transcribe_full(audio, lang=lang, model_size=args.model) + transcript = transcribe_full( + audio, lang=lang, model_size=args.model, + provider=args.whisper_provider, + ) print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr) # 3. Energy profile