diff --git a/app/main.py b/app/main.py index f5dc2d7..b47da6b 100644 --- a/app/main.py +++ b/app/main.py @@ -247,9 +247,6 @@ def process_job(job_id): # Brez extension fn_hint = Path(job["filename"]).stem cmd += ["--filename-hint", fn_hint] - # Whisper provider (groq = 200x hitreje od lokalnega) - if job.get("whisper_provider"): - cmd += ["--whisper-provider", job["whisper_provider"]] # lang: če None ali 'auto', pusti analyze.py auto-detect if job.get("lang") and job["lang"] not in ("auto", ""): cmd += ["--lang", job["lang"]] @@ -503,8 +500,6 @@ class StartJobIn(BaseModel): # LLM za semantično analizo + popravke llm_provider: str = "claude" # claude / gemini / auto llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider) - # Whisper provider (Groq je 200x hitrejši od lokalnega CPU faster-whisper) - whisper_provider: str = "auto" # auto / groq / local # ──────────────────────────────────────────────────────────────── @@ -610,7 +605,6 @@ async def start_processing( quality=payload.quality, llm_provider=payload.llm_provider, llm_model=payload.llm_model, - whisper_provider=payload.whisper_provider, current_step="V vrsti za obdelavo", # Počisti pretekle napake (retry-friendly) chorus_error=None, diff --git a/scripts/analyze.py b/scripts/analyze.py index 2e77bd1..ffd4e4b 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -46,152 +46,13 @@ def extract_audio(video_path): return audio.name -def transcribe_with_groq(audio_path, lang=None, model="whisper-large-v3-turbo"): - """Whisper transkripcija prek Groq API-ja. +def transcribe_full(audio_path, lang=None, model_size="small"): + """Whisper transcript celega avdia. lang=None → robust auto-detect. - 216x realtime speed factor — 30s audio = ~0.5s transcribe time. - Stroški: $0.04/h (turbo) ali $0.111/h (large-v3). - """ - import urllib.request - import urllib.error - import uuid - - api_key = os.environ.get("GROQ_API_KEY") - if not api_key: - print(" ⚠️ GROQ_API_KEY ni nastavljen", file=sys.stderr) - return None - - # Pripravi multipart/form-data - boundary = uuid.uuid4().hex - parts = [] - - def add_text(name, value): - parts.append( - f"--{boundary}\r\nContent-Disposition: form-data; " - f"name=\"{name}\"\r\n\r\n{value}\r\n".encode() - ) - - def add_file(name, filename, content, content_type="application/octet-stream"): - parts.append( - f"--{boundary}\r\nContent-Disposition: form-data; " - f"name=\"{name}\"; filename=\"{filename}\"\r\n" - f"Content-Type: {content_type}\r\n\r\n".encode() - + content + b"\r\n" - ) - - with open(audio_path, "rb") as f: - file_content = f.read() - - # Groq ima 25 MB limit za API requests (verjetno dovolj za večino pesmi) - if len(file_content) > 24 * 1024 * 1024: - print(f" ⚠️ Audio file {len(file_content)/1024/1024:.1f} MB > 24 MB limit, fallback na lokalno", file=sys.stderr) - return None - - add_file("file", "audio.wav", file_content, "audio/wav") - add_text("model", model) - add_text("response_format", "verbose_json") - add_text("temperature", "0.0") - add_text("timestamp_granularities[]", "segment") - add_text("timestamp_granularities[]", "word") - if lang: - add_text("language", lang) - - parts.append(f"--{boundary}--\r\n".encode()) - body = b"".join(parts) - - req = urllib.request.Request( - "https://api.groq.com/openai/v1/audio/transcriptions", - data=body, - headers={ - "Authorization": f"Bearer {api_key}", - "Content-Type": f"multipart/form-data; boundary={boundary}", - "User-Agent": "groq-python/0.11.0", # Cloudflare bypass - }, - ) - - print(f" 📡 Groq Whisper ({model}, {len(file_content)/1024/1024:.1f} MB)...", file=sys.stderr) - try: - with urllib.request.urlopen(req, timeout=180) as resp: - data = json.loads(resp.read().decode()) - except urllib.error.HTTPError as e: - body_err = e.read().decode("utf-8", errors="replace")[:500] - print(f" ❌ Groq HTTP {e.code}: {body_err}", file=sys.stderr) - return None - except Exception as e: - print(f" ❌ Groq exception: {e}", file=sys.stderr) - return None - - # Pretvori Groq response v isti format kot lokalni Whisper - detected_lang = data.get("language", "unknown") - # Groq vrača jezik z velikim začetkom (npr. "German", "Slovenian"), pretvorimo v ISO - LANG_MAP = { - "english": "en", "german": "de", "slovenian": "sl", "croatian": "hr", - "bosnian": "bs", "serbian": "sr", "italian": "it", "spanish": "es", - "french": "fr", "portuguese": "pt", "russian": "ru", "polish": "pl", - "czech": "cs", "slovak": "sk", "hungarian": "hu", "romanian": "ro", - } - detected_lang_iso = LANG_MAP.get(detected_lang.lower(), detected_lang.lower()[:2]) - - segments = [] - for s in data.get("segments", []): - # Word-level timestamps (če so na voljo) - words_in_segment = [] - for w in data.get("words", []): - if s["start"] <= w["start"] <= s["end"]: - words_in_segment.append({ - "start": w["start"], - "end": w["end"], - "text": w["word"], - }) - segments.append({ - "start": s["start"], - "end": s["end"], - "text": s["text"].strip(), - "words": words_in_segment, - }) - - print(f" ✅ Groq: {len(segments)} segmentov, lang={detected_lang_iso}", file=sys.stderr) - return { - "language": detected_lang_iso, - "language_probability": 1.0, # Groq ne vrača confidence - "segments": segments, - } - - -def transcribe_full(audio_path, lang=None, model_size="small", provider="local"): - """Whisper transcript celega avdia. - - provider: - - "local" → faster-whisper na CPU (počasi ampak brezplačno) - - "groq" → Groq Whisper API (216x hitreje, $0.04/h) - - "auto" → poskusi Groq, fallback na local če manjka API key - - Vrne empty transcript če Whisper ne najde govora.""" - - # ── Provider routing ── - if provider in ("groq", "auto") and os.environ.get("GROQ_API_KEY"): - # Mapping: model_size → Groq model name - groq_model = "whisper-large-v3-turbo" - if model_size == "large-v3": - groq_model = "whisper-large-v3" # boljša kvaliteta, malo počasneje - result = transcribe_with_groq(audio_path, lang=lang, model=groq_model) - if result: - return result - if provider == "groq": - # Strict groq mode — če ne uspe, vrni prazen - print(f" ⚠️ Groq failed, no fallback (provider=groq)", file=sys.stderr) - return {"language": "unknown", "language_probability": 0.0, "segments": []} - print(f" 🔄 Groq failed, fallback na lokalno faster-whisper...", file=sys.stderr) - - # ── Lokalni faster-whisper ── - return _transcribe_full_local(audio_path, lang=lang, model_size=model_size) - - -def _transcribe_full_local(audio_path, lang=None, model_size="small"): - """Lokalna faster-whisper transkripcija (originalna implementacija).""" + Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental).""" from faster_whisper import WhisperModel - print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr) + print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr) m = WhisperModel(model_size, device="cpu", compute_type="int8") # Auto-detect z 3-sample voting da se zaklenemo na en jezik @@ -927,9 +788,6 @@ def main(): help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)") ap.add_argument("--filename-hint", default=None, help="Originalno ime datoteke (Claude lahko prepozna pesem)") - ap.add_argument("--whisper-provider", default="auto", - choices=["local", "groq", "auto"], - help="Whisper provider: local=faster-whisper na CPU, groq=Groq API (200x hitreje), auto=Groq če ima API key") ap.add_argument("--json", action="store_true", help="Output JSON") ap.add_argument("--output", help="Path za JSON output") args = ap.parse_args() @@ -948,10 +806,7 @@ def main(): try: # 2. Whisper transcript lang = None if args.lang in (None, "auto", "") else args.lang - transcript = transcribe_full( - audio, lang=lang, model_size=args.model, - provider=args.whisper_provider, - ) + transcript = transcribe_full(audio, lang=lang, model_size=args.model) print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr) # 3. Energy profile