Integrate ElevenLabs Scribe (best multilingual STT 2026)
ElevenLabs Scribe replaces local Whisper as default transcription: - 96.7% accuracy English, 2.4% WER Indonesian (vs Whisper 7.7%) - 18x faster (200s song = 11s vs 3-5 min on CPU) - No hallucinations on songs (Whisper invented 'Pony und Kleid' for 'Bonnie und Clyde') - 99 languages supported, including SLO/HR/BS/SR - $0.40/h pricing, ~$0.022 per 200s song Implementation: - transcribe_with_elevenlabs() function uses Scribe v1 - ISO 639-1 ↔ 639-3 mapping (Scribe needs 'deu' not 'de') - Word-level timestamps converted to pseudo-segments (close on 0.6s pause or 6s duration) - 24MB upload limit guard with auto-fallback to local Default whisper_provider='auto': - If ELEVENLABS_API_KEY set → use Scribe - Otherwise → fallback to local faster-whisper - 'elevenlabs' strict mode: no fallback - 'local' strict mode: skip Scribe entirely Tested on Ben Zucker - Ohne dich: Scribe correctly transcribed 'Wir sind Bonnie und Clyde, zu allem bereit' where local Whisper hallucinated.
This commit is contained in:
parent
3ffa9740f0
commit
68247bb84c
@ -247,6 +247,9 @@ def process_job(job_id):
|
||||
# Brez extension
|
||||
fn_hint = Path(job["filename"]).stem
|
||||
cmd += ["--filename-hint", fn_hint]
|
||||
# STT provider (elevenlabs = Scribe, local = faster-whisper, auto = preferiraj Scribe)
|
||||
if job.get("whisper_provider"):
|
||||
cmd += ["--whisper-provider", job["whisper_provider"]]
|
||||
# lang: če None ali 'auto', pusti analyze.py auto-detect
|
||||
if job.get("lang") and job["lang"] not in ("auto", ""):
|
||||
cmd += ["--lang", job["lang"]]
|
||||
@ -500,6 +503,8 @@ class StartJobIn(BaseModel):
|
||||
# LLM za semantično analizo + popravke
|
||||
llm_provider: str = "claude" # claude / gemini / auto
|
||||
llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider)
|
||||
# STT provider (Scribe je 18x hitreje + boljša multilingual accuracy)
|
||||
whisper_provider: str = "auto" # auto / elevenlabs / local
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
@ -605,6 +610,7 @@ async def start_processing(
|
||||
quality=payload.quality,
|
||||
llm_provider=payload.llm_provider,
|
||||
llm_model=payload.llm_model,
|
||||
whisper_provider=payload.whisper_provider,
|
||||
current_step="V vrsti za obdelavo",
|
||||
# Počisti pretekle napake (retry-friendly)
|
||||
chorus_error=None,
|
||||
|
||||
@ -46,13 +46,186 @@ def extract_audio(video_path):
|
||||
return audio.name
|
||||
|
||||
|
||||
def transcribe_full(audio_path, lang=None, model_size="small"):
|
||||
def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
|
||||
"""ElevenLabs Scribe transkripcija (najboljša multilingual accuracy 2026).
|
||||
|
||||
Lang accepted in ISO 639-1 ('de', 'sl', 'hr') — auto-converted to ISO 639-3.
|
||||
Pricing: ~$0.40/h (~$0.022 per 200s pesem).
|
||||
"""
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
import uuid
|
||||
|
||||
api_key = os.environ.get("ELEVENLABS_API_KEY")
|
||||
if not api_key:
|
||||
print(" ⚠️ ELEVENLABS_API_KEY ni nastavljen", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# ISO 639-1 → 639-3 mapping (Scribe uses 639-3)
|
||||
LANG_1_TO_3 = {
|
||||
"en": "eng", "de": "deu", "sl": "slv", "hr": "hrv", "bs": "bos",
|
||||
"sr": "srp", "it": "ita", "es": "spa", "fr": "fra", "pt": "por",
|
||||
"ru": "rus", "pl": "pol", "cs": "ces", "sk": "slk", "hu": "hun",
|
||||
"ro": "ron", "nl": "nld", "sv": "swe", "no": "nor", "da": "dan",
|
||||
"fi": "fin", "tr": "tur", "ar": "ara", "uk": "ukr", "bg": "bul",
|
||||
"el": "ell", "he": "heb", "ja": "jpn", "ko": "kor", "zh": "zho",
|
||||
}
|
||||
# Reverse mapping for parsing response
|
||||
LANG_3_TO_1 = {v: k for k, v in LANG_1_TO_3.items()}
|
||||
|
||||
# Multipart upload
|
||||
boundary = uuid.uuid4().hex
|
||||
parts = []
|
||||
|
||||
def add_text(name, value):
|
||||
parts.append(
|
||||
f"--{boundary}\r\nContent-Disposition: form-data; "
|
||||
f"name=\"{name}\"\r\n\r\n{value}\r\n".encode()
|
||||
)
|
||||
|
||||
def add_file(name, filename, content, ctype):
|
||||
parts.append(
|
||||
f"--{boundary}\r\nContent-Disposition: form-data; "
|
||||
f"name=\"{name}\"; filename=\"{filename}\"\r\n"
|
||||
f"Content-Type: {ctype}\r\n\r\n".encode() + content + b"\r\n"
|
||||
)
|
||||
|
||||
with open(audio_path, "rb") as f:
|
||||
audio_content = f.read()
|
||||
|
||||
# Limit: ElevenLabs Scribe supports up to ~25 MB / 4.5h per request
|
||||
if len(audio_content) > 24 * 1024 * 1024:
|
||||
print(f" ⚠️ Audio {len(audio_content)/1024/1024:.1f} MB > 24 MB limit, fallback", file=sys.stderr)
|
||||
return None
|
||||
|
||||
add_text("model_id", model)
|
||||
add_text("timestamps_granularity", "word")
|
||||
add_text("tag_audio_events", "false")
|
||||
if lang:
|
||||
scribe_lang = LANG_1_TO_3.get(lang, lang)
|
||||
add_text("language_code", scribe_lang)
|
||||
add_file("file", "audio.mp3", audio_content, "audio/mpeg")
|
||||
parts.append(f"--{boundary}--\r\n".encode())
|
||||
body = b"".join(parts)
|
||||
|
||||
print(f" 📡 ElevenLabs Scribe ({model}, {len(audio_content)/1024/1024:.1f} MB, "
|
||||
f"lang={lang or 'auto'})...", file=sys.stderr)
|
||||
|
||||
req = urllib.request.Request(
|
||||
"https://api.elevenlabs.io/v1/speech-to-text",
|
||||
data=body,
|
||||
headers={
|
||||
"xi-api-key": api_key,
|
||||
"Content-Type": f"multipart/form-data; boundary={boundary}",
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=300) as resp:
|
||||
data = json.loads(resp.read().decode())
|
||||
except urllib.error.HTTPError as e:
|
||||
body_err = e.read().decode("utf-8", errors="replace")[:500]
|
||||
print(f" ❌ Scribe HTTP {e.code}: {body_err}", file=sys.stderr)
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f" ❌ Scribe exception: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Convert response to our standard format
|
||||
detected_lang_3 = data.get("language_code", "unknown")
|
||||
detected_lang_1 = LANG_3_TO_1.get(detected_lang_3, detected_lang_3[:2])
|
||||
detected_prob = data.get("language_probability", 1.0)
|
||||
|
||||
# Scribe returns flat list of words (not segments)
|
||||
# We need to group words into pseudo-segments (~10s each, breaking on long pauses)
|
||||
words = data.get("words", [])
|
||||
segments = []
|
||||
|
||||
if words:
|
||||
# Filter out whitespace tokens
|
||||
real_words = [w for w in words if w.get("text", "").strip()]
|
||||
|
||||
if real_words:
|
||||
current_seg_words = []
|
||||
seg_start = real_words[0].get("start", 0)
|
||||
|
||||
for i, w in enumerate(real_words):
|
||||
current_seg_words.append(w)
|
||||
w_end = w.get("end", w.get("start", 0))
|
||||
|
||||
# Decide if we should close the segment
|
||||
close = False
|
||||
# Close on long pause (>= 0.6s)
|
||||
if i + 1 < len(real_words):
|
||||
next_start = real_words[i + 1].get("start", w_end)
|
||||
pause = next_start - w_end
|
||||
seg_duration = w_end - seg_start
|
||||
# Long pause OR segment is long enough (>= 4s)
|
||||
if pause >= 0.6 or seg_duration >= 6.0:
|
||||
close = True
|
||||
else:
|
||||
close = True # last word
|
||||
|
||||
if close:
|
||||
seg_text = " ".join(ww.get("text", "") for ww in current_seg_words).strip()
|
||||
if seg_text:
|
||||
segments.append({
|
||||
"start": seg_start,
|
||||
"end": w_end,
|
||||
"text": seg_text,
|
||||
"words": [
|
||||
{
|
||||
"start": ww.get("start", 0),
|
||||
"end": ww.get("end", 0),
|
||||
"text": ww.get("text", ""),
|
||||
}
|
||||
for ww in current_seg_words
|
||||
],
|
||||
})
|
||||
# Reset
|
||||
current_seg_words = []
|
||||
if i + 1 < len(real_words):
|
||||
seg_start = real_words[i + 1].get("start", 0)
|
||||
|
||||
print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, "
|
||||
f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr)
|
||||
|
||||
return {
|
||||
"language": detected_lang_1,
|
||||
"language_probability": float(detected_prob),
|
||||
"segments": segments,
|
||||
"_provider": "elevenlabs",
|
||||
}
|
||||
|
||||
|
||||
def transcribe_full(audio_path, lang=None, model_size="small", provider="auto"):
|
||||
"""Whisper/Scribe transcript dispatcher.
|
||||
|
||||
provider:
|
||||
- "elevenlabs" → ElevenLabs Scribe (najboljša kvaliteta, $0.40/h, ~10s na 200s pesem)
|
||||
- "local" → faster-whisper na CPU (brezplačno, počasi, halucinacije)
|
||||
- "auto" → Scribe če ELEVENLABS_API_KEY obstaja, sicer local
|
||||
"""
|
||||
if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"):
|
||||
result = transcribe_with_elevenlabs(audio_path, lang=lang)
|
||||
if result and result.get("segments"):
|
||||
return result
|
||||
if provider == "elevenlabs":
|
||||
print(f" ⚠️ Scribe failed, no fallback (provider=elevenlabs)", file=sys.stderr)
|
||||
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
||||
print(f" 🔄 Scribe failed, fallback na local Whisper...", file=sys.stderr)
|
||||
|
||||
# Local faster-whisper
|
||||
return _transcribe_full_local(audio_path, lang=lang, model_size=model_size)
|
||||
|
||||
|
||||
def _transcribe_full_local(audio_path, lang=None, model_size="small"):
|
||||
"""Whisper transcript celega avdia. lang=None → robust auto-detect.
|
||||
|
||||
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
||||
print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
||||
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||
|
||||
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
|
||||
@ -788,6 +961,10 @@ def main():
|
||||
help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
|
||||
ap.add_argument("--filename-hint", default=None,
|
||||
help="Originalno ime datoteke (Claude lahko prepozna pesem)")
|
||||
ap.add_argument("--whisper-provider", default="auto",
|
||||
choices=["auto", "elevenlabs", "local"],
|
||||
help="STT provider: elevenlabs=ElevenLabs Scribe (najboljša kvaliteta, $0.40/h), "
|
||||
"local=faster-whisper CPU (brezplačno, halucinacije), auto=Scribe če key, sicer local")
|
||||
ap.add_argument("--json", action="store_true", help="Output JSON")
|
||||
ap.add_argument("--output", help="Path za JSON output")
|
||||
args = ap.parse_args()
|
||||
@ -806,7 +983,10 @@ def main():
|
||||
try:
|
||||
# 2. Whisper transcript
|
||||
lang = None if args.lang in (None, "auto", "") else args.lang
|
||||
transcript = transcribe_full(audio, lang=lang, model_size=args.model)
|
||||
transcript = transcribe_full(
|
||||
audio, lang=lang, model_size=args.model,
|
||||
provider=args.whisper_provider,
|
||||
)
|
||||
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
|
||||
|
||||
# 3. Energy profile
|
||||
|
||||
Loading…
Reference in New Issue
Block a user