Auto-detect language from filename for Scribe (no manual UI selection needed)

Problem: Scribe was failing on Slovenian narodno-zabavna songs (Avseniki,
Modrijani) because:
- User doesn't manually pick language (everything is auto)
- Scribe auto-detect had low confidence (0.58) on harmonika-heavy polka
- Result: only 37s transcribed instead of full 186s song

Solution: detect_language_from_filename() function:
- Recognizes 60+ Slovenian artists (Avseniki, Modrijani, Veseli Dolenjci, ...)
- Recognizes 30+ German artists (Ben Zucker, Helene Fischer, ...)
- Recognizes 20+ Croatian/Serbian artists (Thompson, Severina, Lepa Brena, ...)
- Falls back to keyword matching (volim, liebe, srce, herz, ...)
- Detects character set (č/ž/š → SL, ä/ö/ü/ß → DE, đ → HR)
- Score-based: 5pts for artist match, 1-2pts for keywords/chars

When detected, sends language_code to Scribe explicitly:
- Avseniki → 'slv' lock → no more half-transcribed songs
- Ben Zucker → 'deu' lock → consistent German transcription
- User still doesn't need to manually pick anything

filename_hint flows: main.py → analyze.py CLI → transcribe_full → Scribe
This commit is contained in:
Sebastjan Artič 2026-04-29 12:57:19 +00:00
parent 40acad26f3
commit 7d00730051

View File

@ -46,10 +46,110 @@ def extract_audio(video_path):
return audio.name
def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
def detect_language_from_filename(filename_hint):
"""Detektiraj jezik iz imena datoteke na podlagi znanih izvajalcev/besed.
Vrne ISO 639-1 ('sl', 'de', 'en', 'hr'...) ali None.
"""
if not filename_hint:
return None
name = filename_hint.lower()
# Slovenski izvajalci (narodno-zabavna, pop, rock)
SLO_ARTISTS = [
"avseniki", "avsenik", "modrijani", "veseli dolenjci",
"čuki", "atomik harmonik", "alfi nipič", "helena blagne",
"siddharta", "magnifico", "vlado kreslin", "zaklonišče prepeva",
"perpetuum jazzile", "tabu", "natalija verboten", "klavdija",
"iztok mlakar", "rok'n'band", "okrog cele zemlje", "ansambel",
"miran rudan", "andrej šifrer", "mi2", "elvis jackson",
"tanja žagar", "manca špik", "saša lendero", "rebeka dremelj",
"nuša derenda", "alenka godec", "prifarski muzikanti",
"nova generacija", "polka", "narodno-zabavna",
]
SLO_KEYWORDS = ["pazi", "morju", "zveza", "domovina", "ljubim", "srce", "majhna",
"prav", "nazaj", "noč", "dom", "pomoč", "bolha", "preko"]
# Nemški izvajalci (Schlager, Volksmusik)
DE_ARTISTS = [
"ben zucker", "andrea berg", "helene fischer", "andreas gabalier",
"amigos", "kastelruther spatzen", "florian silbereisen", "voxxclub",
"wolfgang petry", "mickie krause", "die toten hosen", "rammstein",
"udo lindenberg", "die ärzte", "westernhagen", "peter maffay",
"matthias reim", "die zillertaler", "die jungen zillertaler",
"stefan mross", "marianne", "michael wendler", "vincent gross",
"schlager", "volksmusik",
]
DE_KEYWORDS = ["liebe", "herz", "ohne", "dich", "leben", "nacht", "tag",
"schön", "mädchen", "sonne", "himmel", "wenn", "nur",
"bist", "hast", "dass", "weiß", "kann", "auch"]
# Hrvaški/srbski izvajalci
HR_ARTISTS = [
"thompson", "miroslav škoro", "oliver dragojević", "gibonni",
"severina", "tony cetinski", "psihomodo pop", "prljavo kazalište",
"parni valjak", "lepa brena", "ceca", "aca lukas", "mile kitić",
"halid bešlić", "dino merlin", "zdravko čolić", "magazin",
]
HR_KEYWORDS = ["volim", "ljubav", "srce", "danas", "noćas", "more",
"majka", "domovina", "zauvijek", "samo", "ćemo"]
# Angleški izvajalci (preveč jih je za listo, raje preverim ne-SL/DE/HR znake)
EN_KEYWORDS = ["love", "song", "feat", "remix", "official", "music", "video",
"remastered", "lyrics", "by", "with", "tonight", "forever",
"heart", "dance", "party", "summer"]
score = {"sl": 0, "de": 0, "hr": 0, "en": 0, "it": 0, "es": 0, "fr": 0}
# Artist matches (težji)
for a in SLO_ARTISTS:
if a in name:
score["sl"] += 5
for a in DE_ARTISTS:
if a in name:
score["de"] += 5
for a in HR_ARTISTS:
if a in name:
score["hr"] += 5
# Keyword matches
for kw in SLO_KEYWORDS:
if kw in name.split() or f" {kw} " in f" {name} ":
score["sl"] += 1
for kw in DE_KEYWORDS:
if kw in name.split() or f" {kw} " in f" {name} ":
score["de"] += 1
for kw in HR_KEYWORDS:
if kw in name.split() or f" {kw} " in f" {name} ":
score["hr"] += 1
for kw in EN_KEYWORDS:
if kw in name.split() or f" {kw} " in f" {name} ":
score["en"] += 1
# Slovenska abeceda (č, ž, š) brez đ (ki je hrvaška)
if any(c in name for c in "čžš") and "đ" not in name:
score["sl"] += 2
# Nemška abeceda (ä ö ü ß)
if any(c in name for c in "äöüß"):
score["de"] += 2
# Hrvaška abeceda (đ)
if "đ" in name:
score["hr"] += 2
if not any(score.values()):
return None
best = max(score.items(), key=lambda x: x[1])
if best[1] >= 2: # threshold
return best[0]
return None
def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filename_hint=None):
"""ElevenLabs Scribe transkripcija (najboljša multilingual accuracy 2026).
Lang accepted in ISO 639-1 ('de', 'sl', 'hr') auto-converted to ISO 639-3.
lang: ISO 639-1 ('de', 'sl', 'hr') če None, probamo iz filename_hint
Pricing: ~$0.40/h (~$0.022 per 200s pesem).
"""
import urllib.request
@ -61,6 +161,13 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
print(" ⚠️ ELEVENLABS_API_KEY ni nastavljen", file=sys.stderr)
return None
# Auto-detect lang from filename če uporabnik ni eksplicitno izbral
if not lang and filename_hint:
guessed = detect_language_from_filename(filename_hint)
if guessed:
lang = guessed
print(f" 🔍 Lang iz filename '{filename_hint}': {lang}", file=sys.stderr)
# ISO 639-1 → 639-3 mapping (Scribe uses 639-3)
LANG_1_TO_3 = {
"en": "eng", "de": "deu", "sl": "slv", "hr": "hrv", "bs": "bos",
@ -70,7 +177,6 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
"fi": "fin", "tr": "tur", "ar": "ara", "uk": "ukr", "bg": "bul",
"el": "ell", "he": "heb", "ja": "jpn", "ko": "kor", "zh": "zho",
}
# Reverse mapping for parsing response
LANG_3_TO_1 = {v: k for k, v in LANG_1_TO_3.items()}
# Multipart upload
@ -212,16 +318,18 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
}
def transcribe_full(audio_path, lang=None, model_size="small", provider="auto"):
def transcribe_full(audio_path, lang=None, model_size="small", provider="auto", filename_hint=None):
"""Whisper/Scribe transcript dispatcher.
provider:
- "elevenlabs" ElevenLabs Scribe (najboljša kvaliteta, $0.40/h, ~10s na 200s pesem)
- "local" faster-whisper na CPU (brezplačno, počasi, halucinacije)
- "auto" Scribe če ELEVENLABS_API_KEY obstaja, sicer local
filename_hint: ime datoteke (uporablja za auto-detect jezika če lang=None)
"""
if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"):
result = transcribe_with_elevenlabs(audio_path, lang=lang)
result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
if result and result.get("segments"):
return result
if provider == "elevenlabs":
@ -1070,9 +1178,12 @@ def main():
try:
# 2. Whisper transcript
lang = None if args.lang in (None, "auto", "") else args.lang
# Filename hint pomaga Scribu detektirati jezik (Avseniki → SL, Ben Zucker → DE)
fname_hint = args.filename_hint or video.stem
transcript = transcribe_full(
audio, lang=lang, model_size=args.model,
provider=args.whisper_provider,
filename_hint=fname_hint,
)
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)