Auto-detect language from filename for Scribe (no manual UI selection needed)
Problem: Scribe was failing on Slovenian narodno-zabavna songs (Avseniki, Modrijani) because: - User doesn't manually pick language (everything is auto) - Scribe auto-detect had low confidence (0.58) on harmonika-heavy polka - Result: only 37s transcribed instead of full 186s song Solution: detect_language_from_filename() function: - Recognizes 60+ Slovenian artists (Avseniki, Modrijani, Veseli Dolenjci, ...) - Recognizes 30+ German artists (Ben Zucker, Helene Fischer, ...) - Recognizes 20+ Croatian/Serbian artists (Thompson, Severina, Lepa Brena, ...) - Falls back to keyword matching (volim, liebe, srce, herz, ...) - Detects character set (č/ž/š → SL, ä/ö/ü/ß → DE, đ → HR) - Score-based: 5pts for artist match, 1-2pts for keywords/chars When detected, sends language_code to Scribe explicitly: - Avseniki → 'slv' lock → no more half-transcribed songs - Ben Zucker → 'deu' lock → consistent German transcription - User still doesn't need to manually pick anything filename_hint flows: main.py → analyze.py CLI → transcribe_full → Scribe
This commit is contained in:
parent
40acad26f3
commit
7d00730051
@ -46,10 +46,110 @@ def extract_audio(video_path):
|
||||
return audio.name
|
||||
|
||||
|
||||
def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
|
||||
def detect_language_from_filename(filename_hint):
|
||||
"""Detektiraj jezik iz imena datoteke na podlagi znanih izvajalcev/besed.
|
||||
|
||||
Vrne ISO 639-1 ('sl', 'de', 'en', 'hr'...) ali None.
|
||||
"""
|
||||
if not filename_hint:
|
||||
return None
|
||||
|
||||
name = filename_hint.lower()
|
||||
|
||||
# Slovenski izvajalci (narodno-zabavna, pop, rock)
|
||||
SLO_ARTISTS = [
|
||||
"avseniki", "avsenik", "modrijani", "veseli dolenjci",
|
||||
"čuki", "atomik harmonik", "alfi nipič", "helena blagne",
|
||||
"siddharta", "magnifico", "vlado kreslin", "zaklonišče prepeva",
|
||||
"perpetuum jazzile", "tabu", "natalija verboten", "klavdija",
|
||||
"iztok mlakar", "rok'n'band", "okrog cele zemlje", "ansambel",
|
||||
"miran rudan", "andrej šifrer", "mi2", "elvis jackson",
|
||||
"tanja žagar", "manca špik", "saša lendero", "rebeka dremelj",
|
||||
"nuša derenda", "alenka godec", "prifarski muzikanti",
|
||||
"nova generacija", "polka", "narodno-zabavna",
|
||||
]
|
||||
SLO_KEYWORDS = ["pazi", "morju", "zveza", "domovina", "ljubim", "srce", "majhna",
|
||||
"prav", "nazaj", "noč", "dom", "pomoč", "bolha", "preko"]
|
||||
|
||||
# Nemški izvajalci (Schlager, Volksmusik)
|
||||
DE_ARTISTS = [
|
||||
"ben zucker", "andrea berg", "helene fischer", "andreas gabalier",
|
||||
"amigos", "kastelruther spatzen", "florian silbereisen", "voxxclub",
|
||||
"wolfgang petry", "mickie krause", "die toten hosen", "rammstein",
|
||||
"udo lindenberg", "die ärzte", "westernhagen", "peter maffay",
|
||||
"matthias reim", "die zillertaler", "die jungen zillertaler",
|
||||
"stefan mross", "marianne", "michael wendler", "vincent gross",
|
||||
"schlager", "volksmusik",
|
||||
]
|
||||
DE_KEYWORDS = ["liebe", "herz", "ohne", "dich", "leben", "nacht", "tag",
|
||||
"schön", "mädchen", "sonne", "himmel", "wenn", "nur",
|
||||
"bist", "hast", "dass", "weiß", "kann", "auch"]
|
||||
|
||||
# Hrvaški/srbski izvajalci
|
||||
HR_ARTISTS = [
|
||||
"thompson", "miroslav škoro", "oliver dragojević", "gibonni",
|
||||
"severina", "tony cetinski", "psihomodo pop", "prljavo kazalište",
|
||||
"parni valjak", "lepa brena", "ceca", "aca lukas", "mile kitić",
|
||||
"halid bešlić", "dino merlin", "zdravko čolić", "magazin",
|
||||
]
|
||||
HR_KEYWORDS = ["volim", "ljubav", "srce", "danas", "noćas", "more",
|
||||
"majka", "domovina", "zauvijek", "samo", "ćemo"]
|
||||
|
||||
# Angleški izvajalci (preveč jih je za listo, raje preverim ne-SL/DE/HR znake)
|
||||
EN_KEYWORDS = ["love", "song", "feat", "remix", "official", "music", "video",
|
||||
"remastered", "lyrics", "by", "with", "tonight", "forever",
|
||||
"heart", "dance", "party", "summer"]
|
||||
|
||||
score = {"sl": 0, "de": 0, "hr": 0, "en": 0, "it": 0, "es": 0, "fr": 0}
|
||||
|
||||
# Artist matches (težji)
|
||||
for a in SLO_ARTISTS:
|
||||
if a in name:
|
||||
score["sl"] += 5
|
||||
for a in DE_ARTISTS:
|
||||
if a in name:
|
||||
score["de"] += 5
|
||||
for a in HR_ARTISTS:
|
||||
if a in name:
|
||||
score["hr"] += 5
|
||||
|
||||
# Keyword matches
|
||||
for kw in SLO_KEYWORDS:
|
||||
if kw in name.split() or f" {kw} " in f" {name} ":
|
||||
score["sl"] += 1
|
||||
for kw in DE_KEYWORDS:
|
||||
if kw in name.split() or f" {kw} " in f" {name} ":
|
||||
score["de"] += 1
|
||||
for kw in HR_KEYWORDS:
|
||||
if kw in name.split() or f" {kw} " in f" {name} ":
|
||||
score["hr"] += 1
|
||||
for kw in EN_KEYWORDS:
|
||||
if kw in name.split() or f" {kw} " in f" {name} ":
|
||||
score["en"] += 1
|
||||
|
||||
# Slovenska abeceda (č, ž, š) brez đ (ki je hrvaška)
|
||||
if any(c in name for c in "čžš") and "đ" not in name:
|
||||
score["sl"] += 2
|
||||
# Nemška abeceda (ä ö ü ß)
|
||||
if any(c in name for c in "äöüß"):
|
||||
score["de"] += 2
|
||||
# Hrvaška abeceda (đ)
|
||||
if "đ" in name:
|
||||
score["hr"] += 2
|
||||
|
||||
if not any(score.values()):
|
||||
return None
|
||||
|
||||
best = max(score.items(), key=lambda x: x[1])
|
||||
if best[1] >= 2: # threshold
|
||||
return best[0]
|
||||
return None
|
||||
|
||||
|
||||
def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filename_hint=None):
|
||||
"""ElevenLabs Scribe transkripcija (najboljša multilingual accuracy 2026).
|
||||
|
||||
Lang accepted in ISO 639-1 ('de', 'sl', 'hr') — auto-converted to ISO 639-3.
|
||||
lang: ISO 639-1 ('de', 'sl', 'hr') — če None, probamo iz filename_hint
|
||||
Pricing: ~$0.40/h (~$0.022 per 200s pesem).
|
||||
"""
|
||||
import urllib.request
|
||||
@ -61,6 +161,13 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
|
||||
print(" ⚠️ ELEVENLABS_API_KEY ni nastavljen", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Auto-detect lang from filename če uporabnik ni eksplicitno izbral
|
||||
if not lang and filename_hint:
|
||||
guessed = detect_language_from_filename(filename_hint)
|
||||
if guessed:
|
||||
lang = guessed
|
||||
print(f" 🔍 Lang iz filename '{filename_hint}': {lang}", file=sys.stderr)
|
||||
|
||||
# ISO 639-1 → 639-3 mapping (Scribe uses 639-3)
|
||||
LANG_1_TO_3 = {
|
||||
"en": "eng", "de": "deu", "sl": "slv", "hr": "hrv", "bs": "bos",
|
||||
@ -70,7 +177,6 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
|
||||
"fi": "fin", "tr": "tur", "ar": "ara", "uk": "ukr", "bg": "bul",
|
||||
"el": "ell", "he": "heb", "ja": "jpn", "ko": "kor", "zh": "zho",
|
||||
}
|
||||
# Reverse mapping for parsing response
|
||||
LANG_3_TO_1 = {v: k for k, v in LANG_1_TO_3.items()}
|
||||
|
||||
# Multipart upload
|
||||
@ -212,16 +318,18 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
|
||||
}
|
||||
|
||||
|
||||
def transcribe_full(audio_path, lang=None, model_size="small", provider="auto"):
|
||||
def transcribe_full(audio_path, lang=None, model_size="small", provider="auto", filename_hint=None):
|
||||
"""Whisper/Scribe transcript dispatcher.
|
||||
|
||||
provider:
|
||||
- "elevenlabs" → ElevenLabs Scribe (najboljša kvaliteta, $0.40/h, ~10s na 200s pesem)
|
||||
- "local" → faster-whisper na CPU (brezplačno, počasi, halucinacije)
|
||||
- "auto" → Scribe če ELEVENLABS_API_KEY obstaja, sicer local
|
||||
|
||||
filename_hint: ime datoteke (uporablja za auto-detect jezika če lang=None)
|
||||
"""
|
||||
if provider in ("elevenlabs", "auto") and os.environ.get("ELEVENLABS_API_KEY"):
|
||||
result = transcribe_with_elevenlabs(audio_path, lang=lang)
|
||||
result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
||||
if result and result.get("segments"):
|
||||
return result
|
||||
if provider == "elevenlabs":
|
||||
@ -1070,9 +1178,12 @@ def main():
|
||||
try:
|
||||
# 2. Whisper transcript
|
||||
lang = None if args.lang in (None, "auto", "") else args.lang
|
||||
# Filename hint pomaga Scribu detektirati jezik (Avseniki → SL, Ben Zucker → DE)
|
||||
fname_hint = args.filename_hint or video.stem
|
||||
transcript = transcribe_full(
|
||||
audio, lang=lang, model_size=args.model,
|
||||
provider=args.whisper_provider,
|
||||
filename_hint=fname_hint,
|
||||
)
|
||||
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user