User feedback: 'Odmakni da se zacne refren na besedo ki je v naslovu' Problem: Many Slovenian folk-pop songs have the title in the VERSE, not in the chorus: - 'Cvetele so maline' → title is in verse, real chorus is 'Naj veter zdaj...' - 'Domotožje v pomladi' → title is theme, real chorus is 'Bele breze...' Old prompt forced LLM to find title phrase in chorus, leading it to pick verse parts (mid-line, wrong timing) just because they contained the title. Changes: 1. REMOVED forced rule: 'Naslov pesmi = REFREN HOOK (80-90% primerov)' 2. NEW guidance: 'Naslov pesmi je VČASIH v refrenu, VČASIH v verzu. NE silujte!' 3. NEW principle: 'Refren je tisti del ki se PONAVLJA 2-3x z ENAKIM besedilom' 4. Fixed CVETELE example: chorus is 'Naj veter zdaj ponese...' (not Cvetele) with explicit warning that title is in VERSE 2 at ~125s 5. Added: 'NE izberi outro/3. nastop — izberi PRVI nastop refrena' This should let LLM find the actual repeating chorus instead of chasing the title phrase into verses.
2304 lines
102 KiB
Python
2304 lines
102 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
analyze.py — Predhodna analiza CELEGA videa pred trim-anjem.
|
|
|
|
Naredi:
|
|
1. Whisper transcript celega videa (auto-detect jezika ali user-specified)
|
|
2. Energy profile (RMS dB na 1s windows)
|
|
3. Structural detection (vocal/instrumental sections, energy peaks)
|
|
4. Pametno izbere clip range (lahko >30s, vključi pre-chorus)
|
|
5. Detekcija instrumentalnih pesmi (no_subs auto)
|
|
|
|
Output: JSON s podatki za clip.py
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
|
|
|
|
def get_video_duration(path):
|
|
r = subprocess.run(
|
|
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
|
"-of", "default=nw=1:nokey=1", str(path)],
|
|
capture_output=True, text=True
|
|
)
|
|
try:
|
|
return float(r.stdout.strip())
|
|
except ValueError:
|
|
return 0.0
|
|
|
|
|
|
def extract_audio(video_path):
|
|
"""Extract avdio v 16kHz mono WAV za Whisper + energy."""
|
|
audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
audio.close()
|
|
subprocess.run(
|
|
["ffmpeg", "-y", "-i", str(video_path), "-vn",
|
|
"-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", audio.name],
|
|
check=True, capture_output=True
|
|
)
|
|
return audio.name
|
|
|
|
|
|
def detect_language_from_filename(filename_hint):
|
|
"""Detektiraj jezik iz imena datoteke na podlagi znanih izvajalcev/besed.
|
|
|
|
Vrne ISO 639-1 ('sl', 'de', 'en', 'hr'...) ali None.
|
|
"""
|
|
if not filename_hint:
|
|
return None
|
|
|
|
name = filename_hint.lower()
|
|
|
|
# Slovenski izvajalci (narodno-zabavna, pop, rock)
|
|
SLO_ARTISTS = [
|
|
"avseniki", "avsenik", "modrijani", "veseli dolenjci",
|
|
"čuki", "atomik harmonik", "alfi nipič", "helena blagne",
|
|
"siddharta", "magnifico", "vlado kreslin", "zaklonišče prepeva",
|
|
"perpetuum jazzile", "tabu", "natalija verboten", "klavdija",
|
|
"iztok mlakar", "rok'n'band", "okrog cele zemlje", "ansambel",
|
|
"miran rudan", "andrej šifrer", "mi2", "elvis jackson",
|
|
"tanja žagar", "manca špik", "saša lendero", "rebeka dremelj",
|
|
"nuša derenda", "alenka godec", "prifarski muzikanti",
|
|
"nova generacija", "polka", "narodno-zabavna",
|
|
]
|
|
SLO_KEYWORDS = ["pazi", "morju", "zveza", "domovina", "ljubim", "srce", "majhna",
|
|
"prav", "nazaj", "noč", "dom", "pomoč", "bolha", "preko"]
|
|
|
|
# Nemški izvajalci (Schlager, Volksmusik)
|
|
DE_ARTISTS = [
|
|
"ben zucker", "andrea berg", "helene fischer", "andreas gabalier",
|
|
"amigos", "kastelruther spatzen", "florian silbereisen", "voxxclub",
|
|
"wolfgang petry", "mickie krause", "die toten hosen", "rammstein",
|
|
"udo lindenberg", "die ärzte", "westernhagen", "peter maffay",
|
|
"matthias reim", "die zillertaler", "die jungen zillertaler",
|
|
"stefan mross", "marianne", "michael wendler", "vincent gross",
|
|
"schlager", "volksmusik",
|
|
]
|
|
DE_KEYWORDS = ["liebe", "herz", "ohne", "dich", "leben", "nacht", "tag",
|
|
"schön", "mädchen", "sonne", "himmel", "wenn", "nur",
|
|
"bist", "hast", "dass", "weiß", "kann", "auch"]
|
|
|
|
# Hrvaški/srbski izvajalci
|
|
HR_ARTISTS = [
|
|
"thompson", "miroslav škoro", "oliver dragojević", "gibonni",
|
|
"severina", "tony cetinski", "psihomodo pop", "prljavo kazalište",
|
|
"parni valjak", "lepa brena", "ceca", "aca lukas", "mile kitić",
|
|
"halid bešlić", "dino merlin", "zdravko čolić", "magazin",
|
|
]
|
|
HR_KEYWORDS = ["volim", "ljubav", "srce", "danas", "noćas", "more",
|
|
"majka", "domovina", "zauvijek", "samo", "ćemo"]
|
|
|
|
# Angleški izvajalci (preveč jih je za listo, raje preverim ne-SL/DE/HR znake)
|
|
EN_KEYWORDS = ["love", "song", "feat", "remix", "official", "music", "video",
|
|
"remastered", "lyrics", "by", "with", "tonight", "forever",
|
|
"heart", "dance", "party", "summer"]
|
|
|
|
score = {"sl": 0, "de": 0, "hr": 0, "en": 0, "it": 0, "es": 0, "fr": 0}
|
|
|
|
# Artist matches (težji)
|
|
for a in SLO_ARTISTS:
|
|
if a in name:
|
|
score["sl"] += 5
|
|
for a in DE_ARTISTS:
|
|
if a in name:
|
|
score["de"] += 5
|
|
for a in HR_ARTISTS:
|
|
if a in name:
|
|
score["hr"] += 5
|
|
|
|
# Keyword matches
|
|
for kw in SLO_KEYWORDS:
|
|
if kw in name.split() or f" {kw} " in f" {name} ":
|
|
score["sl"] += 1
|
|
for kw in DE_KEYWORDS:
|
|
if kw in name.split() or f" {kw} " in f" {name} ":
|
|
score["de"] += 1
|
|
for kw in HR_KEYWORDS:
|
|
if kw in name.split() or f" {kw} " in f" {name} ":
|
|
score["hr"] += 1
|
|
for kw in EN_KEYWORDS:
|
|
if kw in name.split() or f" {kw} " in f" {name} ":
|
|
score["en"] += 1
|
|
|
|
# Slovenska abeceda (č, ž, š) brez đ (ki je hrvaška)
|
|
if any(c in name for c in "čžš") and "đ" not in name:
|
|
score["sl"] += 2
|
|
# Nemška abeceda (ä ö ü ß)
|
|
if any(c in name for c in "äöüß"):
|
|
score["de"] += 2
|
|
# Hrvaška abeceda (đ)
|
|
if "đ" in name:
|
|
score["hr"] += 2
|
|
|
|
if not any(score.values()):
|
|
return None
|
|
|
|
best = max(score.items(), key=lambda x: x[1])
|
|
if best[1] >= 2: # threshold
|
|
return best[0]
|
|
return None
|
|
|
|
|
|
def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filename_hint=None):
|
|
"""ElevenLabs Scribe transkripcija (najboljša multilingual accuracy 2026).
|
|
|
|
lang: ISO 639-1 ('de', 'sl', 'hr') — če None, probamo iz filename_hint
|
|
Pricing: ~$0.40/h (~$0.022 per 200s pesem).
|
|
"""
|
|
import urllib.request
|
|
import urllib.error
|
|
import uuid
|
|
|
|
api_key = os.environ.get("ELEVENLABS_API_KEY")
|
|
if not api_key:
|
|
print(" ⚠️ ELEVENLABS_API_KEY ni nastavljen", file=sys.stderr)
|
|
return None
|
|
|
|
# Auto-detect lang from filename če uporabnik ni eksplicitno izbral
|
|
if not lang and filename_hint:
|
|
guessed = detect_language_from_filename(filename_hint)
|
|
if guessed:
|
|
lang = guessed
|
|
print(f" 🔍 Lang iz filename '{filename_hint}': {lang}", file=sys.stderr)
|
|
|
|
# ISO 639-1 → 639-3 mapping (Scribe uses 639-3)
|
|
LANG_1_TO_3 = {
|
|
"en": "eng", "de": "deu", "sl": "slv", "hr": "hrv", "bs": "bos",
|
|
"sr": "srp", "it": "ita", "es": "spa", "fr": "fra", "pt": "por",
|
|
"ru": "rus", "pl": "pol", "cs": "ces", "sk": "slk", "hu": "hun",
|
|
"ro": "ron", "nl": "nld", "sv": "swe", "no": "nor", "da": "dan",
|
|
"fi": "fin", "tr": "tur", "ar": "ara", "uk": "ukr", "bg": "bul",
|
|
"el": "ell", "he": "heb", "ja": "jpn", "ko": "kor", "zh": "zho",
|
|
}
|
|
LANG_3_TO_1 = {v: k for k, v in LANG_1_TO_3.items()}
|
|
|
|
# Multipart upload
|
|
boundary = uuid.uuid4().hex
|
|
parts = []
|
|
|
|
def add_text(name, value):
|
|
parts.append(
|
|
f"--{boundary}\r\nContent-Disposition: form-data; "
|
|
f"name=\"{name}\"\r\n\r\n{value}\r\n".encode()
|
|
)
|
|
|
|
def add_file(name, filename, content, ctype):
|
|
parts.append(
|
|
f"--{boundary}\r\nContent-Disposition: form-data; "
|
|
f"name=\"{name}\"; filename=\"{filename}\"\r\n"
|
|
f"Content-Type: {ctype}\r\n\r\n".encode() + content + b"\r\n"
|
|
)
|
|
|
|
with open(audio_path, "rb") as f:
|
|
audio_content = f.read()
|
|
|
|
# Limit: ElevenLabs Scribe supports up to ~25 MB / 4.5h per request
|
|
if len(audio_content) > 24 * 1024 * 1024:
|
|
print(f" ⚠️ Audio {len(audio_content)/1024/1024:.1f} MB > 24 MB limit, fallback", file=sys.stderr)
|
|
return None
|
|
|
|
add_text("model_id", model)
|
|
add_text("timestamps_granularity", "word")
|
|
# tag_audio_events=true je kritično: brez tega Scribe predčasno preneha s transkripcijo
|
|
# ko zazna instrumentalni del (npr. polka harmonika prevzame). Z true vstavi oznake
|
|
# kot "(glasba)" in nadaljuje transkripcijo do konca audia.
|
|
# Te oznake potem post-processing odstrani iz besedila.
|
|
add_text("tag_audio_events", "true")
|
|
if lang:
|
|
scribe_lang = LANG_1_TO_3.get(lang, lang)
|
|
add_text("language_code", scribe_lang)
|
|
add_file("file", "audio.mp3", audio_content, "audio/mpeg")
|
|
parts.append(f"--{boundary}--\r\n".encode())
|
|
body = b"".join(parts)
|
|
|
|
print(f" 📡 ElevenLabs Scribe ({model}, {len(audio_content)/1024/1024:.1f} MB, "
|
|
f"lang={lang or 'auto'})...", file=sys.stderr)
|
|
|
|
req = urllib.request.Request(
|
|
"https://api.elevenlabs.io/v1/speech-to-text",
|
|
data=body,
|
|
headers={
|
|
"xi-api-key": api_key,
|
|
"Content-Type": f"multipart/form-data; boundary={boundary}",
|
|
},
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=300) as resp:
|
|
data = json.loads(resp.read().decode())
|
|
except urllib.error.HTTPError as e:
|
|
body_err = e.read().decode("utf-8", errors="replace")[:500]
|
|
print(f" ❌ Scribe HTTP {e.code}: {body_err}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Scribe exception: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
# Convert response to our standard format
|
|
detected_lang_3 = data.get("language_code", "unknown")
|
|
detected_lang_1 = LANG_3_TO_1.get(detected_lang_3, detected_lang_3[:2])
|
|
detected_prob = data.get("language_probability", 1.0)
|
|
|
|
# Scribe returns flat list of words (not segments)
|
|
# We group words into pseudo-segments using **smart phrase-aware segmentation**:
|
|
# - Close on long pause (>= 0.4s) — natural breath/phrase boundary
|
|
# - OR after sentence-ending punctuation (. ! ?)
|
|
# - OR after 4 seconds (max segment length for readable subtitle)
|
|
# This gives ~3-7 word segments matching natural sung phrases.
|
|
words = data.get("words", [])
|
|
segments = []
|
|
|
|
if words:
|
|
# Filter out:
|
|
# 1. whitespace tokens
|
|
# 2. audio event tags type='audio_event' or text in (parenthesis) like "(glasba)", "(music)"
|
|
real_words = []
|
|
for w in words:
|
|
t = w.get("text", "").strip()
|
|
wtype = w.get("type", "word")
|
|
# Skip non-word events
|
|
if wtype != "word":
|
|
continue
|
|
if not t:
|
|
continue
|
|
# Skip parenthesized audio events (legacy fallback)
|
|
if t.startswith("(") and t.endswith(")"):
|
|
continue
|
|
real_words.append(w)
|
|
|
|
if real_words:
|
|
current_seg_words = []
|
|
seg_start = real_words[0].get("start", 0)
|
|
|
|
for i, w in enumerate(real_words):
|
|
current_seg_words.append(w)
|
|
w_end = w.get("end", w.get("start", 0))
|
|
w_text = w.get("text", "")
|
|
|
|
close = False
|
|
# Decide if we should close the segment
|
|
if i + 1 < len(real_words):
|
|
next_start = real_words[i + 1].get("start", w_end)
|
|
pause = next_start - w_end
|
|
seg_duration = w_end - seg_start
|
|
|
|
# Trigger close on:
|
|
# 1. Long pause (>= 0.4s) = phrase boundary
|
|
# 2. Sentence-ending punctuation
|
|
# 3. Segment is long enough (>= 4s)
|
|
if pause >= 0.4:
|
|
close = True
|
|
elif seg_duration >= 4.0 and pause >= 0.15:
|
|
close = True
|
|
elif w_text.rstrip().endswith(('.', '!', '?')) and pause >= 0.2:
|
|
close = True
|
|
elif seg_duration >= 5.5: # hard cap
|
|
close = True
|
|
else:
|
|
close = True # last word
|
|
|
|
if close:
|
|
seg_text = " ".join(ww.get("text", "") for ww in current_seg_words).strip()
|
|
if seg_text:
|
|
segments.append({
|
|
"start": seg_start,
|
|
"end": w_end,
|
|
"text": seg_text,
|
|
"words": [
|
|
{
|
|
"start": ww.get("start", 0),
|
|
"end": ww.get("end", 0),
|
|
"text": ww.get("text", ""),
|
|
}
|
|
for ww in current_seg_words
|
|
],
|
|
})
|
|
# Reset
|
|
current_seg_words = []
|
|
if i + 1 < len(real_words):
|
|
seg_start = real_words[i + 1].get("start", 0)
|
|
|
|
# ── HALLUCINATION DETECTION ──
|
|
# Scribe občasno vrne single dolg segment z 1-2 besedama (10-100s ene besede).
|
|
# To je halucinacija pri instrumentalih.
|
|
hallucination_segs = []
|
|
total_audio_duration = max((s["end"] for s in segments), default=0)
|
|
coverage = 0
|
|
for s in segments:
|
|
seg_dur = s["end"] - s["start"]
|
|
word_count = len(s.get("words", []))
|
|
if seg_dur > 15 and word_count < 5:
|
|
hallucination_segs.append(s)
|
|
else:
|
|
coverage += seg_dur
|
|
|
|
coverage_pct = coverage / total_audio_duration * 100 if total_audio_duration else 0
|
|
|
|
if hallucination_segs:
|
|
print(f" ⚠️ Halucinacija(e) zaznana(e): {len(hallucination_segs)} segment(ov) "
|
|
f"daljših od 15s z manj kot 5 besedami:", file=sys.stderr)
|
|
for h in hallucination_segs:
|
|
print(f" [{h['start']:.1f}-{h['end']:.1f}s] = {h['end']-h['start']:.0f}s "
|
|
f"({len(h.get('words', []))} bes.) text={h.get('text', '')[:50]!r}", file=sys.stderr)
|
|
print(f" 📊 Pravo pokritje: {coverage:.1f}s / {total_audio_duration:.1f}s "
|
|
f"= {coverage_pct:.0f}%", file=sys.stderr)
|
|
|
|
print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, "
|
|
f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr)
|
|
|
|
return {
|
|
"language": detected_lang_1,
|
|
"language_probability": float(detected_prob),
|
|
"segments": segments,
|
|
"_provider": "elevenlabs",
|
|
"_hallucination_count": len(hallucination_segs),
|
|
"_coverage_pct": coverage_pct,
|
|
}
|
|
|
|
|
|
def transcribe_with_gemini(audio_path, lang=None, filename_hint=None):
|
|
"""Gemini 3 Pro audio transcription — fallback za narodno-zabavne pesmi
|
|
kjer Scribe halucinarala.
|
|
|
|
Prednosti:
|
|
- Pravilna besedila slovenskih, hrvaških in drugih "manjšinskih" jezikov
|
|
- Ne halucinira pri instrumentalnih sekcijah
|
|
- Razume kontekst pesmi (lirika)
|
|
|
|
Slabosti:
|
|
- Počasen (~100s na 2min audio)
|
|
- Dražji ($0.20 vs $0.013)
|
|
- Timestamps včasih off za 1-2s
|
|
"""
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
api_key = os.environ.get("GEMINI_API_KEY")
|
|
if not api_key:
|
|
print(f" ❌ Gemini fallback: GEMINI_API_KEY missing", file=sys.stderr)
|
|
return None
|
|
|
|
print(f"🧠 Gemini 3 Pro transcribing {audio_path}...", file=sys.stderr)
|
|
audio_size_mb = os.path.getsize(audio_path) / 1024 / 1024
|
|
print(f" 📦 Audio size: {audio_size_mb:.1f} MB", file=sys.stderr)
|
|
|
|
try:
|
|
# 1. Upload audio prek Files API (resumable)
|
|
upload_url_base = "https://generativelanguage.googleapis.com/upload/v1beta/files"
|
|
with open(audio_path, 'rb') as f:
|
|
audio_bytes = f.read()
|
|
|
|
# Step 1: start
|
|
headers_start = {
|
|
'X-Goog-Upload-Protocol': 'resumable',
|
|
'X-Goog-Upload-Command': 'start',
|
|
'X-Goog-Upload-Header-Content-Length': str(len(audio_bytes)),
|
|
'X-Goog-Upload-Header-Content-Type': 'audio/mp3',
|
|
'Content-Type': 'application/json',
|
|
}
|
|
req_start = urllib.request.Request(
|
|
f"{upload_url_base}?key={api_key}",
|
|
data=json.dumps({"file": {"display_name": "reels_audio"}}).encode(),
|
|
headers=headers_start, method='POST'
|
|
)
|
|
with urllib.request.urlopen(req_start, timeout=30) as resp:
|
|
upload_url = resp.headers.get('X-Goog-Upload-URL')
|
|
|
|
# Step 2: upload bytes
|
|
headers_upload = {
|
|
'Content-Length': str(len(audio_bytes)),
|
|
'X-Goog-Upload-Offset': '0',
|
|
'X-Goog-Upload-Command': 'upload, finalize',
|
|
}
|
|
req_upload = urllib.request.Request(
|
|
upload_url, data=audio_bytes,
|
|
headers=headers_upload, method='POST'
|
|
)
|
|
with urllib.request.urlopen(req_upload, timeout=120) as resp:
|
|
file_info = json.loads(resp.read().decode())
|
|
file_uri = file_info['file']['uri']
|
|
|
|
print(f" ✓ Uploaded to Gemini Files API", file=sys.stderr)
|
|
# Manjši delay da se file procesi
|
|
time.sleep(2)
|
|
|
|
# 2. Generate transcript
|
|
gen_url = (f"https://generativelanguage.googleapis.com/v1beta/"
|
|
f"models/gemini-3-pro-preview:generateContent?key={api_key}")
|
|
|
|
lang_hint = ""
|
|
if filename_hint:
|
|
lang_hint = f"\nFilename hint: {filename_hint}"
|
|
if lang:
|
|
lang_hint += f"\nLanguage: {lang}"
|
|
|
|
prompt = f"""Transcribe this song with precise word-level timestamps.{lang_hint}
|
|
|
|
Return ONLY valid JSON in this EXACT format (no markdown fences, no explanation):
|
|
{{
|
|
"language": "sl",
|
|
"segments": [
|
|
{{
|
|
"start": 0.5,
|
|
"end": 4.2,
|
|
"text": "Besedilo segmenta",
|
|
"words": [
|
|
{{"start": 0.5, "end": 0.9, "text": "Besedilo"}},
|
|
{{"start": 1.0, "end": 1.4, "text": "segmenta"}}
|
|
]
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Rules:
|
|
- Only transcribe vocal singing, NOT instrumental sections
|
|
- Each segment is a complete musical phrase (typically 2-4 seconds)
|
|
- Include word-level timestamps for EVERY word
|
|
- Use proper orthography (š, č, ž for Slavic; ä, ö, ü for German etc.)
|
|
- Skip instrumental breaks (don't fill with silence segments)
|
|
- Be very accurate with timestamps - this is for video subtitle generation
|
|
- DO NOT hallucinate words during instrumental sections
|
|
- DO NOT include trailing commas in JSON
|
|
|
|
Output ONLY the JSON object."""
|
|
|
|
payload = {
|
|
"contents": [{
|
|
"parts": [
|
|
{"text": prompt},
|
|
{"file_data": {"mime_type": "audio/mp3", "file_uri": file_uri}}
|
|
]
|
|
}],
|
|
"generationConfig": {
|
|
"temperature": 0.0,
|
|
"maxOutputTokens": 32000,
|
|
}
|
|
}
|
|
|
|
req_gen = urllib.request.Request(
|
|
gen_url,
|
|
data=json.dumps(payload).encode(),
|
|
headers={'Content-Type': 'application/json'},
|
|
method='POST'
|
|
)
|
|
|
|
t0 = time.time()
|
|
with urllib.request.urlopen(req_gen, timeout=300) as resp:
|
|
result = json.loads(resp.read().decode())
|
|
elapsed = time.time() - t0
|
|
|
|
usage = result.get('usageMetadata', {})
|
|
print(f" ✓ Gemini 3 Pro response v {elapsed:.0f}s "
|
|
f"(in: {usage.get('promptTokenCount', 0)}, "
|
|
f"out: {usage.get('candidatesTokenCount', 0)}, "
|
|
f"thoughts: {usage.get('thoughtsTokenCount', 0)})", file=sys.stderr)
|
|
|
|
# 3. Parse JSON output
|
|
candidate_text = result['candidates'][0]['content']['parts'][0]['text'].strip()
|
|
|
|
# Pobriši markdown code fences če so
|
|
if candidate_text.startswith('```'):
|
|
# ```json\n...\n```
|
|
lines = candidate_text.split('\n')
|
|
if lines[0].startswith('```'):
|
|
lines = lines[1:]
|
|
if lines and lines[-1].rstrip() == '```':
|
|
lines = lines[:-1]
|
|
candidate_text = '\n'.join(lines)
|
|
|
|
# Try-except za JSON z popravki za pogoste težave
|
|
parsed = None
|
|
try:
|
|
parsed = json.loads(candidate_text)
|
|
except json.JSONDecodeError as e:
|
|
# Trailing comma fix
|
|
import re as _re
|
|
cleaned = _re.sub(r',(\s*[}\]])', r'\1', candidate_text)
|
|
try:
|
|
parsed = json.loads(cleaned)
|
|
print(f" ✓ Fixed trailing commas in Gemini JSON", file=sys.stderr)
|
|
except json.JSONDecodeError as e2:
|
|
print(f" ❌ Gemini JSON parse failed: {e2}", file=sys.stderr)
|
|
print(f" First 500 chars: {candidate_text[:500]}", file=sys.stderr)
|
|
return None
|
|
|
|
if not parsed or not parsed.get('segments'):
|
|
print(f" ❌ Gemini returned no segments", file=sys.stderr)
|
|
return None
|
|
|
|
segments = parsed['segments']
|
|
# Detected language
|
|
detected_lang = parsed.get('language', lang or 'unknown')
|
|
|
|
# Compute coverage stats
|
|
hallucination_count = 0
|
|
coverage = 0
|
|
total_dur = max((s.get('end', 0) for s in segments), default=0)
|
|
for s in segments:
|
|
seg_dur = s.get('end', 0) - s.get('start', 0)
|
|
word_count = len(s.get('words', []))
|
|
if seg_dur > 15 and word_count < 5:
|
|
hallucination_count += 1
|
|
else:
|
|
coverage += seg_dur
|
|
coverage_pct = (coverage / total_dur * 100) if total_dur else 0
|
|
|
|
total_words = sum(len(s.get('words', [])) for s in segments)
|
|
print(f" ✅ Gemini 3 Pro: {total_words} words → {len(segments)} segments, "
|
|
f"lang={detected_lang}, coverage={coverage_pct:.0f}%", file=sys.stderr)
|
|
|
|
return {
|
|
"language": detected_lang,
|
|
"language_probability": 0.95,
|
|
"segments": segments,
|
|
"_provider": "gemini-3-pro",
|
|
"_hallucination_count": hallucination_count,
|
|
"_coverage_pct": coverage_pct,
|
|
}
|
|
|
|
except urllib.error.HTTPError as e:
|
|
err_body = e.read().decode()[:500] if hasattr(e, 'read') else ''
|
|
print(f" ❌ Gemini HTTP {e.code}: {err_body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Gemini fallback exception: {e}", file=sys.stderr)
|
|
import traceback
|
|
traceback.print_exc(file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def transcribe_with_soniox(audio_path, lang=None, filename_hint=None):
|
|
"""Soniox stt-async-v4 audio transcription — primary STT.
|
|
|
|
Prednosti:
|
|
- Najboljša natančnost za 60+ jezikov vključno s slovenščino
|
|
- Brezhibno za narodno-zabavno glasbo (Avsenik, Modrijani, itd.)
|
|
- Word-level timestamps + punctuation + diakritike
|
|
- $0.10/h ($0.005 za 3-min pesem) - 4x cenejši kot Scribe
|
|
- 4-13s za 180s audio (5x hitrejši kot Scribe)
|
|
"""
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
api_key = os.environ.get("SONIOX_API_KEY")
|
|
if not api_key:
|
|
print(f" ❌ SONIOX_API_KEY missing", file=sys.stderr)
|
|
return None
|
|
|
|
BASE = "https://api.soniox.com"
|
|
print(f"🎤 Soniox stt-async-v4 transcribing {audio_path}...", file=sys.stderr)
|
|
|
|
file_id = None
|
|
trans_id = None
|
|
|
|
def api_call(method, path, **kwargs):
|
|
headers = kwargs.pop('headers', {})
|
|
headers['Authorization'] = f'Bearer {api_key}'
|
|
data = kwargs.get('data')
|
|
if isinstance(data, dict):
|
|
data = json.dumps(data).encode()
|
|
headers['Content-Type'] = 'application/json'
|
|
req = urllib.request.Request(f"{BASE}{path}", data=data, headers=headers, method=method)
|
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
content = resp.read().decode()
|
|
return json.loads(content) if content else {}
|
|
|
|
try:
|
|
# 1. Upload file (multipart)
|
|
boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"
|
|
with open(audio_path, 'rb') as f:
|
|
audio_bytes = f.read()
|
|
body = b''.join([
|
|
f"--{boundary}\r\n".encode(),
|
|
b'Content-Disposition: form-data; name="file"; filename="audio.mp3"\r\n',
|
|
b'Content-Type: audio/mpeg\r\n\r\n',
|
|
audio_bytes,
|
|
f"\r\n--{boundary}--\r\n".encode()
|
|
])
|
|
req = urllib.request.Request(
|
|
f"{BASE}/v1/files",
|
|
data=body,
|
|
headers={
|
|
'Authorization': f'Bearer {api_key}',
|
|
'Content-Type': f'multipart/form-data; boundary={boundary}',
|
|
},
|
|
method='POST'
|
|
)
|
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
file_data = json.loads(resp.read().decode())
|
|
file_id = file_data['id']
|
|
size_mb = len(audio_bytes) / 1024 / 1024
|
|
print(f" ✓ Uploaded {size_mb:.1f}MB → file_id={file_id}", file=sys.stderr)
|
|
|
|
# 2. Create transcription
|
|
config = {
|
|
"model": "stt-async-v4",
|
|
"file_id": file_id,
|
|
"enable_language_identification": True,
|
|
}
|
|
# Language hints — prepoznaj jezik iz filename ali parametra
|
|
if lang:
|
|
config["language_hints"] = [lang]
|
|
else:
|
|
# Auto-detect iz filename
|
|
fn_lower = (filename_hint or "").lower()
|
|
if any(k in fn_lower for k in ["ansambel", "avsenik", "fehtar", "modrijan", "polka", "valček", "slovensk"]):
|
|
config["language_hints"] = ["sl"]
|
|
else:
|
|
# Multilingual default - top svetovni
|
|
config["language_hints"] = ["en", "sl", "de", "hr", "es", "fr", "it"]
|
|
|
|
trans_data = api_call("POST", "/v1/transcriptions", data=config)
|
|
trans_id = trans_data['id']
|
|
print(f" ✓ Transcription started: {trans_id}", file=sys.stderr)
|
|
|
|
# 3. Poll status
|
|
t0 = time.time()
|
|
while True:
|
|
status_data = api_call("GET", f"/v1/transcriptions/{trans_id}")
|
|
status = status_data.get('status', 'unknown')
|
|
elapsed = time.time() - t0
|
|
if status == "completed":
|
|
print(f" ✓ Completed in {elapsed:.0f}s", file=sys.stderr)
|
|
break
|
|
if status == "error":
|
|
print(f" ❌ Soniox error: {status_data.get('error_message', '?')}", file=sys.stderr)
|
|
return None
|
|
if elapsed > 180:
|
|
print(f" ⚠️ Timeout (180s)", file=sys.stderr)
|
|
return None
|
|
time.sleep(2)
|
|
|
|
# 4. Get transcript
|
|
transcript_data = api_call("GET", f"/v1/transcriptions/{trans_id}/transcript")
|
|
|
|
# Convert Soniox format → naš standard format (segments + words)
|
|
tokens = transcript_data.get('tokens', [])
|
|
if not tokens:
|
|
print(f" ❌ Empty transcript", file=sys.stderr)
|
|
return None
|
|
|
|
# Group tokens into words (Soniox vrača subwords; "Del" + " neb" + "a" = "Del neba")
|
|
# Soniox token ima text in start_ms/end_ms. Beseda začne kjer ima text začetni space ali je prvi.
|
|
words = []
|
|
current_word = None
|
|
for tok in tokens:
|
|
text = tok.get('text', '')
|
|
start_s = tok.get('start_ms', 0) / 1000
|
|
end_s = tok.get('end_ms', 0) / 1000
|
|
# Token, ki začne z space ali je <end>/special, je nova beseda
|
|
if text.startswith(' ') or text in ('<end>', '<fin>'):
|
|
if current_word and current_word['text'].strip():
|
|
words.append(current_word)
|
|
if text in ('<end>', '<fin>'):
|
|
current_word = None
|
|
continue
|
|
current_word = {'text': text, 'start': start_s, 'end': end_s, 'language': tok.get('language', lang or 'sl')}
|
|
else:
|
|
if current_word is None:
|
|
current_word = {'text': text, 'start': start_s, 'end': end_s, 'language': tok.get('language', lang or 'sl')}
|
|
else:
|
|
# Append k current_word
|
|
current_word['text'] += text
|
|
current_word['end'] = end_s
|
|
if current_word and current_word['text'].strip():
|
|
words.append(current_word)
|
|
|
|
# Group words into segments (po pavzah > 0.6s)
|
|
segments = []
|
|
if words:
|
|
current_seg = {'start': words[0]['start'], 'end': words[0]['end'],
|
|
'text': words[0]['text'].strip(),
|
|
'words': [{'start': words[0]['start'], 'end': words[0]['end'], 'text': words[0]['text'].strip()}]}
|
|
|
|
for w in words[1:]:
|
|
gap = w['start'] - current_seg['end']
|
|
if gap > 0.6 and len(current_seg['words']) >= 3:
|
|
segments.append(current_seg)
|
|
current_seg = {'start': w['start'], 'end': w['end'],
|
|
'text': w['text'].strip(),
|
|
'words': [{'start': w['start'], 'end': w['end'], 'text': w['text'].strip()}]}
|
|
else:
|
|
current_seg['end'] = w['end']
|
|
current_seg['text'] = (current_seg['text'] + ' ' + w['text'].strip()).strip()
|
|
current_seg['words'].append({'start': w['start'], 'end': w['end'], 'text': w['text'].strip()})
|
|
segments.append(current_seg)
|
|
|
|
# Detected language
|
|
detected_lang = lang or 'sl'
|
|
if tokens:
|
|
# Get most common language from tokens
|
|
lang_counts = {}
|
|
for tok in tokens:
|
|
tl = tok.get('language')
|
|
if tl:
|
|
lang_counts[tl] = lang_counts.get(tl, 0) + 1
|
|
if lang_counts:
|
|
detected_lang = max(lang_counts, key=lang_counts.get)
|
|
|
|
# Compute coverage stats (compatible z ostalimi providerji)
|
|
total_dur = max((s['end'] for s in segments), default=0)
|
|
coverage = sum(s['end'] - s['start'] for s in segments)
|
|
coverage_pct = (coverage / total_dur * 100) if total_dur else 0
|
|
|
|
total_words = sum(len(s.get('words', [])) for s in segments)
|
|
full_text = transcript_data.get('text', '')
|
|
print(f" ✅ Soniox: {total_words} words → {len(segments)} segments, "
|
|
f"lang={detected_lang}, coverage={coverage_pct:.0f}%", file=sys.stderr)
|
|
print(f" 📝 First 200 chars: {full_text[:200]!r}", file=sys.stderr)
|
|
|
|
return {
|
|
"language": detected_lang,
|
|
"language_probability": 0.95,
|
|
"segments": segments,
|
|
"_provider": "soniox",
|
|
"_hallucination_count": 0, # Soniox redko halucinarala
|
|
"_coverage_pct": coverage_pct,
|
|
}
|
|
|
|
except urllib.error.HTTPError as e:
|
|
err_body = e.read().decode()[:500] if hasattr(e, 'read') else ''
|
|
print(f" ❌ Soniox HTTP {e.code}: {err_body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Soniox exception: {e}", file=sys.stderr)
|
|
import traceback
|
|
traceback.print_exc(file=sys.stderr)
|
|
return None
|
|
finally:
|
|
# Cleanup — pošlji DELETE ampak ne preverjaj response (Soniox returns empty body)
|
|
for path in ([f"/v1/transcriptions/{trans_id}"] if trans_id else []) + ([f"/v1/files/{file_id}"] if file_id else []):
|
|
try:
|
|
req = urllib.request.Request(f"{BASE}{path}",
|
|
headers={'Authorization': f'Bearer {api_key}'}, method='DELETE')
|
|
urllib.request.urlopen(req, timeout=10)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def transcribe_full(audio_path, lang=None, model_size="small", provider="auto", filename_hint=None):
|
|
"""STT dispatcher — Soniox primary z fallback chain.
|
|
|
|
provider:
|
|
- "soniox" → Soniox stt-async-v4 (najboljši, $0.10/h, 5-15s)
|
|
- "elevenlabs" → ElevenLabs Scribe ($0.40/h, 8-15s)
|
|
- "gemini" → Gemini 3 Pro ($3-5/h, 100-200s, najbolj točen za music)
|
|
- "local" → faster-whisper na CPU
|
|
- "auto" → Soniox primary, Scribe fallback, Gemini fallback ob halucinaciji
|
|
"""
|
|
has_soniox = bool(os.environ.get("SONIOX_API_KEY"))
|
|
has_scribe = bool(os.environ.get("ELEVENLABS_API_KEY"))
|
|
has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
|
|
|
|
# Resolve "auto" → "soniox" če key, sicer fallback chain
|
|
if provider in ("auto", "hybrid"):
|
|
if has_soniox:
|
|
provider = "soniox_chain" # Soniox primary + fallbacks
|
|
elif has_scribe and has_gemini:
|
|
provider = "hybrid" # legacy hybrid
|
|
elif has_scribe:
|
|
provider = "elevenlabs"
|
|
else:
|
|
provider = "local"
|
|
|
|
# ─── SONIOX CHAIN: Soniox primary, Scribe/Gemini fallback ───
|
|
if provider == "soniox_chain":
|
|
print(f"🎯 Provider chain: Soniox → Scribe → Gemini", file=sys.stderr)
|
|
result = transcribe_with_soniox(audio_path, lang=lang, filename_hint=filename_hint)
|
|
|
|
if result and result.get("segments"):
|
|
cov = result.get("_coverage_pct", 100)
|
|
hall = result.get("_hallucination_count", 0)
|
|
if cov >= 50 and hall == 0:
|
|
return result
|
|
print(f" ⚠️ Soniox sumljiv (coverage {cov:.0f}%, hall {hall}) — try fallback", file=sys.stderr)
|
|
else:
|
|
print(f" ❌ Soniox failed → fallback", file=sys.stderr)
|
|
|
|
# Fallback 1: Scribe
|
|
if has_scribe:
|
|
print(f" 🔄 Fallback to Scribe...", file=sys.stderr)
|
|
result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result2 and result2.get("segments"):
|
|
cov = result2.get("_coverage_pct", 100)
|
|
hall = result2.get("_hallucination_count", 0)
|
|
if cov >= 50 and hall == 0:
|
|
return result2
|
|
# ohrani za primerjavo
|
|
result = result2 if not result else result
|
|
|
|
# Fallback 2: Gemini (samo če sve doslej slabe)
|
|
if has_gemini:
|
|
print(f" 🔄 Fallback to Gemini 3 Pro (last resort)...", file=sys.stderr)
|
|
result3 = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result3 and result3.get("segments"):
|
|
return result3
|
|
|
|
# Vrni karkoli imamo
|
|
return result or {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── SONIOX ONLY ───
|
|
if provider == "soniox":
|
|
if not has_soniox:
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
result = transcribe_with_soniox(audio_path, lang=lang, filename_hint=filename_hint)
|
|
return result or {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── HYBRID (legacy): Scribe primary, Gemini fallback ───
|
|
if provider == "hybrid":
|
|
if not has_scribe:
|
|
provider = "gemini"
|
|
else:
|
|
# Try Scribe first
|
|
print(f"🎯 HYBRID mode: Scribe primary, Gemini fallback", file=sys.stderr)
|
|
result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
|
|
if result and result.get("segments"):
|
|
hall_count = result.get("_hallucination_count", 0)
|
|
cov_pct = result.get("_coverage_pct", 100)
|
|
|
|
# Quality gate: če je Scribe rezultat dober, vrni ga
|
|
if hall_count == 0 and cov_pct >= 50:
|
|
print(f" ✅ Scribe OK (coverage {cov_pct:.0f}%) — no fallback needed",
|
|
file=sys.stderr)
|
|
return result
|
|
|
|
# Halucinacija ali nizko pokritje → preizkusi Scribe še 1x preden gremo na Gemini
|
|
print(f" ⚠️ Scribe quality issues (coverage {cov_pct:.0f}%, "
|
|
f"{hall_count} halu) — RETRY Scribe...", file=sys.stderr)
|
|
result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result2 and result2.get("segments"):
|
|
h2 = result2.get("_hallucination_count", 0)
|
|
c2 = result2.get("_coverage_pct", 100)
|
|
if h2 == 0 and c2 >= 50:
|
|
print(f" ✅ Scribe retry uspel: coverage {cov_pct:.0f}% → {c2:.0f}%",
|
|
file=sys.stderr)
|
|
return result2
|
|
# Še vedno slabo, ali je drugi tek boljši?
|
|
if h2 < hall_count or c2 > cov_pct:
|
|
result = result2
|
|
hall_count = h2
|
|
cov_pct = c2
|
|
|
|
# Še vedno halucinacija → Gemini fallback
|
|
if has_gemini:
|
|
print(f" 🔄 Scribe še vedno slab (coverage {cov_pct:.0f}%, "
|
|
f"{hall_count} halu) — switching na Gemini 3 Pro...", file=sys.stderr)
|
|
gemini_result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if gemini_result and gemini_result.get("segments"):
|
|
g_cov = gemini_result.get("_coverage_pct", 100)
|
|
g_hall = gemini_result.get("_hallucination_count", 0)
|
|
# Vzemi tisto kar je boljše
|
|
if g_hall < hall_count or g_cov > cov_pct:
|
|
print(f" ✅ Gemini boljši: coverage {cov_pct:.0f}% → {g_cov:.0f}%, "
|
|
f"hallu {hall_count} → {g_hall}", file=sys.stderr)
|
|
return gemini_result
|
|
else:
|
|
print(f" ⚠️ Gemini ni boljši, ohrani Scribe", file=sys.stderr)
|
|
return result
|
|
else:
|
|
print(f" ⚠️ Gemini fallback ni dosegljiv — vrnem Scribe rezultat",
|
|
file=sys.stderr)
|
|
|
|
return result
|
|
else:
|
|
# Scribe popolnoma failed → Gemini direktno
|
|
if has_gemini:
|
|
print(f" 🔄 Scribe failed → Gemini 3 Pro", file=sys.stderr)
|
|
gemini_result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if gemini_result and gemini_result.get("segments"):
|
|
return gemini_result
|
|
# Brez fallback → empty
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── GEMINI ONLY ───
|
|
if provider == "gemini":
|
|
if not has_gemini:
|
|
print(f" ❌ provider=gemini ampak GEMINI_API_KEY missing", file=sys.stderr)
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result and result.get("segments"):
|
|
return result
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── ELEVENLABS / SCRIBE ONLY (z auto-retry) ───
|
|
if provider == "elevenlabs" and has_scribe:
|
|
result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
|
|
if result and result.get("segments"):
|
|
hall_count = result.get("_hallucination_count", 0)
|
|
cov_pct = result.get("_coverage_pct", 100)
|
|
if hall_count > 0 or cov_pct < 50:
|
|
print(f" 🔄 Halucinacija/nizko pokritje ({cov_pct:.0f}%, "
|
|
f"{hall_count} hallucination segs) — RETRY Scribe...", file=sys.stderr)
|
|
result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result2 and result2.get("segments"):
|
|
h2 = result2.get("_hallucination_count", 0)
|
|
c2 = result2.get("_coverage_pct", 100)
|
|
if h2 < hall_count or c2 > cov_pct:
|
|
print(f" ✅ Retry boljši: pokritje {cov_pct:.0f}% → {c2:.0f}%",
|
|
file=sys.stderr)
|
|
result = result2
|
|
return result
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── LOCAL faster-whisper ───
|
|
return _transcribe_full_local(audio_path, lang=lang, model_size=model_size)
|
|
|
|
|
|
def _transcribe_full_local(audio_path, lang=None, model_size="small"):
|
|
"""Whisper transcript celega avdia. lang=None → robust auto-detect.
|
|
|
|
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
|
|
from faster_whisper import WhisperModel
|
|
|
|
print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
|
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
|
|
|
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
|
|
if not lang:
|
|
print(" 🔍 Robust lang detection (3 samples)...", file=sys.stderr)
|
|
try:
|
|
duration_proc = subprocess.run(
|
|
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
|
"-of", "default=nw=1:nokey=1", audio_path],
|
|
capture_output=True, text=True
|
|
)
|
|
audio_duration = float(duration_proc.stdout.strip())
|
|
except Exception:
|
|
audio_duration = 180.0
|
|
|
|
lang_votes = {}
|
|
for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
|
|
if ss + 5 > audio_duration:
|
|
continue
|
|
sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
sample.close()
|
|
try:
|
|
subprocess.run(
|
|
["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
|
|
"-t", "30", "-vn", "-ac", "1", "-ar", "16000",
|
|
"-c:a", "pcm_s16le", sample.name],
|
|
check=True, capture_output=True
|
|
)
|
|
_, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
|
|
sl, sp = sample_info.language, float(sample_info.language_probability)
|
|
lang_votes[sl] = lang_votes.get(sl, 0) + sp
|
|
print(f" sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f" sample @ {ss:.0f}s: failed", file=sys.stderr)
|
|
finally:
|
|
try:
|
|
os.unlink(sample.name)
|
|
except Exception:
|
|
pass
|
|
|
|
if lang_votes:
|
|
lang = max(lang_votes.items(), key=lambda x: x[1])[0]
|
|
print(f" ✅ Lang lock: {lang}", file=sys.stderr)
|
|
|
|
try:
|
|
segs, info = m.transcribe(
|
|
audio_path,
|
|
language=lang,
|
|
word_timestamps=True,
|
|
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
|
vad_filter=False,
|
|
# Anti-halucinacije
|
|
condition_on_previous_text=False,
|
|
temperature=0.0,
|
|
compression_ratio_threshold=2.4,
|
|
log_prob_threshold=-1.0,
|
|
no_speech_threshold=0.6,
|
|
# Beam search namesto greedy = bolj zanesljiv decode (manj halucinacij)
|
|
beam_size=5,
|
|
# Halucinacija detection: če je tišina dolga, ne pretvarjaj v tekst
|
|
hallucination_silence_threshold=2.0,
|
|
)
|
|
detected_lang = info.language
|
|
detected_prob = float(info.language_probability)
|
|
except (ValueError, RuntimeError) as e:
|
|
# Whisper failure (např. pri popolnoma instrumentalnih datotekah z VAD)
|
|
print(f" ⚠️ Whisper transcribe failed: {e}", file=sys.stderr)
|
|
return {
|
|
"language": "unknown",
|
|
"language_probability": 0.0,
|
|
"segments": [],
|
|
}
|
|
|
|
print(f" Detekcija: {detected_lang} (p={detected_prob:.2f})", file=sys.stderr)
|
|
|
|
segments = []
|
|
for s in segs:
|
|
words = []
|
|
if s.words:
|
|
for w in s.words:
|
|
words.append({
|
|
"start": w.start,
|
|
"end": w.end,
|
|
"text": w.word,
|
|
})
|
|
segments.append({
|
|
"start": s.start,
|
|
"end": s.end,
|
|
"text": s.text.strip(),
|
|
"words": words,
|
|
})
|
|
|
|
return {
|
|
"language": detected_lang,
|
|
"language_probability": detected_prob,
|
|
"segments": segments,
|
|
}
|
|
|
|
|
|
def compute_energy_profile(audio_path, window_sec=1.0):
|
|
"""RMS dB na window_sec sekund. Vrne list (timestamp, rms_db)."""
|
|
cmd = [
|
|
"ffmpeg", "-i", audio_path,
|
|
"-af", f"asetnsamples=n={int(16000 * window_sec)}:p=0,"
|
|
f"astats=metadata=1:reset={window_sec},"
|
|
f"ametadata=print:key=lavfi.astats.Overall.RMS_level:file=-",
|
|
"-f", "null", "-",
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
output = result.stdout + "\n" + result.stderr
|
|
|
|
energies = []
|
|
current_pts = 0.0
|
|
for line in output.split("\n"):
|
|
line = line.strip()
|
|
m = re.search(r"pts_time:(\S+)", line)
|
|
if m:
|
|
try:
|
|
current_pts = float(m.group(1))
|
|
except ValueError:
|
|
pass
|
|
continue
|
|
if "RMS_level=" in line:
|
|
val = line.split("RMS_level=")[-1].strip()
|
|
try:
|
|
rms = float(val)
|
|
# -inf zamenjamo z -90
|
|
if rms < -90 or rms != rms: # NaN check
|
|
rms = -90.0
|
|
energies.append((current_pts, rms))
|
|
current_pts += window_sec
|
|
except ValueError:
|
|
pass
|
|
|
|
return energies
|
|
|
|
|
|
def detect_vocal_sections(segments, max_gap=3.0):
|
|
"""Združi consecutive segmente v "vokalne sekcije"."""
|
|
if not segments:
|
|
return []
|
|
sections = []
|
|
current = {
|
|
"start": segments[0]["start"],
|
|
"end": segments[0]["end"],
|
|
"segments": [segments[0]],
|
|
"text": segments[0]["text"],
|
|
}
|
|
for seg in segments[1:]:
|
|
if seg["start"] - current["end"] > max_gap:
|
|
sections.append(current)
|
|
current = {
|
|
"start": seg["start"],
|
|
"end": seg["end"],
|
|
"segments": [seg],
|
|
"text": seg["text"],
|
|
}
|
|
else:
|
|
current["end"] = seg["end"]
|
|
current["segments"].append(seg)
|
|
current["text"] += " " + seg["text"]
|
|
sections.append(current)
|
|
return sections
|
|
|
|
|
|
def avg_energy_in_range(energies, start, end):
|
|
"""Povprečna RMS v rangeu."""
|
|
vals = [r for (t, r) in energies if start <= t <= end]
|
|
if not vals:
|
|
return -90.0
|
|
return sum(vals) / len(vals)
|
|
|
|
|
|
def score_section_as_chorus(section, all_sections, energies, avg_rms):
|
|
"""Score sekcijo kot kandidat za refren.
|
|
|
|
Faktorji:
|
|
- Ponavljajoče besede (low unique-word-ratio) = refren
|
|
- Visoka energija
|
|
- Sekcija se pojavi večkrat v pesmi (refren se ponovi)
|
|
- Krajše vrstice (3-8 besed)
|
|
"""
|
|
text = section["text"].lower()
|
|
words = re.findall(r"\b\w+\b", text)
|
|
if not words:
|
|
return 0
|
|
|
|
unique_ratio = len(set(words)) / len(words)
|
|
# Refren = nizko unique ratio (ponovitve)
|
|
chorus_signal = max(0, (1.0 - unique_ratio) * 30)
|
|
|
|
# Energija
|
|
sec_energy = avg_energy_in_range(energies, section["start"], section["end"])
|
|
energy_above = max(0, sec_energy - avg_rms)
|
|
energy_score = energy_above * 8
|
|
|
|
# Kako pogosto se pojavi podobno besedilo
|
|
repeat_count = 0
|
|
for other in all_sections:
|
|
if other is section:
|
|
continue
|
|
other_text = other["text"].lower()
|
|
other_words = set(re.findall(r"\b\w+\b", other_text))
|
|
common = set(words) & other_words
|
|
# Če imata >50% besed skupnih, je verjetno isti refren
|
|
if len(common) >= len(set(words)) * 0.5 and len(common) >= 3:
|
|
repeat_count += 1
|
|
repeat_score = repeat_count * 25
|
|
|
|
# Dolžina vrstice
|
|
duration = section["end"] - section["start"]
|
|
if 3 <= duration <= 25:
|
|
length_score = 10
|
|
elif duration > 25:
|
|
length_score = 5
|
|
else:
|
|
length_score = 2
|
|
|
|
return chorus_signal + energy_score + repeat_score + length_score
|
|
|
|
|
|
def find_chorus(transcript, energies, video_duration):
|
|
"""Najde najbolj verjeten refren."""
|
|
sections = detect_vocal_sections(transcript["segments"])
|
|
if not sections:
|
|
return None
|
|
|
|
avg_rms = sum(r for (_, r) in energies) / len(energies) if energies else -30.0
|
|
|
|
candidates = []
|
|
for sec in sections:
|
|
score = score_section_as_chorus(sec, sections, energies, avg_rms)
|
|
candidates.append({
|
|
"start": sec["start"],
|
|
"end": sec["end"],
|
|
"duration": sec["end"] - sec["start"],
|
|
"text_preview": sec["text"][:80],
|
|
"score": round(score, 2),
|
|
"avg_rms": round(avg_energy_in_range(energies, sec["start"], sec["end"]), 2),
|
|
})
|
|
|
|
# Sort by score descending
|
|
candidates.sort(key=lambda c: -c["score"])
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
return {
|
|
"best": candidates[0],
|
|
"all_candidates": candidates[:10],
|
|
"avg_rms_total": round(avg_rms, 2),
|
|
}
|
|
|
|
|
|
def smart_clip_range(chorus, transcript, video_duration,
|
|
target_duration=30, max_duration=45, min_duration=20,
|
|
include_prebuild=False):
|
|
"""Vrne clip range TOČNO kot je odločil LLM.
|
|
|
|
Sistem NE razširja in NE skrajšuje LLM-ove odločitve. LLM ima ves
|
|
transkript + lyrics z web_search in lahko razmišlja vsebinsko o:
|
|
- kateri refren
|
|
- koliko ponovitev (1, 2, 3?)
|
|
- ali vključiti intro klic ('Ajmo Janezi!')
|
|
- kje naravno konča
|
|
|
|
Edina varnost: če Claude vrne nič, fallback na sredino videa.
|
|
"""
|
|
if not chorus or not chorus.get("best"):
|
|
# Fallback: sredina videa
|
|
mid = video_duration / 2
|
|
start = max(0, mid - target_duration / 2)
|
|
return {
|
|
"start": start,
|
|
"end": min(video_duration, start + target_duration),
|
|
"reason": "fallback_middle",
|
|
}
|
|
|
|
best = chorus["best"]
|
|
actual_start = best["start"]
|
|
actual_end = best["end"]
|
|
|
|
# Samo varnostni cap: NIKOLI ne čez video duration
|
|
actual_start = max(0, actual_start)
|
|
actual_end = min(video_duration, actual_end)
|
|
|
|
# Če je nekako reverse (start > end), popravi
|
|
if actual_start >= actual_end:
|
|
actual_end = min(video_duration, actual_start + target_duration)
|
|
|
|
return {
|
|
"start": round(actual_start, 2),
|
|
"end": round(actual_end, 2),
|
|
"duration": round(actual_end - actual_start, 2),
|
|
"reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only",
|
|
"chorus_start": round(best["start"], 2),
|
|
"chorus_end": round(best["end"], 2),
|
|
}
|
|
|
|
|
|
def detect_audio_fade(clip_range, transcript, video_duration=None):
|
|
"""Določi fade-in/fade-out trajanje + ev. razširi clip range, da fade
|
|
ne reže besedila na koncu refrena.
|
|
|
|
Logika:
|
|
- Če clip začne sredi vokala → 0.5s fade in
|
|
- Če se konča sredi vokala → razširi clip do konca segmenta (+ buffer),
|
|
potem 1.0s fade out
|
|
- Sicer manj fade
|
|
"""
|
|
cs, ce = clip_range["start"], clip_range["end"]
|
|
|
|
# Najdi segment, ki konča znotraj clip-a (ali je clip end znotraj segmenta)
|
|
starts_in_vocal = False
|
|
ends_in_vocal = False
|
|
end_segment = None
|
|
for seg in transcript["segments"]:
|
|
if seg["start"] <= cs <= seg["end"]:
|
|
starts_in_vocal = True
|
|
if seg["start"] <= ce <= seg["end"]:
|
|
ends_in_vocal = True
|
|
end_segment = seg
|
|
|
|
# Če clip konča znotraj segmenta, razširi do konca segmenta + 0.5s buffer
|
|
extended_end = ce
|
|
if end_segment:
|
|
extended_end = end_segment["end"] + 0.5
|
|
if video_duration is not None:
|
|
extended_end = min(extended_end, video_duration)
|
|
|
|
# Fade-in: če clip začne MED vokalom, fade-in mora biti zelo kratek
|
|
# da ne odreže prve besede. Pri vokalnem začetku samo 0.05s "smooth click prevention",
|
|
# ne pravi audible fade. Pri instrumentalnem intro lahko 0.2-0.3s.
|
|
fade_in = 0.05 if starts_in_vocal else 0.2
|
|
# Krajši fade out (0.5s) ker zdaj clip konča po koncu vokala
|
|
fade_out = 0.3 if ends_in_vocal else 0.4
|
|
|
|
return {
|
|
"fade_in": fade_in,
|
|
"fade_out": fade_out,
|
|
"extended_end": round(extended_end, 2),
|
|
"ends_in_vocal": ends_in_vocal,
|
|
}
|
|
|
|
|
|
def _build_analysis_prompt(transcript, video_duration, target_duration=30, filename_hint=None, include_prebuild=False):
|
|
"""Pripravi enotni prompt za Claude/Gemini analizo.
|
|
|
|
include_prebuild: če True, lahko vključi pre-chorus pred refrenom.
|
|
če False (default), MORA biti SAMO refren — strogo.
|
|
"""
|
|
lines = []
|
|
for seg in transcript["segments"]:
|
|
start = seg["start"]
|
|
end = seg["end"]
|
|
text = seg["text"].strip()
|
|
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
|
|
transcript_text = "\n".join(lines)
|
|
|
|
hint_block = ""
|
|
if filename_hint:
|
|
hint_block = f"""
|
|
|
|
🎵 IME DATOTEKE: "{filename_hint}"
|
|
|
|
🚨 **PRVI KORAK — VEDNO PRED ANALIZO**:
|
|
Iz imena datoteke prepoznaj izvajalca + naslov pesmi. Potem **OBVEZNO uporabi web_search tool** da poiščeš pravo besedilo pesmi — TUDI ČE MISLIŠ DA POZNAŠ PESEM.
|
|
|
|
Razlog: večinoma ne poznaš celotnih besedil pesmi (predvsem ne-angleških). Brez pravega besedila NE MOREŠ:
|
|
- Pravilno prepoznati strukture (verz / pre-chorus / chorus / bridge)
|
|
- Vedeti kje refren **začne in konča** (vključno z outro frazami)
|
|
- Popraviti STT halucinacij
|
|
|
|
📋 **Search strategija** (univerzalna za vse jezike):
|
|
1. Prvo iskanje: `[izvajalec] [naslov] lyrics` ALI `[izvajalec] [naslov] besedilo/Songtext/letra/versuri`
|
|
2. Če ni rezultatov: `[del transkripta - 4-5 zaporednih besed] lyrics`
|
|
3. Trusted lyrics sajti po jezikih:
|
|
- 🇸🇮 SLO: besedila.com, lyricstranslate.com
|
|
- 🇩🇪 DE: songtexte.com, lyricstranslate.com
|
|
- 🇭🇷🇷🇸 HR/SR/BS: tekstovi.net, lyricstranslate.com
|
|
- 🇪🇸 ES: letras.com, musica.com
|
|
- 🇷🇴 RO: versuri.ro, lyricstranslate.com
|
|
- 🇮🇹 IT: angolotesti.it
|
|
- 🇫🇷 FR: paroles.net
|
|
- 🇬🇧🇺🇸 EN: genius.com, azlyrics.com
|
|
- **Univerzalno**: lyricstranslate.com (vsi jeziki)
|
|
|
|
Ko najdeš lyrics:
|
|
- Identificiraj kateri del je REFREN (ponavlja se)
|
|
- Identificiraj VERZE (zgodba, ne ponavlja se)
|
|
- Identificiraj BRIDGE / PRE-CHORUS / OUTRO če obstajajo
|
|
- Mapiraj transkript timestamp-e na strukturne dele
|
|
- Popravi corrected_segments z dejanskim besedilom
|
|
|
|
🎯 **NASLOV PESMI — KAKO POVEZAN Z REFRENOM**:
|
|
Naslov pesmi je **včasih** v refrenu, **včasih** v verzu. NE silujte!
|
|
- POGOSTO v refrenu: "Pijan" → "pijan, pijan"; "Žena me tepe" → "Žena me tepe"; "Brajde" → "v brajde"
|
|
- VČASIH samo v verzu: "Cvetele so maline" → naslov je v VERZU, refren je drug ("Naj veter zdaj ponese...")
|
|
- **NE iskaj naslova v refrenu na silo** — refren je tisti del ki se **PONAVLJA** (2-3x), ne tisti ki vsebuje naslov
|
|
|
|
🎯 **KAKO RES IDENTIFICIRATI REFREN**:
|
|
1. Refren je tisti del besedila, ki se v pesmi **DEJANSKO PONOVI 2-3x z ENAKIM besedilom**
|
|
2. Verzi imajo **različno besedilo** vsakič (pripovedujejo zgodbo)
|
|
3. Najprej najdi besedilo, ki se ponavlja — TO je refren
|
|
4. Ko najdeš PRVI nastop ponavljajočega dela → tam začni clip
|
|
|
|
⚠️ **PAZI**: prvi verz pesmi se pogosto začne **takoj po intro-u** (5-15s) in je kontekstualen — TO NI REFREN. Refren običajno pride **po prvem verzu** (pri 30-60s, odvisno od pesmi).
|
|
"""
|
|
|
|
return f"""Tu je transcript pesmi iz STT modela (timestamp v sekundah, besedilo):
|
|
|
|
{transcript_text}
|
|
|
|
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.{hint_block}
|
|
|
|
⚠️ POMEMBNO: STT lahko naredi napake v vseh jezikih, posebej:
|
|
- Pri narečjih, slovanskih jezikih, romanskih jezikih
|
|
- Generira "tipičen" tekst (npr. tekst druge pesmi istega izvajalca)
|
|
- Lahko vstavi besede ki se POdoBNO slišijo, ampak imajo ČISTO drug pomen
|
|
|
|
KAKO PREPOZNATI HALUCINACIJO:
|
|
- Tekst nima smisla v kontekstu pesmi
|
|
- Različni segmenti imajo nepovezane teme (kot da bi bilo več pesmi)
|
|
- Refren je v vsakem ponovitvi različen (refren se MORA ponavljati identično)
|
|
- Tekst je premalo **glede na trajanje** (več tišine = manj besed, ne več)
|
|
|
|
PROSIM:
|
|
1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
|
|
2. POPRAVI očitne halucinacije:
|
|
- Če prepoznaš pesem (po izvajalcu, naslovu, znaku besedila) → **uporabi PRAVO besedilo**
|
|
- Če halucinacijo ne moreš popraviti, **odstrani segment** (raje brez podnapisa kot napačen)
|
|
- Refren MORA imeti vse pojavitve ENAKE
|
|
- Popravi pomešane jezike (vse vrstice v enem jeziku)
|
|
- Ohrani timestamp-e nespremenjene
|
|
3. Prepoznaj REFREN: del besedila ki se PONAVLJA (ponavadi 2-4 vrstice, ki se v pesmi večkrat ponovijo). To je **univerzalno za vse jezike** — refren je strukturni element pesmi, ne le slovenske/nemške/angleške.
|
|
|
|
{"" if include_prebuild else '''4. **🎯 IZBIRA REFRENA — VSEBINSKO RAZMIŠLJANJE**
|
|
|
|
TI ODLOČAŠ na osnovi vsebine, ritma, energije pesmi.
|
|
Sistem ne bo razširil ne skrajšal tvoje izbire — kar vrneš, to se uporabi.
|
|
|
|
## CILJ: ~30 sekund (tipično za TikTok/Instagram Reel)
|
|
|
|
Dolžina je SVOBODNA glede na strukturo pesmi:
|
|
- 12-15s = en kratki refren (če je pesem zelo kratka)
|
|
- **20-35s = dva zaporedna refrena (NAJBOLJŠA opcija!)** — največkrat idealno
|
|
- 30-40s = refren + drug refren če sta vsebinsko povezana
|
|
|
|
## STRATEGIJA — kako razmišljati:
|
|
|
|
1. **Najdi PRVI nastop refrena** v pesmi
|
|
- Refren je tisti del, ki se ponavlja v pesmi (običajno 2-4 vrstice)
|
|
- Identificiraj **PRVO vrstico refrena** (ne drugo, ne tretjo!)
|
|
- Primer: če refren je "V Ljubljani se obrnem nazaj / saj vrača me pogled / kjer gore se dotikajo neba / gorenjska ljubljena", **prva vrstica = "V Ljubljani se obrnem nazaj"**
|
|
- Primer: če refren je "Žena me tepe / mi prazni žepe / da vidi, kje in s kom sem bil", **prva vrstica = "Žena me tepe"**
|
|
|
|
2. **🎯 KRITIČNO: clip start = TOČNO prva beseda PRVE vrstice refrena**
|
|
- Če prva vrstica je "V Ljubljani se obrnem nazaj", clip start = beseda **"V"**, NE "obrnem"
|
|
- Če prva vrstica je "Žena me tepe", clip start = beseda **"Žena"**, NE "me"
|
|
- **OPOZORILO**: STT (Soniox) lahko združi konec verza z začetkom refrena v en segment, npr. segment "[43.6-47.6] doma. V Ljubljani se" — TUKAJ refren začne sredi segmenta na "V Ljubljani". Uporabi WORD-LEVEL timestampe da najdeš točno začetno besedo "V"!
|
|
- **NIKOLI ne začni** sredi besede ali sredi vrstice refrena
|
|
|
|
3. **Poglej KAJ SLEDI**:
|
|
- Če **takoj sledi DRUGI nastop ISTEGA refrena** (gap < 3s) → **vključi oba** = ~30s. ✅ TO JE NAJBOLJ POGOST PRIMER.
|
|
- Če sledi **drug refren** (B-refren z drugim besedilom) → samo prvi A-refren
|
|
- Če sledi instrumental break → samo prvi refren
|
|
- Če sledi takoj verz → samo prvi refren
|
|
|
|
4. **Vključi naravne intro klice/fraze**:
|
|
- "Ajmo Janezi!" pred BRAJDE refrenom = del refrena, vključi
|
|
- "Hey!" / "Yeah!" / "Oh!" intro klici = del refrena, vključi
|
|
- "Pa-pa!" / "La-la!" v začetku refrena = del refrena, vključi
|
|
|
|
5. **Naravni konec refrena**:
|
|
- Pevec drži zadnji ton 1-3s = **del refrena, vključi**
|
|
- Outro filler ("aj aj aj", "yeah yeah", "la la la") = **del refrena, vključi**
|
|
- **AMPAK**: če je outro filler **dolg in se ponavlja** (npr. "o o o o o o" 5+ sekund), **konča pred njim** — to je outro v fade-out, ne pomembna vsebina. Primer: refren konča z "podeželski rokenrol", potem "o o o" 5s = **odreži pri "rokenrol"**, ne pri zadnjem "o"
|
|
- Ne reži sredi besede ali sredi izpetega tona
|
|
- **OPOZORILO**: STT lahko zadnji izpeti ton ("oba", "doma", "srca") razpiše kot 15-20s segment (ker pevec drži ton + instrumental fade-out). To NI del besedila — odreži po prvih 1-2 sekundah po izpevu, NE čakaj 20s.
|
|
|
|
## PRIMERI — kako se razmišlja:
|
|
|
|
**BRAJDE (FIRBCI x LIMA LEN):**
|
|
- Refren 1: "Ajmo Janezi! Pejd' greva..." 41.8-49.4s + "Da v senci hladni..." 50.2-56.1s
|
|
- GAP < 3s
|
|
- Refren 2: "Pejd' greva..." 57.1-63.1s + "Da v senci..." 63.8-69.8s
|
|
- **Izbira: 41.8-69.8s = 28s** (dva zaporedna refrena z "Ajmo Janezi" intro klicem) ✅
|
|
|
|
**GORENJSKA LJUBLJENA (Fehtarji):**
|
|
- Refren prva vrstica: "V Ljubljani se obrnem nazaj"
|
|
- **POZOR**: Soniox segment je "[43.6-47.6] doma. V Ljubljani se" — refren začne **sredi segmenta** pri besedi "V" (~46s)
|
|
- Naslednji segment: "[48.2-59.7] obrnem nazaj, saj vrača me pogled..."
|
|
- **Pravilna izbira**: start = beseda "V" pri ~46s, **ne 48.2s** (kjer je 'obrnem' sredi vrstice!)
|
|
- End: konec refrena pri "gorenjska ljubljena" (~80s)
|
|
|
|
**CVETELE SO MALINE (Avsenik):**
|
|
- **POZOR**: naslov "Cvetele so maline" je v VERZU 2 (~125s), NE v refrenu!
|
|
- Pravi refren se začne s "Naj veter zdaj ponese moje sanje čez del neba, tja, kjer je ona doma. Zašepetaj ji bog tja pod kostanje, da sanjam še, kar sva nekoč oba"
|
|
- Prvi nastop refrena ~75s, drugi nastop ~150s
|
|
- **NE izberi outro/3. nastop** — izberi PRVI nastop refrena pri 75s
|
|
- **POZOR**: zadnja beseda refrena ("oba", "doma") lahko traja 15+ sekund v Soniox segmentu zaradi izpetega tona + instrumentala
|
|
- **Konec refrena**: 1-2 sekundi po zadnji izgovorjeni besedi, **NE čakaj 20s da ton zamre**
|
|
|
|
**Lady Gaga "Abracadabra":**
|
|
- Refren: "Abracadabra, amor..." 4 vrstice
|
|
- Ponavadi se 2x ponovi
|
|
- Izbira: oba refrena = ~30s
|
|
|
|
**Žena Me Tepe:**
|
|
- Refren: "Žena me tepe, mi prazni žepe..."
|
|
- Ponavadi je dolg (15s) in se ponovi
|
|
- Izbira: lahko 1 polni refren ali 2 zaporedna
|
|
|
|
## 🚫 ČESAR NE DELAJ:
|
|
- ❌ NE razširi v VERZE/KITICE (verz pripoveduje zgodbo, ima drugo besedilo)
|
|
- ❌ NE meša 2 RAZLIČNA refrena (A-refren + B-refren = napaka)
|
|
- ❌ NE začni sredi refrena (vedno na PRVI besedi)
|
|
- ❌ NE konča sredi besede ali izpetega tona
|
|
- ❌ NE razmišljaj samo o številu sekund — razmišljaj VSEBINSKO
|
|
'''}{'''4. **IZBERI ODSEK — REFREN + PRE-CHORUS:**
|
|
|
|
Uporabnik je izbral način "**REFREN + PRE-CHORUS**".
|
|
|
|
## OBVEZNO: cel **PRVI** refren (kot opisano spodaj)
|
|
|
|
## OPCIJSKO: pre-chorus PRED refrenom
|
|
- **Pre-chorus = zadnja 1-2 vrstici verza tik pred refrenom** (slišne, povezane z refrenom)
|
|
- **Dodaj samo če**:
|
|
- Je tik pred refrenom (brez pavze ali instrumental vmes)
|
|
- Vsebinsko vodi v refren (gradnja občutka)
|
|
- Je kratek: 4-10 sekund
|
|
- **Ne dodajaj** če bi presegel skupno dolžino 35s
|
|
|
|
## REFREN — kot pri "samo refren":
|
|
- Začetek refrena = prva vrstica refrena
|
|
- Konec refrena = vključno z vsemi outro frazami in zadnjim držečim tonom
|
|
- Naravni izpev (ej-ej-ej, oh oh, la la la, etc.)
|
|
|
|
## Skupna dolžina: 18-35 sekund
|
|
''' if include_prebuild else ""}
|
|
|
|
5. Če transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "STT_HALLUCINATION_DETECTED"
|
|
|
|
Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
|
{{
|
|
"start": <sekunde>,
|
|
"end": <sekunde>,
|
|
"reason": "<kratka razlaga>",
|
|
"chorus_text": "<besedilo refrena>",
|
|
"structure": "<1 stavek o strukturi pesmi>",
|
|
"language": "<jezik: sl/de/hr/bs/sr/en/it/es/fr>",
|
|
"hallucination_detected": <true/false>,
|
|
"corrected_segments": [
|
|
{{"start": <s>, "end": <s>, "text": "<popravljeno besedilo ALI prazno če halucinacija>"}}
|
|
]
|
|
}}
|
|
|
|
V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom. Halucinacije nadomesti s pravim besedilom (če veš) ALI pusti prazno besedilo."""
|
|
|
|
|
|
def _parse_llm_response(text, video_duration):
|
|
"""Parse JSON odgovor iz LLM-a, vrne None če invalid."""
|
|
text = text.strip()
|
|
# Odstrani markdown ovoj če obstaja
|
|
if text.startswith("```"):
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
text = re.sub(r"\s*```$", "", text)
|
|
# Včasih je pred JSON-om še kakšna razlaga, vzemi prvi { ... } blok
|
|
first_brace = text.find("{")
|
|
last_brace = text.rfind("}")
|
|
if first_brace >= 0 and last_brace > first_brace:
|
|
text = text[first_brace:last_brace + 1]
|
|
|
|
result = json.loads(text)
|
|
|
|
start = float(result["start"])
|
|
end = float(result["end"])
|
|
if start >= end or start < 0 or end > video_duration:
|
|
print(f" ⚠️ LLM returned invalid range: {start}-{end}", file=sys.stderr)
|
|
return None
|
|
|
|
return {
|
|
"start": round(start, 2),
|
|
"end": round(end, 2),
|
|
"duration": round(end - start, 2),
|
|
"reason": result.get("reason", ""),
|
|
"chorus_text": result.get("chorus_text", ""),
|
|
"structure": result.get("structure", ""),
|
|
"language": result.get("language"),
|
|
"corrected_segments": result.get("corrected_segments"),
|
|
}
|
|
|
|
|
|
def analyze_with_claude(transcript, video_duration, target_duration=30, model="claude-sonnet-4-6", filename_hint=None, include_prebuild=False):
|
|
"""Pošlje transkript Claude API-ju (Anthropic).
|
|
|
|
model: claude-sonnet-4-6 (default), claude-haiku-4-5-20251001, claude-opus-4-7
|
|
filename_hint: ime datoteke (Claude lahko prepozna pesem in popravi halucinacije)
|
|
"""
|
|
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
if not api_key:
|
|
print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
|
|
return None
|
|
|
|
if not transcript.get("segments"):
|
|
return None
|
|
|
|
prompt = _build_analysis_prompt(transcript, video_duration, target_duration, filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
|
|
try:
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
# Initial messages
|
|
messages = [{"role": "user", "content": prompt}]
|
|
|
|
# Sonnet 4.6 podpira web_search tool — Claude lahko poišče prave lyrics
|
|
# za pesmi v slovenščini/hrvaščini/itd., če jih ne pozna iz training data.
|
|
tools = [{
|
|
"type": "web_search_20250305",
|
|
"name": "web_search",
|
|
"max_uses": 3, # Maksimalno 3 search-i = $0.03/job
|
|
}]
|
|
|
|
# Agentic loop: Claude lahko kliče web_search, dobi rezultate, vrne final answer
|
|
max_iterations = 5
|
|
for iteration in range(max_iterations):
|
|
body = json.dumps({
|
|
"model": model,
|
|
"max_tokens": 8192,
|
|
"messages": messages,
|
|
"tools": tools,
|
|
}).encode("utf-8")
|
|
|
|
req = urllib.request.Request(
|
|
"https://api.anthropic.com/v1/messages",
|
|
data=body,
|
|
headers={
|
|
"Content-Type": "application/json",
|
|
"x-api-key": api_key,
|
|
"anthropic-version": "2023-06-01",
|
|
},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=180) as resp:
|
|
data = json.loads(resp.read().decode("utf-8"))
|
|
|
|
content = data.get("content", [])
|
|
if not content:
|
|
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
|
|
return None
|
|
|
|
stop_reason = data.get("stop_reason")
|
|
if stop_reason == "max_tokens":
|
|
usage = data.get("usage", {})
|
|
print(
|
|
f" ⚠️ Claude odrezan (max_tokens): "
|
|
f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
|
|
# Če je end_turn → smo končali, parsiraj text
|
|
if stop_reason in ("end_turn", "stop_sequence"):
|
|
# Najdem zadnji text block
|
|
text_blocks = [b for b in content if b.get("type") == "text"]
|
|
if text_blocks:
|
|
text = text_blocks[-1].get("text", "").strip()
|
|
break
|
|
print(" ⚠️ Claude end_turn brez text bloka", file=sys.stderr)
|
|
return None
|
|
|
|
# Če je tool_use → Claude kliče web_search; appendamo response in nadaljujemo
|
|
if stop_reason == "tool_use":
|
|
# Anthropic web_search tool je server-side — sami obdela searches in vrne web_search_tool_result
|
|
# Ampak v API odgovoru so OBA: tool_use IN web_search_tool_result kot del content
|
|
# Torej končni text že obstaja v naslednji iteraciji
|
|
# Appendamo content do messages in pošljem nazaj (Claude bo nadaljeval)
|
|
messages.append({"role": "assistant", "content": content})
|
|
# Claude server-side že obdela search, samo nadaljujemo s pustim user msg
|
|
# Ampak server-side tools NE potrebujejo follow-up tool_result
|
|
# Pravilen flow: če stop_reason=tool_use ampak web_search_tool_result je že v content,
|
|
# potem Claude sam nadaljuje. Drugače moramo poslati tool_result.
|
|
|
|
# Preverim ali so že rezultati v content
|
|
has_results = any(b.get("type") == "web_search_tool_result" for b in content)
|
|
if has_results:
|
|
# Server-side: Anthropic je sam obdelal search, čakamo nadaljevanje
|
|
# Pošlji nazaj brez sprememb da Claude nadaljuje
|
|
print(f" 🔍 Claude je iskal lyrics, čakam nadaljevanje (iter {iteration+1})", file=sys.stderr)
|
|
continue
|
|
else:
|
|
print(f" ⚠️ tool_use brez results", file=sys.stderr)
|
|
return None
|
|
|
|
# Drugi stop reasons
|
|
print(f" ⚠️ Nepričakovan stop_reason: {stop_reason}", file=sys.stderr)
|
|
return None
|
|
else:
|
|
print(f" ⚠️ Presežena max_iterations ({max_iterations})", file=sys.stderr)
|
|
return None
|
|
|
|
result = _parse_llm_response(text, video_duration)
|
|
if not result:
|
|
return None
|
|
|
|
print(f" 🤖 Claude ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr)
|
|
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
|
|
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
|
|
if result.get("corrected_segments"):
|
|
print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr)
|
|
|
|
result["source"] = f"claude:{model}"
|
|
return result
|
|
except urllib.error.HTTPError as e:
|
|
body = e.read().decode("utf-8", errors="replace")[:500]
|
|
print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Claude analysis failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def analyze_with_gemini(transcript, video_duration, target_duration=30, model="gemini-3.1-pro-preview", filename_hint=None, include_prebuild=False):
|
|
"""Pošlje transkript Gemini API-ju (Google).
|
|
|
|
Gemini 3.1 Pro ima najboljši multilingual rezultat (MMMLU 92.6%) — odličen za SLO/HR/BS.
|
|
"""
|
|
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
|
|
if not api_key:
|
|
print(" ⚠️ GEMINI_API_KEY ni nastavljen — preskakujem Gemini analizo", file=sys.stderr)
|
|
return None
|
|
|
|
if not transcript.get("segments"):
|
|
return None
|
|
|
|
prompt = _build_analysis_prompt(transcript, video_duration, target_duration, filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
|
|
try:
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
|
|
# Gemini 3.x Pro je THINKING model — porabi tokene tudi za internal reasoning (thoughtsTokenCount).
|
|
# 4096 je prenizko: pri velikih transkriptih thinking lahko porabi 1500-3000 tokenov,
|
|
# output (corrected_segments za 60+ segmentov) pa še dodatnih 3000-7000 → odreže JSON na pol
|
|
# (finishReason: MAX_TOKENS) in vrne nepopolen, neveljaven JSON.
|
|
# 32768 daje dovolj prostora za thinking + cel JSON output tudi pri dolgih pesmih.
|
|
body = json.dumps({
|
|
"contents": [{
|
|
"role": "user",
|
|
"parts": [{"text": prompt}],
|
|
}],
|
|
"generationConfig": {
|
|
"temperature": 0.1,
|
|
"maxOutputTokens": 32768,
|
|
"responseMimeType": "application/json",
|
|
},
|
|
}).encode("utf-8")
|
|
|
|
req = urllib.request.Request(
|
|
url,
|
|
data=body,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=180) as resp:
|
|
data = json.loads(resp.read().decode("utf-8"))
|
|
|
|
candidates = data.get("candidates", [])
|
|
if not candidates:
|
|
print(" ⚠️ Gemini vrnil 0 candidates", file=sys.stderr)
|
|
return None
|
|
|
|
cand0 = candidates[0]
|
|
finish_reason = cand0.get("finishReason", "?")
|
|
usage = data.get("usageMetadata", {})
|
|
|
|
# Diagnostika: če je finishReason == MAX_TOKENS, je output odrezan in JSON je invalid
|
|
if finish_reason == "MAX_TOKENS":
|
|
print(
|
|
f" ⚠️ Gemini odrezan (MAX_TOKENS): "
|
|
f"prompt={usage.get('promptTokenCount')} "
|
|
f"thoughts={usage.get('thoughtsTokenCount')} "
|
|
f"output={usage.get('candidatesTokenCount')}",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
|
|
parts = cand0.get("content", {}).get("parts", [])
|
|
if not parts:
|
|
print(
|
|
f" ⚠️ Gemini vrnil prazen content (finishReason={finish_reason}, "
|
|
f"thoughts={usage.get('thoughtsTokenCount')})",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
text = parts[0].get("text", "").strip()
|
|
if not text:
|
|
print(
|
|
f" ⚠️ Gemini vrnil prazen text (finishReason={finish_reason}, "
|
|
f"thoughts={usage.get('thoughtsTokenCount')}, "
|
|
f"output={usage.get('candidatesTokenCount')})",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
|
|
result = _parse_llm_response(text, video_duration)
|
|
if not result:
|
|
return None
|
|
|
|
print(f" 🤖 Gemini ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr)
|
|
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
|
|
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
|
|
if result.get("corrected_segments"):
|
|
print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr)
|
|
|
|
result["source"] = f"gemini:{model}"
|
|
return result
|
|
except urllib.error.HTTPError as e:
|
|
body = e.read().decode("utf-8", errors="replace")[:500]
|
|
print(f" ❌ Gemini API HTTP {e.code}: {body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Gemini analysis failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def analyze_with_llm(transcript, video_duration, target_duration=30, provider="claude", llm_model=None, filename_hint=None, include_prebuild=False):
|
|
"""Glavna funkcija — uporabi izbrano LLM (claude/gemini/auto)."""
|
|
if provider == "gemini":
|
|
model = llm_model or "gemini-3.1-pro-preview"
|
|
return analyze_with_gemini(transcript, video_duration, target_duration, model, filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
elif provider == "claude":
|
|
model = llm_model or "claude-sonnet-4-6"
|
|
return analyze_with_claude(transcript, video_duration, target_duration, model, filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
elif provider == "auto":
|
|
# Najprej probaj Claude, fallback na Gemini
|
|
result = analyze_with_claude(transcript, video_duration, target_duration,
|
|
llm_model or "claude-sonnet-4-6", filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
if result:
|
|
return result
|
|
print(" 🔄 Claude ni uspel, probam Gemini...", file=sys.stderr)
|
|
return analyze_with_gemini(transcript, video_duration, target_duration,
|
|
llm_model or "gemini-3.1-pro-preview", filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
else:
|
|
print(f" ⚠️ Neznan LLM provider: {provider}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
|
|
def is_instrumental(transcript, video_duration, threshold=0.1):
|
|
"""Detekcija ali je pesem instrumentalna.
|
|
|
|
Če je vsota trajanja vokalnih segmentov < threshold * video_duration,
|
|
je pesem instrumentalna.
|
|
"""
|
|
if not transcript.get("segments"):
|
|
return True
|
|
vocal_duration = sum(
|
|
s["end"] - s["start"] for s in transcript["segments"]
|
|
)
|
|
ratio = vocal_duration / max(video_duration, 1)
|
|
return bool(ratio < threshold)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("video", help="Vhod video file")
|
|
ap.add_argument("--lang", default=None, help="ISO 639-1 ali 'auto' (default: auto)")
|
|
ap.add_argument("--model", default="large-v3", help="Whisper model")
|
|
ap.add_argument("--target-duration", type=float, default=30.0)
|
|
ap.add_argument("--max-duration", type=float, default=45.0)
|
|
ap.add_argument("--min-duration", type=float, default=20.0)
|
|
ap.add_argument("--include-prebuild", action="store_true",
|
|
help="Vključi pre-chorus build-up (privzeto: ne)")
|
|
ap.add_argument("--no-claude", action="store_true",
|
|
help="Preskoči LLM analizo (uporabi samo lokalno heuristiko)")
|
|
ap.add_argument("--llm-provider", default="claude",
|
|
choices=["claude", "gemini", "auto"],
|
|
help="Kateri LLM uporabiti za analizo (default: claude)")
|
|
ap.add_argument("--llm-model", default=None,
|
|
help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
|
|
ap.add_argument("--filename-hint", default=None,
|
|
help="Originalno ime datoteke (Claude lahko prepozna pesem)")
|
|
ap.add_argument("--whisper-provider", default="auto",
|
|
choices=["auto", "soniox", "elevenlabs", "local", "hybrid", "gemini"],
|
|
help="STT provider: "
|
|
"soniox=Soniox stt-async-v4 ($0.10/h, 5-15s, najboljši za NZ, PRIPOROČENO), "
|
|
"elevenlabs=Scribe ($0.40/h, halucinacije pri NZ), "
|
|
"gemini=Gemini 3 Pro ($3-5/h, počasen), "
|
|
"auto=Soniox primary + fallback chain (PRIVZETO)")
|
|
ap.add_argument("--json", action="store_true", help="Output JSON")
|
|
ap.add_argument("--output", help="Path za JSON output")
|
|
args = ap.parse_args()
|
|
|
|
video = Path(args.video)
|
|
if not video.exists():
|
|
print(f"❌ Video ne obstaja: {video}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
duration = get_video_duration(video)
|
|
print(f"📹 Video: {video.name}, {duration:.1f}s", file=sys.stderr)
|
|
|
|
# 1. Extract avdio
|
|
audio = extract_audio(video)
|
|
|
|
try:
|
|
# 2. Whisper transcript
|
|
lang = None if args.lang in (None, "auto", "") else args.lang
|
|
# Filename hint pomaga Scribu detektirati jezik (Avseniki → SL, Ben Zucker → DE)
|
|
fname_hint = args.filename_hint or video.stem
|
|
transcript = transcribe_full(
|
|
audio, lang=lang, model_size=args.model,
|
|
provider=args.whisper_provider,
|
|
filename_hint=fname_hint,
|
|
)
|
|
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
|
|
|
|
# 3. Energy profile
|
|
print(f"⚡ Energy profile...", file=sys.stderr)
|
|
energies = compute_energy_profile(audio)
|
|
print(f" Energy samples: {len(energies)}", file=sys.stderr)
|
|
|
|
# 4. Instrumental detection
|
|
instrumental = is_instrumental(transcript, duration)
|
|
print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
|
|
|
|
# 5a. PRIMARNO: LLM analiza (razume cel tekst pesmi + popravki)
|
|
claude_result = None
|
|
if not instrumental and not args.no_claude:
|
|
provider = args.llm_provider
|
|
print(f"🤖 Pošiljam transkript {provider}-u za analizo...", file=sys.stderr)
|
|
# Filename hint = original filename brez extension (Claude lahko prepozna pesem)
|
|
fname_hint = args.filename_hint or video.stem
|
|
claude_result = analyze_with_llm(
|
|
transcript, duration, target_duration=args.target_duration,
|
|
provider=provider, llm_model=args.llm_model,
|
|
filename_hint=fname_hint,
|
|
include_prebuild=args.include_prebuild,
|
|
)
|
|
|
|
# 5b. Find chorus lokalno (kot fallback ali za score-jev preview)
|
|
if not instrumental:
|
|
chorus = find_chorus(transcript, energies, duration)
|
|
else:
|
|
# Za instrumentalne: najdi sekcijo z najvišjo energijo
|
|
window = args.target_duration
|
|
best_start = 0
|
|
best_avg = -100
|
|
t = 0
|
|
while t + window <= duration:
|
|
avg = avg_energy_in_range(energies, t, t + window)
|
|
if avg > best_avg:
|
|
best_avg = avg
|
|
best_start = t
|
|
t += 5 # step 5s
|
|
chorus = {
|
|
"best": {
|
|
"start": best_start,
|
|
"end": best_start + window,
|
|
"duration": window,
|
|
"text_preview": "(instrumental — energy peak)",
|
|
"score": 0,
|
|
"avg_rms": round(best_avg, 2),
|
|
},
|
|
"all_candidates": [],
|
|
"avg_rms_total": round(
|
|
sum(r for (_, r) in energies) / len(energies) if energies else -30, 2
|
|
),
|
|
}
|
|
|
|
# 6. Clip range — LLM (Claude/Gemini) ima prednost, sicer smart_clip_range fallback.
|
|
# POMEMBNO: spremenljivka se zgodovinsko imenuje claude_result, dejansko pa vsebuje
|
|
# rezultat KATEREGA KOLI LLM-ja (Claude ali Gemini) — glej analyze_with_llm().
|
|
# llm_source npr. "claude:claude-sonnet-4-6" ali "gemini:gemini-3.1-pro-preview".
|
|
if claude_result:
|
|
llm_source = claude_result.get("source", "llm")
|
|
|
|
# ── HALUCINACIJA HANDLING ──
|
|
# Če je Claude detect-iral halucinacijo (npr. Scribe je vrnil
|
|
# "finančni moduli" namesto pesmi), NE zaupamo izbiri clipa,
|
|
# ker LLM ni mogel locirati pravega refrena.
|
|
if claude_result.get("hallucination_detected"):
|
|
print(f"⚠️ HALUCINACIJA DETECT-ANA — fallback na local heuristic "
|
|
f"(Scribe transkript ne ustreza zvočnemu vsebini)", file=sys.stderr)
|
|
# Reset claude_result — gremo na local fallback
|
|
clip_range = smart_clip_range(
|
|
chorus, transcript, duration,
|
|
target_duration=args.target_duration,
|
|
max_duration=args.max_duration,
|
|
min_duration=args.min_duration,
|
|
include_prebuild=args.include_prebuild,
|
|
)
|
|
clip_range["source"] = "local_fallback_after_hallucination"
|
|
clip_range["reason"] = (
|
|
"STT halucinacija — local heuristic fallback. "
|
|
"Refren je iz energy-based detekcije, ne iz transkripta. "
|
|
+ clip_range.get("reason", "")
|
|
)
|
|
claude_result = None # disable extensions
|
|
else:
|
|
clip_range = {
|
|
"start": claude_result["start"],
|
|
"end": claude_result["end"],
|
|
"duration": claude_result["duration"],
|
|
"reason": f"{llm_source}: " + claude_result.get("reason", ""),
|
|
"chorus_text": claude_result.get("chorus_text", ""),
|
|
"structure": claude_result.get("structure", ""),
|
|
"source": llm_source,
|
|
}
|
|
# Apply max_duration cap če LLM pretirava
|
|
if clip_range["duration"] > args.max_duration:
|
|
clip_range["end"] = clip_range["start"] + args.max_duration
|
|
clip_range["duration"] = args.max_duration
|
|
clip_range["reason"] += " (capped at max_duration)"
|
|
|
|
# Apply min_duration floor — če je clip prekratek, podaljšaj
|
|
if clip_range["duration"] < args.min_duration:
|
|
needed = args.min_duration - clip_range["duration"]
|
|
new_end = min(clip_range["end"] + needed, duration)
|
|
actual_extension = new_end - clip_range["end"]
|
|
clip_range["end"] = new_end
|
|
clip_range["duration"] = clip_range["end"] - clip_range["start"]
|
|
clip_range["reason"] += f" (extended +{actual_extension:.1f}s to meet min_duration)"
|
|
|
|
# ── EXTEND clip end do naslednje naravne pavze ──
|
|
# LLM pogosto reže točno na zadnji besedi refrena, ampak zadnja
|
|
# beseda ima še "ej-ej-ej" outro / pevec drži zadnji ton 1-3s.
|
|
# Razširimo clip do naslednje >= 1s pavze ali instrumentalnega bridg-a,
|
|
# ampak ne čez max_duration + 5s.
|
|
corrected_segs = claude_result.get("corrected_segments") or transcript["segments"]
|
|
current_end = clip_range["end"]
|
|
extension_limit = min(
|
|
clip_range["start"] + args.max_duration + 5, # max 5s nad max_duration
|
|
duration # ne čez celoten audio
|
|
)
|
|
|
|
# ── EXTEND clip START nazaj če Claude začne sredi besede/segmenta ──
|
|
# Refren se pogosto začne na isti besedi kot v transkriptu, ampak Scribe
|
|
# lahko zazna mejo med segmenti **PO** prvi besedi refrena (npr.
|
|
# "Žena me tepe" — beseda "Žena" v prejšnjem segmentu pri 78.0s,
|
|
# nov segment začne pri 78.3s s "tepe"). To pomeni Claude reže
|
|
# PRED besedo "Žena" → odrezana.
|
|
#
|
|
# Strategija: če clip start pade SREDI segmenta (ne tik na začetku),
|
|
# razširi nazaj na začetek tega segmenta + 0.2s buffer.
|
|
# ── EXTEND clip START nazaj če Claude začne sredi besede ali tik za njo ──
|
|
# Pesem se pogosto začne na isti besedi v transkriptu, ampak Scribe lahko
|
|
# zazna mejo med segmenti **PO** prvi besedi (npr. "Žena me tepe" — "Žena"
|
|
# je v prejšnjem segmentu pri 76.88-77.70s, novi segment začne 78.30).
|
|
# Claude reže tipično na začetku novega segmenta = odrezana prva beseda.
|
|
#
|
|
# Strategija: **na ravni besed** — najdi besedo katere konec je
|
|
# blizu clip start (±0.5s) IN preveri ali se lahko ta beseda
|
|
# "naslanja" na clip (z malo pavze do naslednje besede).
|
|
current_start = clip_range["start"]
|
|
|
|
# Zberi VSE besede z njihovimi timestampi
|
|
# POMEMBNO: Claude corrected_segments NE vsebuje word-level timestamps,
|
|
# samo segment start/end. Word-level je samo v originalnem Scribe transkriptu.
|
|
# Zato vedno uporabi `transcript["segments"]` ne `corrected_segs`.
|
|
all_words = []
|
|
for seg in transcript.get("segments", []):
|
|
for w in seg.get("words", []):
|
|
if w.get("start") is not None and w.get("end") is not None:
|
|
all_words.append({
|
|
"start": float(w["start"]),
|
|
"end": float(w["end"]),
|
|
"text": w.get("text", ""),
|
|
})
|
|
|
|
if all_words:
|
|
# Najdi "rob" — beseda kjer končanje zelo blizu clip start
|
|
# ALI clip start je sredi besede (besedo bi odrezali)
|
|
# ALI prejšnje besede so del istega govora pred clip start
|
|
|
|
# Strategija: poišči besedo PRED clip start, nato razširi nazaj
|
|
# za **1-2 besedi** (ne celo frazo - to bi zajelo prejšnji verz).
|
|
# Kombiniraj z amplitude defense (Layer 3) ki dodatno doda buffer.
|
|
MAX_LOOKBACK_WORDS = 2 # max 2 besedi nazaj
|
|
|
|
for i, w in enumerate(all_words):
|
|
# Beseda zaobsega clip start (clip reže sredi besede)
|
|
if w["start"] < current_start < w["end"]:
|
|
# Razširi nazaj na začetek te besede in največ MAX_LOOKBACK_WORDS predhodnih
|
|
anchor_idx = i
|
|
for j in range(i, max(0, i - MAX_LOOKBACK_WORDS), -1):
|
|
prev = all_words[j - 1]
|
|
curr = all_words[j]
|
|
gap = curr["start"] - prev["end"]
|
|
if gap >= 0.5:
|
|
break
|
|
anchor_idx = j - 1
|
|
new_start = max(0, all_words[anchor_idx]["start"]) # NI buffer-ja
|
|
captured = " ".join(w2["text"].strip() for w2 in all_words[anchor_idx:i+1])
|
|
print(f" 🎵 Razširim clip začetek {current_start:.2f}s → {new_start:.2f}s "
|
|
f"(clip sredi besede; ujamem '{captured}')", file=sys.stderr)
|
|
current_start = new_start
|
|
break
|
|
# Beseda končana TIK pred clip start (do 0.5s pred)
|
|
if 0 < (current_start - w["end"]) <= 0.5:
|
|
# Preveri naslednjo besedo
|
|
next_w = all_words[i + 1] if i + 1 < len(all_words) else None
|
|
if next_w and next_w["start"] >= current_start - 0.1:
|
|
# Najdi anchor: do MAX_LOOKBACK_WORDS nazaj
|
|
anchor_idx = i
|
|
for j in range(i, max(0, i - MAX_LOOKBACK_WORDS), -1):
|
|
prev = all_words[j - 1]
|
|
curr = all_words[j]
|
|
gap = curr["start"] - prev["end"]
|
|
if gap >= 0.5:
|
|
break
|
|
anchor_idx = j - 1
|
|
new_start = max(0, all_words[anchor_idx]["start"]) # NI buffer-ja
|
|
captured = " ".join(w2["text"].strip() for w2 in all_words[anchor_idx:i+1])
|
|
print(f" 🎵 Razširim clip začetek {current_start:.2f}s → {new_start:.2f}s "
|
|
f"(beseda '{w['text'].strip()}' tik pred clip start; "
|
|
f"ujamem celo frazo '{captured}')", file=sys.stderr)
|
|
current_start = new_start
|
|
break
|
|
else:
|
|
# Fallback: če ni word-level (npr. local Whisper), uporabi segmente kot prej
|
|
for seg in corrected_segs:
|
|
seg_start = float(seg.get("start", 0))
|
|
seg_end = float(seg.get("end", 0))
|
|
if seg_start < current_start < seg_end:
|
|
new_start = max(0, current_start - 0.5)
|
|
print(f" 🎵 Razširim clip začetek {current_start:.2f}s → {new_start:.2f}s "
|
|
f"(brez word-level, fallback -0.5s)", file=sys.stderr)
|
|
current_start = new_start
|
|
break
|
|
|
|
if current_start < clip_range["start"]:
|
|
clip_range["start"] = round(current_start, 2)
|
|
clip_range["duration"] = round(clip_range["end"] - current_start, 2)
|
|
clip_range["reason"] += f" (start extended back)"
|
|
|
|
# ── SLOJ 3: AUDIO AMPLITUDE CHECK na samem začetku clipa ──
|
|
# Tudi po word-level extension lahko clip začne sredi vokala (npr. če
|
|
# Scribe ni zaznal besede). Kot zadnja obramba: preveri RMS audio
|
|
# amplitudo v prvih 100ms clipa. Če je > silence threshold = vokal je
|
|
# že tam, dodaj 0.5s buffer nazaj.
|
|
try:
|
|
import subprocess as _sp
|
|
# ffmpeg lahko prebere kratek segment in vrne RMS volume
|
|
probe_start = clip_range["start"]
|
|
probe_dur = 0.15 # prvih 150ms
|
|
if probe_start >= 0.5: # samo če imamo prostor za buffer
|
|
cmd_probe = [
|
|
"ffmpeg", "-hide_banner", "-loglevel", "error",
|
|
"-ss", str(probe_start), "-t", str(probe_dur),
|
|
"-i", str(args.video),
|
|
"-af", "volumedetect",
|
|
"-f", "null", "-"
|
|
]
|
|
pr = _sp.run(cmd_probe, capture_output=True, text=True, timeout=10)
|
|
output = pr.stderr or ""
|
|
# Iščemo "mean_volume: -XX.X dB"
|
|
import re as _re_amp
|
|
m = _re_amp.search(r'mean_volume:\s*(-?\d+\.?\d*)\s*dB', output)
|
|
if m:
|
|
mean_db = float(m.group(1))
|
|
# Silence threshold: pod -40 dB = tihota
|
|
# Vokal/glasba je običajno -30 do -10 dB
|
|
if mean_db > -35:
|
|
# Audio je že "glasen" na začetku clipa = vokal/glasba
|
|
# Dodaj 0.5s buffer nazaj (varno, ne prepogosto)
|
|
old_start = clip_range["start"]
|
|
new_start = max(0, old_start - 0.5)
|
|
if new_start < old_start:
|
|
print(f" 🎵 Audio amplitude check: prvih {probe_dur}s "
|
|
f"ima mean_volume {mean_db:.1f} dB (> -35 dB = vokal/glasba). "
|
|
f"Razširim clip {old_start:.2f}s → {new_start:.2f}s.", file=sys.stderr)
|
|
clip_range["start"] = round(new_start, 2)
|
|
clip_range["duration"] = round(clip_range["end"] - new_start, 2)
|
|
clip_range["reason"] += " (amplitude defense -0.5s)"
|
|
else:
|
|
print(f" 🎵 Audio amplitude check: prvih {probe_dur}s "
|
|
f"ima mean_volume {mean_db:.1f} dB (≤ -35 dB = tiho). OK.", file=sys.stderr)
|
|
except Exception as _e:
|
|
print(f" ⚠️ Audio amplitude check skipped: {_e}", file=sys.stderr)
|
|
|
|
# Najdi vse segmente ki se začnejo PO trenutnem clip end
|
|
# STROŽJA pravila: ne podaljšuj v naslednji refren / verz / instrumental.
|
|
# Razširjamo SAMO če zadnji segment se prekriva s clip (klesti iz njega) ALI
|
|
# če je naslednji segment KRATEK (< 2s) IN vsebuje samo outro fillerje
|
|
# (la la, oh, yeah, ej, ja, ah, na, hey itd.).
|
|
|
|
# Definiraj outro filler regex (multi-jezikovno)
|
|
import re as _re
|
|
OUTRO_FILLER_RE = _re.compile(
|
|
r'^[\s\-,.!?]*'
|
|
r'((?:la|na|oh|ah|eh|ej|aj|ja|hey|yeah|yo|ho|wo|hu|mm|nn|uu|oo|aa|ee|ii)'
|
|
r'[\s\-,.!?]*)+'
|
|
r'[\s\-,.!?]*$',
|
|
_re.IGNORECASE
|
|
)
|
|
# Hard cap: ne razširjaj več kot 3s nad původne clip end
|
|
original_clip_end = clip_range["end"]
|
|
soft_extension_limit = min(original_clip_end + 3.0, extension_limit)
|
|
|
|
for seg in corrected_segs:
|
|
seg_start = float(seg.get("start", 0))
|
|
seg_end = float(seg.get("end", 0))
|
|
seg_text = seg.get("text", "").strip()
|
|
|
|
# Segment se prekriva s clip end (zadnji segment refrena, ki ni zaključen)
|
|
if seg_start <= current_end:
|
|
if seg_end > current_end and seg_end <= soft_extension_limit:
|
|
new_end = min(seg_end + 0.3, soft_extension_limit)
|
|
if new_end > current_end:
|
|
print(f" 🎵 Podaljšam clip {current_end:.1f}s → {new_end:.1f}s "
|
|
f"(zadnji segment refrena se zaključi)", file=sys.stderr)
|
|
current_end = new_end
|
|
else:
|
|
# Segment začne PO clip end — preveri ali je outro filler
|
|
pause = seg_start - current_end
|
|
|
|
# Predaleč → ustavi se
|
|
if pause >= 0.7:
|
|
break
|
|
# Predolg segment = nov verz/refren, ne dodaj
|
|
if (seg_end - seg_start) > 2.5:
|
|
break
|
|
# Preveri vsebino — če ni samo outro fillerji, NE dodaj
|
|
if not OUTRO_FILLER_RE.match(seg_text):
|
|
# Ni filler → verjetno nov refren/verz/post-chorus
|
|
break
|
|
|
|
# OK, je outro filler — dodaj
|
|
new_end = min(seg_end + 0.2, soft_extension_limit)
|
|
if new_end > current_end:
|
|
print(f" 🎵 Podaljšam clip {current_end:.1f}s → {new_end:.1f}s "
|
|
f"(outro filler '{seg_text[:40]}')", file=sys.stderr)
|
|
current_end = new_end
|
|
else:
|
|
break
|
|
|
|
if current_end > clip_range["end"]:
|
|
clip_range["end"] = round(current_end, 2)
|
|
clip_range["duration"] = round(current_end - clip_range["start"], 2)
|
|
clip_range["reason"] += f" (extended to natural pause)"
|
|
else:
|
|
clip_range = smart_clip_range(
|
|
chorus, transcript, duration,
|
|
target_duration=args.target_duration,
|
|
max_duration=args.max_duration,
|
|
min_duration=args.min_duration,
|
|
include_prebuild=args.include_prebuild,
|
|
)
|
|
clip_range["source"] = "local_heuristic"
|
|
print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
|
|
f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
|
|
file=sys.stderr)
|
|
|
|
# Če Claude je vrnil popravljene segmente, jih uporabi (boljši za podnapise)
|
|
if claude_result and claude_result.get("corrected_segments"):
|
|
corrected = claude_result["corrected_segments"]
|
|
# Ohrani word-level timing iz originala, posodobi samo text
|
|
orig_by_start = {round(s["start"], 1): s for s in transcript["segments"]}
|
|
new_segments = []
|
|
for cs in corrected:
|
|
try:
|
|
cs_start = float(cs["start"])
|
|
cs_end = float(cs["end"])
|
|
cs_text = str(cs["text"]).strip()
|
|
except (KeyError, ValueError, TypeError):
|
|
continue
|
|
# Najdi originalni segment z istim start (ali blizu) za word-level timing
|
|
orig = orig_by_start.get(round(cs_start, 1))
|
|
if not orig:
|
|
# Najdi najbližji
|
|
closest_diff = 999
|
|
for s in transcript["segments"]:
|
|
diff = abs(s["start"] - cs_start)
|
|
if diff < closest_diff and diff < 1.0:
|
|
closest_diff = diff
|
|
orig = s
|
|
new_segments.append({
|
|
"start": cs_start,
|
|
"end": cs_end,
|
|
"text": cs_text,
|
|
# Word-level timing ne moremo posodabljati ker Claude ne vrača besede,
|
|
# ampak ohranimo če imamo
|
|
"words": orig.get("words", []) if orig else [],
|
|
})
|
|
transcript["segments"] = new_segments
|
|
transcript["claude_corrected"] = True # ohranimo ime ključa zaradi backward compat
|
|
# Posodobi tudi jezik če LLM je drugačnega mnenja
|
|
if claude_result.get("language") and claude_result["language"] != transcript["language"]:
|
|
print(f" ✏️ LLM je popravil jezik: {transcript['language']} → {claude_result['language']}", file=sys.stderr)
|
|
transcript["language"] = claude_result["language"]
|
|
llm_label = claude_result.get("source", "LLM")
|
|
print(f" ✏️ Whisper segmenti zamenjani z {llm_label} popravljenimi ({len(new_segments)})", file=sys.stderr)
|
|
|
|
# 7. Fade params (lahko razširi clip end če konča sredi vokala)
|
|
fade = detect_audio_fade(clip_range, transcript, video_duration=duration)
|
|
print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr)
|
|
|
|
# Če fade detection razširi end (ker clip konča sredi vokala), apply
|
|
if fade.get("extended_end") and fade["extended_end"] > clip_range["end"]:
|
|
old_end = clip_range["end"]
|
|
new_end = min(fade["extended_end"], clip_range["start"] + args.max_duration)
|
|
clip_range["end"] = round(new_end, 2)
|
|
clip_range["duration"] = round(new_end - clip_range["start"], 2)
|
|
print(f" ↳ Razširjen za {new_end - old_end:.1f}s (zaključek besedila)",
|
|
file=sys.stderr)
|
|
|
|
result = {
|
|
"video": str(video),
|
|
"video_duration": duration,
|
|
"language": transcript["language"],
|
|
"language_probability": transcript["language_probability"],
|
|
"instrumental": instrumental,
|
|
"transcript": transcript,
|
|
"chorus": chorus,
|
|
"clip_range": clip_range,
|
|
"fade": fade,
|
|
"claude_used": claude_result is not None,
|
|
"claude_corrected_text": bool(claude_result and claude_result.get("corrected_segments")),
|
|
}
|
|
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
print(f"💾 Saved: {args.output}", file=sys.stderr)
|
|
|
|
if args.json:
|
|
print(json.dumps(result, ensure_ascii=False))
|
|
|
|
finally:
|
|
try:
|
|
os.unlink(audio)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|