User feedback: 'Tikaj more llm razmislat in ineti filing kaj dat notri'.
With Soniox transcript now accurate, LLM has all info to decide content-wise.
TWO CHANGES:
1. smart_clip_range() — REMOVED forced extension logic:
Before: if duration < min_duration (20s):
- extend to next chorus (40% match) ← WRONG! merged with B-chorus
- extend symmetrically into VERSE ← WRONG! brought in kitica
- cap at max_duration
After: trust LLM completely. Only safety: clamp to video bounds.
2. Prompt rewrite — content-driven instead of number-driven:
Before: 'Skupna dolžina: 12-25 sekund (običajno)' + conflicting '~30s'
'❌ Drugi/tretji nastop refrena — uporabi PRVI'
After: '~30 sekund (NAJBOLJŠA opcija = dva zaporedna refrena)'
'Vključi naravne intro klice (Ajmo Janezi! Hey! Pa-pa!)'
'BRAJDE primer: 41.8-69.8s = 28s (dva refrena z Ajmo Janezi intro)'
'NE meša 2 RAZLIČNA refrena (A + B = napaka)'
'NE razširi v VERZE/KITICE'
For BRAJDE this means:
- Old: Claude picked 57.1-69.8s (12.7s, 2nd chorus, no Ajmo)
Code forced extension to 57.06-82.5s (mixed with B-chorus + verse)
- New: Claude picks 41.8-69.8s (28s, 2 choruses with 'Ajmo Janezi!' intro)
Code returns exactly that — no forced extension.
2274 lines
99 KiB
Python
2274 lines
99 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
analyze.py — Predhodna analiza CELEGA videa pred trim-anjem.
|
|
|
|
Naredi:
|
|
1. Whisper transcript celega videa (auto-detect jezika ali user-specified)
|
|
2. Energy profile (RMS dB na 1s windows)
|
|
3. Structural detection (vocal/instrumental sections, energy peaks)
|
|
4. Pametno izbere clip range (lahko >30s, vključi pre-chorus)
|
|
5. Detekcija instrumentalnih pesmi (no_subs auto)
|
|
|
|
Output: JSON s podatki za clip.py
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
import time
|
|
from pathlib import Path
|
|
|
|
|
|
def get_video_duration(path):
|
|
r = subprocess.run(
|
|
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
|
"-of", "default=nw=1:nokey=1", str(path)],
|
|
capture_output=True, text=True
|
|
)
|
|
try:
|
|
return float(r.stdout.strip())
|
|
except ValueError:
|
|
return 0.0
|
|
|
|
|
|
def extract_audio(video_path):
|
|
"""Extract avdio v 16kHz mono WAV za Whisper + energy."""
|
|
audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
audio.close()
|
|
subprocess.run(
|
|
["ffmpeg", "-y", "-i", str(video_path), "-vn",
|
|
"-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", audio.name],
|
|
check=True, capture_output=True
|
|
)
|
|
return audio.name
|
|
|
|
|
|
def detect_language_from_filename(filename_hint):
|
|
"""Detektiraj jezik iz imena datoteke na podlagi znanih izvajalcev/besed.
|
|
|
|
Vrne ISO 639-1 ('sl', 'de', 'en', 'hr'...) ali None.
|
|
"""
|
|
if not filename_hint:
|
|
return None
|
|
|
|
name = filename_hint.lower()
|
|
|
|
# Slovenski izvajalci (narodno-zabavna, pop, rock)
|
|
SLO_ARTISTS = [
|
|
"avseniki", "avsenik", "modrijani", "veseli dolenjci",
|
|
"čuki", "atomik harmonik", "alfi nipič", "helena blagne",
|
|
"siddharta", "magnifico", "vlado kreslin", "zaklonišče prepeva",
|
|
"perpetuum jazzile", "tabu", "natalija verboten", "klavdija",
|
|
"iztok mlakar", "rok'n'band", "okrog cele zemlje", "ansambel",
|
|
"miran rudan", "andrej šifrer", "mi2", "elvis jackson",
|
|
"tanja žagar", "manca špik", "saša lendero", "rebeka dremelj",
|
|
"nuša derenda", "alenka godec", "prifarski muzikanti",
|
|
"nova generacija", "polka", "narodno-zabavna",
|
|
]
|
|
SLO_KEYWORDS = ["pazi", "morju", "zveza", "domovina", "ljubim", "srce", "majhna",
|
|
"prav", "nazaj", "noč", "dom", "pomoč", "bolha", "preko"]
|
|
|
|
# Nemški izvajalci (Schlager, Volksmusik)
|
|
DE_ARTISTS = [
|
|
"ben zucker", "andrea berg", "helene fischer", "andreas gabalier",
|
|
"amigos", "kastelruther spatzen", "florian silbereisen", "voxxclub",
|
|
"wolfgang petry", "mickie krause", "die toten hosen", "rammstein",
|
|
"udo lindenberg", "die ärzte", "westernhagen", "peter maffay",
|
|
"matthias reim", "die zillertaler", "die jungen zillertaler",
|
|
"stefan mross", "marianne", "michael wendler", "vincent gross",
|
|
"schlager", "volksmusik",
|
|
]
|
|
DE_KEYWORDS = ["liebe", "herz", "ohne", "dich", "leben", "nacht", "tag",
|
|
"schön", "mädchen", "sonne", "himmel", "wenn", "nur",
|
|
"bist", "hast", "dass", "weiß", "kann", "auch"]
|
|
|
|
# Hrvaški/srbski izvajalci
|
|
HR_ARTISTS = [
|
|
"thompson", "miroslav škoro", "oliver dragojević", "gibonni",
|
|
"severina", "tony cetinski", "psihomodo pop", "prljavo kazalište",
|
|
"parni valjak", "lepa brena", "ceca", "aca lukas", "mile kitić",
|
|
"halid bešlić", "dino merlin", "zdravko čolić", "magazin",
|
|
]
|
|
HR_KEYWORDS = ["volim", "ljubav", "srce", "danas", "noćas", "more",
|
|
"majka", "domovina", "zauvijek", "samo", "ćemo"]
|
|
|
|
# Angleški izvajalci (preveč jih je za listo, raje preverim ne-SL/DE/HR znake)
|
|
EN_KEYWORDS = ["love", "song", "feat", "remix", "official", "music", "video",
|
|
"remastered", "lyrics", "by", "with", "tonight", "forever",
|
|
"heart", "dance", "party", "summer"]
|
|
|
|
score = {"sl": 0, "de": 0, "hr": 0, "en": 0, "it": 0, "es": 0, "fr": 0}
|
|
|
|
# Artist matches (težji)
|
|
for a in SLO_ARTISTS:
|
|
if a in name:
|
|
score["sl"] += 5
|
|
for a in DE_ARTISTS:
|
|
if a in name:
|
|
score["de"] += 5
|
|
for a in HR_ARTISTS:
|
|
if a in name:
|
|
score["hr"] += 5
|
|
|
|
# Keyword matches
|
|
for kw in SLO_KEYWORDS:
|
|
if kw in name.split() or f" {kw} " in f" {name} ":
|
|
score["sl"] += 1
|
|
for kw in DE_KEYWORDS:
|
|
if kw in name.split() or f" {kw} " in f" {name} ":
|
|
score["de"] += 1
|
|
for kw in HR_KEYWORDS:
|
|
if kw in name.split() or f" {kw} " in f" {name} ":
|
|
score["hr"] += 1
|
|
for kw in EN_KEYWORDS:
|
|
if kw in name.split() or f" {kw} " in f" {name} ":
|
|
score["en"] += 1
|
|
|
|
# Slovenska abeceda (č, ž, š) brez đ (ki je hrvaška)
|
|
if any(c in name for c in "čžš") and "đ" not in name:
|
|
score["sl"] += 2
|
|
# Nemška abeceda (ä ö ü ß)
|
|
if any(c in name for c in "äöüß"):
|
|
score["de"] += 2
|
|
# Hrvaška abeceda (đ)
|
|
if "đ" in name:
|
|
score["hr"] += 2
|
|
|
|
if not any(score.values()):
|
|
return None
|
|
|
|
best = max(score.items(), key=lambda x: x[1])
|
|
if best[1] >= 2: # threshold
|
|
return best[0]
|
|
return None
|
|
|
|
|
|
def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1", filename_hint=None):
|
|
"""ElevenLabs Scribe transkripcija (najboljša multilingual accuracy 2026).
|
|
|
|
lang: ISO 639-1 ('de', 'sl', 'hr') — če None, probamo iz filename_hint
|
|
Pricing: ~$0.40/h (~$0.022 per 200s pesem).
|
|
"""
|
|
import urllib.request
|
|
import urllib.error
|
|
import uuid
|
|
|
|
api_key = os.environ.get("ELEVENLABS_API_KEY")
|
|
if not api_key:
|
|
print(" ⚠️ ELEVENLABS_API_KEY ni nastavljen", file=sys.stderr)
|
|
return None
|
|
|
|
# Auto-detect lang from filename če uporabnik ni eksplicitno izbral
|
|
if not lang and filename_hint:
|
|
guessed = detect_language_from_filename(filename_hint)
|
|
if guessed:
|
|
lang = guessed
|
|
print(f" 🔍 Lang iz filename '{filename_hint}': {lang}", file=sys.stderr)
|
|
|
|
# ISO 639-1 → 639-3 mapping (Scribe uses 639-3)
|
|
LANG_1_TO_3 = {
|
|
"en": "eng", "de": "deu", "sl": "slv", "hr": "hrv", "bs": "bos",
|
|
"sr": "srp", "it": "ita", "es": "spa", "fr": "fra", "pt": "por",
|
|
"ru": "rus", "pl": "pol", "cs": "ces", "sk": "slk", "hu": "hun",
|
|
"ro": "ron", "nl": "nld", "sv": "swe", "no": "nor", "da": "dan",
|
|
"fi": "fin", "tr": "tur", "ar": "ara", "uk": "ukr", "bg": "bul",
|
|
"el": "ell", "he": "heb", "ja": "jpn", "ko": "kor", "zh": "zho",
|
|
}
|
|
LANG_3_TO_1 = {v: k for k, v in LANG_1_TO_3.items()}
|
|
|
|
# Multipart upload
|
|
boundary = uuid.uuid4().hex
|
|
parts = []
|
|
|
|
def add_text(name, value):
|
|
parts.append(
|
|
f"--{boundary}\r\nContent-Disposition: form-data; "
|
|
f"name=\"{name}\"\r\n\r\n{value}\r\n".encode()
|
|
)
|
|
|
|
def add_file(name, filename, content, ctype):
|
|
parts.append(
|
|
f"--{boundary}\r\nContent-Disposition: form-data; "
|
|
f"name=\"{name}\"; filename=\"{filename}\"\r\n"
|
|
f"Content-Type: {ctype}\r\n\r\n".encode() + content + b"\r\n"
|
|
)
|
|
|
|
with open(audio_path, "rb") as f:
|
|
audio_content = f.read()
|
|
|
|
# Limit: ElevenLabs Scribe supports up to ~25 MB / 4.5h per request
|
|
if len(audio_content) > 24 * 1024 * 1024:
|
|
print(f" ⚠️ Audio {len(audio_content)/1024/1024:.1f} MB > 24 MB limit, fallback", file=sys.stderr)
|
|
return None
|
|
|
|
add_text("model_id", model)
|
|
add_text("timestamps_granularity", "word")
|
|
# tag_audio_events=true je kritično: brez tega Scribe predčasno preneha s transkripcijo
|
|
# ko zazna instrumentalni del (npr. polka harmonika prevzame). Z true vstavi oznake
|
|
# kot "(glasba)" in nadaljuje transkripcijo do konca audia.
|
|
# Te oznake potem post-processing odstrani iz besedila.
|
|
add_text("tag_audio_events", "true")
|
|
if lang:
|
|
scribe_lang = LANG_1_TO_3.get(lang, lang)
|
|
add_text("language_code", scribe_lang)
|
|
add_file("file", "audio.mp3", audio_content, "audio/mpeg")
|
|
parts.append(f"--{boundary}--\r\n".encode())
|
|
body = b"".join(parts)
|
|
|
|
print(f" 📡 ElevenLabs Scribe ({model}, {len(audio_content)/1024/1024:.1f} MB, "
|
|
f"lang={lang or 'auto'})...", file=sys.stderr)
|
|
|
|
req = urllib.request.Request(
|
|
"https://api.elevenlabs.io/v1/speech-to-text",
|
|
data=body,
|
|
headers={
|
|
"xi-api-key": api_key,
|
|
"Content-Type": f"multipart/form-data; boundary={boundary}",
|
|
},
|
|
)
|
|
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=300) as resp:
|
|
data = json.loads(resp.read().decode())
|
|
except urllib.error.HTTPError as e:
|
|
body_err = e.read().decode("utf-8", errors="replace")[:500]
|
|
print(f" ❌ Scribe HTTP {e.code}: {body_err}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Scribe exception: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
# Convert response to our standard format
|
|
detected_lang_3 = data.get("language_code", "unknown")
|
|
detected_lang_1 = LANG_3_TO_1.get(detected_lang_3, detected_lang_3[:2])
|
|
detected_prob = data.get("language_probability", 1.0)
|
|
|
|
# Scribe returns flat list of words (not segments)
|
|
# We group words into pseudo-segments using **smart phrase-aware segmentation**:
|
|
# - Close on long pause (>= 0.4s) — natural breath/phrase boundary
|
|
# - OR after sentence-ending punctuation (. ! ?)
|
|
# - OR after 4 seconds (max segment length for readable subtitle)
|
|
# This gives ~3-7 word segments matching natural sung phrases.
|
|
words = data.get("words", [])
|
|
segments = []
|
|
|
|
if words:
|
|
# Filter out:
|
|
# 1. whitespace tokens
|
|
# 2. audio event tags type='audio_event' or text in (parenthesis) like "(glasba)", "(music)"
|
|
real_words = []
|
|
for w in words:
|
|
t = w.get("text", "").strip()
|
|
wtype = w.get("type", "word")
|
|
# Skip non-word events
|
|
if wtype != "word":
|
|
continue
|
|
if not t:
|
|
continue
|
|
# Skip parenthesized audio events (legacy fallback)
|
|
if t.startswith("(") and t.endswith(")"):
|
|
continue
|
|
real_words.append(w)
|
|
|
|
if real_words:
|
|
current_seg_words = []
|
|
seg_start = real_words[0].get("start", 0)
|
|
|
|
for i, w in enumerate(real_words):
|
|
current_seg_words.append(w)
|
|
w_end = w.get("end", w.get("start", 0))
|
|
w_text = w.get("text", "")
|
|
|
|
close = False
|
|
# Decide if we should close the segment
|
|
if i + 1 < len(real_words):
|
|
next_start = real_words[i + 1].get("start", w_end)
|
|
pause = next_start - w_end
|
|
seg_duration = w_end - seg_start
|
|
|
|
# Trigger close on:
|
|
# 1. Long pause (>= 0.4s) = phrase boundary
|
|
# 2. Sentence-ending punctuation
|
|
# 3. Segment is long enough (>= 4s)
|
|
if pause >= 0.4:
|
|
close = True
|
|
elif seg_duration >= 4.0 and pause >= 0.15:
|
|
close = True
|
|
elif w_text.rstrip().endswith(('.', '!', '?')) and pause >= 0.2:
|
|
close = True
|
|
elif seg_duration >= 5.5: # hard cap
|
|
close = True
|
|
else:
|
|
close = True # last word
|
|
|
|
if close:
|
|
seg_text = " ".join(ww.get("text", "") for ww in current_seg_words).strip()
|
|
if seg_text:
|
|
segments.append({
|
|
"start": seg_start,
|
|
"end": w_end,
|
|
"text": seg_text,
|
|
"words": [
|
|
{
|
|
"start": ww.get("start", 0),
|
|
"end": ww.get("end", 0),
|
|
"text": ww.get("text", ""),
|
|
}
|
|
for ww in current_seg_words
|
|
],
|
|
})
|
|
# Reset
|
|
current_seg_words = []
|
|
if i + 1 < len(real_words):
|
|
seg_start = real_words[i + 1].get("start", 0)
|
|
|
|
# ── HALLUCINATION DETECTION ──
|
|
# Scribe občasno vrne single dolg segment z 1-2 besedama (10-100s ene besede).
|
|
# To je halucinacija pri instrumentalih.
|
|
hallucination_segs = []
|
|
total_audio_duration = max((s["end"] for s in segments), default=0)
|
|
coverage = 0
|
|
for s in segments:
|
|
seg_dur = s["end"] - s["start"]
|
|
word_count = len(s.get("words", []))
|
|
if seg_dur > 15 and word_count < 5:
|
|
hallucination_segs.append(s)
|
|
else:
|
|
coverage += seg_dur
|
|
|
|
coverage_pct = coverage / total_audio_duration * 100 if total_audio_duration else 0
|
|
|
|
if hallucination_segs:
|
|
print(f" ⚠️ Halucinacija(e) zaznana(e): {len(hallucination_segs)} segment(ov) "
|
|
f"daljših od 15s z manj kot 5 besedami:", file=sys.stderr)
|
|
for h in hallucination_segs:
|
|
print(f" [{h['start']:.1f}-{h['end']:.1f}s] = {h['end']-h['start']:.0f}s "
|
|
f"({len(h.get('words', []))} bes.) text={h.get('text', '')[:50]!r}", file=sys.stderr)
|
|
print(f" 📊 Pravo pokritje: {coverage:.1f}s / {total_audio_duration:.1f}s "
|
|
f"= {coverage_pct:.0f}%", file=sys.stderr)
|
|
|
|
print(f" ✅ Scribe: {len(words)} words → {len(segments)} segments, "
|
|
f"lang={detected_lang_1} (p={detected_prob:.2f})", file=sys.stderr)
|
|
|
|
return {
|
|
"language": detected_lang_1,
|
|
"language_probability": float(detected_prob),
|
|
"segments": segments,
|
|
"_provider": "elevenlabs",
|
|
"_hallucination_count": len(hallucination_segs),
|
|
"_coverage_pct": coverage_pct,
|
|
}
|
|
|
|
|
|
def transcribe_with_gemini(audio_path, lang=None, filename_hint=None):
|
|
"""Gemini 3 Pro audio transcription — fallback za narodno-zabavne pesmi
|
|
kjer Scribe halucinarala.
|
|
|
|
Prednosti:
|
|
- Pravilna besedila slovenskih, hrvaških in drugih "manjšinskih" jezikov
|
|
- Ne halucinira pri instrumentalnih sekcijah
|
|
- Razume kontekst pesmi (lirika)
|
|
|
|
Slabosti:
|
|
- Počasen (~100s na 2min audio)
|
|
- Dražji ($0.20 vs $0.013)
|
|
- Timestamps včasih off za 1-2s
|
|
"""
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
api_key = os.environ.get("GEMINI_API_KEY")
|
|
if not api_key:
|
|
print(f" ❌ Gemini fallback: GEMINI_API_KEY missing", file=sys.stderr)
|
|
return None
|
|
|
|
print(f"🧠 Gemini 3 Pro transcribing {audio_path}...", file=sys.stderr)
|
|
audio_size_mb = os.path.getsize(audio_path) / 1024 / 1024
|
|
print(f" 📦 Audio size: {audio_size_mb:.1f} MB", file=sys.stderr)
|
|
|
|
try:
|
|
# 1. Upload audio prek Files API (resumable)
|
|
upload_url_base = "https://generativelanguage.googleapis.com/upload/v1beta/files"
|
|
with open(audio_path, 'rb') as f:
|
|
audio_bytes = f.read()
|
|
|
|
# Step 1: start
|
|
headers_start = {
|
|
'X-Goog-Upload-Protocol': 'resumable',
|
|
'X-Goog-Upload-Command': 'start',
|
|
'X-Goog-Upload-Header-Content-Length': str(len(audio_bytes)),
|
|
'X-Goog-Upload-Header-Content-Type': 'audio/mp3',
|
|
'Content-Type': 'application/json',
|
|
}
|
|
req_start = urllib.request.Request(
|
|
f"{upload_url_base}?key={api_key}",
|
|
data=json.dumps({"file": {"display_name": "reels_audio"}}).encode(),
|
|
headers=headers_start, method='POST'
|
|
)
|
|
with urllib.request.urlopen(req_start, timeout=30) as resp:
|
|
upload_url = resp.headers.get('X-Goog-Upload-URL')
|
|
|
|
# Step 2: upload bytes
|
|
headers_upload = {
|
|
'Content-Length': str(len(audio_bytes)),
|
|
'X-Goog-Upload-Offset': '0',
|
|
'X-Goog-Upload-Command': 'upload, finalize',
|
|
}
|
|
req_upload = urllib.request.Request(
|
|
upload_url, data=audio_bytes,
|
|
headers=headers_upload, method='POST'
|
|
)
|
|
with urllib.request.urlopen(req_upload, timeout=120) as resp:
|
|
file_info = json.loads(resp.read().decode())
|
|
file_uri = file_info['file']['uri']
|
|
|
|
print(f" ✓ Uploaded to Gemini Files API", file=sys.stderr)
|
|
# Manjši delay da se file procesi
|
|
time.sleep(2)
|
|
|
|
# 2. Generate transcript
|
|
gen_url = (f"https://generativelanguage.googleapis.com/v1beta/"
|
|
f"models/gemini-3-pro-preview:generateContent?key={api_key}")
|
|
|
|
lang_hint = ""
|
|
if filename_hint:
|
|
lang_hint = f"\nFilename hint: {filename_hint}"
|
|
if lang:
|
|
lang_hint += f"\nLanguage: {lang}"
|
|
|
|
prompt = f"""Transcribe this song with precise word-level timestamps.{lang_hint}
|
|
|
|
Return ONLY valid JSON in this EXACT format (no markdown fences, no explanation):
|
|
{{
|
|
"language": "sl",
|
|
"segments": [
|
|
{{
|
|
"start": 0.5,
|
|
"end": 4.2,
|
|
"text": "Besedilo segmenta",
|
|
"words": [
|
|
{{"start": 0.5, "end": 0.9, "text": "Besedilo"}},
|
|
{{"start": 1.0, "end": 1.4, "text": "segmenta"}}
|
|
]
|
|
}}
|
|
]
|
|
}}
|
|
|
|
Rules:
|
|
- Only transcribe vocal singing, NOT instrumental sections
|
|
- Each segment is a complete musical phrase (typically 2-4 seconds)
|
|
- Include word-level timestamps for EVERY word
|
|
- Use proper orthography (š, č, ž for Slavic; ä, ö, ü for German etc.)
|
|
- Skip instrumental breaks (don't fill with silence segments)
|
|
- Be very accurate with timestamps - this is for video subtitle generation
|
|
- DO NOT hallucinate words during instrumental sections
|
|
- DO NOT include trailing commas in JSON
|
|
|
|
Output ONLY the JSON object."""
|
|
|
|
payload = {
|
|
"contents": [{
|
|
"parts": [
|
|
{"text": prompt},
|
|
{"file_data": {"mime_type": "audio/mp3", "file_uri": file_uri}}
|
|
]
|
|
}],
|
|
"generationConfig": {
|
|
"temperature": 0.0,
|
|
"maxOutputTokens": 32000,
|
|
}
|
|
}
|
|
|
|
req_gen = urllib.request.Request(
|
|
gen_url,
|
|
data=json.dumps(payload).encode(),
|
|
headers={'Content-Type': 'application/json'},
|
|
method='POST'
|
|
)
|
|
|
|
t0 = time.time()
|
|
with urllib.request.urlopen(req_gen, timeout=300) as resp:
|
|
result = json.loads(resp.read().decode())
|
|
elapsed = time.time() - t0
|
|
|
|
usage = result.get('usageMetadata', {})
|
|
print(f" ✓ Gemini 3 Pro response v {elapsed:.0f}s "
|
|
f"(in: {usage.get('promptTokenCount', 0)}, "
|
|
f"out: {usage.get('candidatesTokenCount', 0)}, "
|
|
f"thoughts: {usage.get('thoughtsTokenCount', 0)})", file=sys.stderr)
|
|
|
|
# 3. Parse JSON output
|
|
candidate_text = result['candidates'][0]['content']['parts'][0]['text'].strip()
|
|
|
|
# Pobriši markdown code fences če so
|
|
if candidate_text.startswith('```'):
|
|
# ```json\n...\n```
|
|
lines = candidate_text.split('\n')
|
|
if lines[0].startswith('```'):
|
|
lines = lines[1:]
|
|
if lines and lines[-1].rstrip() == '```':
|
|
lines = lines[:-1]
|
|
candidate_text = '\n'.join(lines)
|
|
|
|
# Try-except za JSON z popravki za pogoste težave
|
|
parsed = None
|
|
try:
|
|
parsed = json.loads(candidate_text)
|
|
except json.JSONDecodeError as e:
|
|
# Trailing comma fix
|
|
import re as _re
|
|
cleaned = _re.sub(r',(\s*[}\]])', r'\1', candidate_text)
|
|
try:
|
|
parsed = json.loads(cleaned)
|
|
print(f" ✓ Fixed trailing commas in Gemini JSON", file=sys.stderr)
|
|
except json.JSONDecodeError as e2:
|
|
print(f" ❌ Gemini JSON parse failed: {e2}", file=sys.stderr)
|
|
print(f" First 500 chars: {candidate_text[:500]}", file=sys.stderr)
|
|
return None
|
|
|
|
if not parsed or not parsed.get('segments'):
|
|
print(f" ❌ Gemini returned no segments", file=sys.stderr)
|
|
return None
|
|
|
|
segments = parsed['segments']
|
|
# Detected language
|
|
detected_lang = parsed.get('language', lang or 'unknown')
|
|
|
|
# Compute coverage stats
|
|
hallucination_count = 0
|
|
coverage = 0
|
|
total_dur = max((s.get('end', 0) for s in segments), default=0)
|
|
for s in segments:
|
|
seg_dur = s.get('end', 0) - s.get('start', 0)
|
|
word_count = len(s.get('words', []))
|
|
if seg_dur > 15 and word_count < 5:
|
|
hallucination_count += 1
|
|
else:
|
|
coverage += seg_dur
|
|
coverage_pct = (coverage / total_dur * 100) if total_dur else 0
|
|
|
|
total_words = sum(len(s.get('words', [])) for s in segments)
|
|
print(f" ✅ Gemini 3 Pro: {total_words} words → {len(segments)} segments, "
|
|
f"lang={detected_lang}, coverage={coverage_pct:.0f}%", file=sys.stderr)
|
|
|
|
return {
|
|
"language": detected_lang,
|
|
"language_probability": 0.95,
|
|
"segments": segments,
|
|
"_provider": "gemini-3-pro",
|
|
"_hallucination_count": hallucination_count,
|
|
"_coverage_pct": coverage_pct,
|
|
}
|
|
|
|
except urllib.error.HTTPError as e:
|
|
err_body = e.read().decode()[:500] if hasattr(e, 'read') else ''
|
|
print(f" ❌ Gemini HTTP {e.code}: {err_body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Gemini fallback exception: {e}", file=sys.stderr)
|
|
import traceback
|
|
traceback.print_exc(file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def transcribe_with_soniox(audio_path, lang=None, filename_hint=None):
|
|
"""Soniox stt-async-v4 audio transcription — primary STT.
|
|
|
|
Prednosti:
|
|
- Najboljša natančnost za 60+ jezikov vključno s slovenščino
|
|
- Brezhibno za narodno-zabavno glasbo (Avsenik, Modrijani, itd.)
|
|
- Word-level timestamps + punctuation + diakritike
|
|
- $0.10/h ($0.005 za 3-min pesem) - 4x cenejši kot Scribe
|
|
- 4-13s za 180s audio (5x hitrejši kot Scribe)
|
|
"""
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
api_key = os.environ.get("SONIOX_API_KEY")
|
|
if not api_key:
|
|
print(f" ❌ SONIOX_API_KEY missing", file=sys.stderr)
|
|
return None
|
|
|
|
BASE = "https://api.soniox.com"
|
|
print(f"🎤 Soniox stt-async-v4 transcribing {audio_path}...", file=sys.stderr)
|
|
|
|
file_id = None
|
|
trans_id = None
|
|
|
|
def api_call(method, path, **kwargs):
|
|
headers = kwargs.pop('headers', {})
|
|
headers['Authorization'] = f'Bearer {api_key}'
|
|
data = kwargs.get('data')
|
|
if isinstance(data, dict):
|
|
data = json.dumps(data).encode()
|
|
headers['Content-Type'] = 'application/json'
|
|
req = urllib.request.Request(f"{BASE}{path}", data=data, headers=headers, method=method)
|
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
content = resp.read().decode()
|
|
return json.loads(content) if content else {}
|
|
|
|
try:
|
|
# 1. Upload file (multipart)
|
|
boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"
|
|
with open(audio_path, 'rb') as f:
|
|
audio_bytes = f.read()
|
|
body = b''.join([
|
|
f"--{boundary}\r\n".encode(),
|
|
b'Content-Disposition: form-data; name="file"; filename="audio.mp3"\r\n',
|
|
b'Content-Type: audio/mpeg\r\n\r\n',
|
|
audio_bytes,
|
|
f"\r\n--{boundary}--\r\n".encode()
|
|
])
|
|
req = urllib.request.Request(
|
|
f"{BASE}/v1/files",
|
|
data=body,
|
|
headers={
|
|
'Authorization': f'Bearer {api_key}',
|
|
'Content-Type': f'multipart/form-data; boundary={boundary}',
|
|
},
|
|
method='POST'
|
|
)
|
|
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
file_data = json.loads(resp.read().decode())
|
|
file_id = file_data['id']
|
|
size_mb = len(audio_bytes) / 1024 / 1024
|
|
print(f" ✓ Uploaded {size_mb:.1f}MB → file_id={file_id}", file=sys.stderr)
|
|
|
|
# 2. Create transcription
|
|
config = {
|
|
"model": "stt-async-v4",
|
|
"file_id": file_id,
|
|
"enable_language_identification": True,
|
|
}
|
|
# Language hints — prepoznaj jezik iz filename ali parametra
|
|
if lang:
|
|
config["language_hints"] = [lang]
|
|
else:
|
|
# Auto-detect iz filename
|
|
fn_lower = (filename_hint or "").lower()
|
|
if any(k in fn_lower for k in ["ansambel", "avsenik", "fehtar", "modrijan", "polka", "valček", "slovensk"]):
|
|
config["language_hints"] = ["sl"]
|
|
else:
|
|
# Multilingual default - top svetovni
|
|
config["language_hints"] = ["en", "sl", "de", "hr", "es", "fr", "it"]
|
|
|
|
trans_data = api_call("POST", "/v1/transcriptions", data=config)
|
|
trans_id = trans_data['id']
|
|
print(f" ✓ Transcription started: {trans_id}", file=sys.stderr)
|
|
|
|
# 3. Poll status
|
|
t0 = time.time()
|
|
while True:
|
|
status_data = api_call("GET", f"/v1/transcriptions/{trans_id}")
|
|
status = status_data.get('status', 'unknown')
|
|
elapsed = time.time() - t0
|
|
if status == "completed":
|
|
print(f" ✓ Completed in {elapsed:.0f}s", file=sys.stderr)
|
|
break
|
|
if status == "error":
|
|
print(f" ❌ Soniox error: {status_data.get('error_message', '?')}", file=sys.stderr)
|
|
return None
|
|
if elapsed > 180:
|
|
print(f" ⚠️ Timeout (180s)", file=sys.stderr)
|
|
return None
|
|
time.sleep(2)
|
|
|
|
# 4. Get transcript
|
|
transcript_data = api_call("GET", f"/v1/transcriptions/{trans_id}/transcript")
|
|
|
|
# Convert Soniox format → naš standard format (segments + words)
|
|
tokens = transcript_data.get('tokens', [])
|
|
if not tokens:
|
|
print(f" ❌ Empty transcript", file=sys.stderr)
|
|
return None
|
|
|
|
# Group tokens into words (Soniox vrača subwords; "Del" + " neb" + "a" = "Del neba")
|
|
# Soniox token ima text in start_ms/end_ms. Beseda začne kjer ima text začetni space ali je prvi.
|
|
words = []
|
|
current_word = None
|
|
for tok in tokens:
|
|
text = tok.get('text', '')
|
|
start_s = tok.get('start_ms', 0) / 1000
|
|
end_s = tok.get('end_ms', 0) / 1000
|
|
# Token, ki začne z space ali je <end>/special, je nova beseda
|
|
if text.startswith(' ') or text in ('<end>', '<fin>'):
|
|
if current_word and current_word['text'].strip():
|
|
words.append(current_word)
|
|
if text in ('<end>', '<fin>'):
|
|
current_word = None
|
|
continue
|
|
current_word = {'text': text, 'start': start_s, 'end': end_s, 'language': tok.get('language', lang or 'sl')}
|
|
else:
|
|
if current_word is None:
|
|
current_word = {'text': text, 'start': start_s, 'end': end_s, 'language': tok.get('language', lang or 'sl')}
|
|
else:
|
|
# Append k current_word
|
|
current_word['text'] += text
|
|
current_word['end'] = end_s
|
|
if current_word and current_word['text'].strip():
|
|
words.append(current_word)
|
|
|
|
# Group words into segments (po pavzah > 0.6s)
|
|
segments = []
|
|
if words:
|
|
current_seg = {'start': words[0]['start'], 'end': words[0]['end'],
|
|
'text': words[0]['text'].strip(),
|
|
'words': [{'start': words[0]['start'], 'end': words[0]['end'], 'text': words[0]['text'].strip()}]}
|
|
|
|
for w in words[1:]:
|
|
gap = w['start'] - current_seg['end']
|
|
if gap > 0.6 and len(current_seg['words']) >= 3:
|
|
segments.append(current_seg)
|
|
current_seg = {'start': w['start'], 'end': w['end'],
|
|
'text': w['text'].strip(),
|
|
'words': [{'start': w['start'], 'end': w['end'], 'text': w['text'].strip()}]}
|
|
else:
|
|
current_seg['end'] = w['end']
|
|
current_seg['text'] = (current_seg['text'] + ' ' + w['text'].strip()).strip()
|
|
current_seg['words'].append({'start': w['start'], 'end': w['end'], 'text': w['text'].strip()})
|
|
segments.append(current_seg)
|
|
|
|
# Detected language
|
|
detected_lang = lang or 'sl'
|
|
if tokens:
|
|
# Get most common language from tokens
|
|
lang_counts = {}
|
|
for tok in tokens:
|
|
tl = tok.get('language')
|
|
if tl:
|
|
lang_counts[tl] = lang_counts.get(tl, 0) + 1
|
|
if lang_counts:
|
|
detected_lang = max(lang_counts, key=lang_counts.get)
|
|
|
|
# Compute coverage stats (compatible z ostalimi providerji)
|
|
total_dur = max((s['end'] for s in segments), default=0)
|
|
coverage = sum(s['end'] - s['start'] for s in segments)
|
|
coverage_pct = (coverage / total_dur * 100) if total_dur else 0
|
|
|
|
total_words = sum(len(s.get('words', [])) for s in segments)
|
|
full_text = transcript_data.get('text', '')
|
|
print(f" ✅ Soniox: {total_words} words → {len(segments)} segments, "
|
|
f"lang={detected_lang}, coverage={coverage_pct:.0f}%", file=sys.stderr)
|
|
print(f" 📝 First 200 chars: {full_text[:200]!r}", file=sys.stderr)
|
|
|
|
return {
|
|
"language": detected_lang,
|
|
"language_probability": 0.95,
|
|
"segments": segments,
|
|
"_provider": "soniox",
|
|
"_hallucination_count": 0, # Soniox redko halucinarala
|
|
"_coverage_pct": coverage_pct,
|
|
}
|
|
|
|
except urllib.error.HTTPError as e:
|
|
err_body = e.read().decode()[:500] if hasattr(e, 'read') else ''
|
|
print(f" ❌ Soniox HTTP {e.code}: {err_body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Soniox exception: {e}", file=sys.stderr)
|
|
import traceback
|
|
traceback.print_exc(file=sys.stderr)
|
|
return None
|
|
finally:
|
|
# Cleanup — pošlji DELETE ampak ne preverjaj response (Soniox returns empty body)
|
|
for path in ([f"/v1/transcriptions/{trans_id}"] if trans_id else []) + ([f"/v1/files/{file_id}"] if file_id else []):
|
|
try:
|
|
req = urllib.request.Request(f"{BASE}{path}",
|
|
headers={'Authorization': f'Bearer {api_key}'}, method='DELETE')
|
|
urllib.request.urlopen(req, timeout=10)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def transcribe_full(audio_path, lang=None, model_size="small", provider="auto", filename_hint=None):
|
|
"""STT dispatcher — Soniox primary z fallback chain.
|
|
|
|
provider:
|
|
- "soniox" → Soniox stt-async-v4 (najboljši, $0.10/h, 5-15s)
|
|
- "elevenlabs" → ElevenLabs Scribe ($0.40/h, 8-15s)
|
|
- "gemini" → Gemini 3 Pro ($3-5/h, 100-200s, najbolj točen za music)
|
|
- "local" → faster-whisper na CPU
|
|
- "auto" → Soniox primary, Scribe fallback, Gemini fallback ob halucinaciji
|
|
"""
|
|
has_soniox = bool(os.environ.get("SONIOX_API_KEY"))
|
|
has_scribe = bool(os.environ.get("ELEVENLABS_API_KEY"))
|
|
has_gemini = bool(os.environ.get("GEMINI_API_KEY"))
|
|
|
|
# Resolve "auto" → "soniox" če key, sicer fallback chain
|
|
if provider in ("auto", "hybrid"):
|
|
if has_soniox:
|
|
provider = "soniox_chain" # Soniox primary + fallbacks
|
|
elif has_scribe and has_gemini:
|
|
provider = "hybrid" # legacy hybrid
|
|
elif has_scribe:
|
|
provider = "elevenlabs"
|
|
else:
|
|
provider = "local"
|
|
|
|
# ─── SONIOX CHAIN: Soniox primary, Scribe/Gemini fallback ───
|
|
if provider == "soniox_chain":
|
|
print(f"🎯 Provider chain: Soniox → Scribe → Gemini", file=sys.stderr)
|
|
result = transcribe_with_soniox(audio_path, lang=lang, filename_hint=filename_hint)
|
|
|
|
if result and result.get("segments"):
|
|
cov = result.get("_coverage_pct", 100)
|
|
hall = result.get("_hallucination_count", 0)
|
|
if cov >= 50 and hall == 0:
|
|
return result
|
|
print(f" ⚠️ Soniox sumljiv (coverage {cov:.0f}%, hall {hall}) — try fallback", file=sys.stderr)
|
|
else:
|
|
print(f" ❌ Soniox failed → fallback", file=sys.stderr)
|
|
|
|
# Fallback 1: Scribe
|
|
if has_scribe:
|
|
print(f" 🔄 Fallback to Scribe...", file=sys.stderr)
|
|
result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result2 and result2.get("segments"):
|
|
cov = result2.get("_coverage_pct", 100)
|
|
hall = result2.get("_hallucination_count", 0)
|
|
if cov >= 50 and hall == 0:
|
|
return result2
|
|
# ohrani za primerjavo
|
|
result = result2 if not result else result
|
|
|
|
# Fallback 2: Gemini (samo če sve doslej slabe)
|
|
if has_gemini:
|
|
print(f" 🔄 Fallback to Gemini 3 Pro (last resort)...", file=sys.stderr)
|
|
result3 = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result3 and result3.get("segments"):
|
|
return result3
|
|
|
|
# Vrni karkoli imamo
|
|
return result or {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── SONIOX ONLY ───
|
|
if provider == "soniox":
|
|
if not has_soniox:
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
result = transcribe_with_soniox(audio_path, lang=lang, filename_hint=filename_hint)
|
|
return result or {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── HYBRID (legacy): Scribe primary, Gemini fallback ───
|
|
if provider == "hybrid":
|
|
if not has_scribe:
|
|
provider = "gemini"
|
|
else:
|
|
# Try Scribe first
|
|
print(f"🎯 HYBRID mode: Scribe primary, Gemini fallback", file=sys.stderr)
|
|
result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
|
|
if result and result.get("segments"):
|
|
hall_count = result.get("_hallucination_count", 0)
|
|
cov_pct = result.get("_coverage_pct", 100)
|
|
|
|
# Quality gate: če je Scribe rezultat dober, vrni ga
|
|
if hall_count == 0 and cov_pct >= 50:
|
|
print(f" ✅ Scribe OK (coverage {cov_pct:.0f}%) — no fallback needed",
|
|
file=sys.stderr)
|
|
return result
|
|
|
|
# Halucinacija ali nizko pokritje → preizkusi Scribe še 1x preden gremo na Gemini
|
|
print(f" ⚠️ Scribe quality issues (coverage {cov_pct:.0f}%, "
|
|
f"{hall_count} halu) — RETRY Scribe...", file=sys.stderr)
|
|
result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result2 and result2.get("segments"):
|
|
h2 = result2.get("_hallucination_count", 0)
|
|
c2 = result2.get("_coverage_pct", 100)
|
|
if h2 == 0 and c2 >= 50:
|
|
print(f" ✅ Scribe retry uspel: coverage {cov_pct:.0f}% → {c2:.0f}%",
|
|
file=sys.stderr)
|
|
return result2
|
|
# Še vedno slabo, ali je drugi tek boljši?
|
|
if h2 < hall_count or c2 > cov_pct:
|
|
result = result2
|
|
hall_count = h2
|
|
cov_pct = c2
|
|
|
|
# Še vedno halucinacija → Gemini fallback
|
|
if has_gemini:
|
|
print(f" 🔄 Scribe še vedno slab (coverage {cov_pct:.0f}%, "
|
|
f"{hall_count} halu) — switching na Gemini 3 Pro...", file=sys.stderr)
|
|
gemini_result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if gemini_result and gemini_result.get("segments"):
|
|
g_cov = gemini_result.get("_coverage_pct", 100)
|
|
g_hall = gemini_result.get("_hallucination_count", 0)
|
|
# Vzemi tisto kar je boljše
|
|
if g_hall < hall_count or g_cov > cov_pct:
|
|
print(f" ✅ Gemini boljši: coverage {cov_pct:.0f}% → {g_cov:.0f}%, "
|
|
f"hallu {hall_count} → {g_hall}", file=sys.stderr)
|
|
return gemini_result
|
|
else:
|
|
print(f" ⚠️ Gemini ni boljši, ohrani Scribe", file=sys.stderr)
|
|
return result
|
|
else:
|
|
print(f" ⚠️ Gemini fallback ni dosegljiv — vrnem Scribe rezultat",
|
|
file=sys.stderr)
|
|
|
|
return result
|
|
else:
|
|
# Scribe popolnoma failed → Gemini direktno
|
|
if has_gemini:
|
|
print(f" 🔄 Scribe failed → Gemini 3 Pro", file=sys.stderr)
|
|
gemini_result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if gemini_result and gemini_result.get("segments"):
|
|
return gemini_result
|
|
# Brez fallback → empty
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── GEMINI ONLY ───
|
|
if provider == "gemini":
|
|
if not has_gemini:
|
|
print(f" ❌ provider=gemini ampak GEMINI_API_KEY missing", file=sys.stderr)
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
result = transcribe_with_gemini(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result and result.get("segments"):
|
|
return result
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── ELEVENLABS / SCRIBE ONLY (z auto-retry) ───
|
|
if provider == "elevenlabs" and has_scribe:
|
|
result = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
|
|
if result and result.get("segments"):
|
|
hall_count = result.get("_hallucination_count", 0)
|
|
cov_pct = result.get("_coverage_pct", 100)
|
|
if hall_count > 0 or cov_pct < 50:
|
|
print(f" 🔄 Halucinacija/nizko pokritje ({cov_pct:.0f}%, "
|
|
f"{hall_count} hallucination segs) — RETRY Scribe...", file=sys.stderr)
|
|
result2 = transcribe_with_elevenlabs(audio_path, lang=lang, filename_hint=filename_hint)
|
|
if result2 and result2.get("segments"):
|
|
h2 = result2.get("_hallucination_count", 0)
|
|
c2 = result2.get("_coverage_pct", 100)
|
|
if h2 < hall_count or c2 > cov_pct:
|
|
print(f" ✅ Retry boljši: pokritje {cov_pct:.0f}% → {c2:.0f}%",
|
|
file=sys.stderr)
|
|
result = result2
|
|
return result
|
|
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
|
|
# ─── LOCAL faster-whisper ───
|
|
return _transcribe_full_local(audio_path, lang=lang, model_size=model_size)
|
|
|
|
|
|
def _transcribe_full_local(audio_path, lang=None, model_size="small"):
|
|
"""Whisper transcript celega avdia. lang=None → robust auto-detect.
|
|
|
|
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
|
|
from faster_whisper import WhisperModel
|
|
|
|
print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
|
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
|
|
|
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
|
|
if not lang:
|
|
print(" 🔍 Robust lang detection (3 samples)...", file=sys.stderr)
|
|
try:
|
|
duration_proc = subprocess.run(
|
|
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
|
"-of", "default=nw=1:nokey=1", audio_path],
|
|
capture_output=True, text=True
|
|
)
|
|
audio_duration = float(duration_proc.stdout.strip())
|
|
except Exception:
|
|
audio_duration = 180.0
|
|
|
|
lang_votes = {}
|
|
for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
|
|
if ss + 5 > audio_duration:
|
|
continue
|
|
sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
sample.close()
|
|
try:
|
|
subprocess.run(
|
|
["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
|
|
"-t", "30", "-vn", "-ac", "1", "-ar", "16000",
|
|
"-c:a", "pcm_s16le", sample.name],
|
|
check=True, capture_output=True
|
|
)
|
|
_, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
|
|
sl, sp = sample_info.language, float(sample_info.language_probability)
|
|
lang_votes[sl] = lang_votes.get(sl, 0) + sp
|
|
print(f" sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f" sample @ {ss:.0f}s: failed", file=sys.stderr)
|
|
finally:
|
|
try:
|
|
os.unlink(sample.name)
|
|
except Exception:
|
|
pass
|
|
|
|
if lang_votes:
|
|
lang = max(lang_votes.items(), key=lambda x: x[1])[0]
|
|
print(f" ✅ Lang lock: {lang}", file=sys.stderr)
|
|
|
|
try:
|
|
segs, info = m.transcribe(
|
|
audio_path,
|
|
language=lang,
|
|
word_timestamps=True,
|
|
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
|
vad_filter=False,
|
|
# Anti-halucinacije
|
|
condition_on_previous_text=False,
|
|
temperature=0.0,
|
|
compression_ratio_threshold=2.4,
|
|
log_prob_threshold=-1.0,
|
|
no_speech_threshold=0.6,
|
|
# Beam search namesto greedy = bolj zanesljiv decode (manj halucinacij)
|
|
beam_size=5,
|
|
# Halucinacija detection: če je tišina dolga, ne pretvarjaj v tekst
|
|
hallucination_silence_threshold=2.0,
|
|
)
|
|
detected_lang = info.language
|
|
detected_prob = float(info.language_probability)
|
|
except (ValueError, RuntimeError) as e:
|
|
# Whisper failure (např. pri popolnoma instrumentalnih datotekah z VAD)
|
|
print(f" ⚠️ Whisper transcribe failed: {e}", file=sys.stderr)
|
|
return {
|
|
"language": "unknown",
|
|
"language_probability": 0.0,
|
|
"segments": [],
|
|
}
|
|
|
|
print(f" Detekcija: {detected_lang} (p={detected_prob:.2f})", file=sys.stderr)
|
|
|
|
segments = []
|
|
for s in segs:
|
|
words = []
|
|
if s.words:
|
|
for w in s.words:
|
|
words.append({
|
|
"start": w.start,
|
|
"end": w.end,
|
|
"text": w.word,
|
|
})
|
|
segments.append({
|
|
"start": s.start,
|
|
"end": s.end,
|
|
"text": s.text.strip(),
|
|
"words": words,
|
|
})
|
|
|
|
return {
|
|
"language": detected_lang,
|
|
"language_probability": detected_prob,
|
|
"segments": segments,
|
|
}
|
|
|
|
|
|
def compute_energy_profile(audio_path, window_sec=1.0):
|
|
"""RMS dB na window_sec sekund. Vrne list (timestamp, rms_db)."""
|
|
cmd = [
|
|
"ffmpeg", "-i", audio_path,
|
|
"-af", f"asetnsamples=n={int(16000 * window_sec)}:p=0,"
|
|
f"astats=metadata=1:reset={window_sec},"
|
|
f"ametadata=print:key=lavfi.astats.Overall.RMS_level:file=-",
|
|
"-f", "null", "-",
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
output = result.stdout + "\n" + result.stderr
|
|
|
|
energies = []
|
|
current_pts = 0.0
|
|
for line in output.split("\n"):
|
|
line = line.strip()
|
|
m = re.search(r"pts_time:(\S+)", line)
|
|
if m:
|
|
try:
|
|
current_pts = float(m.group(1))
|
|
except ValueError:
|
|
pass
|
|
continue
|
|
if "RMS_level=" in line:
|
|
val = line.split("RMS_level=")[-1].strip()
|
|
try:
|
|
rms = float(val)
|
|
# -inf zamenjamo z -90
|
|
if rms < -90 or rms != rms: # NaN check
|
|
rms = -90.0
|
|
energies.append((current_pts, rms))
|
|
current_pts += window_sec
|
|
except ValueError:
|
|
pass
|
|
|
|
return energies
|
|
|
|
|
|
def detect_vocal_sections(segments, max_gap=3.0):
|
|
"""Združi consecutive segmente v "vokalne sekcije"."""
|
|
if not segments:
|
|
return []
|
|
sections = []
|
|
current = {
|
|
"start": segments[0]["start"],
|
|
"end": segments[0]["end"],
|
|
"segments": [segments[0]],
|
|
"text": segments[0]["text"],
|
|
}
|
|
for seg in segments[1:]:
|
|
if seg["start"] - current["end"] > max_gap:
|
|
sections.append(current)
|
|
current = {
|
|
"start": seg["start"],
|
|
"end": seg["end"],
|
|
"segments": [seg],
|
|
"text": seg["text"],
|
|
}
|
|
else:
|
|
current["end"] = seg["end"]
|
|
current["segments"].append(seg)
|
|
current["text"] += " " + seg["text"]
|
|
sections.append(current)
|
|
return sections
|
|
|
|
|
|
def avg_energy_in_range(energies, start, end):
|
|
"""Povprečna RMS v rangeu."""
|
|
vals = [r for (t, r) in energies if start <= t <= end]
|
|
if not vals:
|
|
return -90.0
|
|
return sum(vals) / len(vals)
|
|
|
|
|
|
def score_section_as_chorus(section, all_sections, energies, avg_rms):
|
|
"""Score sekcijo kot kandidat za refren.
|
|
|
|
Faktorji:
|
|
- Ponavljajoče besede (low unique-word-ratio) = refren
|
|
- Visoka energija
|
|
- Sekcija se pojavi večkrat v pesmi (refren se ponovi)
|
|
- Krajše vrstice (3-8 besed)
|
|
"""
|
|
text = section["text"].lower()
|
|
words = re.findall(r"\b\w+\b", text)
|
|
if not words:
|
|
return 0
|
|
|
|
unique_ratio = len(set(words)) / len(words)
|
|
# Refren = nizko unique ratio (ponovitve)
|
|
chorus_signal = max(0, (1.0 - unique_ratio) * 30)
|
|
|
|
# Energija
|
|
sec_energy = avg_energy_in_range(energies, section["start"], section["end"])
|
|
energy_above = max(0, sec_energy - avg_rms)
|
|
energy_score = energy_above * 8
|
|
|
|
# Kako pogosto se pojavi podobno besedilo
|
|
repeat_count = 0
|
|
for other in all_sections:
|
|
if other is section:
|
|
continue
|
|
other_text = other["text"].lower()
|
|
other_words = set(re.findall(r"\b\w+\b", other_text))
|
|
common = set(words) & other_words
|
|
# Če imata >50% besed skupnih, je verjetno isti refren
|
|
if len(common) >= len(set(words)) * 0.5 and len(common) >= 3:
|
|
repeat_count += 1
|
|
repeat_score = repeat_count * 25
|
|
|
|
# Dolžina vrstice
|
|
duration = section["end"] - section["start"]
|
|
if 3 <= duration <= 25:
|
|
length_score = 10
|
|
elif duration > 25:
|
|
length_score = 5
|
|
else:
|
|
length_score = 2
|
|
|
|
return chorus_signal + energy_score + repeat_score + length_score
|
|
|
|
|
|
def find_chorus(transcript, energies, video_duration):
|
|
"""Najde najbolj verjeten refren."""
|
|
sections = detect_vocal_sections(transcript["segments"])
|
|
if not sections:
|
|
return None
|
|
|
|
avg_rms = sum(r for (_, r) in energies) / len(energies) if energies else -30.0
|
|
|
|
candidates = []
|
|
for sec in sections:
|
|
score = score_section_as_chorus(sec, sections, energies, avg_rms)
|
|
candidates.append({
|
|
"start": sec["start"],
|
|
"end": sec["end"],
|
|
"duration": sec["end"] - sec["start"],
|
|
"text_preview": sec["text"][:80],
|
|
"score": round(score, 2),
|
|
"avg_rms": round(avg_energy_in_range(energies, sec["start"], sec["end"]), 2),
|
|
})
|
|
|
|
# Sort by score descending
|
|
candidates.sort(key=lambda c: -c["score"])
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
return {
|
|
"best": candidates[0],
|
|
"all_candidates": candidates[:10],
|
|
"avg_rms_total": round(avg_rms, 2),
|
|
}
|
|
|
|
|
|
def smart_clip_range(chorus, transcript, video_duration,
|
|
target_duration=30, max_duration=45, min_duration=20,
|
|
include_prebuild=False):
|
|
"""Vrne clip range TOČNO kot je odločil LLM.
|
|
|
|
Sistem NE razširja in NE skrajšuje LLM-ove odločitve. LLM ima ves
|
|
transkript + lyrics z web_search in lahko razmišlja vsebinsko o:
|
|
- kateri refren
|
|
- koliko ponovitev (1, 2, 3?)
|
|
- ali vključiti intro klic ('Ajmo Janezi!')
|
|
- kje naravno konča
|
|
|
|
Edina varnost: če Claude vrne nič, fallback na sredino videa.
|
|
"""
|
|
if not chorus or not chorus.get("best"):
|
|
# Fallback: sredina videa
|
|
mid = video_duration / 2
|
|
start = max(0, mid - target_duration / 2)
|
|
return {
|
|
"start": start,
|
|
"end": min(video_duration, start + target_duration),
|
|
"reason": "fallback_middle",
|
|
}
|
|
|
|
best = chorus["best"]
|
|
actual_start = best["start"]
|
|
actual_end = best["end"]
|
|
|
|
# Samo varnostni cap: NIKOLI ne čez video duration
|
|
actual_start = max(0, actual_start)
|
|
actual_end = min(video_duration, actual_end)
|
|
|
|
# Če je nekako reverse (start > end), popravi
|
|
if actual_start >= actual_end:
|
|
actual_end = min(video_duration, actual_start + target_duration)
|
|
|
|
return {
|
|
"start": round(actual_start, 2),
|
|
"end": round(actual_end, 2),
|
|
"duration": round(actual_end - actual_start, 2),
|
|
"reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only",
|
|
"chorus_start": round(best["start"], 2),
|
|
"chorus_end": round(best["end"], 2),
|
|
}
|
|
|
|
|
|
def detect_audio_fade(clip_range, transcript, video_duration=None):
|
|
"""Določi fade-in/fade-out trajanje + ev. razširi clip range, da fade
|
|
ne reže besedila na koncu refrena.
|
|
|
|
Logika:
|
|
- Če clip začne sredi vokala → 0.5s fade in
|
|
- Če se konča sredi vokala → razširi clip do konca segmenta (+ buffer),
|
|
potem 1.0s fade out
|
|
- Sicer manj fade
|
|
"""
|
|
cs, ce = clip_range["start"], clip_range["end"]
|
|
|
|
# Najdi segment, ki konča znotraj clip-a (ali je clip end znotraj segmenta)
|
|
starts_in_vocal = False
|
|
ends_in_vocal = False
|
|
end_segment = None
|
|
for seg in transcript["segments"]:
|
|
if seg["start"] <= cs <= seg["end"]:
|
|
starts_in_vocal = True
|
|
if seg["start"] <= ce <= seg["end"]:
|
|
ends_in_vocal = True
|
|
end_segment = seg
|
|
|
|
# Če clip konča znotraj segmenta, razširi do konca segmenta + 0.5s buffer
|
|
extended_end = ce
|
|
if end_segment:
|
|
extended_end = end_segment["end"] + 0.5
|
|
if video_duration is not None:
|
|
extended_end = min(extended_end, video_duration)
|
|
|
|
# Fade-in: če clip začne MED vokalom, fade-in mora biti zelo kratek
|
|
# da ne odreže prve besede. Pri vokalnem začetku samo 0.05s "smooth click prevention",
|
|
# ne pravi audible fade. Pri instrumentalnem intro lahko 0.2-0.3s.
|
|
fade_in = 0.05 if starts_in_vocal else 0.2
|
|
# Krajši fade out (0.5s) ker zdaj clip konča po koncu vokala
|
|
fade_out = 0.3 if ends_in_vocal else 0.4
|
|
|
|
return {
|
|
"fade_in": fade_in,
|
|
"fade_out": fade_out,
|
|
"extended_end": round(extended_end, 2),
|
|
"ends_in_vocal": ends_in_vocal,
|
|
}
|
|
|
|
|
|
def _build_analysis_prompt(transcript, video_duration, target_duration=30, filename_hint=None, include_prebuild=False):
|
|
"""Pripravi enotni prompt za Claude/Gemini analizo.
|
|
|
|
include_prebuild: če True, lahko vključi pre-chorus pred refrenom.
|
|
če False (default), MORA biti SAMO refren — strogo.
|
|
"""
|
|
lines = []
|
|
for seg in transcript["segments"]:
|
|
start = seg["start"]
|
|
end = seg["end"]
|
|
text = seg["text"].strip()
|
|
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
|
|
transcript_text = "\n".join(lines)
|
|
|
|
hint_block = ""
|
|
if filename_hint:
|
|
hint_block = f"""
|
|
|
|
🎵 IME DATOTEKE: "{filename_hint}"
|
|
|
|
🚨 **PRVI KORAK — VEDNO PRED ANALIZO**:
|
|
Iz imena datoteke prepoznaj izvajalca + naslov pesmi. Potem **OBVEZNO uporabi web_search tool** da poiščeš pravo besedilo pesmi — TUDI ČE MISLIŠ DA POZNAŠ PESEM.
|
|
|
|
Razlog: večinoma ne poznaš celotnih besedil pesmi (predvsem ne-angleških). Brez pravega besedila NE MOREŠ:
|
|
- Pravilno prepoznati strukture (verz / pre-chorus / chorus / bridge)
|
|
- Vedeti kje refren **začne in konča** (vključno z outro frazami)
|
|
- Popraviti STT halucinacij
|
|
|
|
📋 **Search strategija** (univerzalna za vse jezike):
|
|
1. Prvo iskanje: `[izvajalec] [naslov] lyrics` ALI `[izvajalec] [naslov] besedilo/Songtext/letra/versuri`
|
|
2. Če ni rezultatov: `[del transkripta - 4-5 zaporednih besed] lyrics`
|
|
3. Trusted lyrics sajti po jezikih:
|
|
- 🇸🇮 SLO: besedila.com, lyricstranslate.com
|
|
- 🇩🇪 DE: songtexte.com, lyricstranslate.com
|
|
- 🇭🇷🇷🇸 HR/SR/BS: tekstovi.net, lyricstranslate.com
|
|
- 🇪🇸 ES: letras.com, musica.com
|
|
- 🇷🇴 RO: versuri.ro, lyricstranslate.com
|
|
- 🇮🇹 IT: angolotesti.it
|
|
- 🇫🇷 FR: paroles.net
|
|
- 🇬🇧🇺🇸 EN: genius.com, azlyrics.com
|
|
- **Univerzalno**: lyricstranslate.com (vsi jeziki)
|
|
|
|
Ko najdeš lyrics:
|
|
- Identificiraj kateri del je REFREN (ponavlja se)
|
|
- Identificiraj VERZE (zgodba, ne ponavlja se)
|
|
- Identificiraj BRIDGE / PRE-CHORUS / OUTRO če obstajajo
|
|
- Mapiraj transkript timestamp-e na strukturne dele
|
|
- Popravi corrected_segments z dejanskim besedilom
|
|
|
|
🎯 **POMEMBNA HEVRISTIKA: NASLOV PESMI = REFREN HOOK**:
|
|
Naslov pesmi je v 80-90% primerov **ključna fraza refrena** (hook).
|
|
- "Pijan" → refren vsebuje "pijan, pijan"
|
|
- "Brajde" → refren vsebuje "brajde" (ne pa pre-chorus o traktorju!)
|
|
- "Žena me tepe" → refren = "Žena me tepe"
|
|
- "Stisn se k men" → refren = "Stisni se k meni"
|
|
- "Cvetele so maline" → refren vsebuje "cvetele so maline" ali povezano
|
|
|
|
Če iz transkripta ne najdeš naslovne fraze blizu izbranega clipa, **VERJETNO si izbral verz/pre-chorus, ne refrena**. Poišči pravi refren.
|
|
|
|
⚠️ **PAZI**: prvi verz pesmi se pogosto začne **takoj po intro-u** (5-15s) in je kontekstualen — TO NI REFREN. Refren običajno pride **po prvem verzu** (pri 30-60s, odvisno od pesmi).
|
|
"""
|
|
|
|
return f"""Tu je transcript pesmi iz STT modela (timestamp v sekundah, besedilo):
|
|
|
|
{transcript_text}
|
|
|
|
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.{hint_block}
|
|
|
|
⚠️ POMEMBNO: STT lahko naredi napake v vseh jezikih, posebej:
|
|
- Pri narečjih, slovanskih jezikih, romanskih jezikih
|
|
- Generira "tipičen" tekst (npr. tekst druge pesmi istega izvajalca)
|
|
- Lahko vstavi besede ki se POdoBNO slišijo, ampak imajo ČISTO drug pomen
|
|
|
|
KAKO PREPOZNATI HALUCINACIJO:
|
|
- Tekst nima smisla v kontekstu pesmi
|
|
- Različni segmenti imajo nepovezane teme (kot da bi bilo več pesmi)
|
|
- Refren je v vsakem ponovitvi različen (refren se MORA ponavljati identično)
|
|
- Tekst je premalo **glede na trajanje** (več tišine = manj besed, ne več)
|
|
|
|
PROSIM:
|
|
1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
|
|
2. POPRAVI očitne halucinacije:
|
|
- Če prepoznaš pesem (po izvajalcu, naslovu, znaku besedila) → **uporabi PRAVO besedilo**
|
|
- Če halucinacijo ne moreš popraviti, **odstrani segment** (raje brez podnapisa kot napačen)
|
|
- Refren MORA imeti vse pojavitve ENAKE
|
|
- Popravi pomešane jezike (vse vrstice v enem jeziku)
|
|
- Ohrani timestamp-e nespremenjene
|
|
3. Prepoznaj REFREN: del besedila ki se PONAVLJA (ponavadi 2-4 vrstice, ki se v pesmi večkrat ponovijo). To je **univerzalno za vse jezike** — refren je strukturni element pesmi, ne le slovenske/nemške/angleške.
|
|
|
|
{"" if include_prebuild else '''4. **🎯 IZBIRA REFRENA — VSEBINSKO RAZMIŠLJANJE**
|
|
|
|
TI ODLOČAŠ na osnovi vsebine, ritma, energije pesmi.
|
|
Sistem ne bo razširil ne skrajšal tvoje izbire — kar vrneš, to se uporabi.
|
|
|
|
## CILJ: ~30 sekund (tipično za TikTok/Instagram Reel)
|
|
|
|
Dolžina je SVOBODNA glede na strukturo pesmi:
|
|
- 12-15s = en kratki refren (če je pesem zelo kratka)
|
|
- **20-35s = dva zaporedna refrena (NAJBOLJŠA opcija!)** — največkrat idealno
|
|
- 30-40s = refren + drug refren če sta vsebinsko povezana
|
|
|
|
## STRATEGIJA — kako razmišljati:
|
|
|
|
1. **Najdi PRVI nastop refrena** v pesmi
|
|
2. **Poglej KAJ SLEDI**:
|
|
- Če **takoj sledi DRUGI nastop ISTEGA refrena** (gap < 3s) → **vključi oba** = ~30s. ✅ TO JE NAJBOLJ POGOST PRIMER.
|
|
- Če sledi **drug refren** (B-refren z drugim besedilom) → samo prvi A-refren
|
|
- Če sledi instrumental break → samo prvi refren
|
|
- Če sledi takoj verz → samo prvi refren
|
|
|
|
3. **Vključi naravne intro klice/fraze**:
|
|
- "Ajmo Janezi!" pred BRAJDE refrenom = del refrena, vključi
|
|
- "Hey!" / "Yeah!" / "Oh!" intro klici = del refrena, vključi
|
|
- "Pa-pa!" / "La-la!" v začetku refrena = del refrena, vključi
|
|
|
|
4. **Naravni konec refrena**:
|
|
- Pevec drži zadnji ton 1-3s = del refrena
|
|
- Outro filler ("aj aj aj", "yeah yeah") = del refrena
|
|
- Ne reži sredi besede ali izpetega tona
|
|
|
|
## PRIMERI — kako se razmišlja:
|
|
|
|
**BRAJDE (FIRBCI x LIMA LEN):**
|
|
- Refren 1: "Ajmo Janezi! Pejd' greva..." 41.8-49.4s + "Da v senci hladni..." 50.2-56.1s
|
|
- GAP < 3s
|
|
- Refren 2: "Pejd' greva..." 57.1-63.1s + "Da v senci..." 63.8-69.8s
|
|
- **Izbira: 41.8-69.8s = 28s** (dva zaporedna refrena z "Ajmo Janezi" intro klicem) ✅
|
|
|
|
**Lady Gaga "Abracadabra":**
|
|
- Refren: "Abracadabra, amor..." 4 vrstice
|
|
- Ponavadi se 2x ponovi
|
|
- Izbira: oba refrena = ~30s
|
|
|
|
**Žena Me Tepe:**
|
|
- Refren: "Žena me tepe, mi prazni žepe..."
|
|
- Ponavadi je dolg (15s) in se ponovi
|
|
- Izbira: lahko 1 polni refren ali 2 zaporedna
|
|
|
|
## 🚫 ČESAR NE DELAJ:
|
|
- ❌ NE razširi v VERZE/KITICE (verz pripoveduje zgodbo, ima drugo besedilo)
|
|
- ❌ NE meša 2 RAZLIČNA refrena (A-refren + B-refren = napaka)
|
|
- ❌ NE začni sredi refrena (vedno na PRVI besedi)
|
|
- ❌ NE konča sredi besede ali izpetega tona
|
|
- ❌ NE razmišljaj samo o številu sekund — razmišljaj VSEBINSKO
|
|
'''}{'''4. **IZBERI ODSEK — REFREN + PRE-CHORUS:**
|
|
|
|
Uporabnik je izbral način "**REFREN + PRE-CHORUS**".
|
|
|
|
## OBVEZNO: cel **PRVI** refren (kot opisano spodaj)
|
|
|
|
## OPCIJSKO: pre-chorus PRED refrenom
|
|
- **Pre-chorus = zadnja 1-2 vrstici verza tik pred refrenom** (slišne, povezane z refrenom)
|
|
- **Dodaj samo če**:
|
|
- Je tik pred refrenom (brez pavze ali instrumental vmes)
|
|
- Vsebinsko vodi v refren (gradnja občutka)
|
|
- Je kratek: 4-10 sekund
|
|
- **Ne dodajaj** če bi presegel skupno dolžino 35s
|
|
|
|
## REFREN — kot pri "samo refren":
|
|
- Začetek refrena = prva vrstica refrena
|
|
- Konec refrena = vključno z vsemi outro frazami in zadnjim držečim tonom
|
|
- Naravni izpev (ej-ej-ej, oh oh, la la la, etc.)
|
|
|
|
## Skupna dolžina: 18-35 sekund
|
|
''' if include_prebuild else ""}
|
|
|
|
5. Če transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "STT_HALLUCINATION_DETECTED"
|
|
|
|
Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
|
{{
|
|
"start": <sekunde>,
|
|
"end": <sekunde>,
|
|
"reason": "<kratka razlaga>",
|
|
"chorus_text": "<besedilo refrena>",
|
|
"structure": "<1 stavek o strukturi pesmi>",
|
|
"language": "<jezik: sl/de/hr/bs/sr/en/it/es/fr>",
|
|
"hallucination_detected": <true/false>,
|
|
"corrected_segments": [
|
|
{{"start": <s>, "end": <s>, "text": "<popravljeno besedilo ALI prazno če halucinacija>"}}
|
|
]
|
|
}}
|
|
|
|
V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom. Halucinacije nadomesti s pravim besedilom (če veš) ALI pusti prazno besedilo."""
|
|
|
|
|
|
def _parse_llm_response(text, video_duration):
|
|
"""Parse JSON odgovor iz LLM-a, vrne None če invalid."""
|
|
text = text.strip()
|
|
# Odstrani markdown ovoj če obstaja
|
|
if text.startswith("```"):
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
text = re.sub(r"\s*```$", "", text)
|
|
# Včasih je pred JSON-om še kakšna razlaga, vzemi prvi { ... } blok
|
|
first_brace = text.find("{")
|
|
last_brace = text.rfind("}")
|
|
if first_brace >= 0 and last_brace > first_brace:
|
|
text = text[first_brace:last_brace + 1]
|
|
|
|
result = json.loads(text)
|
|
|
|
start = float(result["start"])
|
|
end = float(result["end"])
|
|
if start >= end or start < 0 or end > video_duration:
|
|
print(f" ⚠️ LLM returned invalid range: {start}-{end}", file=sys.stderr)
|
|
return None
|
|
|
|
return {
|
|
"start": round(start, 2),
|
|
"end": round(end, 2),
|
|
"duration": round(end - start, 2),
|
|
"reason": result.get("reason", ""),
|
|
"chorus_text": result.get("chorus_text", ""),
|
|
"structure": result.get("structure", ""),
|
|
"language": result.get("language"),
|
|
"corrected_segments": result.get("corrected_segments"),
|
|
}
|
|
|
|
|
|
def analyze_with_claude(transcript, video_duration, target_duration=30, model="claude-sonnet-4-6", filename_hint=None, include_prebuild=False):
|
|
"""Pošlje transkript Claude API-ju (Anthropic).
|
|
|
|
model: claude-sonnet-4-6 (default), claude-haiku-4-5-20251001, claude-opus-4-7
|
|
filename_hint: ime datoteke (Claude lahko prepozna pesem in popravi halucinacije)
|
|
"""
|
|
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
if not api_key:
|
|
print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
|
|
return None
|
|
|
|
if not transcript.get("segments"):
|
|
return None
|
|
|
|
prompt = _build_analysis_prompt(transcript, video_duration, target_duration, filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
|
|
try:
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
# Initial messages
|
|
messages = [{"role": "user", "content": prompt}]
|
|
|
|
# Sonnet 4.6 podpira web_search tool — Claude lahko poišče prave lyrics
|
|
# za pesmi v slovenščini/hrvaščini/itd., če jih ne pozna iz training data.
|
|
tools = [{
|
|
"type": "web_search_20250305",
|
|
"name": "web_search",
|
|
"max_uses": 3, # Maksimalno 3 search-i = $0.03/job
|
|
}]
|
|
|
|
# Agentic loop: Claude lahko kliče web_search, dobi rezultate, vrne final answer
|
|
max_iterations = 5
|
|
for iteration in range(max_iterations):
|
|
body = json.dumps({
|
|
"model": model,
|
|
"max_tokens": 8192,
|
|
"messages": messages,
|
|
"tools": tools,
|
|
}).encode("utf-8")
|
|
|
|
req = urllib.request.Request(
|
|
"https://api.anthropic.com/v1/messages",
|
|
data=body,
|
|
headers={
|
|
"Content-Type": "application/json",
|
|
"x-api-key": api_key,
|
|
"anthropic-version": "2023-06-01",
|
|
},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=180) as resp:
|
|
data = json.loads(resp.read().decode("utf-8"))
|
|
|
|
content = data.get("content", [])
|
|
if not content:
|
|
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
|
|
return None
|
|
|
|
stop_reason = data.get("stop_reason")
|
|
if stop_reason == "max_tokens":
|
|
usage = data.get("usage", {})
|
|
print(
|
|
f" ⚠️ Claude odrezan (max_tokens): "
|
|
f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
|
|
# Če je end_turn → smo končali, parsiraj text
|
|
if stop_reason in ("end_turn", "stop_sequence"):
|
|
# Najdem zadnji text block
|
|
text_blocks = [b for b in content if b.get("type") == "text"]
|
|
if text_blocks:
|
|
text = text_blocks[-1].get("text", "").strip()
|
|
break
|
|
print(" ⚠️ Claude end_turn brez text bloka", file=sys.stderr)
|
|
return None
|
|
|
|
# Če je tool_use → Claude kliče web_search; appendamo response in nadaljujemo
|
|
if stop_reason == "tool_use":
|
|
# Anthropic web_search tool je server-side — sami obdela searches in vrne web_search_tool_result
|
|
# Ampak v API odgovoru so OBA: tool_use IN web_search_tool_result kot del content
|
|
# Torej končni text že obstaja v naslednji iteraciji
|
|
# Appendamo content do messages in pošljem nazaj (Claude bo nadaljeval)
|
|
messages.append({"role": "assistant", "content": content})
|
|
# Claude server-side že obdela search, samo nadaljujemo s pustim user msg
|
|
# Ampak server-side tools NE potrebujejo follow-up tool_result
|
|
# Pravilen flow: če stop_reason=tool_use ampak web_search_tool_result je že v content,
|
|
# potem Claude sam nadaljuje. Drugače moramo poslati tool_result.
|
|
|
|
# Preverim ali so že rezultati v content
|
|
has_results = any(b.get("type") == "web_search_tool_result" for b in content)
|
|
if has_results:
|
|
# Server-side: Anthropic je sam obdelal search, čakamo nadaljevanje
|
|
# Pošlji nazaj brez sprememb da Claude nadaljuje
|
|
print(f" 🔍 Claude je iskal lyrics, čakam nadaljevanje (iter {iteration+1})", file=sys.stderr)
|
|
continue
|
|
else:
|
|
print(f" ⚠️ tool_use brez results", file=sys.stderr)
|
|
return None
|
|
|
|
# Drugi stop reasons
|
|
print(f" ⚠️ Nepričakovan stop_reason: {stop_reason}", file=sys.stderr)
|
|
return None
|
|
else:
|
|
print(f" ⚠️ Presežena max_iterations ({max_iterations})", file=sys.stderr)
|
|
return None
|
|
|
|
result = _parse_llm_response(text, video_duration)
|
|
if not result:
|
|
return None
|
|
|
|
print(f" 🤖 Claude ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr)
|
|
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
|
|
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
|
|
if result.get("corrected_segments"):
|
|
print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr)
|
|
|
|
result["source"] = f"claude:{model}"
|
|
return result
|
|
except urllib.error.HTTPError as e:
|
|
body = e.read().decode("utf-8", errors="replace")[:500]
|
|
print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Claude analysis failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def analyze_with_gemini(transcript, video_duration, target_duration=30, model="gemini-3.1-pro-preview", filename_hint=None, include_prebuild=False):
|
|
"""Pošlje transkript Gemini API-ju (Google).
|
|
|
|
Gemini 3.1 Pro ima najboljši multilingual rezultat (MMMLU 92.6%) — odličen za SLO/HR/BS.
|
|
"""
|
|
api_key = os.environ.get("GEMINI_API_KEY") or os.environ.get("GOOGLE_API_KEY")
|
|
if not api_key:
|
|
print(" ⚠️ GEMINI_API_KEY ni nastavljen — preskakujem Gemini analizo", file=sys.stderr)
|
|
return None
|
|
|
|
if not transcript.get("segments"):
|
|
return None
|
|
|
|
prompt = _build_analysis_prompt(transcript, video_duration, target_duration, filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
|
|
try:
|
|
import urllib.request
|
|
import urllib.error
|
|
|
|
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
|
|
# Gemini 3.x Pro je THINKING model — porabi tokene tudi za internal reasoning (thoughtsTokenCount).
|
|
# 4096 je prenizko: pri velikih transkriptih thinking lahko porabi 1500-3000 tokenov,
|
|
# output (corrected_segments za 60+ segmentov) pa še dodatnih 3000-7000 → odreže JSON na pol
|
|
# (finishReason: MAX_TOKENS) in vrne nepopolen, neveljaven JSON.
|
|
# 32768 daje dovolj prostora za thinking + cel JSON output tudi pri dolgih pesmih.
|
|
body = json.dumps({
|
|
"contents": [{
|
|
"role": "user",
|
|
"parts": [{"text": prompt}],
|
|
}],
|
|
"generationConfig": {
|
|
"temperature": 0.1,
|
|
"maxOutputTokens": 32768,
|
|
"responseMimeType": "application/json",
|
|
},
|
|
}).encode("utf-8")
|
|
|
|
req = urllib.request.Request(
|
|
url,
|
|
data=body,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=180) as resp:
|
|
data = json.loads(resp.read().decode("utf-8"))
|
|
|
|
candidates = data.get("candidates", [])
|
|
if not candidates:
|
|
print(" ⚠️ Gemini vrnil 0 candidates", file=sys.stderr)
|
|
return None
|
|
|
|
cand0 = candidates[0]
|
|
finish_reason = cand0.get("finishReason", "?")
|
|
usage = data.get("usageMetadata", {})
|
|
|
|
# Diagnostika: če je finishReason == MAX_TOKENS, je output odrezan in JSON je invalid
|
|
if finish_reason == "MAX_TOKENS":
|
|
print(
|
|
f" ⚠️ Gemini odrezan (MAX_TOKENS): "
|
|
f"prompt={usage.get('promptTokenCount')} "
|
|
f"thoughts={usage.get('thoughtsTokenCount')} "
|
|
f"output={usage.get('candidatesTokenCount')}",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
|
|
parts = cand0.get("content", {}).get("parts", [])
|
|
if not parts:
|
|
print(
|
|
f" ⚠️ Gemini vrnil prazen content (finishReason={finish_reason}, "
|
|
f"thoughts={usage.get('thoughtsTokenCount')})",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
text = parts[0].get("text", "").strip()
|
|
if not text:
|
|
print(
|
|
f" ⚠️ Gemini vrnil prazen text (finishReason={finish_reason}, "
|
|
f"thoughts={usage.get('thoughtsTokenCount')}, "
|
|
f"output={usage.get('candidatesTokenCount')})",
|
|
file=sys.stderr,
|
|
)
|
|
return None
|
|
|
|
result = _parse_llm_response(text, video_duration)
|
|
if not result:
|
|
return None
|
|
|
|
print(f" 🤖 Gemini ({model}) izbral: {result['start']:.1f}-{result['end']:.1f}s", file=sys.stderr)
|
|
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
|
|
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
|
|
if result.get("corrected_segments"):
|
|
print(f" Popravljeni segmenti: {len(result['corrected_segments'])}", file=sys.stderr)
|
|
|
|
result["source"] = f"gemini:{model}"
|
|
return result
|
|
except urllib.error.HTTPError as e:
|
|
body = e.read().decode("utf-8", errors="replace")[:500]
|
|
print(f" ❌ Gemini API HTTP {e.code}: {body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Gemini analysis failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def analyze_with_llm(transcript, video_duration, target_duration=30, provider="claude", llm_model=None, filename_hint=None, include_prebuild=False):
|
|
"""Glavna funkcija — uporabi izbrano LLM (claude/gemini/auto)."""
|
|
if provider == "gemini":
|
|
model = llm_model or "gemini-3.1-pro-preview"
|
|
return analyze_with_gemini(transcript, video_duration, target_duration, model, filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
elif provider == "claude":
|
|
model = llm_model or "claude-sonnet-4-6"
|
|
return analyze_with_claude(transcript, video_duration, target_duration, model, filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
elif provider == "auto":
|
|
# Najprej probaj Claude, fallback na Gemini
|
|
result = analyze_with_claude(transcript, video_duration, target_duration,
|
|
llm_model or "claude-sonnet-4-6", filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
if result:
|
|
return result
|
|
print(" 🔄 Claude ni uspel, probam Gemini...", file=sys.stderr)
|
|
return analyze_with_gemini(transcript, video_duration, target_duration,
|
|
llm_model or "gemini-3.1-pro-preview", filename_hint=filename_hint, include_prebuild=include_prebuild)
|
|
else:
|
|
print(f" ⚠️ Neznan LLM provider: {provider}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
|
|
def is_instrumental(transcript, video_duration, threshold=0.1):
|
|
"""Detekcija ali je pesem instrumentalna.
|
|
|
|
Če je vsota trajanja vokalnih segmentov < threshold * video_duration,
|
|
je pesem instrumentalna.
|
|
"""
|
|
if not transcript.get("segments"):
|
|
return True
|
|
vocal_duration = sum(
|
|
s["end"] - s["start"] for s in transcript["segments"]
|
|
)
|
|
ratio = vocal_duration / max(video_duration, 1)
|
|
return bool(ratio < threshold)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("video", help="Vhod video file")
|
|
ap.add_argument("--lang", default=None, help="ISO 639-1 ali 'auto' (default: auto)")
|
|
ap.add_argument("--model", default="large-v3", help="Whisper model")
|
|
ap.add_argument("--target-duration", type=float, default=30.0)
|
|
ap.add_argument("--max-duration", type=float, default=45.0)
|
|
ap.add_argument("--min-duration", type=float, default=20.0)
|
|
ap.add_argument("--include-prebuild", action="store_true",
|
|
help="Vključi pre-chorus build-up (privzeto: ne)")
|
|
ap.add_argument("--no-claude", action="store_true",
|
|
help="Preskoči LLM analizo (uporabi samo lokalno heuristiko)")
|
|
ap.add_argument("--llm-provider", default="claude",
|
|
choices=["claude", "gemini", "auto"],
|
|
help="Kateri LLM uporabiti za analizo (default: claude)")
|
|
ap.add_argument("--llm-model", default=None,
|
|
help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
|
|
ap.add_argument("--filename-hint", default=None,
|
|
help="Originalno ime datoteke (Claude lahko prepozna pesem)")
|
|
ap.add_argument("--whisper-provider", default="auto",
|
|
choices=["auto", "soniox", "elevenlabs", "local", "hybrid", "gemini"],
|
|
help="STT provider: "
|
|
"soniox=Soniox stt-async-v4 ($0.10/h, 5-15s, najboljši za NZ, PRIPOROČENO), "
|
|
"elevenlabs=Scribe ($0.40/h, halucinacije pri NZ), "
|
|
"gemini=Gemini 3 Pro ($3-5/h, počasen), "
|
|
"auto=Soniox primary + fallback chain (PRIVZETO)")
|
|
ap.add_argument("--json", action="store_true", help="Output JSON")
|
|
ap.add_argument("--output", help="Path za JSON output")
|
|
args = ap.parse_args()
|
|
|
|
video = Path(args.video)
|
|
if not video.exists():
|
|
print(f"❌ Video ne obstaja: {video}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
duration = get_video_duration(video)
|
|
print(f"📹 Video: {video.name}, {duration:.1f}s", file=sys.stderr)
|
|
|
|
# 1. Extract avdio
|
|
audio = extract_audio(video)
|
|
|
|
try:
|
|
# 2. Whisper transcript
|
|
lang = None if args.lang in (None, "auto", "") else args.lang
|
|
# Filename hint pomaga Scribu detektirati jezik (Avseniki → SL, Ben Zucker → DE)
|
|
fname_hint = args.filename_hint or video.stem
|
|
transcript = transcribe_full(
|
|
audio, lang=lang, model_size=args.model,
|
|
provider=args.whisper_provider,
|
|
filename_hint=fname_hint,
|
|
)
|
|
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
|
|
|
|
# 3. Energy profile
|
|
print(f"⚡ Energy profile...", file=sys.stderr)
|
|
energies = compute_energy_profile(audio)
|
|
print(f" Energy samples: {len(energies)}", file=sys.stderr)
|
|
|
|
# 4. Instrumental detection
|
|
instrumental = is_instrumental(transcript, duration)
|
|
print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
|
|
|
|
# 5a. PRIMARNO: LLM analiza (razume cel tekst pesmi + popravki)
|
|
claude_result = None
|
|
if not instrumental and not args.no_claude:
|
|
provider = args.llm_provider
|
|
print(f"🤖 Pošiljam transkript {provider}-u za analizo...", file=sys.stderr)
|
|
# Filename hint = original filename brez extension (Claude lahko prepozna pesem)
|
|
fname_hint = args.filename_hint or video.stem
|
|
claude_result = analyze_with_llm(
|
|
transcript, duration, target_duration=args.target_duration,
|
|
provider=provider, llm_model=args.llm_model,
|
|
filename_hint=fname_hint,
|
|
include_prebuild=args.include_prebuild,
|
|
)
|
|
|
|
# 5b. Find chorus lokalno (kot fallback ali za score-jev preview)
|
|
if not instrumental:
|
|
chorus = find_chorus(transcript, energies, duration)
|
|
else:
|
|
# Za instrumentalne: najdi sekcijo z najvišjo energijo
|
|
window = args.target_duration
|
|
best_start = 0
|
|
best_avg = -100
|
|
t = 0
|
|
while t + window <= duration:
|
|
avg = avg_energy_in_range(energies, t, t + window)
|
|
if avg > best_avg:
|
|
best_avg = avg
|
|
best_start = t
|
|
t += 5 # step 5s
|
|
chorus = {
|
|
"best": {
|
|
"start": best_start,
|
|
"end": best_start + window,
|
|
"duration": window,
|
|
"text_preview": "(instrumental — energy peak)",
|
|
"score": 0,
|
|
"avg_rms": round(best_avg, 2),
|
|
},
|
|
"all_candidates": [],
|
|
"avg_rms_total": round(
|
|
sum(r for (_, r) in energies) / len(energies) if energies else -30, 2
|
|
),
|
|
}
|
|
|
|
# 6. Clip range — LLM (Claude/Gemini) ima prednost, sicer smart_clip_range fallback.
|
|
# POMEMBNO: spremenljivka se zgodovinsko imenuje claude_result, dejansko pa vsebuje
|
|
# rezultat KATEREGA KOLI LLM-ja (Claude ali Gemini) — glej analyze_with_llm().
|
|
# llm_source npr. "claude:claude-sonnet-4-6" ali "gemini:gemini-3.1-pro-preview".
|
|
if claude_result:
|
|
llm_source = claude_result.get("source", "llm")
|
|
|
|
# ── HALUCINACIJA HANDLING ──
|
|
# Če je Claude detect-iral halucinacijo (npr. Scribe je vrnil
|
|
# "finančni moduli" namesto pesmi), NE zaupamo izbiri clipa,
|
|
# ker LLM ni mogel locirati pravega refrena.
|
|
if claude_result.get("hallucination_detected"):
|
|
print(f"⚠️ HALUCINACIJA DETECT-ANA — fallback na local heuristic "
|
|
f"(Scribe transkript ne ustreza zvočnemu vsebini)", file=sys.stderr)
|
|
# Reset claude_result — gremo na local fallback
|
|
clip_range = smart_clip_range(
|
|
chorus, transcript, duration,
|
|
target_duration=args.target_duration,
|
|
max_duration=args.max_duration,
|
|
min_duration=args.min_duration,
|
|
include_prebuild=args.include_prebuild,
|
|
)
|
|
clip_range["source"] = "local_fallback_after_hallucination"
|
|
clip_range["reason"] = (
|
|
"STT halucinacija — local heuristic fallback. "
|
|
"Refren je iz energy-based detekcije, ne iz transkripta. "
|
|
+ clip_range.get("reason", "")
|
|
)
|
|
claude_result = None # disable extensions
|
|
else:
|
|
clip_range = {
|
|
"start": claude_result["start"],
|
|
"end": claude_result["end"],
|
|
"duration": claude_result["duration"],
|
|
"reason": f"{llm_source}: " + claude_result.get("reason", ""),
|
|
"chorus_text": claude_result.get("chorus_text", ""),
|
|
"structure": claude_result.get("structure", ""),
|
|
"source": llm_source,
|
|
}
|
|
# Apply max_duration cap če LLM pretirava
|
|
if clip_range["duration"] > args.max_duration:
|
|
clip_range["end"] = clip_range["start"] + args.max_duration
|
|
clip_range["duration"] = args.max_duration
|
|
clip_range["reason"] += " (capped at max_duration)"
|
|
|
|
# Apply min_duration floor — če je clip prekratek, podaljšaj
|
|
if clip_range["duration"] < args.min_duration:
|
|
needed = args.min_duration - clip_range["duration"]
|
|
new_end = min(clip_range["end"] + needed, duration)
|
|
actual_extension = new_end - clip_range["end"]
|
|
clip_range["end"] = new_end
|
|
clip_range["duration"] = clip_range["end"] - clip_range["start"]
|
|
clip_range["reason"] += f" (extended +{actual_extension:.1f}s to meet min_duration)"
|
|
|
|
# ── EXTEND clip end do naslednje naravne pavze ──
|
|
# LLM pogosto reže točno na zadnji besedi refrena, ampak zadnja
|
|
# beseda ima še "ej-ej-ej" outro / pevec drži zadnji ton 1-3s.
|
|
# Razširimo clip do naslednje >= 1s pavze ali instrumentalnega bridg-a,
|
|
# ampak ne čez max_duration + 5s.
|
|
corrected_segs = claude_result.get("corrected_segments") or transcript["segments"]
|
|
current_end = clip_range["end"]
|
|
extension_limit = min(
|
|
clip_range["start"] + args.max_duration + 5, # max 5s nad max_duration
|
|
duration # ne čez celoten audio
|
|
)
|
|
|
|
# ── EXTEND clip START nazaj če Claude začne sredi besede/segmenta ──
|
|
# Refren se pogosto začne na isti besedi kot v transkriptu, ampak Scribe
|
|
# lahko zazna mejo med segmenti **PO** prvi besedi refrena (npr.
|
|
# "Žena me tepe" — beseda "Žena" v prejšnjem segmentu pri 78.0s,
|
|
# nov segment začne pri 78.3s s "tepe"). To pomeni Claude reže
|
|
# PRED besedo "Žena" → odrezana.
|
|
#
|
|
# Strategija: če clip start pade SREDI segmenta (ne tik na začetku),
|
|
# razširi nazaj na začetek tega segmenta + 0.2s buffer.
|
|
# ── EXTEND clip START nazaj če Claude začne sredi besede ali tik za njo ──
|
|
# Pesem se pogosto začne na isti besedi v transkriptu, ampak Scribe lahko
|
|
# zazna mejo med segmenti **PO** prvi besedi (npr. "Žena me tepe" — "Žena"
|
|
# je v prejšnjem segmentu pri 76.88-77.70s, novi segment začne 78.30).
|
|
# Claude reže tipično na začetku novega segmenta = odrezana prva beseda.
|
|
#
|
|
# Strategija: **na ravni besed** — najdi besedo katere konec je
|
|
# blizu clip start (±0.5s) IN preveri ali se lahko ta beseda
|
|
# "naslanja" na clip (z malo pavze do naslednje besede).
|
|
current_start = clip_range["start"]
|
|
|
|
# Zberi VSE besede z njihovimi timestampi
|
|
# POMEMBNO: Claude corrected_segments NE vsebuje word-level timestamps,
|
|
# samo segment start/end. Word-level je samo v originalnem Scribe transkriptu.
|
|
# Zato vedno uporabi `transcript["segments"]` ne `corrected_segs`.
|
|
all_words = []
|
|
for seg in transcript.get("segments", []):
|
|
for w in seg.get("words", []):
|
|
if w.get("start") is not None and w.get("end") is not None:
|
|
all_words.append({
|
|
"start": float(w["start"]),
|
|
"end": float(w["end"]),
|
|
"text": w.get("text", ""),
|
|
})
|
|
|
|
if all_words:
|
|
# Najdi "rob" — beseda kjer končanje zelo blizu clip start
|
|
# ALI clip start je sredi besede (besedo bi odrezali)
|
|
# ALI prejšnje besede so del istega govora pred clip start
|
|
|
|
# Strategija: poišči besedo PRED clip start, nato razširi nazaj
|
|
# za **1-2 besedi** (ne celo frazo - to bi zajelo prejšnji verz).
|
|
# Kombiniraj z amplitude defense (Layer 3) ki dodatno doda buffer.
|
|
MAX_LOOKBACK_WORDS = 2 # max 2 besedi nazaj
|
|
|
|
for i, w in enumerate(all_words):
|
|
# Beseda zaobsega clip start (clip reže sredi besede)
|
|
if w["start"] < current_start < w["end"]:
|
|
# Razširi nazaj na začetek te besede in največ MAX_LOOKBACK_WORDS predhodnih
|
|
anchor_idx = i
|
|
for j in range(i, max(0, i - MAX_LOOKBACK_WORDS), -1):
|
|
prev = all_words[j - 1]
|
|
curr = all_words[j]
|
|
gap = curr["start"] - prev["end"]
|
|
if gap >= 0.5:
|
|
break
|
|
anchor_idx = j - 1
|
|
new_start = max(0, all_words[anchor_idx]["start"]) # NI buffer-ja
|
|
captured = " ".join(w2["text"].strip() for w2 in all_words[anchor_idx:i+1])
|
|
print(f" 🎵 Razširim clip začetek {current_start:.2f}s → {new_start:.2f}s "
|
|
f"(clip sredi besede; ujamem '{captured}')", file=sys.stderr)
|
|
current_start = new_start
|
|
break
|
|
# Beseda končana TIK pred clip start (do 0.5s pred)
|
|
if 0 < (current_start - w["end"]) <= 0.5:
|
|
# Preveri naslednjo besedo
|
|
next_w = all_words[i + 1] if i + 1 < len(all_words) else None
|
|
if next_w and next_w["start"] >= current_start - 0.1:
|
|
# Najdi anchor: do MAX_LOOKBACK_WORDS nazaj
|
|
anchor_idx = i
|
|
for j in range(i, max(0, i - MAX_LOOKBACK_WORDS), -1):
|
|
prev = all_words[j - 1]
|
|
curr = all_words[j]
|
|
gap = curr["start"] - prev["end"]
|
|
if gap >= 0.5:
|
|
break
|
|
anchor_idx = j - 1
|
|
new_start = max(0, all_words[anchor_idx]["start"]) # NI buffer-ja
|
|
captured = " ".join(w2["text"].strip() for w2 in all_words[anchor_idx:i+1])
|
|
print(f" 🎵 Razširim clip začetek {current_start:.2f}s → {new_start:.2f}s "
|
|
f"(beseda '{w['text'].strip()}' tik pred clip start; "
|
|
f"ujamem celo frazo '{captured}')", file=sys.stderr)
|
|
current_start = new_start
|
|
break
|
|
else:
|
|
# Fallback: če ni word-level (npr. local Whisper), uporabi segmente kot prej
|
|
for seg in corrected_segs:
|
|
seg_start = float(seg.get("start", 0))
|
|
seg_end = float(seg.get("end", 0))
|
|
if seg_start < current_start < seg_end:
|
|
new_start = max(0, current_start - 0.5)
|
|
print(f" 🎵 Razširim clip začetek {current_start:.2f}s → {new_start:.2f}s "
|
|
f"(brez word-level, fallback -0.5s)", file=sys.stderr)
|
|
current_start = new_start
|
|
break
|
|
|
|
if current_start < clip_range["start"]:
|
|
clip_range["start"] = round(current_start, 2)
|
|
clip_range["duration"] = round(clip_range["end"] - current_start, 2)
|
|
clip_range["reason"] += f" (start extended back)"
|
|
|
|
# ── SLOJ 3: AUDIO AMPLITUDE CHECK na samem začetku clipa ──
|
|
# Tudi po word-level extension lahko clip začne sredi vokala (npr. če
|
|
# Scribe ni zaznal besede). Kot zadnja obramba: preveri RMS audio
|
|
# amplitudo v prvih 100ms clipa. Če je > silence threshold = vokal je
|
|
# že tam, dodaj 0.5s buffer nazaj.
|
|
try:
|
|
import subprocess as _sp
|
|
# ffmpeg lahko prebere kratek segment in vrne RMS volume
|
|
probe_start = clip_range["start"]
|
|
probe_dur = 0.15 # prvih 150ms
|
|
if probe_start >= 0.5: # samo če imamo prostor za buffer
|
|
cmd_probe = [
|
|
"ffmpeg", "-hide_banner", "-loglevel", "error",
|
|
"-ss", str(probe_start), "-t", str(probe_dur),
|
|
"-i", str(args.video),
|
|
"-af", "volumedetect",
|
|
"-f", "null", "-"
|
|
]
|
|
pr = _sp.run(cmd_probe, capture_output=True, text=True, timeout=10)
|
|
output = pr.stderr or ""
|
|
# Iščemo "mean_volume: -XX.X dB"
|
|
import re as _re_amp
|
|
m = _re_amp.search(r'mean_volume:\s*(-?\d+\.?\d*)\s*dB', output)
|
|
if m:
|
|
mean_db = float(m.group(1))
|
|
# Silence threshold: pod -40 dB = tihota
|
|
# Vokal/glasba je običajno -30 do -10 dB
|
|
if mean_db > -35:
|
|
# Audio je že "glasen" na začetku clipa = vokal/glasba
|
|
# Dodaj 0.5s buffer nazaj (varno, ne prepogosto)
|
|
old_start = clip_range["start"]
|
|
new_start = max(0, old_start - 0.5)
|
|
if new_start < old_start:
|
|
print(f" 🎵 Audio amplitude check: prvih {probe_dur}s "
|
|
f"ima mean_volume {mean_db:.1f} dB (> -35 dB = vokal/glasba). "
|
|
f"Razširim clip {old_start:.2f}s → {new_start:.2f}s.", file=sys.stderr)
|
|
clip_range["start"] = round(new_start, 2)
|
|
clip_range["duration"] = round(clip_range["end"] - new_start, 2)
|
|
clip_range["reason"] += " (amplitude defense -0.5s)"
|
|
else:
|
|
print(f" 🎵 Audio amplitude check: prvih {probe_dur}s "
|
|
f"ima mean_volume {mean_db:.1f} dB (≤ -35 dB = tiho). OK.", file=sys.stderr)
|
|
except Exception as _e:
|
|
print(f" ⚠️ Audio amplitude check skipped: {_e}", file=sys.stderr)
|
|
|
|
# Najdi vse segmente ki se začnejo PO trenutnem clip end
|
|
# STROŽJA pravila: ne podaljšuj v naslednji refren / verz / instrumental.
|
|
# Razširjamo SAMO če zadnji segment se prekriva s clip (klesti iz njega) ALI
|
|
# če je naslednji segment KRATEK (< 2s) IN vsebuje samo outro fillerje
|
|
# (la la, oh, yeah, ej, ja, ah, na, hey itd.).
|
|
|
|
# Definiraj outro filler regex (multi-jezikovno)
|
|
import re as _re
|
|
OUTRO_FILLER_RE = _re.compile(
|
|
r'^[\s\-,.!?]*'
|
|
r'((?:la|na|oh|ah|eh|ej|aj|ja|hey|yeah|yo|ho|wo|hu|mm|nn|uu|oo|aa|ee|ii)'
|
|
r'[\s\-,.!?]*)+'
|
|
r'[\s\-,.!?]*$',
|
|
_re.IGNORECASE
|
|
)
|
|
# Hard cap: ne razširjaj več kot 3s nad původne clip end
|
|
original_clip_end = clip_range["end"]
|
|
soft_extension_limit = min(original_clip_end + 3.0, extension_limit)
|
|
|
|
for seg in corrected_segs:
|
|
seg_start = float(seg.get("start", 0))
|
|
seg_end = float(seg.get("end", 0))
|
|
seg_text = seg.get("text", "").strip()
|
|
|
|
# Segment se prekriva s clip end (zadnji segment refrena, ki ni zaključen)
|
|
if seg_start <= current_end:
|
|
if seg_end > current_end and seg_end <= soft_extension_limit:
|
|
new_end = min(seg_end + 0.3, soft_extension_limit)
|
|
if new_end > current_end:
|
|
print(f" 🎵 Podaljšam clip {current_end:.1f}s → {new_end:.1f}s "
|
|
f"(zadnji segment refrena se zaključi)", file=sys.stderr)
|
|
current_end = new_end
|
|
else:
|
|
# Segment začne PO clip end — preveri ali je outro filler
|
|
pause = seg_start - current_end
|
|
|
|
# Predaleč → ustavi se
|
|
if pause >= 0.7:
|
|
break
|
|
# Predolg segment = nov verz/refren, ne dodaj
|
|
if (seg_end - seg_start) > 2.5:
|
|
break
|
|
# Preveri vsebino — če ni samo outro fillerji, NE dodaj
|
|
if not OUTRO_FILLER_RE.match(seg_text):
|
|
# Ni filler → verjetno nov refren/verz/post-chorus
|
|
break
|
|
|
|
# OK, je outro filler — dodaj
|
|
new_end = min(seg_end + 0.2, soft_extension_limit)
|
|
if new_end > current_end:
|
|
print(f" 🎵 Podaljšam clip {current_end:.1f}s → {new_end:.1f}s "
|
|
f"(outro filler '{seg_text[:40]}')", file=sys.stderr)
|
|
current_end = new_end
|
|
else:
|
|
break
|
|
|
|
if current_end > clip_range["end"]:
|
|
clip_range["end"] = round(current_end, 2)
|
|
clip_range["duration"] = round(current_end - clip_range["start"], 2)
|
|
clip_range["reason"] += f" (extended to natural pause)"
|
|
else:
|
|
clip_range = smart_clip_range(
|
|
chorus, transcript, duration,
|
|
target_duration=args.target_duration,
|
|
max_duration=args.max_duration,
|
|
min_duration=args.min_duration,
|
|
include_prebuild=args.include_prebuild,
|
|
)
|
|
clip_range["source"] = "local_heuristic"
|
|
print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
|
|
f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
|
|
file=sys.stderr)
|
|
|
|
# Če Claude je vrnil popravljene segmente, jih uporabi (boljši za podnapise)
|
|
if claude_result and claude_result.get("corrected_segments"):
|
|
corrected = claude_result["corrected_segments"]
|
|
# Ohrani word-level timing iz originala, posodobi samo text
|
|
orig_by_start = {round(s["start"], 1): s for s in transcript["segments"]}
|
|
new_segments = []
|
|
for cs in corrected:
|
|
try:
|
|
cs_start = float(cs["start"])
|
|
cs_end = float(cs["end"])
|
|
cs_text = str(cs["text"]).strip()
|
|
except (KeyError, ValueError, TypeError):
|
|
continue
|
|
# Najdi originalni segment z istim start (ali blizu) za word-level timing
|
|
orig = orig_by_start.get(round(cs_start, 1))
|
|
if not orig:
|
|
# Najdi najbližji
|
|
closest_diff = 999
|
|
for s in transcript["segments"]:
|
|
diff = abs(s["start"] - cs_start)
|
|
if diff < closest_diff and diff < 1.0:
|
|
closest_diff = diff
|
|
orig = s
|
|
new_segments.append({
|
|
"start": cs_start,
|
|
"end": cs_end,
|
|
"text": cs_text,
|
|
# Word-level timing ne moremo posodabljati ker Claude ne vrača besede,
|
|
# ampak ohranimo če imamo
|
|
"words": orig.get("words", []) if orig else [],
|
|
})
|
|
transcript["segments"] = new_segments
|
|
transcript["claude_corrected"] = True # ohranimo ime ključa zaradi backward compat
|
|
# Posodobi tudi jezik če LLM je drugačnega mnenja
|
|
if claude_result.get("language") and claude_result["language"] != transcript["language"]:
|
|
print(f" ✏️ LLM je popravil jezik: {transcript['language']} → {claude_result['language']}", file=sys.stderr)
|
|
transcript["language"] = claude_result["language"]
|
|
llm_label = claude_result.get("source", "LLM")
|
|
print(f" ✏️ Whisper segmenti zamenjani z {llm_label} popravljenimi ({len(new_segments)})", file=sys.stderr)
|
|
|
|
# 7. Fade params (lahko razširi clip end če konča sredi vokala)
|
|
fade = detect_audio_fade(clip_range, transcript, video_duration=duration)
|
|
print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr)
|
|
|
|
# Če fade detection razširi end (ker clip konča sredi vokala), apply
|
|
if fade.get("extended_end") and fade["extended_end"] > clip_range["end"]:
|
|
old_end = clip_range["end"]
|
|
new_end = min(fade["extended_end"], clip_range["start"] + args.max_duration)
|
|
clip_range["end"] = round(new_end, 2)
|
|
clip_range["duration"] = round(new_end - clip_range["start"], 2)
|
|
print(f" ↳ Razširjen za {new_end - old_end:.1f}s (zaključek besedila)",
|
|
file=sys.stderr)
|
|
|
|
result = {
|
|
"video": str(video),
|
|
"video_duration": duration,
|
|
"language": transcript["language"],
|
|
"language_probability": transcript["language_probability"],
|
|
"instrumental": instrumental,
|
|
"transcript": transcript,
|
|
"chorus": chorus,
|
|
"clip_range": clip_range,
|
|
"fade": fade,
|
|
"claude_used": claude_result is not None,
|
|
"claude_corrected_text": bool(claude_result and claude_result.get("corrected_segments")),
|
|
}
|
|
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
print(f"💾 Saved: {args.output}", file=sys.stderr)
|
|
|
|
if args.json:
|
|
print(json.dumps(result, ensure_ascii=False))
|
|
|
|
finally:
|
|
try:
|
|
os.unlink(audio)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|