Major: smart selection pipeline (analyze.py) + audio fade + multi-lang auto-detect

- New analyze.py: full transcript + energy + structural analysis - Smart clip range: includes pre-chorus, can exceed 30s up to max_duration (default 45s) - Audio fade in/out: auto-detected from vocal boundaries - Instrumental detection: auto-disables subs if vocals < 10% of duration - Multi-language: auto-detect via Whisper or explicit (DE/SL/HR/BS/SR/EN/IT/ES/FR) - Frontend: cleaner UX, added bs language, smart selection description - reframe.py: --fade-in --fade-out args - clip.py: propagates fade params - app/main.py: replaces find_chorus.py call with analyze.py
2026-04-29 06:21:35 +00:00 · 2026-04-29 06:21:35 +00:00 · 8512076b91
commit 8512076b91
parent 81edd24ca3
5 changed files with 572 additions and 36 deletions
--- a/app/main.py
+++ b/app/main.py
@ -154,37 +154,57 @@ def process_job(job_id):
        else:
            input_path = Path(job["input_path"])

-        # ── 2. Find chorus (če auto) ──────────────────────────
+        # ── 2. Smart analysis (če auto_chorus) ──────────────────────────
        if job.get("auto_chorus"):
-            update_job(job_id, current_step="Iščem refren (Whisper + energy)")
+            update_job(job_id, current_step="Analiza pesmi (transkript + energija)")
+            analysis_path = OUTPUT_DIR / f"{job_id}.analysis.json"
            cmd = [
-                "python3", str(SCRIPTS_DIR / "find_chorus.py"),
+                "python3", str(SCRIPTS_DIR / "analyze.py"),
                str(input_path),
-                "--duration", str(job.get("duration", 30)),
-                "--json",
+                "--target-duration", str(job.get("duration", 30)),
+                "--max-duration", str(job.get("max_duration", 45)),
+                "--min-duration", str(job.get("min_duration", 20)),
+                "--output", str(analysis_path),
            ]
-            if job.get("lang"):
+            # lang: če None ali 'auto', pusti analyze.py auto-detect
+            if job.get("lang") and job["lang"] not in ("auto", ""):
                cmd += ["--lang", job["lang"]]
            cmd += ["--model", job.get("whisper_model", "small")]

            proc = subprocess.run(cmd, capture_output=True, text=True)
-            if proc.returncode == 0:
+            if proc.returncode == 0 and analysis_path.exists():
                try:
-                    chorus = json.loads(proc.stdout)
-                    if chorus.get("candidates"):
-                        best = chorus["candidates"][0]
-                        update_job(
-                            job_id,
-                            chorus_detection=chorus,
-                            start=best["start"],
-                            duration=best["duration"],
-                        )
-                        # KLJUČNO: reload local job dict, da nove vrednosti pridejo v reframe call
-                        job = load_job(job_id)
-                except json.JSONDecodeError:
-                    update_job(job_id, chorus_error="JSON decode failed")
+                    with open(analysis_path, "r", encoding="utf-8") as f:
+                        analysis = json.load(f)
+                    cr = analysis["clip_range"]
+                    fade = analysis["fade"]
+                    update_job(
+                        job_id,
+                        analysis_summary={
+                            "language": analysis["language"],
+                            "language_probability": analysis["language_probability"],
+                            "instrumental": analysis["instrumental"],
+                            "clip_range": cr,
+                            "fade": fade,
+                            "chorus_preview": analysis["chorus"]["best"]["text_preview"]
+                                if analysis.get("chorus") and analysis["chorus"].get("best") else None,
+                        },
+                        start=cr["start"],
+                        duration=cr["duration"],
+                        fade_in=fade["fade_in"],
+                        fade_out=fade["fade_out"],
+                        detected_language=analysis["language"],
+                        is_instrumental=analysis["instrumental"],
+                    )
+                    # Auto-disable subs za instrumental
+                    if analysis["instrumental"] and not job.get("no_subs"):
+                        update_job(job_id, no_subs=True, auto_disabled_subs=True)
+                    # Reload local dict
+                    job = load_job(job_id)
+                except (json.JSONDecodeError, KeyError) as e:
+                    update_job(job_id, chorus_error=f"Analysis parse: {e}")
            else:
-                update_job(job_id, chorus_error=proc.stderr[-300:])
+                update_job(job_id, chorus_error=(proc.stderr or "")[-500:])

        # ── 3. Reframe + subtitles (clip.py orchestrator) ─────
        output_path = OUTPUT_DIR / f"{job_id}.mp4"
@ -201,8 +221,16 @@ def process_job(job_id):
            cmd += ["--start", str(job["start"])]
        if job.get("duration") is not None:
            cmd += ["--duration", str(job["duration"])]
-        if job.get("lang"):
-            cmd += ["--lang", job["lang"]]
+        if job.get("fade_in", 0) > 0:
+            cmd += ["--fade-in", str(job["fade_in"])]
+        if job.get("fade_out", 0) > 0:
+            cmd += ["--fade-out", str(job["fade_out"])]
+        # lang: prefer detected_language če auto
+        chosen_lang = job.get("lang")
+        if chosen_lang in (None, "auto", ""):
+            chosen_lang = job.get("detected_language")
+        if chosen_lang:
+            cmd += ["--lang", chosen_lang]
        if job.get("no_subs"):
            cmd += ["--no-subs"]
        cmd += ["--model", job.get("whisper_model", "small")]
@ -269,10 +297,12 @@ class YouTubeJobIn(BaseModel):
 class StartJobIn(BaseModel):
    job_id: str
    mode: str = "track"
-    lang: Optional[str] = None
+    lang: Optional[str] = None  # None/auto = Whisper auto-detect
    auto_chorus: bool = True
    start: Optional[float] = None
    duration: Optional[float] = 30
+    max_duration: Optional[float] = 45  # Smart selection lahko gre do 45s
+    min_duration: Optional[float] = 20
    no_subs: bool = False
    subtitle_style: str = "reels"
    whisper_model: str = "small"
@ -373,6 +403,8 @@ async def start_processing(
        auto_chorus=payload.auto_chorus,
        start=payload.start,
        duration=payload.duration,
+        max_duration=payload.max_duration,
+        min_duration=payload.min_duration,
        no_subs=payload.no_subs,
        subtitle_style=payload.subtitle_style,
        whisper_model=payload.whisper_model,
--- a/scripts/analyze.py
+++ b/scripts/analyze.py
@ -0,0 +1,467 @@
+#!/usr/bin/env python3
+"""
+analyze.py — Predhodna analiza CELEGA videa pred trim-anjem.
+
+Naredi:
+1. Whisper transcript celega videa (auto-detect jezika ali user-specified)
+2. Energy profile (RMS dB na 1s windows)
+3. Structural detection (vocal/instrumental sections, energy peaks)
+4. Pametno izbere clip range (lahko >30s, vključi pre-chorus)
+5. Detekcija instrumentalnih pesmi (no_subs auto)
+
+Output: JSON s podatki za clip.py
+"""
+
+import argparse
+import json
+import os
+import re
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+
+
+def get_video_duration(path):
+    r = subprocess.run(
+        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
+         "-of", "default=nw=1:nokey=1", str(path)],
+        capture_output=True, text=True
+    )
+    try:
+        return float(r.stdout.strip())
+    except ValueError:
+        return 0.0
+
+
+def extract_audio(video_path):
+    """Extract avdio v 16kHz mono WAV za Whisper + energy."""
+    audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+    audio.close()
+    subprocess.run(
+        ["ffmpeg", "-y", "-i", str(video_path), "-vn",
+         "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", audio.name],
+        check=True, capture_output=True
+    )
+    return audio.name
+
+
+def transcribe_full(audio_path, lang=None, model_size="small"):
+    """Whisper transcript celega avdia. lang=None → auto-detect."""
+    from faster_whisper import WhisperModel
+
+    print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
+    m = WhisperModel(model_size, device="cpu", compute_type="int8")
+    segs, info = m.transcribe(
+        audio_path,
+        language=lang,
+        word_timestamps=True,
+        vad_filter=True,
+    )
+    detected_lang = info.language
+    detected_prob = info.language_probability
+    print(f"   Detekcija: {detected_lang} (p={detected_prob:.2f})", file=sys.stderr)
+
+    segments = []
+    for s in segs:
+        words = []
+        if s.words:
+            for w in s.words:
+                words.append({
+                    "start": w.start,
+                    "end": w.end,
+                    "text": w.word,
+                })
+        segments.append({
+            "start": s.start,
+            "end": s.end,
+            "text": s.text.strip(),
+            "words": words,
+        })
+
+    return {
+        "language": detected_lang,
+        "language_probability": detected_prob,
+        "segments": segments,
+    }
+
+
+def compute_energy_profile(audio_path, window_sec=1.0):
+    """RMS dB na window_sec sekund. Vrne list (timestamp, rms_db)."""
+    cmd = [
+        "ffmpeg", "-i", audio_path,
+        "-af", f"asetnsamples=n={int(16000 * window_sec)}:p=0,"
+               f"astats=metadata=1:reset={window_sec},"
+               f"ametadata=print:key=lavfi.astats.Overall.RMS_level:file=-",
+        "-f", "null", "-",
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    output = result.stdout + "\n" + result.stderr
+
+    energies = []
+    current_pts = 0.0
+    for line in output.split("\n"):
+        line = line.strip()
+        m = re.search(r"pts_time:(\S+)", line)
+        if m:
+            try:
+                current_pts = float(m.group(1))
+            except ValueError:
+                pass
+            continue
+        if "RMS_level=" in line:
+            val = line.split("RMS_level=")[-1].strip()
+            try:
+                rms = float(val)
+                # -inf zamenjamo z -90
+                if rms < -90 or rms != rms:  # NaN check
+                    rms = -90.0
+                energies.append((current_pts, rms))
+                current_pts += window_sec
+            except ValueError:
+                pass
+
+    return energies
+
+
+def detect_vocal_sections(segments, max_gap=3.0):
+    """Združi consecutive segmente v "vokalne sekcije"."""
+    if not segments:
+        return []
+    sections = []
+    current = {
+        "start": segments[0]["start"],
+        "end": segments[0]["end"],
+        "segments": [segments[0]],
+        "text": segments[0]["text"],
+    }
+    for seg in segments[1:]:
+        if seg["start"] - current["end"] > max_gap:
+            sections.append(current)
+            current = {
+                "start": seg["start"],
+                "end": seg["end"],
+                "segments": [seg],
+                "text": seg["text"],
+            }
+        else:
+            current["end"] = seg["end"]
+            current["segments"].append(seg)
+            current["text"] += " " + seg["text"]
+    sections.append(current)
+    return sections
+
+
+def avg_energy_in_range(energies, start, end):
+    """Povprečna RMS v rangeu."""
+    vals = [r for (t, r) in energies if start <= t <= end]
+    if not vals:
+        return -90.0
+    return sum(vals) / len(vals)
+
+
+def score_section_as_chorus(section, all_sections, energies, avg_rms):
+    """Score sekcijo kot kandidat za refren.
+
+    Faktorji:
+    - Ponavljajoče besede (low unique-word-ratio) = refren
+    - Visoka energija
+    - Sekcija se pojavi večkrat v pesmi (refren se ponovi)
+    - Krajše vrstice (3-8 besed)
+    """
+    text = section["text"].lower()
+    words = re.findall(r"\b\w+\b", text)
+    if not words:
+        return 0
+
+    unique_ratio = len(set(words)) / len(words)
+    # Refren = nizko unique ratio (ponovitve)
+    chorus_signal = max(0, (1.0 - unique_ratio) * 30)
+
+    # Energija
+    sec_energy = avg_energy_in_range(energies, section["start"], section["end"])
+    energy_above = max(0, sec_energy - avg_rms)
+    energy_score = energy_above * 8
+
+    # Kako pogosto se pojavi podobno besedilo
+    repeat_count = 0
+    for other in all_sections:
+        if other is section:
+            continue
+        other_text = other["text"].lower()
+        other_words = set(re.findall(r"\b\w+\b", other_text))
+        common = set(words) & other_words
+        # Če imata >50% besed skupnih, je verjetno isti refren
+        if len(common) >= len(set(words)) * 0.5 and len(common) >= 3:
+            repeat_count += 1
+    repeat_score = repeat_count * 25
+
+    # Dolžina vrstice
+    duration = section["end"] - section["start"]
+    if 3 <= duration <= 25:
+        length_score = 10
+    elif duration > 25:
+        length_score = 5
+    else:
+        length_score = 2
+
+    return chorus_signal + energy_score + repeat_score + length_score
+
+
+def find_chorus(transcript, energies, video_duration):
+    """Najde najbolj verjeten refren."""
+    sections = detect_vocal_sections(transcript["segments"])
+    if not sections:
+        return None
+
+    avg_rms = sum(r for (_, r) in energies) / len(energies) if energies else -30.0
+
+    candidates = []
+    for sec in sections:
+        score = score_section_as_chorus(sec, sections, energies, avg_rms)
+        candidates.append({
+            "start": sec["start"],
+            "end": sec["end"],
+            "duration": sec["end"] - sec["start"],
+            "text_preview": sec["text"][:80],
+            "score": round(score, 2),
+            "avg_rms": round(avg_energy_in_range(energies, sec["start"], sec["end"]), 2),
+        })
+
+    # Sort by score descending
+    candidates.sort(key=lambda c: -c["score"])
+
+    if not candidates:
+        return None
+
+    return {
+        "best": candidates[0],
+        "all_candidates": candidates[:10],
+        "avg_rms_total": round(avg_rms, 2),
+    }
+
+
+def smart_clip_range(chorus, transcript, video_duration,
+                      target_duration=30, max_duration=45, min_duration=20):
+    """Inteligentno določi clip range.
+
+    Logika:
+    1. Začni z refrenom kot core
+    2. Če je krajši od min_duration, razširi na obeh straneh
+    3. Če imamo prostor, dodaj pre-chorus pred refrenom
+    4. Cap na max_duration
+    """
+    if not chorus or not chorus.get("best"):
+        # Fallback: vzemi sredino videa
+        mid = video_duration / 2
+        start = max(0, mid - target_duration / 2)
+        return {
+            "start": start,
+            "end": min(video_duration, start + target_duration),
+            "reason": "fallback_middle",
+        }
+
+    best = chorus["best"]
+    sections = detect_vocal_sections(transcript["segments"])
+
+    actual_start = best["start"]
+    actual_end = best["end"]
+
+    # 1. Če je core refren prekratek, razširi
+    if actual_end - actual_start < min_duration:
+        # Najdi naslednjo sekcijo (verjetno se refren ponovi)
+        for sec in sections:
+            if sec["start"] > actual_end and sec["start"] - actual_end < 5:
+                # Sekcija blizu, dodaj jo
+                if sec["end"] - actual_start <= max_duration:
+                    actual_end = sec["end"]
+                    if actual_end - actual_start >= min_duration:
+                        break
+
+    # 2. Dodaj pre-chorus pred refrenom (build-up)
+    pre_section = None
+    for sec in sections:
+        if sec["end"] <= actual_start and actual_start - sec["end"] < 8:
+            pre_section = sec  # zadnja pred refrenom
+    if pre_section:
+        candidate_start = pre_section["start"]
+        if actual_end - candidate_start <= max_duration:
+            actual_start = candidate_start
+
+    # 3. Če je res prekratek, razširi simetrično
+    if actual_end - actual_start < min_duration:
+        deficit = min_duration - (actual_end - actual_start)
+        actual_start = max(0, actual_start - deficit / 2)
+        actual_end = min(video_duration, actual_end + deficit / 2)
+
+    # 4. Trim na max
+    if actual_end - actual_start > max_duration:
+        actual_end = actual_start + max_duration
+
+    # Snap to video bounds
+    actual_start = max(0, actual_start)
+    actual_end = min(video_duration, actual_end)
+
+    return {
+        "start": round(actual_start, 2),
+        "end": round(actual_end, 2),
+        "duration": round(actual_end - actual_start, 2),
+        "reason": "smart_chorus_with_prebuild",
+        "chorus_start": round(best["start"], 2),
+        "chorus_end": round(best["end"], 2),
+    }
+
+
+def detect_audio_fade(clip_range, transcript):
+    """Določi fade-in/fade-out trajanje.
+
+    Logika:
+    - Če clip začne sredi vokala → 0.5s fade in
+    - Če se konča sredi vokala → 1.0s fade out
+    - Sicer manj fade
+    """
+    cs, ce = clip_range["start"], clip_range["end"]
+
+    # Vokal pri začetku?
+    starts_in_vocal = False
+    ends_in_vocal = False
+    for seg in transcript["segments"]:
+        # Začetek clip-a znotraj segmenta
+        if seg["start"] <= cs <= seg["end"]:
+            starts_in_vocal = True
+        # Konec clip-a znotraj segmenta
+        if seg["start"] <= ce <= seg["end"]:
+            ends_in_vocal = True
+
+    fade_in = 0.5 if starts_in_vocal else 0.2
+    fade_out = 1.5 if ends_in_vocal else 0.3
+
+    return {"fade_in": fade_in, "fade_out": fade_out}
+
+
+def is_instrumental(transcript, video_duration, threshold=0.1):
+    """Detekcija ali je pesem instrumentalna.
+
+    Če je vsota trajanja vokalnih segmentov < threshold * video_duration,
+    je pesem instrumentalna.
+    """
+    if not transcript.get("segments"):
+        return True
+    vocal_duration = sum(
+        s["end"] - s["start"] for s in transcript["segments"]
+    )
+    ratio = vocal_duration / max(video_duration, 1)
+    return ratio < threshold
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("video", help="Vhod video file")
+    ap.add_argument("--lang", default=None, help="ISO 639-1 ali 'auto' (default: auto)")
+    ap.add_argument("--model", default="small", help="Whisper model")
+    ap.add_argument("--target-duration", type=float, default=30.0)
+    ap.add_argument("--max-duration", type=float, default=45.0)
+    ap.add_argument("--min-duration", type=float, default=20.0)
+    ap.add_argument("--json", action="store_true", help="Output JSON")
+    ap.add_argument("--output", help="Path za JSON output")
+    args = ap.parse_args()
+
+    video = Path(args.video)
+    if not video.exists():
+        print(f"❌ Video ne obstaja: {video}", file=sys.stderr)
+        sys.exit(1)
+
+    duration = get_video_duration(video)
+    print(f"📹 Video: {video.name}, {duration:.1f}s", file=sys.stderr)
+
+    # 1. Extract avdio
+    audio = extract_audio(video)
+
+    try:
+        # 2. Whisper transcript
+        lang = None if args.lang in (None, "auto", "") else args.lang
+        transcript = transcribe_full(audio, lang=lang, model_size=args.model)
+        print(f"   Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
+
+        # 3. Energy profile
+        print(f"⚡ Energy profile...", file=sys.stderr)
+        energies = compute_energy_profile(audio)
+        print(f"   Energy samples: {len(energies)}", file=sys.stderr)
+
+        # 4. Instrumental detection
+        instrumental = is_instrumental(transcript, duration)
+        print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
+
+        # 5. Find chorus (samo če ni instrumental)
+        if not instrumental:
+            chorus = find_chorus(transcript, energies, duration)
+        else:
+            # Za instrumentalne: najdi sekcijo z najvišjo energijo
+            window = args.target_duration
+            best_start = 0
+            best_avg = -100
+            t = 0
+            while t + window <= duration:
+                avg = avg_energy_in_range(energies, t, t + window)
+                if avg > best_avg:
+                    best_avg = avg
+                    best_start = t
+                t += 5  # step 5s
+            chorus = {
+                "best": {
+                    "start": best_start,
+                    "end": best_start + window,
+                    "duration": window,
+                    "text_preview": "(instrumental — energy peak)",
+                    "score": 0,
+                    "avg_rms": round(best_avg, 2),
+                },
+                "all_candidates": [],
+                "avg_rms_total": round(
+                    sum(r for (_, r) in energies) / len(energies) if energies else -30, 2
+                ),
+            }
+
+        # 6. Smart clip range
+        clip_range = smart_clip_range(
+            chorus, transcript, duration,
+            target_duration=args.target_duration,
+            max_duration=args.max_duration,
+            min_duration=args.min_duration,
+        )
+        print(f"✂  Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
+              f"(duration: {clip_range['duration']}s)", file=sys.stderr)
+
+        # 7. Fade params
+        fade = detect_audio_fade(clip_range, transcript)
+        print(f"🎚  Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr)
+
+        result = {
+            "video": str(video),
+            "video_duration": duration,
+            "language": transcript["language"],
+            "language_probability": transcript["language_probability"],
+            "instrumental": instrumental,
+            "transcript": transcript,
+            "chorus": chorus,
+            "clip_range": clip_range,
+            "fade": fade,
+        }
+
+        if args.output:
+            with open(args.output, "w", encoding="utf-8") as f:
+                json.dump(result, f, ensure_ascii=False, indent=2)
+            print(f"💾 Saved: {args.output}", file=sys.stderr)
+
+        if args.json:
+            print(json.dumps(result, ensure_ascii=False))
+
+    finally:
+        try:
+            os.unlink(audio)
+        except Exception:
+            pass
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/clip.py
+++ b/scripts/clip.py
@ -45,9 +45,11 @@ def parse_clips(spec):
 SCRIPT_DIR = Path(__file__).parent


-def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quality):
+def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quality,
+             fade_in=0.0, fade_out=0.0):
    """Naredi en klip src → dst."""
-    print(f"🎯 run_clip args: src={src}, dst={dst}, start={start!r}, duration={duration!r}, mode={mode}", file=sys.stderr)
+    print(f"🎯 run_clip args: src={src}, dst={dst}, start={start!r}, duration={duration!r}, "
+          f"mode={mode}, fade_in={fade_in}, fade_out={fade_out}", file=sys.stderr)
    tmp = tempfile.mkdtemp(prefix="reel_")
    try:
        reframed = Path(tmp) / "reframed.mp4"
@ -63,6 +65,10 @@ def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quali
            cmd += ["--start", str(start)]
        if duration is not None:
            cmd += ["--duration", str(duration)]
+        if fade_in > 0:
+            cmd += ["--fade-in", str(fade_in)]
+        if fade_out > 0:
+            cmd += ["--fade-out", str(fade_out)]
        print(f"🔧 REFRAME CMD: {' '.join(cmd)}", file=sys.stderr)
        print(f"\n▶  Klip: {dst.name}")
        r = subprocess.run(cmd)
@ -97,6 +103,8 @@ def main():
    ap.add_argument("output", help="Datoteka (en klip) ali mapa (več klipov)")
    ap.add_argument("--start", type=str, default=None, help="Začetek (s ali mm:ss)")
    ap.add_argument("--duration", type=float, default=None, help="Trajanje v s")
+    ap.add_argument("--fade-in", type=float, default=0.0, help="Audio fade in (s)")
+    ap.add_argument("--fade-out", type=float, default=0.0, help="Audio fade out (s)")
    ap.add_argument("--clips", type=str, default=None,
                    help="Več klipov: '0:30-1:00,2:15-2:45'")
    ap.add_argument("--mode", default="track", choices=["track", "center", "blur"])
@ -127,7 +135,8 @@ def main():
    else:
        start = parse_ts(args.start) if args.start else None
        run_clip(src, Path(args.output), start, args.duration, args.mode,
-                 args.lang, args.model, args.style, args.no_subs, args.quality)
+                 args.lang, args.model, args.style, args.no_subs, args.quality,
+                 fade_in=args.fade_in, fade_out=args.fade_out)


 if __name__ == "__main__":
--- a/scripts/reframe.py
+++ b/scripts/reframe.py
@ -213,6 +213,8 @@ def main():
    ap.add_argument("--target-height", type=int, default=1920)
    ap.add_argument("--start", type=float, default=None, help="Začetek (s)")
    ap.add_argument("--duration", type=float, default=None, help="Trajanje (s)")
+    ap.add_argument("--fade-in", type=float, default=0.0, help="Audio fade in (s)")
+    ap.add_argument("--fade-out", type=float, default=0.0, help="Audio fade out (s)")
    ap.add_argument("--quality", default="medium", choices=["fast", "medium", "high"])
    args = ap.parse_args()

@ -268,6 +270,16 @@ def main():
    preset = {"fast": "veryfast", "medium": "medium", "high": "slow"}[args.quality]
    crf = {"fast": "26", "medium": "21", "high": "18"}[args.quality]

+    # Audio fade filter (afade)
+    audio_filter = []
+    if args.fade_in > 0:
+        audio_filter.append(f"afade=t=in:st=0:d={args.fade_in}")
+    if args.fade_out > 0:
+        clip_dur = info["duration"]
+        fade_start = max(0, clip_dur - args.fade_out)
+        audio_filter.append(f"afade=t=out:st={fade_start}:d={args.fade_out}")
+    audio_filter_str = ",".join(audio_filter) if audio_filter else None
+
    if args.mode == "blur":
        # blur uporablja filter_complex
        cmd = [
@ -275,18 +287,20 @@ def main():
            "-filter_complex", vfilter,
            "-c:v", "libx264", "-preset", preset, "-crf", crf,
            "-c:a", "aac", "-b:a", "128k",
-            "-movflags", "+faststart",
-            str(dst),
        ]
+        if audio_filter_str:
+            cmd += ["-af", audio_filter_str]
+        cmd += ["-movflags", "+faststart", str(dst)]
    else:
        cmd = [
            "ffmpeg", "-y", "-i", str(work_input),
            "-vf", vfilter,
            "-c:v", "libx264", "-preset", preset, "-crf", crf,
            "-c:a", "aac", "-b:a", "128k",
-            "-movflags", "+faststart",
-            str(dst),
        ]
+        if audio_filter_str:
+            cmd += ["-af", audio_filter_str]
+        cmd += ["-movflags", "+faststart", str(dst)]

    print(f"🎬 Render ({args.mode})...")
    result = subprocess.run(cmd, capture_output=True, text=True)
--- a/templates/index.html
+++ b/templates/index.html
@ -216,12 +216,16 @@
        <div>
          <label>Jezik podnapisov</label>
          <select id="lang">
-            <option value="">Auto detect</option>
+            <option value="">Auto detect (Whisper)</option>
            <option value="sl">Slovenščina</option>
            <option value="de">Deutsch</option>
            <option value="en">English</option>
            <option value="hr">Hrvatski</option>
+            <option value="bs">Bosanski</option>
            <option value="sr">Српски</option>
+            <option value="it">Italiano</option>
+            <option value="es">Español</option>
+            <option value="fr">Français</option>
          </select>
        </div>
        <div>
@ -238,8 +242,13 @@

      <label class="toggle" style="margin-top: 16px;">
        <input type="checkbox" id="auto-chorus" checked>
-        Avto-detekcija refrena (priporočeno za glasbo)
+        Pametna izbira odseka (Whisper + energy → najde refren)
      </label>
+      <div style="font-size: 12px; color: var(--text-dim); margin-top: 4px; margin-left: 26px;">
+        Sistem analizira celoten video, najde refren ter pre-chorus build-up.
+        Lahko traja malo dlje (do 1.5×) če to bolje prikazuje pesem.
+        Audio fade in/out je avtomatsko dodan.
+      </div>

      <div id="manual-times" class="row hidden">
        <div>
@ -353,13 +362,18 @@

    // ─── Settings collector ─────────────────────────
    function collectSettings() {
+      const auto = $("#auto-chorus").checked;
+      const duration = parseFloat($("#duration").value) || 30;
      return {
        mode: $("#mode").value,
        lang: $("#lang").value || null,
        whisper_model: $("#model").value,
-        auto_chorus: $("#auto-chorus").checked,
-        start: !$("#auto-chorus").checked && $("#start").value ? parseTimestamp($("#start").value) : null,
-        duration: parseFloat($("#duration").value) || 30,
+        auto_chorus: auto,
+        start: !auto && $("#start").value ? parseTimestamp($("#start").value) : null,
+        duration: duration,
+        // Smart selection: max do 1.5x ciljno trajanje, min 0.7x
+        max_duration: auto ? Math.round(duration * 1.5) : duration,
+        min_duration: auto ? Math.round(duration * 0.7) : duration,
        subtitle_style: $("#subtitle-style").value,
        quality: $("#quality").value,
        no_subs: $("#no-subs").checked,