diff --git a/app/main.py b/app/main.py index b35ac2f..b510617 100644 --- a/app/main.py +++ b/app/main.py @@ -154,37 +154,57 @@ def process_job(job_id): else: input_path = Path(job["input_path"]) - # ── 2. Find chorus (če auto) ────────────────────────── + # ── 2. Smart analysis (če auto_chorus) ────────────────────────── if job.get("auto_chorus"): - update_job(job_id, current_step="Iščem refren (Whisper + energy)") + update_job(job_id, current_step="Analiza pesmi (transkript + energija)") + analysis_path = OUTPUT_DIR / f"{job_id}.analysis.json" cmd = [ - "python3", str(SCRIPTS_DIR / "find_chorus.py"), + "python3", str(SCRIPTS_DIR / "analyze.py"), str(input_path), - "--duration", str(job.get("duration", 30)), - "--json", + "--target-duration", str(job.get("duration", 30)), + "--max-duration", str(job.get("max_duration", 45)), + "--min-duration", str(job.get("min_duration", 20)), + "--output", str(analysis_path), ] - if job.get("lang"): + # lang: če None ali 'auto', pusti analyze.py auto-detect + if job.get("lang") and job["lang"] not in ("auto", ""): cmd += ["--lang", job["lang"]] cmd += ["--model", job.get("whisper_model", "small")] proc = subprocess.run(cmd, capture_output=True, text=True) - if proc.returncode == 0: + if proc.returncode == 0 and analysis_path.exists(): try: - chorus = json.loads(proc.stdout) - if chorus.get("candidates"): - best = chorus["candidates"][0] - update_job( - job_id, - chorus_detection=chorus, - start=best["start"], - duration=best["duration"], - ) - # KLJUČNO: reload local job dict, da nove vrednosti pridejo v reframe call - job = load_job(job_id) - except json.JSONDecodeError: - update_job(job_id, chorus_error="JSON decode failed") + with open(analysis_path, "r", encoding="utf-8") as f: + analysis = json.load(f) + cr = analysis["clip_range"] + fade = analysis["fade"] + update_job( + job_id, + analysis_summary={ + "language": analysis["language"], + "language_probability": analysis["language_probability"], + "instrumental": analysis["instrumental"], + "clip_range": cr, + "fade": fade, + "chorus_preview": analysis["chorus"]["best"]["text_preview"] + if analysis.get("chorus") and analysis["chorus"].get("best") else None, + }, + start=cr["start"], + duration=cr["duration"], + fade_in=fade["fade_in"], + fade_out=fade["fade_out"], + detected_language=analysis["language"], + is_instrumental=analysis["instrumental"], + ) + # Auto-disable subs za instrumental + if analysis["instrumental"] and not job.get("no_subs"): + update_job(job_id, no_subs=True, auto_disabled_subs=True) + # Reload local dict + job = load_job(job_id) + except (json.JSONDecodeError, KeyError) as e: + update_job(job_id, chorus_error=f"Analysis parse: {e}") else: - update_job(job_id, chorus_error=proc.stderr[-300:]) + update_job(job_id, chorus_error=(proc.stderr or "")[-500:]) # ── 3. Reframe + subtitles (clip.py orchestrator) ───── output_path = OUTPUT_DIR / f"{job_id}.mp4" @@ -201,8 +221,16 @@ def process_job(job_id): cmd += ["--start", str(job["start"])] if job.get("duration") is not None: cmd += ["--duration", str(job["duration"])] - if job.get("lang"): - cmd += ["--lang", job["lang"]] + if job.get("fade_in", 0) > 0: + cmd += ["--fade-in", str(job["fade_in"])] + if job.get("fade_out", 0) > 0: + cmd += ["--fade-out", str(job["fade_out"])] + # lang: prefer detected_language če auto + chosen_lang = job.get("lang") + if chosen_lang in (None, "auto", ""): + chosen_lang = job.get("detected_language") + if chosen_lang: + cmd += ["--lang", chosen_lang] if job.get("no_subs"): cmd += ["--no-subs"] cmd += ["--model", job.get("whisper_model", "small")] @@ -269,10 +297,12 @@ class YouTubeJobIn(BaseModel): class StartJobIn(BaseModel): job_id: str mode: str = "track" - lang: Optional[str] = None + lang: Optional[str] = None # None/auto = Whisper auto-detect auto_chorus: bool = True start: Optional[float] = None duration: Optional[float] = 30 + max_duration: Optional[float] = 45 # Smart selection lahko gre do 45s + min_duration: Optional[float] = 20 no_subs: bool = False subtitle_style: str = "reels" whisper_model: str = "small" @@ -373,6 +403,8 @@ async def start_processing( auto_chorus=payload.auto_chorus, start=payload.start, duration=payload.duration, + max_duration=payload.max_duration, + min_duration=payload.min_duration, no_subs=payload.no_subs, subtitle_style=payload.subtitle_style, whisper_model=payload.whisper_model, diff --git a/scripts/analyze.py b/scripts/analyze.py new file mode 100644 index 0000000..b21ad8b --- /dev/null +++ b/scripts/analyze.py @@ -0,0 +1,467 @@ +#!/usr/bin/env python3 +""" +analyze.py — Predhodna analiza CELEGA videa pred trim-anjem. + +Naredi: +1. Whisper transcript celega videa (auto-detect jezika ali user-specified) +2. Energy profile (RMS dB na 1s windows) +3. Structural detection (vocal/instrumental sections, energy peaks) +4. Pametno izbere clip range (lahko >30s, vključi pre-chorus) +5. Detekcija instrumentalnih pesmi (no_subs auto) + +Output: JSON s podatki za clip.py +""" + +import argparse +import json +import os +import re +import subprocess +import sys +import tempfile +from pathlib import Path + + +def get_video_duration(path): + r = subprocess.run( + ["ffprobe", "-v", "error", "-show_entries", "format=duration", + "-of", "default=nw=1:nokey=1", str(path)], + capture_output=True, text=True + ) + try: + return float(r.stdout.strip()) + except ValueError: + return 0.0 + + +def extract_audio(video_path): + """Extract avdio v 16kHz mono WAV za Whisper + energy.""" + audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) + audio.close() + subprocess.run( + ["ffmpeg", "-y", "-i", str(video_path), "-vn", + "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", audio.name], + check=True, capture_output=True + ) + return audio.name + + +def transcribe_full(audio_path, lang=None, model_size="small"): + """Whisper transcript celega avdia. lang=None → auto-detect.""" + from faster_whisper import WhisperModel + + print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr) + m = WhisperModel(model_size, device="cpu", compute_type="int8") + segs, info = m.transcribe( + audio_path, + language=lang, + word_timestamps=True, + vad_filter=True, + ) + detected_lang = info.language + detected_prob = info.language_probability + print(f" Detekcija: {detected_lang} (p={detected_prob:.2f})", file=sys.stderr) + + segments = [] + for s in segs: + words = [] + if s.words: + for w in s.words: + words.append({ + "start": w.start, + "end": w.end, + "text": w.word, + }) + segments.append({ + "start": s.start, + "end": s.end, + "text": s.text.strip(), + "words": words, + }) + + return { + "language": detected_lang, + "language_probability": detected_prob, + "segments": segments, + } + + +def compute_energy_profile(audio_path, window_sec=1.0): + """RMS dB na window_sec sekund. Vrne list (timestamp, rms_db).""" + cmd = [ + "ffmpeg", "-i", audio_path, + "-af", f"asetnsamples=n={int(16000 * window_sec)}:p=0," + f"astats=metadata=1:reset={window_sec}," + f"ametadata=print:key=lavfi.astats.Overall.RMS_level:file=-", + "-f", "null", "-", + ] + result = subprocess.run(cmd, capture_output=True, text=True) + output = result.stdout + "\n" + result.stderr + + energies = [] + current_pts = 0.0 + for line in output.split("\n"): + line = line.strip() + m = re.search(r"pts_time:(\S+)", line) + if m: + try: + current_pts = float(m.group(1)) + except ValueError: + pass + continue + if "RMS_level=" in line: + val = line.split("RMS_level=")[-1].strip() + try: + rms = float(val) + # -inf zamenjamo z -90 + if rms < -90 or rms != rms: # NaN check + rms = -90.0 + energies.append((current_pts, rms)) + current_pts += window_sec + except ValueError: + pass + + return energies + + +def detect_vocal_sections(segments, max_gap=3.0): + """Združi consecutive segmente v "vokalne sekcije".""" + if not segments: + return [] + sections = [] + current = { + "start": segments[0]["start"], + "end": segments[0]["end"], + "segments": [segments[0]], + "text": segments[0]["text"], + } + for seg in segments[1:]: + if seg["start"] - current["end"] > max_gap: + sections.append(current) + current = { + "start": seg["start"], + "end": seg["end"], + "segments": [seg], + "text": seg["text"], + } + else: + current["end"] = seg["end"] + current["segments"].append(seg) + current["text"] += " " + seg["text"] + sections.append(current) + return sections + + +def avg_energy_in_range(energies, start, end): + """Povprečna RMS v rangeu.""" + vals = [r for (t, r) in energies if start <= t <= end] + if not vals: + return -90.0 + return sum(vals) / len(vals) + + +def score_section_as_chorus(section, all_sections, energies, avg_rms): + """Score sekcijo kot kandidat za refren. + + Faktorji: + - Ponavljajoče besede (low unique-word-ratio) = refren + - Visoka energija + - Sekcija se pojavi večkrat v pesmi (refren se ponovi) + - Krajše vrstice (3-8 besed) + """ + text = section["text"].lower() + words = re.findall(r"\b\w+\b", text) + if not words: + return 0 + + unique_ratio = len(set(words)) / len(words) + # Refren = nizko unique ratio (ponovitve) + chorus_signal = max(0, (1.0 - unique_ratio) * 30) + + # Energija + sec_energy = avg_energy_in_range(energies, section["start"], section["end"]) + energy_above = max(0, sec_energy - avg_rms) + energy_score = energy_above * 8 + + # Kako pogosto se pojavi podobno besedilo + repeat_count = 0 + for other in all_sections: + if other is section: + continue + other_text = other["text"].lower() + other_words = set(re.findall(r"\b\w+\b", other_text)) + common = set(words) & other_words + # Če imata >50% besed skupnih, je verjetno isti refren + if len(common) >= len(set(words)) * 0.5 and len(common) >= 3: + repeat_count += 1 + repeat_score = repeat_count * 25 + + # Dolžina vrstice + duration = section["end"] - section["start"] + if 3 <= duration <= 25: + length_score = 10 + elif duration > 25: + length_score = 5 + else: + length_score = 2 + + return chorus_signal + energy_score + repeat_score + length_score + + +def find_chorus(transcript, energies, video_duration): + """Najde najbolj verjeten refren.""" + sections = detect_vocal_sections(transcript["segments"]) + if not sections: + return None + + avg_rms = sum(r for (_, r) in energies) / len(energies) if energies else -30.0 + + candidates = [] + for sec in sections: + score = score_section_as_chorus(sec, sections, energies, avg_rms) + candidates.append({ + "start": sec["start"], + "end": sec["end"], + "duration": sec["end"] - sec["start"], + "text_preview": sec["text"][:80], + "score": round(score, 2), + "avg_rms": round(avg_energy_in_range(energies, sec["start"], sec["end"]), 2), + }) + + # Sort by score descending + candidates.sort(key=lambda c: -c["score"]) + + if not candidates: + return None + + return { + "best": candidates[0], + "all_candidates": candidates[:10], + "avg_rms_total": round(avg_rms, 2), + } + + +def smart_clip_range(chorus, transcript, video_duration, + target_duration=30, max_duration=45, min_duration=20): + """Inteligentno določi clip range. + + Logika: + 1. Začni z refrenom kot core + 2. Če je krajši od min_duration, razširi na obeh straneh + 3. Če imamo prostor, dodaj pre-chorus pred refrenom + 4. Cap na max_duration + """ + if not chorus or not chorus.get("best"): + # Fallback: vzemi sredino videa + mid = video_duration / 2 + start = max(0, mid - target_duration / 2) + return { + "start": start, + "end": min(video_duration, start + target_duration), + "reason": "fallback_middle", + } + + best = chorus["best"] + sections = detect_vocal_sections(transcript["segments"]) + + actual_start = best["start"] + actual_end = best["end"] + + # 1. Če je core refren prekratek, razširi + if actual_end - actual_start < min_duration: + # Najdi naslednjo sekcijo (verjetno se refren ponovi) + for sec in sections: + if sec["start"] > actual_end and sec["start"] - actual_end < 5: + # Sekcija blizu, dodaj jo + if sec["end"] - actual_start <= max_duration: + actual_end = sec["end"] + if actual_end - actual_start >= min_duration: + break + + # 2. Dodaj pre-chorus pred refrenom (build-up) + pre_section = None + for sec in sections: + if sec["end"] <= actual_start and actual_start - sec["end"] < 8: + pre_section = sec # zadnja pred refrenom + if pre_section: + candidate_start = pre_section["start"] + if actual_end - candidate_start <= max_duration: + actual_start = candidate_start + + # 3. Če je res prekratek, razširi simetrično + if actual_end - actual_start < min_duration: + deficit = min_duration - (actual_end - actual_start) + actual_start = max(0, actual_start - deficit / 2) + actual_end = min(video_duration, actual_end + deficit / 2) + + # 4. Trim na max + if actual_end - actual_start > max_duration: + actual_end = actual_start + max_duration + + # Snap to video bounds + actual_start = max(0, actual_start) + actual_end = min(video_duration, actual_end) + + return { + "start": round(actual_start, 2), + "end": round(actual_end, 2), + "duration": round(actual_end - actual_start, 2), + "reason": "smart_chorus_with_prebuild", + "chorus_start": round(best["start"], 2), + "chorus_end": round(best["end"], 2), + } + + +def detect_audio_fade(clip_range, transcript): + """Določi fade-in/fade-out trajanje. + + Logika: + - Če clip začne sredi vokala → 0.5s fade in + - Če se konča sredi vokala → 1.0s fade out + - Sicer manj fade + """ + cs, ce = clip_range["start"], clip_range["end"] + + # Vokal pri začetku? + starts_in_vocal = False + ends_in_vocal = False + for seg in transcript["segments"]: + # Začetek clip-a znotraj segmenta + if seg["start"] <= cs <= seg["end"]: + starts_in_vocal = True + # Konec clip-a znotraj segmenta + if seg["start"] <= ce <= seg["end"]: + ends_in_vocal = True + + fade_in = 0.5 if starts_in_vocal else 0.2 + fade_out = 1.5 if ends_in_vocal else 0.3 + + return {"fade_in": fade_in, "fade_out": fade_out} + + +def is_instrumental(transcript, video_duration, threshold=0.1): + """Detekcija ali je pesem instrumentalna. + + Če je vsota trajanja vokalnih segmentov < threshold * video_duration, + je pesem instrumentalna. + """ + if not transcript.get("segments"): + return True + vocal_duration = sum( + s["end"] - s["start"] for s in transcript["segments"] + ) + ratio = vocal_duration / max(video_duration, 1) + return ratio < threshold + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("video", help="Vhod video file") + ap.add_argument("--lang", default=None, help="ISO 639-1 ali 'auto' (default: auto)") + ap.add_argument("--model", default="small", help="Whisper model") + ap.add_argument("--target-duration", type=float, default=30.0) + ap.add_argument("--max-duration", type=float, default=45.0) + ap.add_argument("--min-duration", type=float, default=20.0) + ap.add_argument("--json", action="store_true", help="Output JSON") + ap.add_argument("--output", help="Path za JSON output") + args = ap.parse_args() + + video = Path(args.video) + if not video.exists(): + print(f"❌ Video ne obstaja: {video}", file=sys.stderr) + sys.exit(1) + + duration = get_video_duration(video) + print(f"📹 Video: {video.name}, {duration:.1f}s", file=sys.stderr) + + # 1. Extract avdio + audio = extract_audio(video) + + try: + # 2. Whisper transcript + lang = None if args.lang in (None, "auto", "") else args.lang + transcript = transcribe_full(audio, lang=lang, model_size=args.model) + print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr) + + # 3. Energy profile + print(f"⚡ Energy profile...", file=sys.stderr) + energies = compute_energy_profile(audio) + print(f" Energy samples: {len(energies)}", file=sys.stderr) + + # 4. Instrumental detection + instrumental = is_instrumental(transcript, duration) + print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr) + + # 5. Find chorus (samo če ni instrumental) + if not instrumental: + chorus = find_chorus(transcript, energies, duration) + else: + # Za instrumentalne: najdi sekcijo z najvišjo energijo + window = args.target_duration + best_start = 0 + best_avg = -100 + t = 0 + while t + window <= duration: + avg = avg_energy_in_range(energies, t, t + window) + if avg > best_avg: + best_avg = avg + best_start = t + t += 5 # step 5s + chorus = { + "best": { + "start": best_start, + "end": best_start + window, + "duration": window, + "text_preview": "(instrumental — energy peak)", + "score": 0, + "avg_rms": round(best_avg, 2), + }, + "all_candidates": [], + "avg_rms_total": round( + sum(r for (_, r) in energies) / len(energies) if energies else -30, 2 + ), + } + + # 6. Smart clip range + clip_range = smart_clip_range( + chorus, transcript, duration, + target_duration=args.target_duration, + max_duration=args.max_duration, + min_duration=args.min_duration, + ) + print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s " + f"(duration: {clip_range['duration']}s)", file=sys.stderr) + + # 7. Fade params + fade = detect_audio_fade(clip_range, transcript) + print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr) + + result = { + "video": str(video), + "video_duration": duration, + "language": transcript["language"], + "language_probability": transcript["language_probability"], + "instrumental": instrumental, + "transcript": transcript, + "chorus": chorus, + "clip_range": clip_range, + "fade": fade, + } + + if args.output: + with open(args.output, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + print(f"💾 Saved: {args.output}", file=sys.stderr) + + if args.json: + print(json.dumps(result, ensure_ascii=False)) + + finally: + try: + os.unlink(audio) + except Exception: + pass + + +if __name__ == "__main__": + main() diff --git a/scripts/clip.py b/scripts/clip.py index 04456b5..a26d868 100644 --- a/scripts/clip.py +++ b/scripts/clip.py @@ -45,9 +45,11 @@ def parse_clips(spec): SCRIPT_DIR = Path(__file__).parent -def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quality): +def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quality, + fade_in=0.0, fade_out=0.0): """Naredi en klip src → dst.""" - print(f"🎯 run_clip args: src={src}, dst={dst}, start={start!r}, duration={duration!r}, mode={mode}", file=sys.stderr) + print(f"🎯 run_clip args: src={src}, dst={dst}, start={start!r}, duration={duration!r}, " + f"mode={mode}, fade_in={fade_in}, fade_out={fade_out}", file=sys.stderr) tmp = tempfile.mkdtemp(prefix="reel_") try: reframed = Path(tmp) / "reframed.mp4" @@ -63,6 +65,10 @@ def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quali cmd += ["--start", str(start)] if duration is not None: cmd += ["--duration", str(duration)] + if fade_in > 0: + cmd += ["--fade-in", str(fade_in)] + if fade_out > 0: + cmd += ["--fade-out", str(fade_out)] print(f"🔧 REFRAME CMD: {' '.join(cmd)}", file=sys.stderr) print(f"\n▶ Klip: {dst.name}") r = subprocess.run(cmd) @@ -97,6 +103,8 @@ def main(): ap.add_argument("output", help="Datoteka (en klip) ali mapa (več klipov)") ap.add_argument("--start", type=str, default=None, help="Začetek (s ali mm:ss)") ap.add_argument("--duration", type=float, default=None, help="Trajanje v s") + ap.add_argument("--fade-in", type=float, default=0.0, help="Audio fade in (s)") + ap.add_argument("--fade-out", type=float, default=0.0, help="Audio fade out (s)") ap.add_argument("--clips", type=str, default=None, help="Več klipov: '0:30-1:00,2:15-2:45'") ap.add_argument("--mode", default="track", choices=["track", "center", "blur"]) @@ -127,7 +135,8 @@ def main(): else: start = parse_ts(args.start) if args.start else None run_clip(src, Path(args.output), start, args.duration, args.mode, - args.lang, args.model, args.style, args.no_subs, args.quality) + args.lang, args.model, args.style, args.no_subs, args.quality, + fade_in=args.fade_in, fade_out=args.fade_out) if __name__ == "__main__": diff --git a/scripts/reframe.py b/scripts/reframe.py index 5e9570e..ff26d18 100644 --- a/scripts/reframe.py +++ b/scripts/reframe.py @@ -213,6 +213,8 @@ def main(): ap.add_argument("--target-height", type=int, default=1920) ap.add_argument("--start", type=float, default=None, help="Začetek (s)") ap.add_argument("--duration", type=float, default=None, help="Trajanje (s)") + ap.add_argument("--fade-in", type=float, default=0.0, help="Audio fade in (s)") + ap.add_argument("--fade-out", type=float, default=0.0, help="Audio fade out (s)") ap.add_argument("--quality", default="medium", choices=["fast", "medium", "high"]) args = ap.parse_args() @@ -268,6 +270,16 @@ def main(): preset = {"fast": "veryfast", "medium": "medium", "high": "slow"}[args.quality] crf = {"fast": "26", "medium": "21", "high": "18"}[args.quality] + # Audio fade filter (afade) + audio_filter = [] + if args.fade_in > 0: + audio_filter.append(f"afade=t=in:st=0:d={args.fade_in}") + if args.fade_out > 0: + clip_dur = info["duration"] + fade_start = max(0, clip_dur - args.fade_out) + audio_filter.append(f"afade=t=out:st={fade_start}:d={args.fade_out}") + audio_filter_str = ",".join(audio_filter) if audio_filter else None + if args.mode == "blur": # blur uporablja filter_complex cmd = [ @@ -275,18 +287,20 @@ def main(): "-filter_complex", vfilter, "-c:v", "libx264", "-preset", preset, "-crf", crf, "-c:a", "aac", "-b:a", "128k", - "-movflags", "+faststart", - str(dst), ] + if audio_filter_str: + cmd += ["-af", audio_filter_str] + cmd += ["-movflags", "+faststart", str(dst)] else: cmd = [ "ffmpeg", "-y", "-i", str(work_input), "-vf", vfilter, "-c:v", "libx264", "-preset", preset, "-crf", crf, "-c:a", "aac", "-b:a", "128k", - "-movflags", "+faststart", - str(dst), ] + if audio_filter_str: + cmd += ["-af", audio_filter_str] + cmd += ["-movflags", "+faststart", str(dst)] print(f"🎬 Render ({args.mode})...") result = subprocess.run(cmd, capture_output=True, text=True) diff --git a/templates/index.html b/templates/index.html index bf8ea76..2dd1b86 100644 --- a/templates/index.html +++ b/templates/index.html @@ -216,12 +216,16 @@
@@ -238,8 +242,13 @@ +
+ Sistem analizira celoten video, najde refren ter pre-chorus build-up. + Lahko traja malo dlje (do 1.5×) če to bolje prikazuje pesem. + Audio fade in/out je avtomatsko dodan. +