reels-app/scripts/reframe.py

#!/usr/bin/env python3
"""
reframe.py — Pretvori 16:9 video v 9:16 (reels/shorts/tiktok format).

Modi:
  --mode track    : Pametno sledi obrazu/osebi (MediaPipe face detection)
                    Crop okno se gladko premika za subjektom.
  --mode center   : Statični center crop (najhitrejše)
  --mode blur     : 9:16 platno z blur ozadjem + 16:9 video v sredini

Primer:
  python3 reframe.py input.mp4 output.mp4 --mode track
  python3 reframe.py input.mp4 output.mp4 --mode track --start 10 --duration 30
"""
import argparse
import subprocess
import sys
import os
import json
import tempfile
from pathlib import Path

import cv2
import numpy as np


def get_video_info(path):
    """Vrni dict z width, height, fps, duration."""
    cmd = [
        "ffprobe", "-v", "quiet", "-print_format", "json",
        "-show_streams", "-show_format", str(path)
    ]
    data = json.loads(subprocess.check_output(cmd))
    vstream = next(s for s in data["streams"] if s["codec_type"] == "video")
    fps_str = vstream["r_frame_rate"]
    num, den = fps_str.split("/")
    fps = float(num) / float(den)
    return {
        "width": int(vstream["width"]),
        "height": int(vstream["height"]),
        "fps": fps,
        "duration": float(data["format"]["duration"]),
    }


def detect_face_centers(video_path, sample_fps=5):
    """
    Vzorči video pri sample_fps in vrni seznam (timestamp, x_center_normalized).
    x_center_normalized je 0..1 (0 = levi rob, 1 = desni rob).
    Če obraza ni, vrne None za to vzorčenje.

    Uporablja OpenCV Haar cascade (frontalface_alt2) — robustno, brez external modela.
    """
    cap = cv2.VideoCapture(str(video_path))
    src_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    step = max(1, int(src_fps / sample_fps))

    cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_alt2.xml"
    face_cascade = cv2.CascadeClassifier(cascade_path)

    samples = []
    frame_idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_idx % step == 0:
            ts = frame_idx / src_fps
            gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
            faces = face_cascade.detectMultiScale(
                gray, scaleFactor=1.2, minNeighbors=5, minSize=(60, 60)
            )
            if len(faces) > 0:
                # Vzemi največji obraz
                x, y, w, h = max(faces, key=lambda f: f[2] * f[3])
                x_center = (x + w / 2) / width
                samples.append((ts, x_center))
            else:
                samples.append((ts, None))
        frame_idx += 1

    cap.release()
    return samples, width, height, src_fps, total_frames


def smooth_track(samples, total_duration, smoothing_window=2.0):
    """
    Iz seznama (ts, x) naredi gladko krivuljo x(t) za vsako sekundo videa.
    - None vrednosti se zapolni z zadnjo znano (ali 0.5 default).
    - Drsno povprečje preko smoothing_window sekund.
    """
    # Zapolni manjkajoče
    last = 0.5
    filled = []
    for ts, x in samples:
        if x is None:
            x = last
        else:
            last = x
        filled.append((ts, x))

    if not filled:
        return lambda t: 0.5

    # Drsno povprečje
    timestamps = np.array([t for t, _ in filled])
    values = np.array([v for _, v in filled])

    smoothed = np.zeros_like(values)
    for i, t in enumerate(timestamps):
        mask = np.abs(timestamps - t) <= smoothing_window / 2
        smoothed[i] = np.mean(values[mask])

    def x_at(t):
        if t <= timestamps[0]:
            return float(smoothed[0])
        if t >= timestamps[-1]:
            return float(smoothed[-1])
        return float(np.interp(t, timestamps, smoothed))

    return x_at


def build_track_filter(info, x_at, target_w, target_h, fps):
    """
    Sestavi FFmpeg filter za track mode.
    Generiramo crop expression, ki se premika z x(t).
    Ker FFmpeg ne podpira poljubne funkcije časa, vzorčimo x(t) in
    sestavimo piecewise linearno funkcijo prek `if(...)`.

    Bolj robustno: pre-scale na ciljno višino, potem crop x = f(t).
    """
    src_w = info["width"]
    src_h = info["height"]

    # Najprej scale: višina = target_h, širina proporcionalno
    scale_h = target_h
    scale_w = int(src_w * (target_h / src_h))
    # Po skaliranju je crop širina = target_w
    # x_center v skaliranem prostoru
    max_x = scale_w - target_w  # max levo-zgornji x

    # Vzorčimo x(t) na ~5 fps (dovolj gladko po smoothingu)
    duration = info["duration"]
    n_samples = max(2, int(duration * 5))
    times = np.linspace(0, duration, n_samples)
    x_centers_norm = [x_at(t) for t in times]
    # Pretvori normaliziran center v dejanski levi-zgornji x v skaliranem oknu
    x_lefts = []
    for xc in x_centers_norm:
        x_left = xc * scale_w - target_w / 2
        x_left = max(0, min(max_x, x_left))
        x_lefts.append(x_left)

    # Sestavi piecewise expression: če (t < t1, x1, če (t < t2, x2, ...))
    # FFmpeg ima omejitev na dolžino expression-a, zato uporabimo drugačen pristop:
    # Generiramo CSV in uporabimo `sendcmd` filter ali pa preprosto
    # nizkofrekvenčno linearno interpolacijo prek `if/lerp`.
    # Pragmatično: zgradimo nested if. Pri 5 fps in 60s = 300 vej; deluje.
    # Za daljše videe rebajzamo na 2 fps.
    if duration > 120:
        n_samples = int(duration * 2)
        times = np.linspace(0, duration, n_samples)
        x_lefts_resampled = []
        for t in times:
            x_lefts_resampled.append(np.interp(t, np.linspace(0, duration, len(x_lefts)), x_lefts))
        x_lefts = x_lefts_resampled

    # Linearna interpolacija med vzorci znotraj FFmpeg expression
    # Format: če(t<t_i, lerp(x_{i-1}, x_i, (t-t_{i-1})/(t_i-t_{i-1})), nadaljuj)
    expr = f"{x_lefts[-1]:.1f}"
    for i in range(len(times) - 1, 0, -1):
        t0, t1 = times[i - 1], times[i]
        x0, x1 = x_lefts[i - 1], x_lefts[i]
        # lerp = x0 + (x1-x0)*(t-t0)/(t1-t0)
        if abs(t1 - t0) < 1e-6:
            lerp = f"{x0:.1f}"
        else:
            lerp = f"({x0:.1f}+({x1 - x0:.1f})*(t-{t0:.3f})/{t1 - t0:.3f})"
        expr = f"if(lt(t,{t1:.3f}),{lerp},{expr})"

    vfilter = (
        f"scale={scale_w}:{scale_h},"
        f"crop={target_w}:{target_h}:'{expr}':0"
    )
    return vfilter


def build_center_filter(info, target_w, target_h):
    src_w = info["width"]
    src_h = info["height"]
    scale_h = target_h
    scale_w = int(src_w * (target_h / src_h))
    return f"scale={scale_w}:{scale_h},crop={target_w}:{target_h}:(in_w-{target_w})/2:0"


def build_blur_filter(info, target_w, target_h):
    """
    9:16 platno: spodaj/zgoraj blur kopija, v sredini originalni 16:9.
    """
    # Originalna širina v 9:16 platnu = target_w, višina proporcionalno
    src_w = info["width"]
    src_h = info["height"]
    fg_h = int(target_w * src_h / src_w)
    return (
        f"[0:v]scale={target_w}:{target_h}:force_original_aspect_ratio=increase,"
        f"crop={target_w}:{target_h},gblur=sigma=30[bg];"
        f"[0:v]scale={target_w}:{fg_h}[fg];"
        f"[bg][fg]overlay=0:(H-h)/2"
    )


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("input")
    ap.add_argument("output")
    ap.add_argument("--mode", choices=["track", "center", "blur"], default="track")
    ap.add_argument("--target-width", type=int, default=1080)
    ap.add_argument("--target-height", type=int, default=1920)
    ap.add_argument("--start", type=float, default=None, help="Začetek (s)")
    ap.add_argument("--duration", type=float, default=None, help="Trajanje (s)")
    ap.add_argument("--quality", default="medium", choices=["fast", "medium", "high"])
    args = ap.parse_args()

    src = Path(args.input)
    dst = Path(args.output)
    if not src.exists():
        print(f"❌ Vhod ne obstaja: {src}", file=sys.stderr)
        sys.exit(1)

    # Če imamo --start/--duration, najprej trim z FFmpeg v temp file (hitreje)
    work_input = src
    tmp = None
    if args.start is not None or args.duration is not None:
        tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
        tmp.close()
        cmd = ["ffmpeg", "-y"]
        if args.start is not None:
            cmd += ["-ss", str(args.start)]
        cmd += ["-i", str(src)]
        if args.duration is not None:
            cmd += ["-t", str(args.duration)]
        cmd += ["-c", "copy", tmp.name]
        subprocess.run(cmd, check=True, stderr=subprocess.DEVNULL)
        work_input = Path(tmp.name)
        print(f"✂  Trim → {work_input}")

    info = get_video_info(work_input)
    print(f"📹 Vhod: {info['width']}x{info['height']} @ {info['fps']:.2f}fps, {info['duration']:.1f}s")

    if args.mode == "track":
        print("🔍 Detektiram obraze (OpenCV)...")
        samples, _, _, _, _ = detect_face_centers(work_input, sample_fps=5)
        n_with_face = sum(1 for _, x in samples if x is not None)
        print(f"   {n_with_face}/{len(samples)} vzorcev z obrazom")
        x_at = smooth_track(samples, info["duration"], smoothing_window=2.0)
        vfilter = build_track_filter(info, x_at, args.target_width, args.target_height, info["fps"])
    elif args.mode == "center":
        vfilter = build_center_filter(info, args.target_width, args.target_height)
    elif args.mode == "blur":
        vfilter = build_blur_filter(info, args.target_width, args.target_height)

    preset = {"fast": "veryfast", "medium": "medium", "high": "slow"}[args.quality]
    crf = {"fast": "26", "medium": "21", "high": "18"}[args.quality]

    if args.mode == "blur":
        # blur uporablja filter_complex
        cmd = [
            "ffmpeg", "-y", "-i", str(work_input),
            "-filter_complex", vfilter,
            "-c:v", "libx264", "-preset", preset, "-crf", crf,
            "-c:a", "aac", "-b:a", "128k",
            "-movflags", "+faststart",
            str(dst),
        ]
    else:
        cmd = [
            "ffmpeg", "-y", "-i", str(work_input),
            "-vf", vfilter,
            "-c:v", "libx264", "-preset", preset, "-crf", crf,
            "-c:a", "aac", "-b:a", "128k",
            "-movflags", "+faststart",
            str(dst),
        ]

    print(f"🎬 Render ({args.mode})...")
    result = subprocess.run(cmd, capture_output=True, text=True)
    if result.returncode != 0:
        print("❌ FFmpeg napaka:", file=sys.stderr)
        print(result.stderr[-2000:], file=sys.stderr)
        sys.exit(1)

    if tmp:
        os.unlink(tmp.name)

    out_info = get_video_info(dst)
    out_size = dst.stat().st_size / 1024 / 1024
    print(f"✅ {dst} — {out_info['width']}x{out_info['height']}, {out_size:.1f} MB")


if __name__ == "__main__":
    main()