reels-app/scripts/reframe.py
Sebastjan Artič 30b969e4b8 Initial: reels clipper app
- FastAPI backend (auth, jobs, SSE, download)
- Frontend: drag&drop + YouTube URL + jobs panel
- Pipeline: yt_download → find_chorus → reframe → subtitle
- Modes: track (face follow), center, blur
- Whisper for SI/DE/EN subtitles
- Auto-chorus detection via Whisper + RMS energy
- Docker + Coolify ready
2026-04-28 15:28:22 +00:00

307 lines
11 KiB
Python

#!/usr/bin/env python3
"""
reframe.py — Pretvori 16:9 video v 9:16 (reels/shorts/tiktok format).
Modi:
--mode track : Pametno sledi obrazu/osebi (MediaPipe face detection)
Crop okno se gladko premika za subjektom.
--mode center : Statični center crop (najhitrejše)
--mode blur : 9:16 platno z blur ozadjem + 16:9 video v sredini
Primer:
python3 reframe.py input.mp4 output.mp4 --mode track
python3 reframe.py input.mp4 output.mp4 --mode track --start 10 --duration 30
"""
import argparse
import subprocess
import sys
import os
import json
import tempfile
from pathlib import Path
import cv2
import numpy as np
def get_video_info(path):
"""Vrni dict z width, height, fps, duration."""
cmd = [
"ffprobe", "-v", "quiet", "-print_format", "json",
"-show_streams", "-show_format", str(path)
]
data = json.loads(subprocess.check_output(cmd))
vstream = next(s for s in data["streams"] if s["codec_type"] == "video")
fps_str = vstream["r_frame_rate"]
num, den = fps_str.split("/")
fps = float(num) / float(den)
return {
"width": int(vstream["width"]),
"height": int(vstream["height"]),
"fps": fps,
"duration": float(data["format"]["duration"]),
}
def detect_face_centers(video_path, sample_fps=5):
"""
Vzorči video pri sample_fps in vrni seznam (timestamp, x_center_normalized).
x_center_normalized je 0..1 (0 = levi rob, 1 = desni rob).
Če obraza ni, vrne None za to vzorčenje.
Uporablja OpenCV Haar cascade (frontalface_alt2) — robustno, brez external modela.
"""
cap = cv2.VideoCapture(str(video_path))
src_fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
step = max(1, int(src_fps / sample_fps))
cascade_path = cv2.data.haarcascades + "haarcascade_frontalface_alt2.xml"
face_cascade = cv2.CascadeClassifier(cascade_path)
samples = []
frame_idx = 0
while True:
ret, frame = cap.read()
if not ret:
break
if frame_idx % step == 0:
ts = frame_idx / src_fps
gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(
gray, scaleFactor=1.2, minNeighbors=5, minSize=(60, 60)
)
if len(faces) > 0:
# Vzemi največji obraz
x, y, w, h = max(faces, key=lambda f: f[2] * f[3])
x_center = (x + w / 2) / width
samples.append((ts, x_center))
else:
samples.append((ts, None))
frame_idx += 1
cap.release()
return samples, width, height, src_fps, total_frames
def smooth_track(samples, total_duration, smoothing_window=2.0):
"""
Iz seznama (ts, x) naredi gladko krivuljo x(t) za vsako sekundo videa.
- None vrednosti se zapolni z zadnjo znano (ali 0.5 default).
- Drsno povprečje preko smoothing_window sekund.
"""
# Zapolni manjkajoče
last = 0.5
filled = []
for ts, x in samples:
if x is None:
x = last
else:
last = x
filled.append((ts, x))
if not filled:
return lambda t: 0.5
# Drsno povprečje
timestamps = np.array([t for t, _ in filled])
values = np.array([v for _, v in filled])
smoothed = np.zeros_like(values)
for i, t in enumerate(timestamps):
mask = np.abs(timestamps - t) <= smoothing_window / 2
smoothed[i] = np.mean(values[mask])
def x_at(t):
if t <= timestamps[0]:
return float(smoothed[0])
if t >= timestamps[-1]:
return float(smoothed[-1])
return float(np.interp(t, timestamps, smoothed))
return x_at
def build_track_filter(info, x_at, target_w, target_h, fps):
"""
Sestavi FFmpeg filter za track mode.
Generiramo crop expression, ki se premika z x(t).
Ker FFmpeg ne podpira poljubne funkcije časa, vzorčimo x(t) in
sestavimo piecewise linearno funkcijo prek `if(...)`.
Bolj robustno: pre-scale na ciljno višino, potem crop x = f(t).
"""
src_w = info["width"]
src_h = info["height"]
# Najprej scale: višina = target_h, širina proporcionalno
scale_h = target_h
scale_w = int(src_w * (target_h / src_h))
# Po skaliranju je crop širina = target_w
# x_center v skaliranem prostoru
max_x = scale_w - target_w # max levo-zgornji x
# Vzorčimo x(t) na ~5 fps (dovolj gladko po smoothingu)
duration = info["duration"]
n_samples = max(2, int(duration * 5))
times = np.linspace(0, duration, n_samples)
x_centers_norm = [x_at(t) for t in times]
# Pretvori normaliziran center v dejanski levi-zgornji x v skaliranem oknu
x_lefts = []
for xc in x_centers_norm:
x_left = xc * scale_w - target_w / 2
x_left = max(0, min(max_x, x_left))
x_lefts.append(x_left)
# Sestavi piecewise expression: če (t < t1, x1, če (t < t2, x2, ...))
# FFmpeg ima omejitev na dolžino expression-a, zato uporabimo drugačen pristop:
# Generiramo CSV in uporabimo `sendcmd` filter ali pa preprosto
# nizkofrekvenčno linearno interpolacijo prek `if/lerp`.
# Pragmatično: zgradimo nested if. Pri 5 fps in 60s = 300 vej; deluje.
# Za daljše videe rebajzamo na 2 fps.
if duration > 120:
n_samples = int(duration * 2)
times = np.linspace(0, duration, n_samples)
x_lefts_resampled = []
for t in times:
x_lefts_resampled.append(np.interp(t, np.linspace(0, duration, len(x_lefts)), x_lefts))
x_lefts = x_lefts_resampled
# Linearna interpolacija med vzorci znotraj FFmpeg expression
# Format: če(t<t_i, lerp(x_{i-1}, x_i, (t-t_{i-1})/(t_i-t_{i-1})), nadaljuj)
expr = f"{x_lefts[-1]:.1f}"
for i in range(len(times) - 1, 0, -1):
t0, t1 = times[i - 1], times[i]
x0, x1 = x_lefts[i - 1], x_lefts[i]
# lerp = x0 + (x1-x0)*(t-t0)/(t1-t0)
if abs(t1 - t0) < 1e-6:
lerp = f"{x0:.1f}"
else:
lerp = f"({x0:.1f}+({x1 - x0:.1f})*(t-{t0:.3f})/{t1 - t0:.3f})"
expr = f"if(lt(t,{t1:.3f}),{lerp},{expr})"
vfilter = (
f"scale={scale_w}:{scale_h},"
f"crop={target_w}:{target_h}:'{expr}':0"
)
return vfilter
def build_center_filter(info, target_w, target_h):
src_w = info["width"]
src_h = info["height"]
scale_h = target_h
scale_w = int(src_w * (target_h / src_h))
return f"scale={scale_w}:{scale_h},crop={target_w}:{target_h}:(in_w-{target_w})/2:0"
def build_blur_filter(info, target_w, target_h):
"""
9:16 platno: spodaj/zgoraj blur kopija, v sredini originalni 16:9.
"""
# Originalna širina v 9:16 platnu = target_w, višina proporcionalno
src_w = info["width"]
src_h = info["height"]
fg_h = int(target_w * src_h / src_w)
return (
f"[0:v]scale={target_w}:{target_h}:force_original_aspect_ratio=increase,"
f"crop={target_w}:{target_h},gblur=sigma=30[bg];"
f"[0:v]scale={target_w}:{fg_h}[fg];"
f"[bg][fg]overlay=0:(H-h)/2"
)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("input")
ap.add_argument("output")
ap.add_argument("--mode", choices=["track", "center", "blur"], default="track")
ap.add_argument("--target-width", type=int, default=1080)
ap.add_argument("--target-height", type=int, default=1920)
ap.add_argument("--start", type=float, default=None, help="Začetek (s)")
ap.add_argument("--duration", type=float, default=None, help="Trajanje (s)")
ap.add_argument("--quality", default="medium", choices=["fast", "medium", "high"])
args = ap.parse_args()
src = Path(args.input)
dst = Path(args.output)
if not src.exists():
print(f"❌ Vhod ne obstaja: {src}", file=sys.stderr)
sys.exit(1)
# Če imamo --start/--duration, najprej trim z FFmpeg v temp file (hitreje)
work_input = src
tmp = None
if args.start is not None or args.duration is not None:
tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
tmp.close()
cmd = ["ffmpeg", "-y"]
if args.start is not None:
cmd += ["-ss", str(args.start)]
cmd += ["-i", str(src)]
if args.duration is not None:
cmd += ["-t", str(args.duration)]
cmd += ["-c", "copy", tmp.name]
subprocess.run(cmd, check=True, stderr=subprocess.DEVNULL)
work_input = Path(tmp.name)
print(f"✂ Trim → {work_input}")
info = get_video_info(work_input)
print(f"📹 Vhod: {info['width']}x{info['height']} @ {info['fps']:.2f}fps, {info['duration']:.1f}s")
if args.mode == "track":
print("🔍 Detektiram obraze (OpenCV)...")
samples, _, _, _, _ = detect_face_centers(work_input, sample_fps=5)
n_with_face = sum(1 for _, x in samples if x is not None)
print(f" {n_with_face}/{len(samples)} vzorcev z obrazom")
x_at = smooth_track(samples, info["duration"], smoothing_window=2.0)
vfilter = build_track_filter(info, x_at, args.target_width, args.target_height, info["fps"])
elif args.mode == "center":
vfilter = build_center_filter(info, args.target_width, args.target_height)
elif args.mode == "blur":
vfilter = build_blur_filter(info, args.target_width, args.target_height)
preset = {"fast": "veryfast", "medium": "medium", "high": "slow"}[args.quality]
crf = {"fast": "26", "medium": "21", "high": "18"}[args.quality]
if args.mode == "blur":
# blur uporablja filter_complex
cmd = [
"ffmpeg", "-y", "-i", str(work_input),
"-filter_complex", vfilter,
"-c:v", "libx264", "-preset", preset, "-crf", crf,
"-c:a", "aac", "-b:a", "128k",
"-movflags", "+faststart",
str(dst),
]
else:
cmd = [
"ffmpeg", "-y", "-i", str(work_input),
"-vf", vfilter,
"-c:v", "libx264", "-preset", preset, "-crf", crf,
"-c:a", "aac", "-b:a", "128k",
"-movflags", "+faststart",
str(dst),
]
print(f"🎬 Render ({args.mode})...")
result = subprocess.run(cmd, capture_output=True, text=True)
if result.returncode != 0:
print("❌ FFmpeg napaka:", file=sys.stderr)
print(result.stderr[-2000:], file=sys.stderr)
sys.exit(1)
if tmp:
os.unlink(tmp.name)
out_info = get_video_info(dst)
out_size = dst.stat().st_size / 1024 / 1024
print(f"{dst}{out_info['width']}x{out_info['height']}, {out_size:.1f} MB")
if __name__ == "__main__":
main()