S3 mirror integration: workfiles auto-mirror to s3://folxspeed/reels-app/

- main.py: 4 helper funcs (_persist_to_s3, _ensure_local, _delete_from_s3, _ffmpeg_then_persist) - no-op fallback when S3 creds missing - save_job(): mirror metadata JSON to S3 - process_job(): mirror YT download + render output + analysis/srt/ass to S3 - upload_video(): mirror direct uploads to S3 - _precache_edit_assets(): Popen->threaded with S3 sync after ffmpeg - read endpoints (download, preview, source_video, waveform, preview_clip, get_transcript, recut_job): _ensure_local() fallback fetch from S3 - delete_job(): cascade delete to S3 (mirror unlink) - cleanup.py: NEW module, deletes local files >48h that exist in S3. Verified by S3 head_object + size match. NOT YET ACTIVATED in cron. Backward compat: lokalna mapa ostane primary. Brez env vars S3_* vsi helperji vrnejo False (no-op). Production behavior identičen, dokler ne dobi S3 creds.
2026-05-03 12:24:18 +00:00 · 2026-05-03 12:24:18 +00:00 · 0d72d70f5d
commit 0d72d70f5d
parent ec1d109e3b
2 changed files with 318 additions and 13 deletions
--- a/app/cleanup.py
+++ b/app/cleanup.py
@ -0,0 +1,167 @@
 """
 Cleanup module: removes local files older than N hours, *only if they exist in S3*.
 Safe by design:
 - Never deletes a local file unless its S3 mirror is verified
 - Never deletes job metadata (jobs/*.json) — those are tiny
 - Default age threshold is conservative (48 h)
 - Dry-run mode for verification
 Usage (cron):
    python3 -m app.cleanup --apply           # actual delete
    python3 -m app.cleanup --dry-run         # preview only (default)
    python3 -m app.cleanup --apply --hours 72  # custom age threshold
 Suggested cron (every night 03:30):
    30 3 * * * cd /app && python3 -m app.cleanup --apply >> /data/cleanup.log 2>&1
 """
 from __future__ import annotations
 import argparse
 import logging
 import os
 import sys
 import time
 from pathlib import Path
 # Konfiguracija — ujema main.py
 DATA_DIR = Path(os.environ.get("DATA_DIR", "/data"))
 UPLOAD_DIR = DATA_DIR / "uploads"
 OUTPUT_DIR = DATA_DIR / "outputs"
 DEFAULT_AGE_HOURS = 48
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [cleanup] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
 )
 log = logging.getLogger(__name__)
 def _file_age_hours(p: Path) -> float:
    """Return file age in hours based on mtime."""
    try:
        return (time.time() - p.stat().st_mtime) / 3600
    except OSError:
        return 0.0
 def _scan_dir(d: Path, kind: str, min_age_h: float):
    """Yield (local_path, kind, age_hours) for files older than threshold."""
    if not d.exists():
        return
    for p in d.iterdir():
        if not p.is_file():
            continue
        # Skip very small files (probably config/state) — only target real workfiles
        if p.stat().st_size < 1024 * 100:  # <100 KB
            continue
        age = _file_age_hours(p)
        if age >= min_age_h:
            yield p, kind, age
 def cleanup(min_age_hours: float, apply: bool) -> dict:
    """Run cleanup pass. Returns stats dict.
    Logic:
      For each file older than min_age_hours in uploads/ and outputs/:
        - Verify S3 mirror exists (s3.exists())
        - If verified: delete local
        - If not verified: skip (warn) — never delete unverified
    """
    from app import s3_storage
    if not s3_storage.is_enabled():
        log.error("S3 not configured — refusing to run cleanup. Aborting.")
        return {"error": "s3_not_configured"}
    stats = {
        "scanned": 0,
        "would_delete": 0,
        "deleted": 0,
        "skipped_no_s3_mirror": 0,
        "freed_mb": 0.0,
        "errors": 0,
    }
    deleted_files = []
    skipped_files = []
    for d, kind in [(UPLOAD_DIR, "upload"), (OUTPUT_DIR, "output")]:
        for p, _, age_h in _scan_dir(d, kind, min_age_hours):
            stats["scanned"] += 1
            size_mb = p.stat().st_size / 1024 / 1024
            folder = "uploads" if kind == "upload" else "outputs"
            s3_key = f"{folder}/{p.name}"
            # Verify S3 mirror
            try:
                s3_size = s3_storage.get_object_size(s3_key)
            except Exception as e:
                log.warning("S3 check failed for %s: %s", s3_key, e)
                stats["errors"] += 1
                continue
            if s3_size is None:
                log.warning("SKIP — no S3 mirror: %s (age %.1fh, %.1f MB)",
                            p.name, age_h, size_mb)
                stats["skipped_no_s3_mirror"] += 1
                skipped_files.append(p.name)
                continue
            # Check size match (sanity)
            local_size = p.stat().st_size
            if abs(s3_size - local_size) > 1024:  # >1 KB delta — suspicious
                log.warning("SKIP — size mismatch %s: local=%d s3=%d",
                            p.name, local_size, s3_size)
                stats["skipped_no_s3_mirror"] += 1
                skipped_files.append(p.name)
                continue
            stats["would_delete"] += 1
            stats["freed_mb"] += size_mb
            if apply:
                try:
                    p.unlink()
                    stats["deleted"] += 1
                    deleted_files.append(p.name)
                    log.info("DEL %s (%.1f MB, age %.1fh, S3 verified)",
                             p.name, size_mb, age_h)
                except OSError as e:
                    log.error("Delete failed %s: %s", p, e)
                    stats["errors"] += 1
            else:
                log.info("DRY %s (%.1f MB, age %.1fh, S3 verified)",
                         p.name, size_mb, age_h)
    log.info("=" * 60)
    log.info("Cleanup pass: %s", "APPLY" if apply else "DRY-RUN")
    log.info("  scanned:               %d", stats["scanned"])
    log.info("  would-delete:          %d", stats["would_delete"])
    log.info("  deleted:               %d", stats["deleted"])
    log.info("  skipped (no S3):       %d", stats["skipped_no_s3_mirror"])
    log.info("  errors:                %d", stats["errors"])
    log.info("  freed:                 %.1f MB", stats["freed_mb"])
    return stats
 def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--apply", action="store_true",
                    help="Actually delete files (default is dry-run)")
    ap.add_argument("--dry-run", action="store_true",
                    help="Preview only, do not delete (default)")
    ap.add_argument("--hours", type=float, default=DEFAULT_AGE_HOURS,
                    help=f"Min file age in hours (default {DEFAULT_AGE_HOURS})")
    args = ap.parse_args()
    apply = args.apply and not args.dry_run
    stats = cleanup(min_age_hours=args.hours, apply=apply)
    if stats.get("error"):
        sys.exit(2)
 if __name__ == "__main__":
    main()
--- a/app/main.py
+++ b/app/main.py
@ -19,6 +19,7 @@ import os
 import secrets
 import shutil
 import subprocess
 import threading
 import time
 import uuid
 from pathlib import Path
@ -55,6 +56,75 @@ QNET_DIR.mkdir(parents=True, exist_ok=True)
 os.environ.setdefault("QNET_LOOKUP_PATH", str(QNET_DIR / "songs_lookup.json"))
 from app import qnet_match  # noqa: E402
 # S3 storage mirror — uploads/outputs/jobs gredo tudi v s3://folxspeed/reels-app/
 # Lokalna mapa ostane primary, S3 je replica/cache.
 from app import s3_storage  # noqa: E402
 def _persist_to_s3(local_path, kind: str) -> bool:
    """Mirror local file to S3 after producing it (best-effort).
    kind: 'upload' for uploads/, 'output' for outputs/, 'job_meta' for jobs/
    Silent no-op when S3 not configured. Never raises.
    """
    try:
        if not s3_storage.is_enabled():
            return False
        p = Path(local_path)
        if not p.exists() or p.stat().st_size == 0:
            return False
        return s3_storage.upload_job_file("", kind, p)
    except Exception as e:
        print(f"⚠️ S3 mirror failed for {local_path}: {e}", flush=True)
        return False
 def _ensure_local(local_path, kind: str) -> bool:
    """Make sure file is on disk; if missing, fetch from S3.
    Returns True if file is ready locally after the call.
    kind: 'upload' or 'output'
    """
    p = Path(local_path)
    if p.exists() and p.stat().st_size > 0:
        return True
    if not s3_storage.is_enabled():
        return False
    folder = {"upload": "uploads", "output": "outputs", "job_meta": "jobs"}.get(kind, kind)
    key = f"{folder}/{p.name}"
    print(f"📥 Local missing, fetching from S3: {key}", flush=True)
    return s3_storage.download(key, p)
 def _delete_from_s3(filename: str, kind: str) -> bool:
    """Delete object from S3 (mirror local delete). Best-effort, no raise."""
    try:
        if not s3_storage.is_enabled():
            return False
        folder = {"upload": "uploads", "output": "outputs", "job_meta": "jobs"}.get(kind, kind)
        return s3_storage.delete(f"{folder}/{filename}")
    except Exception:
        return False
 def _ffmpeg_then_persist(cmd, out_path, kind: str = "output", timeout: int = 600):
    """Run ffmpeg in background thread, then mirror result to S3.
    Drop-in replacement for subprocess.Popen() when we want S3 sync after.
    """
    def runner():
        try:
            subprocess.run(
                cmd,
                stdout=subprocess.DEVNULL,
                stderr=subprocess.DEVNULL,
                timeout=timeout,
            )
            _persist_to_s3(out_path, kind)
        except Exception as e:
            print(f"⚠️ ffmpeg/persist failed for {out_path}: {e}", flush=True)
    threading.Thread(target=runner, daemon=True).start()
 # Dedup DB — sledi že obdelanim/naloženim komadom
 DEDUP_DB = DATA_DIR / "processed.db"
@ -346,7 +416,9 @@ def load_job(job_id):
 def save_job(job):
-    job_path(job["id"]).write_text(json.dumps(job, ensure_ascii=False, indent=2))
+    p = job_path(job["id"])
    p.write_text(json.dumps(job, ensure_ascii=False, indent=2))
    _persist_to_s3(p, "job_meta")
 def update_job(job_id, **kwargs):
@ -593,8 +665,8 @@ def _precache_edit_assets(job_id: str, src_path: str):
            "-loglevel", "error",
            str(low_path),
        ]
-        subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        _ffmpeg_then_persist(cmd, low_path, kind="output")
-        print(f"📦 Pre-cache low-q source za {job_id} (background)", flush=True)
+        print(f"📦 Pre-cache low-q source za {job_id} (background → S3)", flush=True)
    # Waveform PNG (2400x72 — za zoom)
    wave_path = OUTPUT_DIR / f"{job_id}_waveform_2400x72.png"
@ -611,8 +683,8 @@ def _precache_edit_assets(job_id: str, src_path: str):
            "-loglevel", "error",
            str(wave_path),
        ]
-        subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        _ffmpeg_then_persist(cmd, wave_path, kind="output")
-        print(f"📦 Pre-cache waveform za {job_id} (background)", flush=True)
+        print(f"📦 Pre-cache waveform za {job_id} (background → S3)", flush=True)
 def process_job(job_id):
@ -633,6 +705,11 @@ def process_job(job_id):
            if not run_subprocess_logged(cmd, job_id, "YouTube download"):
                return
            update_job(job_id, input_path=str(input_path))
            # S3 mirror — original video + info.json
            _persist_to_s3(input_path, "upload")
            info_json = input_path.with_suffix(".info.json")
            if info_json.exists():
                _persist_to_s3(info_json, "upload")
            # Probaj dobiti YT metadata (če še ni iz submit-a) — title, uploader, id, ...
            # Single video submit ali playlist resolve že nastavi metadata, ampak
@ -926,6 +1003,12 @@ def process_job(job_id):
                output_path=str(output_path),
                output_size_mb=round(output_path.stat().st_size / 1024 / 1024, 2),
            )
            # S3 mirror — final reel + pomožne datoteke (analysis, subtitles)
            _persist_to_s3(output_path, "output")
            for suffix in (".analysis.json", ".subtitles.srt", ".subtitles.ass"):
                aux = OUTPUT_DIR / f"{job_id}{suffix}"
                if aux.exists():
                    _persist_to_s3(aux, "output")
            # Telegram obvestilo
            try:
                from app.telegram import notify_job_done
@ -1375,6 +1458,8 @@ async def upload_video(
            job["has_clean_name"] = bool(a and t)
    save_job(job)
    # S3 mirror — direct upload datoteka
    _persist_to_s3(input_path, "upload")
    return job
@ -1683,6 +1768,7 @@ async def download(job_id: str, user: str = Depends(check_auth)):
    if not job or job.get("status") != "done":
        raise HTTPException(404, "Ne pripravljen")
    out = Path(job["output_path"])
    _ensure_local(out, "output")
    if not out.exists():
        raise HTTPException(404, "Output ne obstaja")
@ -1703,6 +1789,7 @@ async def preview(job_id: str, request: Request, user: str = Depends(check_auth)
    if not job or job.get("status") != "done":
        raise HTTPException(404, "Ne pripravljen")
    out = Path(job["output_path"])
    _ensure_local(out, "output")
    if not out.exists():
        raise HTTPException(404, "Output ne obstaja")
@ -1762,11 +1849,41 @@ async def delete_job(job_id: str, user: str = Depends(check_auth)):
    job = load_job(job_id)
    if not job:
        raise HTTPException(404, "Ne obstaja")
-    for key in ("input_path", "output_path"):
+    # Glavni input + output (po job records)
    for key, kind in (("input_path", "upload"), ("output_path", "output")):
        p = job.get(key)
-        if p and Path(p).exists():
+        if p:
-            Path(p).unlink(missing_ok=True)
+            local_p = Path(p)
-    job_path(job_id).unlink(missing_ok=True)
+            local_p.unlink(missing_ok=True)
            _delete_from_s3(local_p.name, kind)
    # Pomožne datoteke v outputs/ (analysis, subtitles, low-q, waveform)
    for fname in (
        f"{job_id}.mp4",
        f"{job_id}.analysis.json",
        f"{job_id}.subtitles.srt",
        f"{job_id}.subtitles.ass",
        f"{job_id}_source_low.mp4",
    ):
        f = OUTPUT_DIR / fname
        f.unlink(missing_ok=True)
        _delete_from_s3(fname, "output")
    # Waveform PNG-ji (več velikosti) — listanje ker imena niso fiksna
    try:
        for wf in OUTPUT_DIR.glob(f"{job_id}_waveform_*.png"):
            wf_name = wf.name
            wf.unlink(missing_ok=True)
            _delete_from_s3(wf_name, "output")
    except Exception:
        pass
    # YT info.json
    info_json = UPLOAD_DIR / f"{job_id}_yt.info.json"
    if info_json.exists():
        info_json.unlink(missing_ok=True)
    _delete_from_s3(f"{job_id}_yt.info.json", "upload")
    # Job metadata
    jp = job_path(job_id)
    jp.unlink(missing_ok=True)
    _delete_from_s3(f"{job_id}.json", "job_meta")
    return {"deleted": job_id}
@ -1782,12 +1899,17 @@ async def source_video(job_id: str, quality: str = "high", user: str = Depends(c
    if not job:
        raise HTTPException(404, "Ne obstaja")
    src = job.get("input_path")
-    if not src or not Path(src).exists():
+    if not src:
        raise HTTPException(404, "Original video ne obstaja")
    _ensure_local(src, "upload")
    if not Path(src).exists():
        raise HTTPException(404, "Original video ne obstaja")
    if quality == "low":
        # 480p cached za hitro scrubbanje
        cache_path = OUTPUT_DIR / f"{job_id}_source_low.mp4"
        # Najprej probaj fetch iz S3 (po cleanupu lahko manjka lokalno)
        _ensure_local(cache_path, "output")
        cache_valid = cache_path.exists() and cache_path.stat().st_size > 1024
        if not cache_valid:
@ -1812,6 +1934,8 @@ async def source_video(job_id: str, quality: str = "high", user: str = Depends(c
                    if cache_path.exists():
                        cache_path.unlink()
                    raise HTTPException(500, f"FFmpeg failed: {(proc.stderr or 'unknown')[-300:]}")
                # Mirror v S3 po regeneraciji
                _persist_to_s3(cache_path, "output")
            except subprocess.TimeoutExpired:
                if cache_path.exists():
                    cache_path.unlink()
@ -1842,13 +1966,17 @@ async def waveform(job_id: str, width: int = 1200, height: int = 80, user: str =
    if not job:
        raise HTTPException(404, "Ne obstaja")
    src = job.get("input_path")
-    if not src or not Path(src).exists():
+    if not src:
        raise HTTPException(404, "Original video ne obstaja")
    _ensure_local(src, "upload")
    if not Path(src).exists():
        raise HTTPException(404, "Original video ne obstaja")
    width = max(400, min(width, 3000))
    height = max(40, min(height, 200))
    cache_path = OUTPUT_DIR / f"{job_id}_waveform_{width}x{height}.png"
    _ensure_local(cache_path, "output")
    cache_valid = cache_path.exists() and cache_path.stat().st_size > 100
    if not cache_valid:
@ -1872,6 +2000,8 @@ async def waveform(job_id: str, width: int = 1200, height: int = 80, user: str =
                if cache_path.exists():
                    cache_path.unlink()
                raise HTTPException(500, f"Waveform render failed: {(proc.stderr or 'unknown')[-300:]}")
            # Mirror v S3 po regeneraciji
            _persist_to_s3(cache_path, "output")
        except subprocess.TimeoutExpired:
            if cache_path.exists():
                cache_path.unlink()
@ -1903,7 +2033,10 @@ async def preview_clip(
    if not job:
        raise HTTPException(404, "Ne obstaja")
    src = job.get("input_path")
-    if not src or not Path(src).exists():
+    if not src:
        raise HTTPException(404, "Original video ne obstaja")
    _ensure_local(src, "upload")
    if not Path(src).exists():
        raise HTTPException(404, "Original video ne obstaja")
    if end <= start:
@ -1978,6 +2111,7 @@ async def get_transcript(job_id: str, user: str = Depends(check_auth)):
    if not job:
        raise HTTPException(404, "Ne obstaja")
    analysis_path = OUTPUT_DIR / f"{job_id}.analysis.json"
    _ensure_local(analysis_path, "output")
    if not analysis_path.exists():
        raise HTTPException(404, "Analysis ne obstaja")
    try:
@ -2217,7 +2351,10 @@ async def recut_job(job_id: str, payload: RecutRequest, user: str = Depends(chec
        raise HTTPException(404, "Ne obstaja")
    src = job.get("input_path")
-    if not src or not Path(src).exists():
+    if not src:
        raise HTTPException(400, "Original video manjka")
    _ensure_local(src, "upload")
    if not Path(src).exists():
        raise HTTPException(400, "Original video manjka")
    if payload.end <= payload.start:
@ -2232,6 +2369,7 @@ async def recut_job(job_id: str, payload: RecutRequest, user: str = Depends(chec
    # Naloži obstoječi analysis
    analysis_path = OUTPUT_DIR / f"{job_id}.analysis.json"
    _ensure_local(analysis_path, "output")
    if not analysis_path.exists():
        raise HTTPException(500, "Analysis manjka — re-uplad pesmi")