reels-app/app/cleanup.py
Claude 0d72d70f5d S3 mirror integration: workfiles auto-mirror to s3://folxspeed/reels-app/
- main.py: 4 helper funcs (_persist_to_s3, _ensure_local, _delete_from_s3,
  _ffmpeg_then_persist) - no-op fallback when S3 creds missing
- save_job(): mirror metadata JSON to S3
- process_job(): mirror YT download + render output + analysis/srt/ass to S3
- upload_video(): mirror direct uploads to S3
- _precache_edit_assets(): Popen->threaded with S3 sync after ffmpeg
- read endpoints (download, preview, source_video, waveform, preview_clip,
  get_transcript, recut_job): _ensure_local() fallback fetch from S3
- delete_job(): cascade delete to S3 (mirror unlink)
- cleanup.py: NEW module, deletes local files >48h that exist in S3.
  Verified by S3 head_object + size match. NOT YET ACTIVATED in cron.

Backward compat: lokalna mapa ostane primary. Brez env vars S3_* vsi
helperji vrnejo False (no-op). Production behavior identičen, dokler
ne dobi S3 creds.
2026-05-03 12:24:18 +00:00

168 lines
5.5 KiB
Python

"""
Cleanup module: removes local files older than N hours, *only if they exist in S3*.
Safe by design:
- Never deletes a local file unless its S3 mirror is verified
- Never deletes job metadata (jobs/*.json) — those are tiny
- Default age threshold is conservative (48 h)
- Dry-run mode for verification
Usage (cron):
python3 -m app.cleanup --apply # actual delete
python3 -m app.cleanup --dry-run # preview only (default)
python3 -m app.cleanup --apply --hours 72 # custom age threshold
Suggested cron (every night 03:30):
30 3 * * * cd /app && python3 -m app.cleanup --apply >> /data/cleanup.log 2>&1
"""
from __future__ import annotations
import argparse
import logging
import os
import sys
import time
from pathlib import Path
# Konfiguracija — ujema main.py
DATA_DIR = Path(os.environ.get("DATA_DIR", "/data"))
UPLOAD_DIR = DATA_DIR / "uploads"
OUTPUT_DIR = DATA_DIR / "outputs"
DEFAULT_AGE_HOURS = 48
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [cleanup] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger(__name__)
def _file_age_hours(p: Path) -> float:
"""Return file age in hours based on mtime."""
try:
return (time.time() - p.stat().st_mtime) / 3600
except OSError:
return 0.0
def _scan_dir(d: Path, kind: str, min_age_h: float):
"""Yield (local_path, kind, age_hours) for files older than threshold."""
if not d.exists():
return
for p in d.iterdir():
if not p.is_file():
continue
# Skip very small files (probably config/state) — only target real workfiles
if p.stat().st_size < 1024 * 100: # <100 KB
continue
age = _file_age_hours(p)
if age >= min_age_h:
yield p, kind, age
def cleanup(min_age_hours: float, apply: bool) -> dict:
"""Run cleanup pass. Returns stats dict.
Logic:
For each file older than min_age_hours in uploads/ and outputs/:
- Verify S3 mirror exists (s3.exists())
- If verified: delete local
- If not verified: skip (warn) — never delete unverified
"""
from app import s3_storage
if not s3_storage.is_enabled():
log.error("S3 not configured — refusing to run cleanup. Aborting.")
return {"error": "s3_not_configured"}
stats = {
"scanned": 0,
"would_delete": 0,
"deleted": 0,
"skipped_no_s3_mirror": 0,
"freed_mb": 0.0,
"errors": 0,
}
deleted_files = []
skipped_files = []
for d, kind in [(UPLOAD_DIR, "upload"), (OUTPUT_DIR, "output")]:
for p, _, age_h in _scan_dir(d, kind, min_age_hours):
stats["scanned"] += 1
size_mb = p.stat().st_size / 1024 / 1024
folder = "uploads" if kind == "upload" else "outputs"
s3_key = f"{folder}/{p.name}"
# Verify S3 mirror
try:
s3_size = s3_storage.get_object_size(s3_key)
except Exception as e:
log.warning("S3 check failed for %s: %s", s3_key, e)
stats["errors"] += 1
continue
if s3_size is None:
log.warning("SKIP — no S3 mirror: %s (age %.1fh, %.1f MB)",
p.name, age_h, size_mb)
stats["skipped_no_s3_mirror"] += 1
skipped_files.append(p.name)
continue
# Check size match (sanity)
local_size = p.stat().st_size
if abs(s3_size - local_size) > 1024: # >1 KB delta — suspicious
log.warning("SKIP — size mismatch %s: local=%d s3=%d",
p.name, local_size, s3_size)
stats["skipped_no_s3_mirror"] += 1
skipped_files.append(p.name)
continue
stats["would_delete"] += 1
stats["freed_mb"] += size_mb
if apply:
try:
p.unlink()
stats["deleted"] += 1
deleted_files.append(p.name)
log.info("DEL %s (%.1f MB, age %.1fh, S3 verified)",
p.name, size_mb, age_h)
except OSError as e:
log.error("Delete failed %s: %s", p, e)
stats["errors"] += 1
else:
log.info("DRY %s (%.1f MB, age %.1fh, S3 verified)",
p.name, size_mb, age_h)
log.info("=" * 60)
log.info("Cleanup pass: %s", "APPLY" if apply else "DRY-RUN")
log.info(" scanned: %d", stats["scanned"])
log.info(" would-delete: %d", stats["would_delete"])
log.info(" deleted: %d", stats["deleted"])
log.info(" skipped (no S3): %d", stats["skipped_no_s3_mirror"])
log.info(" errors: %d", stats["errors"])
log.info(" freed: %.1f MB", stats["freed_mb"])
return stats
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--apply", action="store_true",
help="Actually delete files (default is dry-run)")
ap.add_argument("--dry-run", action="store_true",
help="Preview only, do not delete (default)")
ap.add_argument("--hours", type=float, default=DEFAULT_AGE_HOURS,
help=f"Min file age in hours (default {DEFAULT_AGE_HOURS})")
args = ap.parse_args()
apply = args.apply and not args.dry_run
stats = cleanup(min_age_hours=args.hours, apply=apply)
if stats.get("error"):
sys.exit(2)
if __name__ == "__main__":
main()