From 3877b822ff22f86ec5a5997ee4622fb169957097 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= <sebastjan@folx.tv>
Date: Wed, 29 Apr 2026 14:15:18 +0000
Subject: [PATCH] Smart download filenames: 'Artist - Title - REEL.mp4' +
 validation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two improvements:

1. DOWNLOAD FILENAME: instead of 'reel_<job-id>.mp4' (e.g. reel_25e076af7600.mp4),
   downloads now have descriptive names like:
   - 'Lady Gaga - Abracadabra - REEL.mp4'
   - 'Modrijani - S teboj - REEL.mp4'
   - 'Sarah Connor - FICKA - REEL.mp4'

2. PRE-UPLOAD VALIDATION: when filename doesn't follow 'Artist - Title' format,
   browser prompts user for both fields. Without them, upload is blocked.
   This prevents files with names like '12345.mp4' or 'video_final.mp4' from
   being processed without identifying info.

Implementation:
- parse_artist_title() helper handles common formats:
  - 'Artist - Title.mp4' / 'Artist – Title' (em-dash)
  - 'Artist | Title' / 'Artist : Title'
  - Strips noise: '(Official Music Video)', '(Audio)', '(HD)', '[Lyric Video]'
- Client-side parser mirrors backend (validation before upload)
- Backend accepts artist + title form fields (override parsed)
- Job stored with parsed_artist + parsed_title + has_clean_name fields
- YouTube jobs auto-fetch title via yt-dlp --info-only and parse it
- Filename hint to Scribe/Claude uses parsed values (cleaner than raw filename)
- Download endpoint uses build_download_filename() for content-disposition
- Jobs list shows 'Artist — Title' instead of raw filename

Result: downloaded reels are auto-named correctly for Facebook/Instagram
upload, no more renaming files manually.
---
 app/main.py          | 147 +++++++++++++++++++++++++++++++++++++++++--
 templates/index.html |  91 ++++++++++++++++++++++++---
 2 files changed, 227 insertions(+), 11 deletions(-)
diff --git a/app/main.py b/app/main.py
index 58b6bff..b6fa38e 100644
--- a/app/main.py
+++ b/app/main.py
@@ -73,6 +73,97 @@ def check_auth(creds: HTTPBasicCredentials = Depends(security)):
     return creds.username
 
 
+# ────────────────────────────────────────────────────────────────
+# Artist + title parsing iz filename / YouTube title
+# ────────────────────────────────────────────────────────────────
+import re
+
+_NOISE_PATTERNS = [
+    # Pogosti "noise" ki ga je treba odstraniti
+    r"\(Official\s+(?:Music\s+)?Video\)",
+    r"\(Officia[lk]\s+Audio\)",
+    r"\(Offizielles\s+(?:Musik)?[Vv]ideo\)",
+    r"\(Lyric[s]?\s+Video\)",
+    r"\(Audio\)",
+    r"\(HD\)", r"\(HQ\)", r"\(4K\)",
+    r"\(Live\)", r"\(Remix\)",
+    r"\(Remastered\)", r"\(Remaster(?:ed)?\s*\d{0,4}\)",
+    r"\[Official.*?\]", r"\[Music.*?\]", r"\[Audio.*?\]",
+    r"\bofficial\s+video\b", r"\bofficial\s+audio\b",
+    r"\boriginal\s+(?:video|audio)\b",
+    r"\bMV\b", r"\b4K\b", r"\bHD\b", r"\bHQ\b",
+]
+
+def parse_artist_title(filename_or_title):
+    """Iz imena datoteke / YouTube naslova ekstrahira (artist, title).
+    
+    Podpira pogoste vzorce:
+      - "Artist - Title.mp4"
+      - "Artist - Title (Official Music Video).mp4"  
+      - "Artist – Title" (em-dash)
+      - "Artist | Title"
+    
+    Vrne (artist, title) ali (None, None) če ni razvidno.
+    """
+    if not filename_or_title:
+        return (None, None)
+    
+    # Odstrani extension
+    name = Path(filename_or_title).stem if "." in filename_or_title else filename_or_title
+    
+    # Odstrani noise patterns
+    for pat in _NOISE_PATTERNS:
+        name = re.sub(pat, "", name, flags=re.IGNORECASE)
+    
+    # Normaliziraj presledke
+    name = re.sub(r"\s+", " ", name).strip()
+    
+    # Probaj različne separatorje
+    for sep in [" - ", " – ", " — ", " | ", " : "]:
+        if sep in name:
+            parts = name.split(sep, 1)
+            artist = parts[0].strip()
+            title = parts[1].strip()
+            # Strip trailing/leading puncutation
+            artist = re.sub(r'^[\s\-–—|.:_]+|[\s\-–—|.:_]+$', '', artist)
+            title = re.sub(r'^[\s\-–—|.:_]+|[\s\-–—|.:_]+$', '', title)
+            if artist and title and len(artist) <= 80 and len(title) <= 100:
+                return (artist, title)
+    
+    return (None, None)
+
+
+def safe_filename(s, max_len=80):
+    """Naredi varno ime datoteke (brez znakov ki bi razbili FS)."""
+    if not s:
+        return ""
+    # Replace problematic chars with safe alternative
+    s = re.sub(r'[<>:"/\\|?*\x00-\x1f]', '', s)
+    s = re.sub(r'\s+', ' ', s).strip()
+    return s[:max_len]
+
+
+def build_download_filename(job):
+    """Sestavi pravilno ime download datoteke iz job metadata."""
+    # Najprej probaj job-shranjene parsed values
+    artist = job.get("parsed_artist")
+    title = job.get("parsed_title")
+    
+    # Fallback: parse from filename
+    if not artist or not title:
+        source = job.get("filename") or job.get("youtube_title") or ""
+        parsed_artist, parsed_title = parse_artist_title(source)
+        artist = artist or parsed_artist
+        title = title or parsed_title
+    
+    if artist and title:
+        return f"{safe_filename(artist)} - {safe_filename(title)} - REEL.mp4"
+    if title:
+        return f"{safe_filename(title)} - REEL.mp4"
+    # Last resort: job ID (vendar to bi se moralo preprečiti že ob upload-u)
+    return f"reel_{job['id']}.mp4"
+
+
 # ────────────────────────────────────────────────────────────────
 # Job state (filesystem-based, persistent prek restartov)
 # ────────────────────────────────────────────────────────────────
@@ -220,6 +311,30 @@ def process_job(job_id):
             if not run_subprocess_logged(cmd, job_id, "YouTube download"):
                 return
             update_job(job_id, input_path=str(input_path))
+            
+            # Probaj dobiti YT naslov za artist+title parsing
+            try:
+                info_cmd = [
+                    "python3", str(SCRIPTS_DIR / "yt_download.py"),
+                    job["youtube_url"], "/dev/null", "--info-only",
+                ]
+                proc = subprocess.run(info_cmd, capture_output=True, text=True, timeout=30)
+                if proc.returncode == 0 and proc.stdout:
+                    info = json.loads(proc.stdout)
+                    yt_title = info.get("title", "")
+                    if yt_title:
+                        a, t = parse_artist_title(yt_title)
+                        updates = {"youtube_title": yt_title}
+                        if a:
+                            updates["parsed_artist"] = a
+                        if t:
+                            updates["parsed_title"] = t
+                        updates["has_clean_name"] = bool(a and t)
+                        update_job(job_id, **updates)
+                        # Reload job for downstream use
+                        job = load_job(job_id)
+            except Exception as e:
+                print(f"⚠️ Cannot fetch YT title: {e}", flush=True)
         else:
             input_path = Path(job["input_path"])
 
@@ -242,9 +357,11 @@ def process_job(job_id):
                 cmd += ["--llm-provider", job["llm_provider"]]
             if job.get("llm_model"):
                 cmd += ["--llm-model", job["llm_model"]]
-            # Filename hint = original filename (Claude lahko prepozna pesem)
-            if job.get("filename"):
-                # Brez extension
+            # Filename hint za Claude/Scribe — preferiraj parsed artist+title (čistejše)
+            if job.get("parsed_artist") and job.get("parsed_title"):
+                fn_hint = f"{job['parsed_artist']} - {job['parsed_title']}"
+                cmd += ["--filename-hint", fn_hint]
+            elif job.get("filename"):
                 fn_hint = Path(job["filename"]).stem
                 cmd += ["--filename-hint", fn_hint]
             # STT provider (elevenlabs = Scribe, local = faster-whisper, auto = preferiraj Scribe)
@@ -513,6 +630,8 @@ class StartJobIn(BaseModel):
 @app.post("/api/upload")
 async def upload_video(
     file: UploadFile = File(...),
+    artist: Optional[str] = Form(None),
+    title: Optional[str] = Form(None),
     user: str = Depends(check_auth),
 ):
     if not file.filename:
@@ -543,6 +662,22 @@ async def upload_video(
         "created_at": time.time(),
         "updated_at": time.time(),
     }
+    
+    # Artist + title — najprej user-provided, potem parse iz filename
+    if artist and title:
+        # User je vpisal ali potrdil
+        job["parsed_artist"] = artist.strip()
+        job["parsed_title"] = title.strip()
+        job["has_clean_name"] = True
+    else:
+        # Auto parse iz filename
+        a, t = parse_artist_title(file.filename)
+        if a:
+            job["parsed_artist"] = a
+        if t:
+            job["parsed_title"] = t
+        job["has_clean_name"] = bool(a and t)
+    
     save_job(job)
     return job
 
@@ -670,10 +805,14 @@ async def download(job_id: str, user: str = Depends(check_auth)):
     out = Path(job["output_path"])
     if not out.exists():
         raise HTTPException(404, "Output ne obstaja")
+    
+    # Pametno ime: "Izvajalec - Naslov - REEL.mp4"
+    download_name = build_download_filename(job)
+    
     return FileResponse(
         out,
         media_type="video/mp4",
-        filename=f"reel_{job_id}.mp4",
+        filename=download_name,
     )
 
 
diff --git a/templates/index.html b/templates/index.html
index 2a2c647..0f5fd80 100644
--- a/templates/index.html
+++ b/templates/index.html
@@ -443,11 +443,13 @@
     const dz = $("#dropzone");
     const fileInput = $("#file-input");
     let pendingFile = null;
+    let pendingArtist = null;
+    let pendingTitle = null;
+    
     dz.addEventListener("click", () => fileInput.click());
     fileInput.addEventListener("change", () => {
       if (fileInput.files[0]) {
-        pendingFile = fileInput.files[0];
-        dz.querySelector("div").textContent = `📹 ${pendingFile.name}`;
+        handleFileSelected(fileInput.files[0]);
       }
     });
     ["dragover", "dragenter"].forEach(ev =>
@@ -456,11 +458,82 @@
       dz.addEventListener(ev, e => { e.preventDefault(); dz.classList.remove("drag"); }));
     dz.addEventListener("drop", e => {
       const f = e.dataTransfer.files[0];
-      if (f) {
-        pendingFile = f;
-        dz.querySelector("div").textContent = `📹 ${f.name}`;
-      }
+      if (f) handleFileSelected(f);
     });
+    
+    // Klient-side parser (mora ustrezati backend parse_artist_title)
+    function parseArtistTitle(filename) {
+      if (!filename) return [null, null];
+      let name = filename.replace(/\.[^.]+$/, "");  // remove ext
+      
+      // Odstrani noise
+      const noise = [
+        /\(Official\s+(?:Music\s+)?Video\)/gi,
+        /\(Officia[lk]\s+Audio\)/gi,
+        /\(Offizielles\s+(?:Musik)?[Vv]ideo\)/gi,
+        /\(Lyric[s]?\s+Video\)/gi,
+        /\(Audio\)/gi,
+        /\(HD\)|\(HQ\)|\(4K\)/gi,
+        /\(Live\)|\(Remix\)|\(Remaster(?:ed)?\s*\d{0,4}\)/gi,
+        /\[Official.*?\]|\[Music.*?\]|\[Audio.*?\]/gi,
+        /\bofficial\s+video\b|\bofficial\s+audio\b/gi,
+        /\boriginal\s+(?:video|audio)\b/gi,
+        /\bMV\b|\b4K\b|\bHD\b|\bHQ\b/g,
+      ];
+      for (const r of noise) name = name.replace(r, "");
+      name = name.replace(/\s+/g, " ").trim();
+      
+      // Probaj separatorje
+      for (const sep of [" - ", " – ", " — ", " | ", " : "]) {
+        if (name.includes(sep)) {
+          const parts = name.split(sep);
+          if (parts.length >= 2) {
+            const artist = parts[0].trim().replace(/^[\s\-–—|.:_]+|[\s\-–—|.:_]+$/g, "");
+            const title = parts.slice(1).join(sep).trim().replace(/^[\s\-–—|.:_]+|[\s\-–—|.:_]+$/g, "");
+            if (artist && title) return [artist, title];
+          }
+        }
+      }
+      return [null, null];
+    }
+    
+    function handleFileSelected(f) {
+      const [artist, title] = parseArtistTitle(f.name);
+      
+      if (!artist || !title) {
+        // Ni razvidno ime — vprašaj uporabnika
+        const userArtist = prompt(
+          `❗ Iz imena datoteke ni razviden izvajalec in naslov.\n\n` +
+          `Datoteka: "${f.name}"\n\n` +
+          `Vpiši IZVAJALCA (npr. "Lady Gaga"):`,
+          ""
+        );
+        if (!userArtist || !userArtist.trim()) {
+          alert("⛔ Brez izvajalca ne morem nadaljevati.\n\nPoimenuj datoteko v formatu:\n  Izvajalec - Naslov.mp4");
+          fileInput.value = "";
+          return;
+        }
+        const userTitle = prompt(
+          `Vpiši NASLOV pesmi (npr. "Abracadabra"):`,
+          ""
+        );
+        if (!userTitle || !userTitle.trim()) {
+          alert("⛔ Brez naslova ne morem nadaljevati.");
+          fileInput.value = "";
+          return;
+        }
+        pendingArtist = userArtist.trim();
+        pendingTitle = userTitle.trim();
+      } else {
+        pendingArtist = artist;
+        pendingTitle = title;
+      }
+      
+      pendingFile = f;
+      dz.querySelector("div").innerHTML = 
+        `📹 <b>${pendingArtist} — ${pendingTitle}</b>` +
+        `<div style="font-size: 11px; color: var(--muted); margin-top: 4px;">${f.name} (${(f.size/1024/1024).toFixed(1)} MB)</div>`;
+    }
 
     // ─── Settings collector ─────────────────────────
     function collectSettings() {
@@ -610,6 +683,8 @@
           }
           const fd = new FormData();
           fd.append("file", pendingFile);
+          if (pendingArtist) fd.append("artist", pendingArtist);
+          if (pendingTitle) fd.append("title", pendingTitle);
 
           showLive("Nalaganje datoteke", `${pendingFile.name} (${(pendingFile.size / 1024 / 1024).toFixed(1)} MB)`, 0);
 
@@ -755,7 +830,9 @@
 
       const title = job.source_type === "youtube"
         ? (job.youtube_url || "YouTube")
-        : (job.filename || job.id);
+        : (job.parsed_artist && job.parsed_title 
+            ? `${job.parsed_artist} — ${job.parsed_title}` 
+            : (job.filename || job.id));
 
       const sizeStr = job.output_size_mb ? `${job.output_size_mb} MB` :
                       job.size_mb ? `${job.size_mb} MB` : "";