diff --git a/app/main.py b/app/main.py index ef13ca4..89d88db 100644 --- a/app/main.py +++ b/app/main.py @@ -79,19 +79,43 @@ def check_auth(creds: HTTPBasicCredentials = Depends(security)): import re _NOISE_PATTERNS = [ - # Pogosti "noise" ki ga je treba odstraniti - r"\(Official\s+(?:Music\s+)?Video\)", - r"\(Officia[lk]\s+Audio\)", - r"\(Offizielles\s+(?:Musik)?[Vv]ideo\)", + # Variacije "official" z možnimi tipkarskimi napakami (offiicial, offical, oficial, ...) + # Match liberalno: "off" + 0-3 dodatnih črk + "icial" + "video"/"audio" + opcijsko številka + r"\(Off[a-z]*icial\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\s*\)", + r"\(Off[a-z]*icia[lk]\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\)", + r"\bOff[a-z]*icial\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\b", + + # Nemške variacije + r"\(Offizielles?\s+(?:Musik)?[Vv]ideo\)", + r"\bOffizielles?\s+(?:Musik)?[Vv]ideo\b", + + # Lyric videos r"\(Lyric[s]?\s+Video\)", + r"\bLyric[s]?\s+Video\b", + + # Audio quality / version markers r"\(Audio\)", - r"\(HD\)", r"\(HQ\)", r"\(4K\)", - r"\(Live\)", r"\(Remix\)", + r"\(HD\)", r"\(HQ\)", r"\(4K\)", r"\(8K\)", r"\(1080p?\)", + r"\(Live\)", r"\(Remix\)", r"\(Cover\)", r"\(Acoustic\)", r"\(Remastered\)", r"\(Remaster(?:ed)?\s*\d{0,4}\)", - r"\[Official.*?\]", r"\[Music.*?\]", r"\[Audio.*?\]", - r"\bofficial\s+video\b", r"\bofficial\s+audio\b", + r"\(Extended(?:\s+Mix)?\)", + r"\(Radio(?:\s+Edit)?\)", + r"\(Clean(?:\s+Version)?\)", + r"\(Explicit\)", + + # Square brackets variations + r"\[Official[^\]]*\]", + r"\[Music[^\]]*\]", + r"\[Audio[^\]]*\]", + r"\[HD\]", r"\[HQ\]", r"\[4K\]", + r"\[Lyric[s]?[^\]]*\]", + + # Bare words (without brackets) r"\boriginal\s+(?:video|audio)\b", - r"\bMV\b", r"\b4K\b", r"\bHD\b", r"\bHQ\b", + r"\bMV\b", + + # Trailing year in parens (npr. "(2024)") + r"\(\d{4}\)\s*$", ] def parse_artist_title(filename_or_title): @@ -143,11 +167,23 @@ def safe_filename(s, max_len=80): return s[:max_len] +def clean_noise(s): + """Odstrani 'noise' (Official Video itd.) iz besedila - tudi že-parsed values.""" + if not s: + return s + cleaned = s + for pat in _NOISE_PATTERNS: + cleaned = re.sub(pat, "", cleaned, flags=re.IGNORECASE) + cleaned = re.sub(r"\s+", " ", cleaned).strip() + cleaned = re.sub(r'^[\s\-–—|.:_]+|[\s\-–—|.:_]+$', '', cleaned) + return cleaned + + def build_download_filename(job): """Sestavi pravilno ime download datoteke iz job metadata.""" # Najprej probaj job-shranjene parsed values - artist = job.get("parsed_artist") - title = job.get("parsed_title") + artist = clean_noise(job.get("parsed_artist")) + title = clean_noise(job.get("parsed_title")) # Fallback: parse from filename if not artist or not title: @@ -192,11 +228,26 @@ def update_job(job_id, **kwargs): return job +def _clean_job_titles(job): + """Očisti 'Official Video' ipd. iz parsed_title v real-time (brez perzistiranja).""" + if not job: + return job + if job.get("parsed_title"): + cleaned = clean_noise(job["parsed_title"]) + if cleaned and cleaned != job["parsed_title"]: + job["parsed_title"] = cleaned + if job.get("parsed_artist"): + cleaned = clean_noise(job["parsed_artist"]) + if cleaned and cleaned != job["parsed_artist"]: + job["parsed_artist"] = cleaned + return job + + def list_jobs(): out = [] for f in sorted(JOBS_DIR.glob("*.json"), reverse=True): try: - out.append(json.loads(f.read_text())) + out.append(_clean_job_titles(json.loads(f.read_text()))) except Exception: pass return out @@ -969,7 +1020,7 @@ async def get_job(job_id: str, user: str = Depends(check_auth)): job = load_job(job_id) if not job: raise HTTPException(404, "Ne obstaja") - return job + return _clean_job_titles(job) @app.get("/api/stream/{job_id}")