Clean noise patterns more aggressively + clean already-stored values

User feedback: 'Ansambel UNIKAT — PA PA (offiicial video)' shows the
'(offiicial video)' suffix everywhere (titles, downloads, UI). The user
wants only 'Artist - Title' without any video format markers.

Two fixes:

1. EXPANDED _NOISE_PATTERNS to handle:
   - Typos in 'official': 'offiicial', 'offical', 'oficial' (regex Off[a-z]*icial)
   - Variants: '(Official 4K Video)', '(Official HD Video)', '(Official Music Video)'
   - More versions: (Live), (Cover), (Acoustic), (Extended Mix), (Radio Edit), (Clean), (Explicit)
   - Square brackets: [Official...], [HD], [Lyrics...]
   - Bare words without brackets
   - Trailing year markers '(2024)'

2. NEW clean_noise() function applied at READ TIME:
   Even if a job was saved with 'PA PA (offiicial video)' as parsed_title,
   the new code re-cleans it when serving the job to the UI or building
   the download filename. This means existing jobs get fixed too without
   needing re-processing.

3. Applied to:
   - build_download_filename() — clean before formatting
   - list_jobs() — strip noise when serving job list
   - get_job() — strip noise when serving single job

Result: 'Ansambel UNIKAT - PA PA - REEL.mp4' (no more (offiicial video))
This commit is contained in:
Sebastjan Artič 2026-04-29 18:47:19 +00:00
parent 0dd33c16f3
commit 4788a55643

View File

@ -79,19 +79,43 @@ def check_auth(creds: HTTPBasicCredentials = Depends(security)):
import re
_NOISE_PATTERNS = [
# Pogosti "noise" ki ga je treba odstraniti
r"\(Official\s+(?:Music\s+)?Video\)",
r"\(Officia[lk]\s+Audio\)",
r"\(Offizielles\s+(?:Musik)?[Vv]ideo\)",
# Variacije "official" z možnimi tipkarskimi napakami (offiicial, offical, oficial, ...)
# Match liberalno: "off" + 0-3 dodatnih črk + "icial" + "video"/"audio" + opcijsko številka
r"\(Off[a-z]*icial\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\s*\)",
r"\(Off[a-z]*icia[lk]\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\)",
r"\bOff[a-z]*icial\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\b",
# Nemške variacije
r"\(Offizielles?\s+(?:Musik)?[Vv]ideo\)",
r"\bOffizielles?\s+(?:Musik)?[Vv]ideo\b",
# Lyric videos
r"\(Lyric[s]?\s+Video\)",
r"\bLyric[s]?\s+Video\b",
# Audio quality / version markers
r"\(Audio\)",
r"\(HD\)", r"\(HQ\)", r"\(4K\)",
r"\(Live\)", r"\(Remix\)",
r"\(HD\)", r"\(HQ\)", r"\(4K\)", r"\(8K\)", r"\(1080p?\)",
r"\(Live\)", r"\(Remix\)", r"\(Cover\)", r"\(Acoustic\)",
r"\(Remastered\)", r"\(Remaster(?:ed)?\s*\d{0,4}\)",
r"\[Official.*?\]", r"\[Music.*?\]", r"\[Audio.*?\]",
r"\bofficial\s+video\b", r"\bofficial\s+audio\b",
r"\(Extended(?:\s+Mix)?\)",
r"\(Radio(?:\s+Edit)?\)",
r"\(Clean(?:\s+Version)?\)",
r"\(Explicit\)",
# Square brackets variations
r"\[Official[^\]]*\]",
r"\[Music[^\]]*\]",
r"\[Audio[^\]]*\]",
r"\[HD\]", r"\[HQ\]", r"\[4K\]",
r"\[Lyric[s]?[^\]]*\]",
# Bare words (without brackets)
r"\boriginal\s+(?:video|audio)\b",
r"\bMV\b", r"\b4K\b", r"\bHD\b", r"\bHQ\b",
r"\bMV\b",
# Trailing year in parens (npr. "(2024)")
r"\(\d{4}\)\s*$",
]
def parse_artist_title(filename_or_title):
@ -143,11 +167,23 @@ def safe_filename(s, max_len=80):
return s[:max_len]
def clean_noise(s):
"""Odstrani 'noise' (Official Video itd.) iz besedila - tudi že-parsed values."""
if not s:
return s
cleaned = s
for pat in _NOISE_PATTERNS:
cleaned = re.sub(pat, "", cleaned, flags=re.IGNORECASE)
cleaned = re.sub(r"\s+", " ", cleaned).strip()
cleaned = re.sub(r'^[\s\-–—|.:_]+|[\s\-–—|.:_]+$', '', cleaned)
return cleaned
def build_download_filename(job):
"""Sestavi pravilno ime download datoteke iz job metadata."""
# Najprej probaj job-shranjene parsed values
artist = job.get("parsed_artist")
title = job.get("parsed_title")
artist = clean_noise(job.get("parsed_artist"))
title = clean_noise(job.get("parsed_title"))
# Fallback: parse from filename
if not artist or not title:
@ -192,11 +228,26 @@ def update_job(job_id, **kwargs):
return job
def _clean_job_titles(job):
"""Očisti 'Official Video' ipd. iz parsed_title v real-time (brez perzistiranja)."""
if not job:
return job
if job.get("parsed_title"):
cleaned = clean_noise(job["parsed_title"])
if cleaned and cleaned != job["parsed_title"]:
job["parsed_title"] = cleaned
if job.get("parsed_artist"):
cleaned = clean_noise(job["parsed_artist"])
if cleaned and cleaned != job["parsed_artist"]:
job["parsed_artist"] = cleaned
return job
def list_jobs():
out = []
for f in sorted(JOBS_DIR.glob("*.json"), reverse=True):
try:
out.append(json.loads(f.read_text()))
out.append(_clean_job_titles(json.loads(f.read_text())))
except Exception:
pass
return out
@ -969,7 +1020,7 @@ async def get_job(job_id: str, user: str = Depends(check_auth)):
job = load_job(job_id)
if not job:
raise HTTPException(404, "Ne obstaja")
return job
return _clean_job_titles(job)
@app.get("/api/stream/{job_id}")