Clean noise patterns more aggressively + clean already-stored values
User feedback: 'Ansambel UNIKAT — PA PA (offiicial video)' shows the '(offiicial video)' suffix everywhere (titles, downloads, UI). The user wants only 'Artist - Title' without any video format markers. Two fixes: 1. EXPANDED _NOISE_PATTERNS to handle: - Typos in 'official': 'offiicial', 'offical', 'oficial' (regex Off[a-z]*icial) - Variants: '(Official 4K Video)', '(Official HD Video)', '(Official Music Video)' - More versions: (Live), (Cover), (Acoustic), (Extended Mix), (Radio Edit), (Clean), (Explicit) - Square brackets: [Official...], [HD], [Lyrics...] - Bare words without brackets - Trailing year markers '(2024)' 2. NEW clean_noise() function applied at READ TIME: Even if a job was saved with 'PA PA (offiicial video)' as parsed_title, the new code re-cleans it when serving the job to the UI or building the download filename. This means existing jobs get fixed too without needing re-processing. 3. Applied to: - build_download_filename() — clean before formatting - list_jobs() — strip noise when serving job list - get_job() — strip noise when serving single job Result: 'Ansambel UNIKAT - PA PA - REEL.mp4' (no more (offiicial video))
This commit is contained in:
parent
0dd33c16f3
commit
4788a55643
77
app/main.py
77
app/main.py
@ -79,19 +79,43 @@ def check_auth(creds: HTTPBasicCredentials = Depends(security)):
|
||||
import re
|
||||
|
||||
_NOISE_PATTERNS = [
|
||||
# Pogosti "noise" ki ga je treba odstraniti
|
||||
r"\(Official\s+(?:Music\s+)?Video\)",
|
||||
r"\(Officia[lk]\s+Audio\)",
|
||||
r"\(Offizielles\s+(?:Musik)?[Vv]ideo\)",
|
||||
# Variacije "official" z možnimi tipkarskimi napakami (offiicial, offical, oficial, ...)
|
||||
# Match liberalno: "off" + 0-3 dodatnih črk + "icial" + "video"/"audio" + opcijsko številka
|
||||
r"\(Off[a-z]*icial\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\s*\)",
|
||||
r"\(Off[a-z]*icia[lk]\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\)",
|
||||
r"\bOff[a-z]*icial\s+(?:Music\s+|HD\s+|4K\s+)?(?:Video|Audio)\b",
|
||||
|
||||
# Nemške variacije
|
||||
r"\(Offizielles?\s+(?:Musik)?[Vv]ideo\)",
|
||||
r"\bOffizielles?\s+(?:Musik)?[Vv]ideo\b",
|
||||
|
||||
# Lyric videos
|
||||
r"\(Lyric[s]?\s+Video\)",
|
||||
r"\bLyric[s]?\s+Video\b",
|
||||
|
||||
# Audio quality / version markers
|
||||
r"\(Audio\)",
|
||||
r"\(HD\)", r"\(HQ\)", r"\(4K\)",
|
||||
r"\(Live\)", r"\(Remix\)",
|
||||
r"\(HD\)", r"\(HQ\)", r"\(4K\)", r"\(8K\)", r"\(1080p?\)",
|
||||
r"\(Live\)", r"\(Remix\)", r"\(Cover\)", r"\(Acoustic\)",
|
||||
r"\(Remastered\)", r"\(Remaster(?:ed)?\s*\d{0,4}\)",
|
||||
r"\[Official.*?\]", r"\[Music.*?\]", r"\[Audio.*?\]",
|
||||
r"\bofficial\s+video\b", r"\bofficial\s+audio\b",
|
||||
r"\(Extended(?:\s+Mix)?\)",
|
||||
r"\(Radio(?:\s+Edit)?\)",
|
||||
r"\(Clean(?:\s+Version)?\)",
|
||||
r"\(Explicit\)",
|
||||
|
||||
# Square brackets variations
|
||||
r"\[Official[^\]]*\]",
|
||||
r"\[Music[^\]]*\]",
|
||||
r"\[Audio[^\]]*\]",
|
||||
r"\[HD\]", r"\[HQ\]", r"\[4K\]",
|
||||
r"\[Lyric[s]?[^\]]*\]",
|
||||
|
||||
# Bare words (without brackets)
|
||||
r"\boriginal\s+(?:video|audio)\b",
|
||||
r"\bMV\b", r"\b4K\b", r"\bHD\b", r"\bHQ\b",
|
||||
r"\bMV\b",
|
||||
|
||||
# Trailing year in parens (npr. "(2024)")
|
||||
r"\(\d{4}\)\s*$",
|
||||
]
|
||||
|
||||
def parse_artist_title(filename_or_title):
|
||||
@ -143,11 +167,23 @@ def safe_filename(s, max_len=80):
|
||||
return s[:max_len]
|
||||
|
||||
|
||||
def clean_noise(s):
|
||||
"""Odstrani 'noise' (Official Video itd.) iz besedila - tudi že-parsed values."""
|
||||
if not s:
|
||||
return s
|
||||
cleaned = s
|
||||
for pat in _NOISE_PATTERNS:
|
||||
cleaned = re.sub(pat, "", cleaned, flags=re.IGNORECASE)
|
||||
cleaned = re.sub(r"\s+", " ", cleaned).strip()
|
||||
cleaned = re.sub(r'^[\s\-–—|.:_]+|[\s\-–—|.:_]+$', '', cleaned)
|
||||
return cleaned
|
||||
|
||||
|
||||
def build_download_filename(job):
|
||||
"""Sestavi pravilno ime download datoteke iz job metadata."""
|
||||
# Najprej probaj job-shranjene parsed values
|
||||
artist = job.get("parsed_artist")
|
||||
title = job.get("parsed_title")
|
||||
artist = clean_noise(job.get("parsed_artist"))
|
||||
title = clean_noise(job.get("parsed_title"))
|
||||
|
||||
# Fallback: parse from filename
|
||||
if not artist or not title:
|
||||
@ -192,11 +228,26 @@ def update_job(job_id, **kwargs):
|
||||
return job
|
||||
|
||||
|
||||
def _clean_job_titles(job):
|
||||
"""Očisti 'Official Video' ipd. iz parsed_title v real-time (brez perzistiranja)."""
|
||||
if not job:
|
||||
return job
|
||||
if job.get("parsed_title"):
|
||||
cleaned = clean_noise(job["parsed_title"])
|
||||
if cleaned and cleaned != job["parsed_title"]:
|
||||
job["parsed_title"] = cleaned
|
||||
if job.get("parsed_artist"):
|
||||
cleaned = clean_noise(job["parsed_artist"])
|
||||
if cleaned and cleaned != job["parsed_artist"]:
|
||||
job["parsed_artist"] = cleaned
|
||||
return job
|
||||
|
||||
|
||||
def list_jobs():
|
||||
out = []
|
||||
for f in sorted(JOBS_DIR.glob("*.json"), reverse=True):
|
||||
try:
|
||||
out.append(json.loads(f.read_text()))
|
||||
out.append(_clean_job_titles(json.loads(f.read_text())))
|
||||
except Exception:
|
||||
pass
|
||||
return out
|
||||
@ -969,7 +1020,7 @@ async def get_job(job_id: str, user: str = Depends(check_auth)):
|
||||
job = load_job(job_id)
|
||||
if not job:
|
||||
raise HTTPException(404, "Ne obstaja")
|
||||
return job
|
||||
return _clean_job_titles(job)
|
||||
|
||||
|
||||
@app.get("/api/stream/{job_id}")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user