reels-app/app/qnet_match.py

"""
Qnet match modul — iz uploaded filename najde matching song v Qnet bazah.

Vrne (artist, title, station, confidence, matched_file).

Strategija:
  1. **Exact filename match** (case-insensitive, with normalized punctuation)
     — stem only, brez extension. Confidence: 1.0.
  2. **Normalized basename match** — odstrani noise besede (Official, HD, 4K,
     letnice, številke v oklepajih), normaliziraj presledke, primerjaj.
     Confidence: 0.9.
  3. **Artist+Title fuzzy match** — najprej parsa filename z obstoječim
     parse_artist_title, potem išče v bazi po normalized (artist|title)
     bigram-keywords. Confidence: 0.6–0.85 odvisno od ratio.
  4. Če ni zadetka → None.

Vrne lookup result dict:
    {
      "matched": bool,
      "method": "exact" | "normalized" | "fuzzy" | None,
      "confidence": float,  # 0.0–1.0
      "artist": str,
      "title": str,
      "station": str,
      "file": str,
    }
"""
import json
import re
import os
import time
import unicodedata
from pathlib import Path
from typing import Optional
from difflib import SequenceMatcher

LOOKUP_PATH = Path(os.environ.get("QNET_LOOKUP_PATH", "/data/qnet/songs_lookup.json"))

# In-memory cache — re-load if file is newer than cache
_cache: dict = {"mtime": 0.0, "songs": [], "by_norm_file": {}, "by_norm_stem": {}}


def _norm(s: str) -> str:
    """Aggressive normalize: lowercase, strip diacritics, kill punctuation."""
    if not s:
        return ""
    s = unicodedata.normalize("NFKD", s)
    s = "".join(c for c in s if not unicodedata.combining(c))
    s = s.lower()
    # Replace en-dash / em-dash / various separators with hyphen
    s = s.replace("–", "-").replace("—", "-").replace("|", "-")
    # Drop everything except alnum and space
    s = re.sub(r"[^a-z0-9\s]+", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s


# Noise words pogosto v "Official Video" / YouTube ipd.
_NOISE_WORDS = {
    "official", "officiel", "video", "musikvideo", "musicvideo", "music",
    "lyric", "lyrics", "audio", "hd", "4k", "8k", "uhd", "live", "remix",
    "remaster", "remastered", "version", "ver", "extended", "edit",
    "videoclip", "clip", "premiere", "premiera", "tv", "final", "promo",
    "1080p", "720p", "2160p",
}


def _norm_minus_noise(s: str) -> str:
    """Norm + remove noise words + drop standalone digits 2/3/4-digit (years/versions)."""
    n = _norm(s)
    toks = []
    for t in n.split():
        if t in _NOISE_WORDS:
            continue
        if t.isdigit() and len(t) <= 4:
            continue
        toks.append(t)
    return " ".join(toks)


def _load_db():
    """Re-load lookup file if changed. Uses mtime check."""
    global _cache
    try:
        mtime = LOOKUP_PATH.stat().st_mtime
    except FileNotFoundError:
        return False
    if mtime <= _cache["mtime"]:
        return True
    songs = json.loads(LOOKUP_PATH.read_text(encoding="utf-8"))
    by_norm_file = {}        # normalized whole filename → song
    by_norm_stem = {}        # normalized stem (no ext) → song
    by_norm_minus = {}       # normalized minus noise → song
    for s in songs:
        f = s.get("file") or ""
        if not f:
            continue
        nf = _norm(f)
        ns = _norm(Path(f).stem)
        nm = _norm_minus_noise(Path(f).stem)
        # Prvi zadetek wins (po prvi station ki ima ta file — kasneje lahko
        # menjamo na "all stations match")
        by_norm_file.setdefault(nf, s)
        by_norm_stem.setdefault(ns, s)
        if nm:
            by_norm_minus.setdefault(nm, s)
    _cache = {
        "mtime": mtime,
        "songs": songs,
        "by_norm_file": by_norm_file,
        "by_norm_stem": by_norm_stem,
        "by_norm_minus": by_norm_minus,
    }
    print(f"[qnet_match] loaded {len(songs)} songs from {LOOKUP_PATH} (mtime={mtime})", flush=True)
    return True


def _result(song: dict, method: str, confidence: float) -> dict:
    return {
        "matched": True,
        "method": method,
        "confidence": round(confidence, 3),
        "artist": song.get("artist", ""),
        "title": song.get("title", ""),
        "station": song.get("station", ""),
        "file": song.get("file", ""),
    }


def _no_match() -> dict:
    return {
        "matched": False,
        "method": None,
        "confidence": 0.0,
        "artist": "",
        "title": "",
        "station": "",
        "file": "",
    }


def match_filename(filename: str) -> dict:
    """Najdi match za uploaded filename. Glavna API funkcija."""
    if not filename:
        return _no_match()
    if not _load_db():
        return _no_match()  # baza ni naložena

    stem = Path(filename).stem
    nf_full = _norm(filename)
    nf_stem = _norm(stem)
    nm_stem = _norm_minus_noise(stem)

    # 1) Exact normalized match z extensionom
    if nf_full in _cache["by_norm_file"]:
        return _result(_cache["by_norm_file"][nf_full], "exact", 1.0)

    # 2) Stem match (brez extension)
    if nf_stem in _cache["by_norm_stem"]:
        return _result(_cache["by_norm_stem"][nf_stem], "exact_stem", 0.95)

    # 3) Stem match brez noise
    if nm_stem and nm_stem in _cache["by_norm_minus"]:
        return _result(_cache["by_norm_minus"][nm_stem], "normalized", 0.9)

    # 4) Fuzzy substring — če se input vsebuje v song.file ali obratno (samo če dovolj dolgo)
    # Najprej probaj parse "Artist - Title" iz filename
    parsed_a, parsed_t = _parse_artist_title_simple(stem)
    if parsed_a and parsed_t:
        cand = _fuzzy_artist_title(parsed_a, parsed_t)
        if cand:
            return cand

    # 5) Last-resort fuzzy — sequence ratio na stem proti vsem stem-om (slow but bounded)
    # Samo če stem >= 8 znakov, da ne dobimo random match-ev
    if len(nm_stem) >= 8:
        best = None
        best_ratio = 0.0
        for cand_stem, song in _cache["by_norm_minus"].items():
            ratio = SequenceMatcher(None, nm_stem, cand_stem).ratio()
            if ratio > best_ratio:
                best_ratio = ratio
                best = song
        if best and best_ratio >= 0.85:
            return _result(best, "fuzzy", best_ratio * 0.85)

    return _no_match()


def _parse_artist_title_simple(name: str):
    """Lightweight parser za 'Artist - Title' / 'Artist – Title'."""
    if not name:
        return (None, None)
    # Probaj najprej en/em-dash (najbolj distinctive), potem hyphen
    for sep in [" – ", " — ", " - ", "_-_"]:
        if sep in name:
            parts = name.split(sep, 1)
            if len(parts) == 2:
                a = parts[0].strip()
                t = parts[1].strip()
                if a and t and len(a) <= 80 and len(t) <= 100:
                    return (a, t)
    return (None, None)


def _fuzzy_artist_title(artist: str, title: str) -> Optional[dict]:
    """Najdi pesem v bazi po normalized artist+title fuzzy match."""
    na = _norm_minus_noise(artist)
    nt = _norm_minus_noise(title)
    if not na or not nt:
        return None

    best = None
    best_score = 0.0
    for song in _cache["songs"]:
        sa = _norm_minus_noise(song.get("artist", ""))
        st = _norm_minus_noise(song.get("title", ""))
        if not sa or not st:
            continue
        ar = SequenceMatcher(None, na, sa).ratio()
        tr = SequenceMatcher(None, nt, st).ratio()
        score = (ar + tr) / 2
        if score > best_score:
            best_score = score
            best = song

    if best and best_score >= 0.82:
        return _result(best, "fuzzy_at", best_score * 0.9)
    return None


def db_stats() -> dict:
    """Vrne statistiko baze za health check / admin endpoint."""
    _load_db()
    if not _cache["songs"]:
        return {"loaded": False, "total": 0, "stations": {}}
    by_station = {}
    for s in _cache["songs"]:
        st = s.get("station", "?")
        by_station[st] = by_station.get(st, 0) + 1
    return {
        "loaded": True,
        "total": len(_cache["songs"]),
        "stations": by_station,
        "mtime": _cache["mtime"],
        "age_seconds": time.time() - _cache["mtime"],
    }