Fix find_chorus: RMS energy parser was broken (no pts_time available), now syntheses timestamps; energy weight x10 (refren je glasnejši)

This commit is contained in:
Sebastjan Artič 2026-04-28 16:55:51 +00:00
parent 64e8854cea
commit c17578521a

View File

@ -119,33 +119,42 @@ def find_repeated_lines(lines, similarity_threshold=0.5):
def compute_energy(audio_path, window_sec=1.0):
"""
Vrni list (timestamp, rms_db) preko FFmpeg ebur128 filter.
Vrni list (timestamp, rms_db) preko FFmpeg astats filter.
Vsako okno window_sec sekund vrne en RMS sample.
"""
# Uporabi ebur128 ali astats za RMS
cmd = [
"ffmpeg", "-i", audio_path,
"-af", f"asetnsamples=n={int(16000 * window_sec)}:p=0,astats=metadata=1:reset={window_sec},"
"ametadata=print:key=lavfi.astats.Overall.RMS_level",
"ametadata=print:key=lavfi.astats.Overall.RMS_level:file=-",
"-f", "null", "-",
]
result = subprocess.run(cmd, capture_output=True, text=True)
output = result.stderr
# ametadata file=- pošilja na stdout
output = result.stdout + "\n" + result.stderr
energies = []
current_pts = None
for line in output.split("\n"):
line = line.strip()
if line.startswith("frame:"):
# frame:N pts:X pts_time:Y
m = re.search(r"pts_time:(\S+)", line)
if m:
# Format A: "frame:N pts:X pts_time:Y"
m = re.search(r"pts_time:(\S+)", line)
if m:
try:
current_pts = float(m.group(1))
elif line.startswith("lavfi.astats.Overall.RMS_level="):
val = line.split("=")[1]
except ValueError:
pass
continue
# Format B: lavfi.astats.Overall.RMS_level=-15.123
if "RMS_level=" in line:
val = line.split("RMS_level=")[-1].strip()
try:
rms = float(val)
if current_pts is not None:
energies.append((current_pts, rms))
# Če nimamo timestamp-a, sintetiziraj na podlagi vrstnega reda
if current_pts is None:
current_pts = len(energies) * window_sec
energies.append((current_pts, rms))
# Increment za naslednji vzorec, če FFmpeg ne pošilja pts
current_pts += window_sec
except ValueError:
pass
@ -210,11 +219,12 @@ def find_chorus(video, lang=None, model_size="small", target_duration=30.0):
avg_e = avg_energy_in_range(energies, start, start + target_duration)
energy_score = max(0, avg_e - avg_overall) # koliko nad povprečjem
# Score: število ponovitev + energy + dolžina vrstice
# Score: ponovitve + energija + dolžina vrstice
# Refren je navadno glasnejši kot verz — energija je močnejši signal
score = (
len(cluster) * 10 # repetition weight
+ energy_score * 2 # energy weight
+ min(len(rep_text.split()), 10) # text richness
len(cluster) * 5 # repetition weight (zmanjšano)
+ energy_score * 10 # energy weight (povečano — refren je glasnejši)
+ min(len(rep_text.split()), 10)
)
candidates.append({
@ -229,11 +239,11 @@ def find_chorus(video, lang=None, model_size="small", target_duration=30.0):
"cluster_id": cluster_idx,
})
# Sort by score, dedupe close candidates
# Sort by score, dedupe close candidates (vsaj 20s narazen)
candidates.sort(key=lambda c: -c["score"])
deduped = []
for c in candidates:
if all(abs(c["start"] - d["start"]) > 5 for d in deduped):
if all(abs(c["start"] - d["start"]) > 20 for d in deduped):
deduped.append(c)
if len(deduped) >= 5:
break