783 lines
29 KiB
Python
783 lines
29 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
analyze.py — Predhodna analiza CELEGA videa pred trim-anjem.
|
|
|
|
Naredi:
|
|
1. Whisper transcript celega videa (auto-detect jezika ali user-specified)
|
|
2. Energy profile (RMS dB na 1s windows)
|
|
3. Structural detection (vocal/instrumental sections, energy peaks)
|
|
4. Pametno izbere clip range (lahko >30s, vključi pre-chorus)
|
|
5. Detekcija instrumentalnih pesmi (no_subs auto)
|
|
|
|
Output: JSON s podatki za clip.py
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
|
|
def get_video_duration(path):
|
|
r = subprocess.run(
|
|
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
|
"-of", "default=nw=1:nokey=1", str(path)],
|
|
capture_output=True, text=True
|
|
)
|
|
try:
|
|
return float(r.stdout.strip())
|
|
except ValueError:
|
|
return 0.0
|
|
|
|
|
|
def extract_audio(video_path):
|
|
"""Extract avdio v 16kHz mono WAV za Whisper + energy."""
|
|
audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
audio.close()
|
|
subprocess.run(
|
|
["ffmpeg", "-y", "-i", str(video_path), "-vn",
|
|
"-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", audio.name],
|
|
check=True, capture_output=True
|
|
)
|
|
return audio.name
|
|
|
|
|
|
def transcribe_full(audio_path, lang=None, model_size="small"):
|
|
"""Whisper transcript celega avdia. lang=None → robust auto-detect.
|
|
|
|
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
|
|
from faster_whisper import WhisperModel
|
|
|
|
print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
|
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
|
|
|
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
|
|
if not lang:
|
|
print(" 🔍 Robust lang detection (3 samples)...", file=sys.stderr)
|
|
try:
|
|
duration_proc = subprocess.run(
|
|
["ffprobe", "-v", "error", "-show_entries", "format=duration",
|
|
"-of", "default=nw=1:nokey=1", audio_path],
|
|
capture_output=True, text=True
|
|
)
|
|
audio_duration = float(duration_proc.stdout.strip())
|
|
except Exception:
|
|
audio_duration = 180.0
|
|
|
|
lang_votes = {}
|
|
for ss in [max(15, audio_duration * 0.15), audio_duration * 0.45, audio_duration * 0.75]:
|
|
if ss + 5 > audio_duration:
|
|
continue
|
|
sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
|
sample.close()
|
|
try:
|
|
subprocess.run(
|
|
["ffmpeg", "-y", "-ss", str(ss), "-i", audio_path,
|
|
"-t", "30", "-vn", "-ac", "1", "-ar", "16000",
|
|
"-c:a", "pcm_s16le", sample.name],
|
|
check=True, capture_output=True
|
|
)
|
|
_, sample_info = m.transcribe(sample.name, language=None, vad_filter=False)
|
|
sl, sp = sample_info.language, float(sample_info.language_probability)
|
|
lang_votes[sl] = lang_votes.get(sl, 0) + sp
|
|
print(f" sample @ {ss:.0f}s: {sl} (p={sp:.2f})", file=sys.stderr)
|
|
except Exception as e:
|
|
print(f" sample @ {ss:.0f}s: failed", file=sys.stderr)
|
|
finally:
|
|
try:
|
|
os.unlink(sample.name)
|
|
except Exception:
|
|
pass
|
|
|
|
if lang_votes:
|
|
lang = max(lang_votes.items(), key=lambda x: x[1])[0]
|
|
print(f" ✅ Lang lock: {lang}", file=sys.stderr)
|
|
|
|
try:
|
|
segs, info = m.transcribe(
|
|
audio_path,
|
|
language=lang,
|
|
word_timestamps=True,
|
|
# VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez
|
|
vad_filter=False,
|
|
# Anti-halucinacije
|
|
condition_on_previous_text=False,
|
|
temperature=0.0,
|
|
compression_ratio_threshold=2.4,
|
|
log_prob_threshold=-1.0,
|
|
no_speech_threshold=0.6,
|
|
)
|
|
detected_lang = info.language
|
|
detected_prob = float(info.language_probability)
|
|
except (ValueError, RuntimeError) as e:
|
|
# Whisper failure (např. pri popolnoma instrumentalnih datotekah z VAD)
|
|
print(f" ⚠️ Whisper transcribe failed: {e}", file=sys.stderr)
|
|
return {
|
|
"language": "unknown",
|
|
"language_probability": 0.0,
|
|
"segments": [],
|
|
}
|
|
|
|
print(f" Detekcija: {detected_lang} (p={detected_prob:.2f})", file=sys.stderr)
|
|
|
|
segments = []
|
|
for s in segs:
|
|
words = []
|
|
if s.words:
|
|
for w in s.words:
|
|
words.append({
|
|
"start": w.start,
|
|
"end": w.end,
|
|
"text": w.word,
|
|
})
|
|
segments.append({
|
|
"start": s.start,
|
|
"end": s.end,
|
|
"text": s.text.strip(),
|
|
"words": words,
|
|
})
|
|
|
|
return {
|
|
"language": detected_lang,
|
|
"language_probability": detected_prob,
|
|
"segments": segments,
|
|
}
|
|
|
|
|
|
def compute_energy_profile(audio_path, window_sec=1.0):
|
|
"""RMS dB na window_sec sekund. Vrne list (timestamp, rms_db)."""
|
|
cmd = [
|
|
"ffmpeg", "-i", audio_path,
|
|
"-af", f"asetnsamples=n={int(16000 * window_sec)}:p=0,"
|
|
f"astats=metadata=1:reset={window_sec},"
|
|
f"ametadata=print:key=lavfi.astats.Overall.RMS_level:file=-",
|
|
"-f", "null", "-",
|
|
]
|
|
result = subprocess.run(cmd, capture_output=True, text=True)
|
|
output = result.stdout + "\n" + result.stderr
|
|
|
|
energies = []
|
|
current_pts = 0.0
|
|
for line in output.split("\n"):
|
|
line = line.strip()
|
|
m = re.search(r"pts_time:(\S+)", line)
|
|
if m:
|
|
try:
|
|
current_pts = float(m.group(1))
|
|
except ValueError:
|
|
pass
|
|
continue
|
|
if "RMS_level=" in line:
|
|
val = line.split("RMS_level=")[-1].strip()
|
|
try:
|
|
rms = float(val)
|
|
# -inf zamenjamo z -90
|
|
if rms < -90 or rms != rms: # NaN check
|
|
rms = -90.0
|
|
energies.append((current_pts, rms))
|
|
current_pts += window_sec
|
|
except ValueError:
|
|
pass
|
|
|
|
return energies
|
|
|
|
|
|
def detect_vocal_sections(segments, max_gap=3.0):
|
|
"""Združi consecutive segmente v "vokalne sekcije"."""
|
|
if not segments:
|
|
return []
|
|
sections = []
|
|
current = {
|
|
"start": segments[0]["start"],
|
|
"end": segments[0]["end"],
|
|
"segments": [segments[0]],
|
|
"text": segments[0]["text"],
|
|
}
|
|
for seg in segments[1:]:
|
|
if seg["start"] - current["end"] > max_gap:
|
|
sections.append(current)
|
|
current = {
|
|
"start": seg["start"],
|
|
"end": seg["end"],
|
|
"segments": [seg],
|
|
"text": seg["text"],
|
|
}
|
|
else:
|
|
current["end"] = seg["end"]
|
|
current["segments"].append(seg)
|
|
current["text"] += " " + seg["text"]
|
|
sections.append(current)
|
|
return sections
|
|
|
|
|
|
def avg_energy_in_range(energies, start, end):
|
|
"""Povprečna RMS v rangeu."""
|
|
vals = [r for (t, r) in energies if start <= t <= end]
|
|
if not vals:
|
|
return -90.0
|
|
return sum(vals) / len(vals)
|
|
|
|
|
|
def score_section_as_chorus(section, all_sections, energies, avg_rms):
|
|
"""Score sekcijo kot kandidat za refren.
|
|
|
|
Faktorji:
|
|
- Ponavljajoče besede (low unique-word-ratio) = refren
|
|
- Visoka energija
|
|
- Sekcija se pojavi večkrat v pesmi (refren se ponovi)
|
|
- Krajše vrstice (3-8 besed)
|
|
"""
|
|
text = section["text"].lower()
|
|
words = re.findall(r"\b\w+\b", text)
|
|
if not words:
|
|
return 0
|
|
|
|
unique_ratio = len(set(words)) / len(words)
|
|
# Refren = nizko unique ratio (ponovitve)
|
|
chorus_signal = max(0, (1.0 - unique_ratio) * 30)
|
|
|
|
# Energija
|
|
sec_energy = avg_energy_in_range(energies, section["start"], section["end"])
|
|
energy_above = max(0, sec_energy - avg_rms)
|
|
energy_score = energy_above * 8
|
|
|
|
# Kako pogosto se pojavi podobno besedilo
|
|
repeat_count = 0
|
|
for other in all_sections:
|
|
if other is section:
|
|
continue
|
|
other_text = other["text"].lower()
|
|
other_words = set(re.findall(r"\b\w+\b", other_text))
|
|
common = set(words) & other_words
|
|
# Če imata >50% besed skupnih, je verjetno isti refren
|
|
if len(common) >= len(set(words)) * 0.5 and len(common) >= 3:
|
|
repeat_count += 1
|
|
repeat_score = repeat_count * 25
|
|
|
|
# Dolžina vrstice
|
|
duration = section["end"] - section["start"]
|
|
if 3 <= duration <= 25:
|
|
length_score = 10
|
|
elif duration > 25:
|
|
length_score = 5
|
|
else:
|
|
length_score = 2
|
|
|
|
return chorus_signal + energy_score + repeat_score + length_score
|
|
|
|
|
|
def find_chorus(transcript, energies, video_duration):
|
|
"""Najde najbolj verjeten refren."""
|
|
sections = detect_vocal_sections(transcript["segments"])
|
|
if not sections:
|
|
return None
|
|
|
|
avg_rms = sum(r for (_, r) in energies) / len(energies) if energies else -30.0
|
|
|
|
candidates = []
|
|
for sec in sections:
|
|
score = score_section_as_chorus(sec, sections, energies, avg_rms)
|
|
candidates.append({
|
|
"start": sec["start"],
|
|
"end": sec["end"],
|
|
"duration": sec["end"] - sec["start"],
|
|
"text_preview": sec["text"][:80],
|
|
"score": round(score, 2),
|
|
"avg_rms": round(avg_energy_in_range(energies, sec["start"], sec["end"]), 2),
|
|
})
|
|
|
|
# Sort by score descending
|
|
candidates.sort(key=lambda c: -c["score"])
|
|
|
|
if not candidates:
|
|
return None
|
|
|
|
return {
|
|
"best": candidates[0],
|
|
"all_candidates": candidates[:10],
|
|
"avg_rms_total": round(avg_rms, 2),
|
|
}
|
|
|
|
|
|
def smart_clip_range(chorus, transcript, video_duration,
|
|
target_duration=30, max_duration=45, min_duration=20,
|
|
include_prebuild=False):
|
|
"""Inteligentno določi clip range.
|
|
|
|
Logika:
|
|
1. Začni z refrenom kot core
|
|
2. Če je krajši od min_duration → razširi z drugim refrenom (ne kitico!)
|
|
3. Cap na max_duration
|
|
|
|
include_prebuild=False (default): NE doda kitice/verza pred refrenom.
|
|
include_prebuild=True: doda kratek pre-chorus (max 8s, gap < 3s).
|
|
"""
|
|
if not chorus or not chorus.get("best"):
|
|
# Fallback: vzemi sredino videa
|
|
mid = video_duration / 2
|
|
start = max(0, mid - target_duration / 2)
|
|
return {
|
|
"start": start,
|
|
"end": min(video_duration, start + target_duration),
|
|
"reason": "fallback_middle",
|
|
}
|
|
|
|
best = chorus["best"]
|
|
sections = detect_vocal_sections(transcript["segments"])
|
|
|
|
actual_start = best["start"]
|
|
actual_end = best["end"]
|
|
|
|
# Najdi VSE sekcije ki so podobne refrenu (verjetne ponovitve)
|
|
chorus_words = set(re.findall(r"\b\w+\b", best["text_preview"].lower()))
|
|
chorus_sections = []
|
|
for sec in sections:
|
|
sec_words = set(re.findall(r"\b\w+\b", sec["text"].lower()))
|
|
if chorus_words and len(sec_words & chorus_words) >= len(chorus_words) * 0.4:
|
|
chorus_sections.append(sec)
|
|
|
|
# 1. Če je core refren prekratek, razširi z naslednjim REFRENOM (ne kitico!)
|
|
if actual_end - actual_start < min_duration:
|
|
for sec in chorus_sections:
|
|
if sec["start"] > actual_end and sec["start"] - actual_end < 8:
|
|
if sec["end"] - actual_start <= max_duration:
|
|
actual_end = sec["end"]
|
|
if actual_end - actual_start >= min_duration:
|
|
break
|
|
|
|
# 2. Pre-chorus build-up (samo če uporabnik to izrecno hoče)
|
|
if include_prebuild:
|
|
pre_section = None
|
|
for sec in sections:
|
|
# Pre-section mora biti BLIZU (gap < 3s) in NE preveč dolga (< 8s)
|
|
sec_duration = sec["end"] - sec["start"]
|
|
if (sec["end"] <= actual_start
|
|
and actual_start - sec["end"] < 3
|
|
and sec_duration < 8):
|
|
pre_section = sec
|
|
if pre_section:
|
|
candidate_start = pre_section["start"]
|
|
if actual_end - candidate_start <= max_duration:
|
|
actual_start = candidate_start
|
|
|
|
# 3. Če je še prekratek, razširi simetrično znotraj refrenov (ne kitic)
|
|
if actual_end - actual_start < min_duration:
|
|
deficit = min_duration - (actual_end - actual_start)
|
|
# Razširi konec če lahko
|
|
for sec in chorus_sections:
|
|
if sec["start"] > actual_end and sec["start"] - actual_end < 5:
|
|
actual_end = min(sec["end"], actual_end + deficit)
|
|
break
|
|
# Če še ni dovolj, manjše simetrično
|
|
if actual_end - actual_start < min_duration:
|
|
extra = (min_duration - (actual_end - actual_start)) / 2
|
|
actual_start = max(0, actual_start - extra)
|
|
actual_end = min(video_duration, actual_end + extra)
|
|
|
|
# 4. Trim na max
|
|
if actual_end - actual_start > max_duration:
|
|
actual_end = actual_start + max_duration
|
|
|
|
actual_start = max(0, actual_start)
|
|
actual_end = min(video_duration, actual_end)
|
|
|
|
return {
|
|
"start": round(actual_start, 2),
|
|
"end": round(actual_end, 2),
|
|
"duration": round(actual_end - actual_start, 2),
|
|
"reason": "smart_chorus_with_prebuild" if include_prebuild else "smart_chorus_only",
|
|
"chorus_start": round(best["start"], 2),
|
|
"chorus_end": round(best["end"], 2),
|
|
}
|
|
|
|
|
|
def detect_audio_fade(clip_range, transcript, video_duration=None):
|
|
"""Določi fade-in/fade-out trajanje + ev. razširi clip range, da fade
|
|
ne reže besedila na koncu refrena.
|
|
|
|
Logika:
|
|
- Če clip začne sredi vokala → 0.5s fade in
|
|
- Če se konča sredi vokala → razširi clip do konca segmenta (+ buffer),
|
|
potem 1.0s fade out
|
|
- Sicer manj fade
|
|
"""
|
|
cs, ce = clip_range["start"], clip_range["end"]
|
|
|
|
# Najdi segment, ki konča znotraj clip-a (ali je clip end znotraj segmenta)
|
|
starts_in_vocal = False
|
|
ends_in_vocal = False
|
|
end_segment = None
|
|
for seg in transcript["segments"]:
|
|
if seg["start"] <= cs <= seg["end"]:
|
|
starts_in_vocal = True
|
|
if seg["start"] <= ce <= seg["end"]:
|
|
ends_in_vocal = True
|
|
end_segment = seg
|
|
|
|
# Če clip konča znotraj segmenta, razširi do konca segmenta + 0.5s buffer
|
|
extended_end = ce
|
|
if end_segment:
|
|
extended_end = end_segment["end"] + 0.5
|
|
if video_duration is not None:
|
|
extended_end = min(extended_end, video_duration)
|
|
|
|
fade_in = 0.4 if starts_in_vocal else 0.2
|
|
# Krajši fade out (0.5s) ker zdaj clip konča po koncu vokala
|
|
fade_out = 0.5 if ends_in_vocal else 0.3
|
|
|
|
return {
|
|
"fade_in": fade_in,
|
|
"fade_out": fade_out,
|
|
"extended_end": round(extended_end, 2),
|
|
"ends_in_vocal": ends_in_vocal,
|
|
}
|
|
|
|
|
|
def analyze_with_claude(transcript, video_duration, target_duration=30):
|
|
"""Pošlje cel transkript Claude API-ju, ki razume strukturo pesmi
|
|
in vrne najboljši odsek za reel.
|
|
|
|
Claude bere cel tekst, prepozna ponovitve med deli (refren) in razume
|
|
kontekst (kdaj je intro, verz, refren, bridge, outro).
|
|
|
|
Vrne dict z 'start', 'end', 'reason', 'chorus_text' ali None če Claude
|
|
ni dosegljiv ali API key manjka.
|
|
"""
|
|
api_key = os.environ.get("ANTHROPIC_API_KEY")
|
|
if not api_key:
|
|
print(" ⚠️ ANTHROPIC_API_KEY ni nastavljen — preskakujem Claude analizo", file=sys.stderr)
|
|
return None
|
|
|
|
if not transcript.get("segments"):
|
|
return None
|
|
|
|
# Pripravi tekstovni format za Claude — vsak segment z timestamp-om
|
|
lines = []
|
|
for seg in transcript["segments"]:
|
|
start = seg["start"]
|
|
end = seg["end"]
|
|
text = seg["text"].strip()
|
|
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
|
|
transcript_text = "\n".join(lines)
|
|
|
|
prompt = f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo):
|
|
|
|
{transcript_text}
|
|
|
|
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.
|
|
|
|
POMEMBNO: Whisper je avtomatski STT in pogosto naredi napake, posebej pri:
|
|
- slovanskih jezikih (slovenščina, hrvaščina, bosanščina, srbščina)
|
|
- narečnih izrazih
|
|
- ko glasba prevladuje nad vokalom
|
|
|
|
PROSIM:
|
|
1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
|
|
2. POPRAVI očitne napake v transkripciji:
|
|
- Če pesem ima refren ki se ponavlja, vse pojavitve refrena POPRAVI da imajo ENAKO besedilo (uporabi najjasnejšo varianto)
|
|
- Popravi napačne besede ki nimajo smisla v kontekstu
|
|
- Popravi pomešane jezike (če pesem je slovenska, vse vrstice naj bodo v slovenščini)
|
|
- Ohrani timestamp-e nepriremenjene
|
|
3. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi
|
|
4. Izberi najboljši odsek za reel:
|
|
- Vključi cel refren (cel verz besedila brez prekinitve)
|
|
- Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom
|
|
- Lahko traja 20-45 sekund (ne strogo 30s)
|
|
- Začni in končaj na smiselni meji (konec stavka, ne sredi besede)
|
|
5. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
|
|
|
|
Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
|
{{
|
|
"start": <sekunde>,
|
|
"end": <sekunde>,
|
|
"reason": "<kratka razlaga zakaj ta odsek>",
|
|
"chorus_text": "<besedilo refrena ali ključni del>",
|
|
"structure": "<1 stavek o strukturi pesmi>",
|
|
"language": "<jezik: sl/de/hr/bs/sr/en/it/es/fr>",
|
|
"corrected_segments": [
|
|
{{"start": <s>, "end": <s>, "text": "<popravljeno besedilo>"}}
|
|
]
|
|
}}
|
|
|
|
V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom (ohrani timestamp-e)."""
|
|
|
|
try:
|
|
import urllib.request
|
|
import urllib.error
|
|
body = json.dumps({
|
|
"model": "claude-haiku-4-5-20251001",
|
|
"max_tokens": 4096,
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
}).encode("utf-8")
|
|
|
|
req = urllib.request.Request(
|
|
"https://api.anthropic.com/v1/messages",
|
|
data=body,
|
|
headers={
|
|
"Content-Type": "application/json",
|
|
"x-api-key": api_key,
|
|
"anthropic-version": "2023-06-01",
|
|
},
|
|
method="POST",
|
|
)
|
|
with urllib.request.urlopen(req, timeout=60) as resp:
|
|
data = json.loads(resp.read().decode("utf-8"))
|
|
|
|
content = data.get("content", [])
|
|
if not content:
|
|
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
|
|
return None
|
|
text = content[0].get("text", "").strip()
|
|
|
|
# Včasih Claude obda JSON v markdown
|
|
if text.startswith("```"):
|
|
text = re.sub(r"^```(?:json)?\s*", "", text)
|
|
text = re.sub(r"\s*```$", "", text)
|
|
result = json.loads(text)
|
|
|
|
# Sanity check
|
|
start = float(result["start"])
|
|
end = float(result["end"])
|
|
if start >= end or start < 0 or end > video_duration:
|
|
print(f" ⚠️ Claude returned invalid range: {start}-{end}", file=sys.stderr)
|
|
return None
|
|
|
|
print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr)
|
|
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
|
|
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
|
|
cs = result.get("corrected_segments")
|
|
if cs:
|
|
print(f" Popravljeni segmenti: {len(cs)}", file=sys.stderr)
|
|
|
|
return {
|
|
"start": round(start, 2),
|
|
"end": round(end, 2),
|
|
"duration": round(end - start, 2),
|
|
"reason": result.get("reason", ""),
|
|
"chorus_text": result.get("chorus_text", ""),
|
|
"structure": result.get("structure", ""),
|
|
"language": result.get("language"),
|
|
"corrected_segments": result.get("corrected_segments"),
|
|
"source": "claude_llm",
|
|
}
|
|
except urllib.error.HTTPError as e:
|
|
body = e.read().decode("utf-8", errors="replace")[:500]
|
|
print(f" ❌ Claude API HTTP {e.code}: {body}", file=sys.stderr)
|
|
return None
|
|
except Exception as e:
|
|
print(f" ❌ Claude analysis failed: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def is_instrumental(transcript, video_duration, threshold=0.1):
|
|
"""Detekcija ali je pesem instrumentalna.
|
|
|
|
Če je vsota trajanja vokalnih segmentov < threshold * video_duration,
|
|
je pesem instrumentalna.
|
|
"""
|
|
if not transcript.get("segments"):
|
|
return True
|
|
vocal_duration = sum(
|
|
s["end"] - s["start"] for s in transcript["segments"]
|
|
)
|
|
ratio = vocal_duration / max(video_duration, 1)
|
|
return bool(ratio < threshold)
|
|
|
|
|
|
def main():
|
|
ap = argparse.ArgumentParser()
|
|
ap.add_argument("video", help="Vhod video file")
|
|
ap.add_argument("--lang", default=None, help="ISO 639-1 ali 'auto' (default: auto)")
|
|
ap.add_argument("--model", default="large-v3", help="Whisper model")
|
|
ap.add_argument("--target-duration", type=float, default=30.0)
|
|
ap.add_argument("--max-duration", type=float, default=45.0)
|
|
ap.add_argument("--min-duration", type=float, default=20.0)
|
|
ap.add_argument("--include-prebuild", action="store_true",
|
|
help="Vključi pre-chorus build-up (privzeto: ne)")
|
|
ap.add_argument("--no-claude", action="store_true",
|
|
help="Preskoči Claude LLM analizo (uporabi samo lokalno heuristiko)")
|
|
ap.add_argument("--json", action="store_true", help="Output JSON")
|
|
ap.add_argument("--output", help="Path za JSON output")
|
|
args = ap.parse_args()
|
|
|
|
video = Path(args.video)
|
|
if not video.exists():
|
|
print(f"❌ Video ne obstaja: {video}", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
duration = get_video_duration(video)
|
|
print(f"📹 Video: {video.name}, {duration:.1f}s", file=sys.stderr)
|
|
|
|
# 1. Extract avdio
|
|
audio = extract_audio(video)
|
|
|
|
try:
|
|
# 2. Whisper transcript
|
|
lang = None if args.lang in (None, "auto", "") else args.lang
|
|
transcript = transcribe_full(audio, lang=lang, model_size=args.model)
|
|
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
|
|
|
|
# 3. Energy profile
|
|
print(f"⚡ Energy profile...", file=sys.stderr)
|
|
energies = compute_energy_profile(audio)
|
|
print(f" Energy samples: {len(energies)}", file=sys.stderr)
|
|
|
|
# 4. Instrumental detection
|
|
instrumental = is_instrumental(transcript, duration)
|
|
print(f"🎵 Instrumentalna: {instrumental}", file=sys.stderr)
|
|
|
|
# 5a. PRIMARNO: Claude LLM analiza (razume cel tekst pesmi)
|
|
claude_result = None
|
|
if not instrumental and not args.no_claude:
|
|
print(f"🤖 Pošiljam transkript Claude-u za analizo strukture...", file=sys.stderr)
|
|
claude_result = analyze_with_claude(
|
|
transcript, duration, target_duration=args.target_duration
|
|
)
|
|
|
|
# 5b. Find chorus lokalno (kot fallback ali za score-jev preview)
|
|
if not instrumental:
|
|
chorus = find_chorus(transcript, energies, duration)
|
|
else:
|
|
# Za instrumentalne: najdi sekcijo z najvišjo energijo
|
|
window = args.target_duration
|
|
best_start = 0
|
|
best_avg = -100
|
|
t = 0
|
|
while t + window <= duration:
|
|
avg = avg_energy_in_range(energies, t, t + window)
|
|
if avg > best_avg:
|
|
best_avg = avg
|
|
best_start = t
|
|
t += 5 # step 5s
|
|
chorus = {
|
|
"best": {
|
|
"start": best_start,
|
|
"end": best_start + window,
|
|
"duration": window,
|
|
"text_preview": "(instrumental — energy peak)",
|
|
"score": 0,
|
|
"avg_rms": round(best_avg, 2),
|
|
},
|
|
"all_candidates": [],
|
|
"avg_rms_total": round(
|
|
sum(r for (_, r) in energies) / len(energies) if energies else -30, 2
|
|
),
|
|
}
|
|
|
|
# 6. Clip range — Claude ima prednost, sicer smart_clip_range fallback
|
|
if claude_result:
|
|
clip_range = {
|
|
"start": claude_result["start"],
|
|
"end": claude_result["end"],
|
|
"duration": claude_result["duration"],
|
|
"reason": "claude_llm: " + claude_result.get("reason", ""),
|
|
"chorus_text": claude_result.get("chorus_text", ""),
|
|
"structure": claude_result.get("structure", ""),
|
|
"source": "claude",
|
|
}
|
|
# Apply max_duration cap če Claude pretirava
|
|
if clip_range["duration"] > args.max_duration:
|
|
clip_range["end"] = clip_range["start"] + args.max_duration
|
|
clip_range["duration"] = args.max_duration
|
|
clip_range["reason"] += " (capped at max_duration)"
|
|
else:
|
|
clip_range = smart_clip_range(
|
|
chorus, transcript, duration,
|
|
target_duration=args.target_duration,
|
|
max_duration=args.max_duration,
|
|
min_duration=args.min_duration,
|
|
include_prebuild=args.include_prebuild,
|
|
)
|
|
clip_range["source"] = "local_heuristic"
|
|
print(f"✂ Clip range: {clip_range['start']:.1f}s - {clip_range['end']:.1f}s "
|
|
f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
|
|
file=sys.stderr)
|
|
|
|
# Če Claude je vrnil popravljene segmente, jih uporabi (boljši za podnapise)
|
|
if claude_result and claude_result.get("corrected_segments"):
|
|
corrected = claude_result["corrected_segments"]
|
|
# Ohrani word-level timing iz originala, posodobi samo text
|
|
orig_by_start = {round(s["start"], 1): s for s in transcript["segments"]}
|
|
new_segments = []
|
|
for cs in corrected:
|
|
try:
|
|
cs_start = float(cs["start"])
|
|
cs_end = float(cs["end"])
|
|
cs_text = str(cs["text"]).strip()
|
|
except (KeyError, ValueError, TypeError):
|
|
continue
|
|
# Najdi originalni segment z istim start (ali blizu) za word-level timing
|
|
orig = orig_by_start.get(round(cs_start, 1))
|
|
if not orig:
|
|
# Najdi najbližji
|
|
closest_diff = 999
|
|
for s in transcript["segments"]:
|
|
diff = abs(s["start"] - cs_start)
|
|
if diff < closest_diff and diff < 1.0:
|
|
closest_diff = diff
|
|
orig = s
|
|
new_segments.append({
|
|
"start": cs_start,
|
|
"end": cs_end,
|
|
"text": cs_text,
|
|
# Word-level timing ne moremo posodabljati ker Claude ne vrača besede,
|
|
# ampak ohranimo če imamo
|
|
"words": orig.get("words", []) if orig else [],
|
|
})
|
|
transcript["segments"] = new_segments
|
|
transcript["claude_corrected"] = True
|
|
# Posodobi tudi jezik če Claude je drugačnega mnenja
|
|
if claude_result.get("language") and claude_result["language"] != transcript["language"]:
|
|
print(f" ✏️ Claude je popravil jezik: {transcript['language']} → {claude_result['language']}", file=sys.stderr)
|
|
transcript["language"] = claude_result["language"]
|
|
print(f" ✏️ Whisper segmenti zamenjani s Claude popravljenimi ({len(new_segments)})", file=sys.stderr)
|
|
|
|
# 7. Fade params (lahko razširi clip end če konča sredi vokala)
|
|
fade = detect_audio_fade(clip_range, transcript, video_duration=duration)
|
|
print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr)
|
|
|
|
# Če fade detection razširi end (ker clip konča sredi vokala), apply
|
|
if fade.get("extended_end") and fade["extended_end"] > clip_range["end"]:
|
|
old_end = clip_range["end"]
|
|
new_end = min(fade["extended_end"], clip_range["start"] + args.max_duration)
|
|
clip_range["end"] = round(new_end, 2)
|
|
clip_range["duration"] = round(new_end - clip_range["start"], 2)
|
|
print(f" ↳ Razširjen za {new_end - old_end:.1f}s (zaključek besedila)",
|
|
file=sys.stderr)
|
|
|
|
result = {
|
|
"video": str(video),
|
|
"video_duration": duration,
|
|
"language": transcript["language"],
|
|
"language_probability": transcript["language_probability"],
|
|
"instrumental": instrumental,
|
|
"transcript": transcript,
|
|
"chorus": chorus,
|
|
"clip_range": clip_range,
|
|
"fade": fade,
|
|
"claude_used": claude_result is not None,
|
|
"claude_corrected_text": bool(claude_result and claude_result.get("corrected_segments")),
|
|
}
|
|
|
|
if args.output:
|
|
with open(args.output, "w", encoding="utf-8") as f:
|
|
json.dump(result, f, ensure_ascii=False, indent=2)
|
|
print(f"💾 Saved: {args.output}", file=sys.stderr)
|
|
|
|
if args.json:
|
|
print(json.dumps(result, ensure_ascii=False))
|
|
|
|
finally:
|
|
try:
|
|
os.unlink(audio)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|