diff --git a/app/main.py b/app/main.py index ec8d45c..d9a4b31 100644 --- a/app/main.py +++ b/app/main.py @@ -111,6 +111,75 @@ def list_jobs(): return out +def generate_srt_from_segments(segments, clip_start, clip_end, output_path): + """Generira SRT samo za dele, ki spadajo v [clip_start, clip_end]. + + Timestamp-i so re-mapirani na 0-based (kot je v trim-anem videu). + Razdeli dolge segmente (>2.5s) na enake kose za hiter pacing v reels stilu. + Vse besedilo VELIKE TISKANE ČRKE. + """ + MAX_CHUNK_DURATION = 2.5 + + def fmt_ts(s): + h = int(s // 3600) + m = int((s % 3600) // 60) + sec = s % 60 + return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".", ",") + + lines = [] + idx = 1 + + for seg in segments: + s_start = float(seg["start"]) + s_end = float(seg["end"]) + text = str(seg["text"]).strip() + + # Filter v range + if s_end <= clip_start or s_start >= clip_end: + continue + # Klipni + s_start = max(s_start, clip_start) + s_end = min(s_end, clip_end) + if s_end - s_start < 0.2: + continue + + # Re-mapraj na 0-based + rel_start = s_start - clip_start + rel_end = s_end - clip_start + + if not text: + continue + text_upper = text.upper() + + # Razdeli na chunk-e če je predolg + duration = rel_end - rel_start + if duration <= MAX_CHUNK_DURATION: + lines.append(f"{idx}\n{fmt_ts(rel_start)} --> {fmt_ts(rel_end)}\n{text_upper}\n") + idx += 1 + else: + # Razdeli na N enakih kosov; če ima Whisper word-timing, jih lahko razdelimo bolje, + # ampak za zdaj enako razdelimo + n_parts = int(duration / MAX_CHUNK_DURATION) + 1 + words = text_upper.split() + words_per_part = max(1, len(words) // n_parts) + chunk_dur = duration / n_parts + for i in range(n_parts): + cs = rel_start + i * chunk_dur + ce = rel_start + (i + 1) * chunk_dur + # Vzemi pripadajoče besede + wstart = i * words_per_part + wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words) + chunk_text = " ".join(words[wstart:wend]) if wstart < len(words) else text_upper + if not chunk_text.strip(): + chunk_text = text_upper + lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n") + idx += 1 + + with open(output_path, "w", encoding="utf-8") as f: + f.write("\n".join(lines)) + return output_path + + # ──────────────────────────────────────────────────────────────── # Pipeline runner (background task) # ──────────────────────────────────────────────────────────────── @@ -174,12 +243,29 @@ def process_job(job_id): cmd += ["--model", job.get("whisper_model", "small")] proc = subprocess.run(cmd, capture_output=True, text=True) + srt_from_claude = None # Pot do SRT iz Claude-popravljenega transcript-a if proc.returncode == 0 and analysis_path.exists(): try: with open(analysis_path, "r", encoding="utf-8") as f: analysis = json.load(f) cr = analysis["clip_range"] fade = analysis["fade"] + + # Generiraj SRT iz transcript-a TRIM-ANEGA na clip_range + # (Claude je morda popravil besedilo — uporabi popravljeno) + if analysis.get("transcript", {}).get("segments"): + srt_path_out = OUTPUT_DIR / f"{job_id}.subtitles.srt" + try: + generate_srt_from_segments( + analysis["transcript"]["segments"], + cr["start"], cr["end"], + srt_path_out, + ) + srt_from_claude = str(srt_path_out) + print(f"📝 Generated SRT from Claude transcript: {srt_path_out}") + except Exception as e: + print(f"⚠️ SRT generation failed: {e}") + update_job( job_id, analysis_summary={ @@ -193,6 +279,7 @@ def process_job(job_id): "video_duration": analysis.get("video_duration"), "candidates": analysis["chorus"].get("all_candidates", [])[:5] if analysis.get("chorus") else [], + "claude_corrected_text": analysis.get("claude_corrected_text", False), }, # Cel transkript shranimo za UI prikaz full_transcript=[ @@ -205,6 +292,7 @@ def process_job(job_id): fade_out=fade["fade_out"], detected_language=analysis["language"], is_instrumental=analysis["instrumental"], + claude_srt_path=srt_from_claude, ) # Auto-disable subs za instrumental if analysis["instrumental"] and not job.get("no_subs"): @@ -235,6 +323,9 @@ def process_job(job_id): cmd += ["--fade-in", str(job["fade_in"])] if job.get("fade_out", 0) > 0: cmd += ["--fade-out", str(job["fade_out"])] + # SRT iz Claude (boljše besedilo) — preda direktno v subtitle.py + if job.get("claude_srt_path") and Path(job["claude_srt_path"]).exists() and not job.get("no_subs"): + cmd += ["--srt", job["claude_srt_path"]] # lang: prefer detected_language če auto chosen_lang = job.get("lang") if chosen_lang in (None, "auto", ""): diff --git a/scripts/analyze.py b/scripts/analyze.py index 9402d2b..9ed4b48 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -464,21 +464,31 @@ def analyze_with_claude(transcript, video_duration, target_duration=30): lines.append(f"[{start:6.1f}-{end:6.1f}] {text}") transcript_text = "\n".join(lines) - prompt = f"""Tu je transcript pesmi (timestamp v sekundah, besedilo): + prompt = f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo): {transcript_text} Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel. +POMEMBNO: Whisper je avtomatski STT in pogosto naredi napake, posebej pri: +- slovanskih jezikih (slovenščina, hrvaščina, bosanščina, srbščina) +- narečnih izrazih +- ko glasba prevladuje nad vokalom + PROSIM: 1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro) -2. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi (običajno 2-3x z istim ali zelo podobnim besedilom) -3. Izberi najboljši odsek za reel: +2. POPRAVI očitne napake v transkripciji: + - Če pesem ima refren ki se ponavlja, vse pojavitve refrena POPRAVI da imajo ENAKO besedilo (uporabi najjasnejšo varianto) + - Popravi napačne besede ki nimajo smisla v kontekstu + - Popravi pomešane jezike (če pesem je slovenska, vse vrstice naj bodo v slovenščini) + - Ohrani timestamp-e nepriremenjene +3. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi +4. Izberi najboljši odsek za reel: - Vključi cel refren (cel verz besedila brez prekinitve) - Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom - Lahko traja 20-45 sekund (ne strogo 30s) - Začni in končaj na smiselni meji (konec stavka, ne sredi besede) -4. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del +5. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del Odgovori SAMO v JSON formatu (brez markdown, brez razlage): {{ @@ -486,15 +496,21 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage): "end": , "reason": "", "chorus_text": "", - "structure": "<1 stavek o strukturi pesmi>" -}}""" + "structure": "<1 stavek o strukturi pesmi>", + "language": "", + "corrected_segments": [ + {{"start": , "end": , "text": ""}} + ] +}} + +V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom (ohrani timestamp-e).""" try: import urllib.request import urllib.error body = json.dumps({ "model": "claude-haiku-4-5-20251001", - "max_tokens": 1024, + "max_tokens": 4096, "messages": [{"role": "user", "content": prompt}], }).encode("utf-8") @@ -533,6 +549,9 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage): print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr) print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr) print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr) + cs = result.get("corrected_segments") + if cs: + print(f" Popravljeni segmenti: {len(cs)}", file=sys.stderr) return { "start": round(start, 2), @@ -541,6 +560,8 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage): "reason": result.get("reason", ""), "chorus_text": result.get("chorus_text", ""), "structure": result.get("structure", ""), + "language": result.get("language"), + "corrected_segments": result.get("corrected_segments"), "source": "claude_llm", } except urllib.error.HTTPError as e: @@ -676,6 +697,45 @@ def main(): f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})", file=sys.stderr) + # Če Claude je vrnil popravljene segmente, jih uporabi (boljši za podnapise) + if claude_result and claude_result.get("corrected_segments"): + corrected = claude_result["corrected_segments"] + # Ohrani word-level timing iz originala, posodobi samo text + orig_by_start = {round(s["start"], 1): s for s in transcript["segments"]} + new_segments = [] + for cs in corrected: + try: + cs_start = float(cs["start"]) + cs_end = float(cs["end"]) + cs_text = str(cs["text"]).strip() + except (KeyError, ValueError, TypeError): + continue + # Najdi originalni segment z istim start (ali blizu) za word-level timing + orig = orig_by_start.get(round(cs_start, 1)) + if not orig: + # Najdi najbližji + closest_diff = 999 + for s in transcript["segments"]: + diff = abs(s["start"] - cs_start) + if diff < closest_diff and diff < 1.0: + closest_diff = diff + orig = s + new_segments.append({ + "start": cs_start, + "end": cs_end, + "text": cs_text, + # Word-level timing ne moremo posodabljati ker Claude ne vrača besede, + # ampak ohranimo če imamo + "words": orig.get("words", []) if orig else [], + }) + transcript["segments"] = new_segments + transcript["claude_corrected"] = True + # Posodobi tudi jezik če Claude je drugačnega mnenja + if claude_result.get("language") and claude_result["language"] != transcript["language"]: + print(f" ✏️ Claude je popravil jezik: {transcript['language']} → {claude_result['language']}", file=sys.stderr) + transcript["language"] = claude_result["language"] + print(f" ✏️ Whisper segmenti zamenjani s Claude popravljenimi ({len(new_segments)})", file=sys.stderr) + # 7. Fade params (lahko razširi clip end če konča sredi vokala) fade = detect_audio_fade(clip_range, transcript, video_duration=duration) print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr) @@ -699,6 +759,8 @@ def main(): "chorus": chorus, "clip_range": clip_range, "fade": fade, + "claude_used": claude_result is not None, + "claude_corrected_text": bool(claude_result and claude_result.get("corrected_segments")), } if args.output: diff --git a/scripts/clip.py b/scripts/clip.py index a26d868..2848591 100644 --- a/scripts/clip.py +++ b/scripts/clip.py @@ -46,10 +46,11 @@ SCRIPT_DIR = Path(__file__).parent def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quality, - fade_in=0.0, fade_out=0.0): + fade_in=0.0, fade_out=0.0, srt_path=None): """Naredi en klip src → dst.""" print(f"🎯 run_clip args: src={src}, dst={dst}, start={start!r}, duration={duration!r}, " - f"mode={mode}, fade_in={fade_in}, fade_out={fade_out}", file=sys.stderr) + f"mode={mode}, fade_in={fade_in}, fade_out={fade_out}, " + f"srt={'yes' if srt_path else 'no'}", file=sys.stderr) tmp = tempfile.mkdtemp(prefix="reel_") try: reframed = Path(tmp) / "reframed.mp4" @@ -88,6 +89,8 @@ def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quali ] if lang: cmd += ["--lang", lang] + if srt_path: + cmd += ["--srt", str(srt_path)] r = subprocess.run(cmd) if r.returncode != 0: print(f"❌ Subtitle napaka — shranim brez", file=sys.stderr) @@ -114,6 +117,7 @@ def main(): ap.add_argument("--style", default="reels", choices=["reels", "yellow", "minimal"]) ap.add_argument("--no-subs", action="store_true") ap.add_argument("--quality", default="medium", choices=["fast", "medium", "high"]) + ap.add_argument("--srt", default=None, help="Že-pripravljen SRT (preskoči Whisper)") args = ap.parse_args() src = Path(args.input) @@ -136,7 +140,8 @@ def main(): start = parse_ts(args.start) if args.start else None run_clip(src, Path(args.output), start, args.duration, args.mode, args.lang, args.model, args.style, args.no_subs, args.quality, - fade_in=args.fade_in, fade_out=args.fade_out) + fade_in=args.fade_in, fade_out=args.fade_out, + srt_path=args.srt) if __name__ == "__main__": diff --git a/scripts/subtitle.py b/scripts/subtitle.py index 4539712..3611ec9 100644 --- a/scripts/subtitle.py +++ b/scripts/subtitle.py @@ -282,6 +282,7 @@ def main(): ap.add_argument("--model", default="small", choices=["tiny", "base", "small", "medium", "large-v3"]) ap.add_argument("--style", default="reels", choices=list(SUBTITLE_STYLES.keys())) ap.add_argument("--keep-srt", action="store_true", help="Ohrani .srt poleg output") + ap.add_argument("--srt", default=None, help="Že-pripravljen SRT (preskoči Whisper transkripcijo)") args = ap.parse_args() src = Path(args.input) @@ -289,14 +290,21 @@ def main(): print(f"❌ {src} ne obstaja", file=sys.stderr) sys.exit(1) - srt = transcribe(src, lang=args.lang, model_size=args.model) + if args.srt and Path(args.srt).exists(): + print(f"📄 Uporabljam že-pripravljen SRT: {args.srt}") + srt = args.srt + srt_was_provided = True + else: + srt = transcribe(src, lang=args.lang, model_size=args.model) + srt_was_provided = False + burn_subtitles(src, srt, args.output, style=args.style) - if args.keep_srt: + if args.keep_srt and not srt_was_provided: keep_path = Path(args.output).with_suffix(".srt") os.rename(srt, keep_path) print(f"💾 SRT shranjen: {keep_path}") - else: + elif not srt_was_provided: os.unlink(srt)