#!/usr/bin/env python3 """ subtitle.py — Generiraj podnapise iz videa in jih burn-in v output. Uporablja faster-whisper za transkripcijo, FFmpeg za burn-in. Primer: python3 subtitle.py video.mp4 video_sub.mp4 python3 subtitle.py video.mp4 video_sub.mp4 --lang sl --model small python3 subtitle.py video.mp4 video_sub.mp4 --style reels # velik beli centriran tekst """ import argparse import subprocess import sys import tempfile import os from pathlib import Path def detect_language_robust(video, model): """2-step detekcija jezika za auto mode: 1. Vzemi 3 vzorce po 30s iz različnih delov pesmi (start/middle/end refrena) 2. Vsak vzorec transkribiraj z auto-detect 3. Vrne najpogostejši jezik z največjo skupno verjetnostjo To prepreči, da Whisper sredi pesmi spremeni jezik. """ import subprocess duration_proc = subprocess.run( ["ffprobe", "-v", "error", "-show_entries", "format=duration", "-of", "default=nw=1:nokey=1", str(video)], capture_output=True, text=True ) try: duration = float(duration_proc.stdout.strip()) except Exception: duration = 180.0 # 3 vzorci po 30s — začetek (po intru), sredina, proti koncu sample_starts = [ max(15, duration * 0.15), # po intru, kjer je verjetno verz 1 duration * 0.45, # približno sredina, refren duration * 0.75, # zadnji refren ] lang_votes = {} # lang → cumulative_prob for ss in sample_starts: if ss + 5 > duration: continue # Extract 30s sample sample = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) sample.close() subprocess.run( ["ffmpeg", "-y", "-ss", str(ss), "-i", str(video), "-t", "30", "-vn", "-ac", "1", "-ar", "16000", "-c:a", "pcm_s16le", sample.name], check=True, capture_output=True ) try: _, sample_info = model.transcribe(sample.name, language=None, vad_filter=False) lang = sample_info.language prob = float(sample_info.language_probability) lang_votes[lang] = lang_votes.get(lang, 0) + prob print(f" sample @ {ss:.0f}s: {lang} (p={prob:.2f})") except Exception as e: print(f" sample @ {ss:.0f}s: failed ({e})") finally: try: os.unlink(sample.name) except Exception: pass if not lang_votes: return None best_lang = max(lang_votes.items(), key=lambda x: x[1]) print(f" 🎯 Locked language: {best_lang[0]} (cumulative p={best_lang[1]:.2f})") return best_lang[0] def transcribe(video, lang=None, model_size="small"): """Vrne pot do .srt datoteke.""" from faster_whisper import WhisperModel print(f"🧠 Whisper model: {model_size}, lang={lang or 'auto'}") model = WhisperModel(model_size, device="cpu", compute_type="int8") # Auto-detect z robust 3-sample voting (preprečuje preklop jezika sredi pesmi) if not lang: print(" 🔍 Robust auto-detect (3 sampli)...") lang = detect_language_robust(video, model) if lang: print(f" ✅ Lang lock: {lang}") else: print(" ⚠️ Detection failed, fallback na auto per-segment") segments, info = model.transcribe( str(video), language=lang, # fixed za cel video word_timestamps=True, # VAD filter kdaj izpusti vokal med glasbo — pri pesmi bolje brez vad_filter=False, # Anti-halucinacije: # - condition_on_previous_text: ne predaja napak naprej # - temperature=0: deterministično (brez "kreativnega" ugibanja) # - compression_ratio_threshold: zazna ponavljajoče halucinacije # - log_prob_threshold: zavrne segmente z nizko verjetnostjo # - no_speech_threshold: agresivneje preskoči tihe dele condition_on_previous_text=False, temperature=0.0, compression_ratio_threshold=2.4, log_prob_threshold=-1.0, no_speech_threshold=0.6, ) print(f" Detekcija: {info.language} (p={info.language_probability:.2f})") srt_path = tempfile.NamedTemporaryFile(suffix=".srt", delete=False, mode="w", encoding="utf-8") def fmt_ts(s): h = int(s // 3600) m = int((s % 3600) // 60) sec = s % 60 return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".", ",") # Generiramo word-level chunked podnapise: 3-4 besede naenkrat, # max 2.5s na chunk za hiter pacing v reels stil. # Vse v VELIKIH TISKANIH ČRKAH. MAX_CHUNK_DURATION = 2.5 # sekund def write_chunk(idx, start, end, text): # Če je chunk daljši od MAX, razdeli ga na N enakih kosov z istim tekstom duration = end - start if duration <= MAX_CHUNK_DURATION: srt_path.write(f"{idx}\n{fmt_ts(start)} --> {fmt_ts(end)}\n{text}\n\n") return idx + 1 # Razdeli na N enakih kosov n_parts = int(duration / MAX_CHUNK_DURATION) + 1 chunk_dur = duration / n_parts for i in range(n_parts): chunk_start = start + i * chunk_dur chunk_end = start + (i + 1) * chunk_dur srt_path.write(f"{idx}\n{fmt_ts(chunk_start)} --> {fmt_ts(chunk_end)}\n{text}\n\n") idx += 1 return idx idx = 1 for seg in segments: words = seg.words or [] if not words: idx = write_chunk(idx, seg.start, seg.end, seg.text.strip().upper()) continue # Združi v skupine po ~4 besede group = [] for w in words: group.append(w) if len(group) >= 4 or w.word.strip().endswith((".", "?", "!")): start = group[0].start end = group[-1].end text = "".join(g.word for g in group).strip().upper() idx = write_chunk(idx, start, end, text) group = [] if group: start = group[0].start end = group[-1].end text = "".join(g.word for g in group).strip().upper() idx = write_chunk(idx, start, end, text) srt_path.close() print(f"📝 SRT: {srt_path.name} ({idx - 1} segmentov)") return srt_path.name SUBTITLE_STYLES = { "reels": ( # Velike bele črke z debelim črnim outline-om, na spodnji tretjini "FontName=Arial,FontSize=42,Bold=1," "PrimaryColour=&HFFFFFF,OutlineColour=&H000000," "Outline=4,Shadow=1,Alignment=2,MarginV=120,BorderStyle=1" ), "yellow": ( "FontName=Arial,FontSize=42,Bold=1," "PrimaryColour=&H00FFFF,OutlineColour=&H000000," "Outline=4,Shadow=1,Alignment=2,MarginV=120,BorderStyle=1" ), "minimal": ( "FontName=Arial,FontSize=28," "PrimaryColour=&HFFFFFF,OutlineColour=&H000000," "Outline=2,Shadow=0,Alignment=2,MarginV=80,BorderStyle=1" ), } def burn_subtitles(video, srt, output, style="reels"): """Burn-in podnapisov. Najprej pretvorimo SRT v ASS z eksplicitnim stylom, ker FFmpeg force_style je ne-zanesljivo in pogosto silently ignore-an.""" # Pretvorimo SRT → ASS s pravim stylom ass_path = srt.replace(".srt", ".ass") if srt.endswith(".srt") else srt + ".ass" # Style nastavitve glede na izbiro if style == "yellow": primary = "&H0000FFFF" # rumeno else: primary = "&H00FFFFFF" # belo # ASS PlayResY 1920 → MarginV je v pikslih 1:1 # FontSize=56 = ~3% širine ekrana, MarginV=400 = spodnja tretjina (safe zone) # WrapStyle=0 → smart wrap, MarginL/R=80 = 7.4% z vsake strani ass_header = f"""[Script Info] ScriptType: v4.00+ PlayResX: 1080 PlayResY: 1920 WrapStyle: 0 ScaledBorderAndShadow: yes [V4+ Styles] Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding Style: Default,DejaVu Sans,56,{primary},&H00FFFFFF,&H00000000,&H00000000,1,0,0,0,100,100,0,0,1,5,1,2,80,80,400,1 [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text """ # Parse SRT in convert v ASS dialogue lines with open(srt, "r", encoding="utf-8") as f: srt_content = f.read() def srt_to_ass_time(t): # 00:00:09,520 → 0:00:09.52 h, m, rest = t.split(":") s, ms = rest.split(",") return f"{int(h)}:{int(m):02d}:{int(s):02d}.{int(ms)//10:02d}" dialogue_lines = [] blocks = srt_content.strip().split("\n\n") for block in blocks: lines = block.strip().split("\n") if len(lines) < 3: continue # lines[0] = idx, lines[1] = timecode, lines[2:] = text timecode = lines[1] text = " ".join(lines[2:]).replace("\n", " ") if " --> " not in timecode: continue start_t, end_t = timecode.split(" --> ") ass_start = srt_to_ass_time(start_t.strip()) ass_end = srt_to_ass_time(end_t.strip()) dialogue_lines.append(f"Dialogue: 0,{ass_start},{ass_end},Default,,0,0,0,,{text}") with open(ass_path, "w", encoding="utf-8") as f: f.write(ass_header) f.write("\n".join(dialogue_lines)) f.write("\n") print(f"📝 ASS: {ass_path} ({len(dialogue_lines)} dialogov)") # Burn-in z ass filtrom (boljši kot subtitles za naš primer) ass_escaped = ass_path.replace("\\", "\\\\").replace(":", "\\:").replace("'", r"\'") # setsar=1 + setdar=9/16: zagotovi 1:1 piksel + 9:16 display ratio v finalnem output-u vf = f"ass='{ass_escaped}',setsar=1" cmd = [ "ffmpeg", "-y", "-i", str(video), "-vf", vf, "-c:v", "libx264", "-preset", "medium", "-crf", "21", "-pix_fmt", "yuv420p", # web/mobile compat (Instagram/FB/web players) "-c:a", "copy", "-movflags", "+faststart", str(output), ] print("🔥 Burn-in podnapisov...") result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: print("❌ FFmpeg napaka:", file=sys.stderr) print(result.stderr[-2000:], file=sys.stderr) sys.exit(1) print(f"✅ {output}") def main(): ap = argparse.ArgumentParser() ap.add_argument("input") ap.add_argument("output") ap.add_argument("--lang", default=None, help="Jezik (sl, de, en, ...) ali auto") ap.add_argument("--model", default="large-v3", choices=["tiny", "base", "small", "medium", "large-v3"]) ap.add_argument("--style", default="reels", choices=list(SUBTITLE_STYLES.keys())) ap.add_argument("--keep-srt", action="store_true", help="Ohrani .srt poleg output") ap.add_argument("--srt", default=None, help="Že-pripravljen SRT (preskoči Whisper transkripcijo)") args = ap.parse_args() src = Path(args.input) if not src.exists(): print(f"❌ {src} ne obstaja", file=sys.stderr) sys.exit(1) if args.srt and Path(args.srt).exists(): print(f"📄 Uporabljam že-pripravljen SRT: {args.srt}") srt = args.srt srt_was_provided = True else: srt = transcribe(src, lang=args.lang, model_size=args.model) srt_was_provided = False burn_subtitles(src, srt, args.output, style=args.style) if args.keep_srt and not srt_was_provided: keep_path = Path(args.output).with_suffix(".srt") os.rename(srt, keep_path) print(f"💾 SRT shranjen: {keep_path}") elif not srt_was_provided: os.unlink(srt) if __name__ == "__main__": main()