Major: Claude post-processing of Whisper transcript

- Claude now corrects transcription errors (Slavic languages, dialects, mixed langs)
- Returns corrected_segments with same timestamps but cleaner text
- Pipeline generates SRT from Claude-corrected transcript and passes to subtitle.py via --srt
- subtitle.py supports --srt to skip Whisper re-transcription on the trimmed clip
- clip.py propagates --srt through to subtitle.py
- Whisper still runs once (in analyze.py); subtitle.py reuses corrected output instead of re-running
- This means: Whisper's mistakes (mixed langs, hallucinations, wrong words) are fixed by Claude before becoming visible subtitles
This commit is contained in:
Sebastjan Artič 2026-04-29 08:13:33 +00:00
parent 4e123bdabc
commit 4bc5ac6756
4 changed files with 179 additions and 13 deletions

View File

@ -111,6 +111,75 @@ def list_jobs():
return out
def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
"""Generira SRT samo za dele, ki spadajo v [clip_start, clip_end].
Timestamp-i so re-mapirani na 0-based (kot je v trim-anem videu).
Razdeli dolge segmente (>2.5s) na enake kose za hiter pacing v reels stilu.
Vse besedilo VELIKE TISKANE ČRKE.
"""
MAX_CHUNK_DURATION = 2.5
def fmt_ts(s):
h = int(s // 3600)
m = int((s % 3600) // 60)
sec = s % 60
return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".", ",")
lines = []
idx = 1
for seg in segments:
s_start = float(seg["start"])
s_end = float(seg["end"])
text = str(seg["text"]).strip()
# Filter v range
if s_end <= clip_start or s_start >= clip_end:
continue
# Klipni
s_start = max(s_start, clip_start)
s_end = min(s_end, clip_end)
if s_end - s_start < 0.2:
continue
# Re-mapraj na 0-based
rel_start = s_start - clip_start
rel_end = s_end - clip_start
if not text:
continue
text_upper = text.upper()
# Razdeli na chunk-e če je predolg
duration = rel_end - rel_start
if duration <= MAX_CHUNK_DURATION:
lines.append(f"{idx}\n{fmt_ts(rel_start)} --> {fmt_ts(rel_end)}\n{text_upper}\n")
idx += 1
else:
# Razdeli na N enakih kosov; če ima Whisper word-timing, jih lahko razdelimo bolje,
# ampak za zdaj enako razdelimo
n_parts = int(duration / MAX_CHUNK_DURATION) + 1
words = text_upper.split()
words_per_part = max(1, len(words) // n_parts)
chunk_dur = duration / n_parts
for i in range(n_parts):
cs = rel_start + i * chunk_dur
ce = rel_start + (i + 1) * chunk_dur
# Vzemi pripadajoče besede
wstart = i * words_per_part
wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words)
chunk_text = " ".join(words[wstart:wend]) if wstart < len(words) else text_upper
if not chunk_text.strip():
chunk_text = text_upper
lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n")
idx += 1
with open(output_path, "w", encoding="utf-8") as f:
f.write("\n".join(lines))
return output_path
# ────────────────────────────────────────────────────────────────
# Pipeline runner (background task)
# ────────────────────────────────────────────────────────────────
@ -174,12 +243,29 @@ def process_job(job_id):
cmd += ["--model", job.get("whisper_model", "small")]
proc = subprocess.run(cmd, capture_output=True, text=True)
srt_from_claude = None # Pot do SRT iz Claude-popravljenega transcript-a
if proc.returncode == 0 and analysis_path.exists():
try:
with open(analysis_path, "r", encoding="utf-8") as f:
analysis = json.load(f)
cr = analysis["clip_range"]
fade = analysis["fade"]
# Generiraj SRT iz transcript-a TRIM-ANEGA na clip_range
# (Claude je morda popravil besedilo — uporabi popravljeno)
if analysis.get("transcript", {}).get("segments"):
srt_path_out = OUTPUT_DIR / f"{job_id}.subtitles.srt"
try:
generate_srt_from_segments(
analysis["transcript"]["segments"],
cr["start"], cr["end"],
srt_path_out,
)
srt_from_claude = str(srt_path_out)
print(f"📝 Generated SRT from Claude transcript: {srt_path_out}")
except Exception as e:
print(f"⚠️ SRT generation failed: {e}")
update_job(
job_id,
analysis_summary={
@ -193,6 +279,7 @@ def process_job(job_id):
"video_duration": analysis.get("video_duration"),
"candidates": analysis["chorus"].get("all_candidates", [])[:5]
if analysis.get("chorus") else [],
"claude_corrected_text": analysis.get("claude_corrected_text", False),
},
# Cel transkript shranimo za UI prikaz
full_transcript=[
@ -205,6 +292,7 @@ def process_job(job_id):
fade_out=fade["fade_out"],
detected_language=analysis["language"],
is_instrumental=analysis["instrumental"],
claude_srt_path=srt_from_claude,
)
# Auto-disable subs za instrumental
if analysis["instrumental"] and not job.get("no_subs"):
@ -235,6 +323,9 @@ def process_job(job_id):
cmd += ["--fade-in", str(job["fade_in"])]
if job.get("fade_out", 0) > 0:
cmd += ["--fade-out", str(job["fade_out"])]
# SRT iz Claude (boljše besedilo) — preda direktno v subtitle.py
if job.get("claude_srt_path") and Path(job["claude_srt_path"]).exists() and not job.get("no_subs"):
cmd += ["--srt", job["claude_srt_path"]]
# lang: prefer detected_language če auto
chosen_lang = job.get("lang")
if chosen_lang in (None, "auto", ""):

View File

@ -464,21 +464,31 @@ def analyze_with_claude(transcript, video_duration, target_duration=30):
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
transcript_text = "\n".join(lines)
prompt = f"""Tu je transcript pesmi (timestamp v sekundah, besedilo):
prompt = f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo):
{transcript_text}
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.
POMEMBNO: Whisper je avtomatski STT in pogosto naredi napake, posebej pri:
- slovanskih jezikih (slovenščina, hrvaščina, bosanščina, srbščina)
- narečnih izrazih
- ko glasba prevladuje nad vokalom
PROSIM:
1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
2. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi (običajno 2-3x z istim ali zelo podobnim besedilom)
3. Izberi najboljši odsek za reel:
2. POPRAVI očitne napake v transkripciji:
- Če pesem ima refren ki se ponavlja, vse pojavitve refrena POPRAVI da imajo ENAKO besedilo (uporabi najjasnejšo varianto)
- Popravi napačne besede ki nimajo smisla v kontekstu
- Popravi pomešane jezike (če pesem je slovenska, vse vrstice naj bodo v slovenščini)
- Ohrani timestamp-e nepriremenjene
3. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi
4. Izberi najboljši odsek za reel:
- Vključi cel refren (cel verz besedila brez prekinitve)
- Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom
- Lahko traja 20-45 sekund (ne strogo 30s)
- Začni in končaj na smiselni meji (konec stavka, ne sredi besede)
4. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
5. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
{{
@ -486,15 +496,21 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
"end": <sekunde>,
"reason": "<kratka razlaga zakaj ta odsek>",
"chorus_text": "<besedilo refrena ali ključni del>",
"structure": "<1 stavek o strukturi pesmi>"
}}"""
"structure": "<1 stavek o strukturi pesmi>",
"language": "<jezik: sl/de/hr/bs/sr/en/it/es/fr>",
"corrected_segments": [
{{"start": <s>, "end": <s>, "text": "<popravljeno besedilo>"}}
]
}}
V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom (ohrani timestamp-e)."""
try:
import urllib.request
import urllib.error
body = json.dumps({
"model": "claude-haiku-4-5-20251001",
"max_tokens": 1024,
"max_tokens": 4096,
"messages": [{"role": "user", "content": prompt}],
}).encode("utf-8")
@ -533,6 +549,9 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr)
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
cs = result.get("corrected_segments")
if cs:
print(f" Popravljeni segmenti: {len(cs)}", file=sys.stderr)
return {
"start": round(start, 2),
@ -541,6 +560,8 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
"reason": result.get("reason", ""),
"chorus_text": result.get("chorus_text", ""),
"structure": result.get("structure", ""),
"language": result.get("language"),
"corrected_segments": result.get("corrected_segments"),
"source": "claude_llm",
}
except urllib.error.HTTPError as e:
@ -676,6 +697,45 @@ def main():
f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
file=sys.stderr)
# Če Claude je vrnil popravljene segmente, jih uporabi (boljši za podnapise)
if claude_result and claude_result.get("corrected_segments"):
corrected = claude_result["corrected_segments"]
# Ohrani word-level timing iz originala, posodobi samo text
orig_by_start = {round(s["start"], 1): s for s in transcript["segments"]}
new_segments = []
for cs in corrected:
try:
cs_start = float(cs["start"])
cs_end = float(cs["end"])
cs_text = str(cs["text"]).strip()
except (KeyError, ValueError, TypeError):
continue
# Najdi originalni segment z istim start (ali blizu) za word-level timing
orig = orig_by_start.get(round(cs_start, 1))
if not orig:
# Najdi najbližji
closest_diff = 999
for s in transcript["segments"]:
diff = abs(s["start"] - cs_start)
if diff < closest_diff and diff < 1.0:
closest_diff = diff
orig = s
new_segments.append({
"start": cs_start,
"end": cs_end,
"text": cs_text,
# Word-level timing ne moremo posodabljati ker Claude ne vrača besede,
# ampak ohranimo če imamo
"words": orig.get("words", []) if orig else [],
})
transcript["segments"] = new_segments
transcript["claude_corrected"] = True
# Posodobi tudi jezik če Claude je drugačnega mnenja
if claude_result.get("language") and claude_result["language"] != transcript["language"]:
print(f" ✏️ Claude je popravil jezik: {transcript['language']}{claude_result['language']}", file=sys.stderr)
transcript["language"] = claude_result["language"]
print(f" ✏️ Whisper segmenti zamenjani s Claude popravljenimi ({len(new_segments)})", file=sys.stderr)
# 7. Fade params (lahko razširi clip end če konča sredi vokala)
fade = detect_audio_fade(clip_range, transcript, video_duration=duration)
print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr)
@ -699,6 +759,8 @@ def main():
"chorus": chorus,
"clip_range": clip_range,
"fade": fade,
"claude_used": claude_result is not None,
"claude_corrected_text": bool(claude_result and claude_result.get("corrected_segments")),
}
if args.output:

View File

@ -46,10 +46,11 @@ SCRIPT_DIR = Path(__file__).parent
def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quality,
fade_in=0.0, fade_out=0.0):
fade_in=0.0, fade_out=0.0, srt_path=None):
"""Naredi en klip src → dst."""
print(f"🎯 run_clip args: src={src}, dst={dst}, start={start!r}, duration={duration!r}, "
f"mode={mode}, fade_in={fade_in}, fade_out={fade_out}", file=sys.stderr)
f"mode={mode}, fade_in={fade_in}, fade_out={fade_out}, "
f"srt={'yes' if srt_path else 'no'}", file=sys.stderr)
tmp = tempfile.mkdtemp(prefix="reel_")
try:
reframed = Path(tmp) / "reframed.mp4"
@ -88,6 +89,8 @@ def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quali
]
if lang:
cmd += ["--lang", lang]
if srt_path:
cmd += ["--srt", str(srt_path)]
r = subprocess.run(cmd)
if r.returncode != 0:
print(f"❌ Subtitle napaka — shranim brez", file=sys.stderr)
@ -114,6 +117,7 @@ def main():
ap.add_argument("--style", default="reels", choices=["reels", "yellow", "minimal"])
ap.add_argument("--no-subs", action="store_true")
ap.add_argument("--quality", default="medium", choices=["fast", "medium", "high"])
ap.add_argument("--srt", default=None, help="Že-pripravljen SRT (preskoči Whisper)")
args = ap.parse_args()
src = Path(args.input)
@ -136,7 +140,8 @@ def main():
start = parse_ts(args.start) if args.start else None
run_clip(src, Path(args.output), start, args.duration, args.mode,
args.lang, args.model, args.style, args.no_subs, args.quality,
fade_in=args.fade_in, fade_out=args.fade_out)
fade_in=args.fade_in, fade_out=args.fade_out,
srt_path=args.srt)
if __name__ == "__main__":

View File

@ -282,6 +282,7 @@ def main():
ap.add_argument("--model", default="small", choices=["tiny", "base", "small", "medium", "large-v3"])
ap.add_argument("--style", default="reels", choices=list(SUBTITLE_STYLES.keys()))
ap.add_argument("--keep-srt", action="store_true", help="Ohrani .srt poleg output")
ap.add_argument("--srt", default=None, help="Že-pripravljen SRT (preskoči Whisper transkripcijo)")
args = ap.parse_args()
src = Path(args.input)
@ -289,14 +290,21 @@ def main():
print(f"{src} ne obstaja", file=sys.stderr)
sys.exit(1)
srt = transcribe(src, lang=args.lang, model_size=args.model)
if args.srt and Path(args.srt).exists():
print(f"📄 Uporabljam že-pripravljen SRT: {args.srt}")
srt = args.srt
srt_was_provided = True
else:
srt = transcribe(src, lang=args.lang, model_size=args.model)
srt_was_provided = False
burn_subtitles(src, srt, args.output, style=args.style)
if args.keep_srt:
if args.keep_srt and not srt_was_provided:
keep_path = Path(args.output).with_suffix(".srt")
os.rename(srt, keep_path)
print(f"💾 SRT shranjen: {keep_path}")
else:
elif not srt_was_provided:
os.unlink(srt)