Major: Claude post-processing of Whisper transcript
- Claude now corrects transcription errors (Slavic languages, dialects, mixed langs) - Returns corrected_segments with same timestamps but cleaner text - Pipeline generates SRT from Claude-corrected transcript and passes to subtitle.py via --srt - subtitle.py supports --srt to skip Whisper re-transcription on the trimmed clip - clip.py propagates --srt through to subtitle.py - Whisper still runs once (in analyze.py); subtitle.py reuses corrected output instead of re-running - This means: Whisper's mistakes (mixed langs, hallucinations, wrong words) are fixed by Claude before becoming visible subtitles
This commit is contained in:
parent
4e123bdabc
commit
4bc5ac6756
91
app/main.py
91
app/main.py
@ -111,6 +111,75 @@ def list_jobs():
|
||||
return out
|
||||
|
||||
|
||||
def generate_srt_from_segments(segments, clip_start, clip_end, output_path):
|
||||
"""Generira SRT samo za dele, ki spadajo v [clip_start, clip_end].
|
||||
|
||||
Timestamp-i so re-mapirani na 0-based (kot je v trim-anem videu).
|
||||
Razdeli dolge segmente (>2.5s) na enake kose za hiter pacing v reels stilu.
|
||||
Vse besedilo VELIKE TISKANE ČRKE.
|
||||
"""
|
||||
MAX_CHUNK_DURATION = 2.5
|
||||
|
||||
def fmt_ts(s):
|
||||
h = int(s // 3600)
|
||||
m = int((s % 3600) // 60)
|
||||
sec = s % 60
|
||||
return f"{h:02d}:{m:02d}:{sec:06.3f}".replace(".", ",")
|
||||
|
||||
lines = []
|
||||
idx = 1
|
||||
|
||||
for seg in segments:
|
||||
s_start = float(seg["start"])
|
||||
s_end = float(seg["end"])
|
||||
text = str(seg["text"]).strip()
|
||||
|
||||
# Filter v range
|
||||
if s_end <= clip_start or s_start >= clip_end:
|
||||
continue
|
||||
# Klipni
|
||||
s_start = max(s_start, clip_start)
|
||||
s_end = min(s_end, clip_end)
|
||||
if s_end - s_start < 0.2:
|
||||
continue
|
||||
|
||||
# Re-mapraj na 0-based
|
||||
rel_start = s_start - clip_start
|
||||
rel_end = s_end - clip_start
|
||||
|
||||
if not text:
|
||||
continue
|
||||
text_upper = text.upper()
|
||||
|
||||
# Razdeli na chunk-e če je predolg
|
||||
duration = rel_end - rel_start
|
||||
if duration <= MAX_CHUNK_DURATION:
|
||||
lines.append(f"{idx}\n{fmt_ts(rel_start)} --> {fmt_ts(rel_end)}\n{text_upper}\n")
|
||||
idx += 1
|
||||
else:
|
||||
# Razdeli na N enakih kosov; če ima Whisper word-timing, jih lahko razdelimo bolje,
|
||||
# ampak za zdaj enako razdelimo
|
||||
n_parts = int(duration / MAX_CHUNK_DURATION) + 1
|
||||
words = text_upper.split()
|
||||
words_per_part = max(1, len(words) // n_parts)
|
||||
chunk_dur = duration / n_parts
|
||||
for i in range(n_parts):
|
||||
cs = rel_start + i * chunk_dur
|
||||
ce = rel_start + (i + 1) * chunk_dur
|
||||
# Vzemi pripadajoče besede
|
||||
wstart = i * words_per_part
|
||||
wend = (i + 1) * words_per_part if i < n_parts - 1 else len(words)
|
||||
chunk_text = " ".join(words[wstart:wend]) if wstart < len(words) else text_upper
|
||||
if not chunk_text.strip():
|
||||
chunk_text = text_upper
|
||||
lines.append(f"{idx}\n{fmt_ts(cs)} --> {fmt_ts(ce)}\n{chunk_text.strip()}\n")
|
||||
idx += 1
|
||||
|
||||
with open(output_path, "w", encoding="utf-8") as f:
|
||||
f.write("\n".join(lines))
|
||||
return output_path
|
||||
|
||||
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
# Pipeline runner (background task)
|
||||
# ────────────────────────────────────────────────────────────────
|
||||
@ -174,12 +243,29 @@ def process_job(job_id):
|
||||
cmd += ["--model", job.get("whisper_model", "small")]
|
||||
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
srt_from_claude = None # Pot do SRT iz Claude-popravljenega transcript-a
|
||||
if proc.returncode == 0 and analysis_path.exists():
|
||||
try:
|
||||
with open(analysis_path, "r", encoding="utf-8") as f:
|
||||
analysis = json.load(f)
|
||||
cr = analysis["clip_range"]
|
||||
fade = analysis["fade"]
|
||||
|
||||
# Generiraj SRT iz transcript-a TRIM-ANEGA na clip_range
|
||||
# (Claude je morda popravil besedilo — uporabi popravljeno)
|
||||
if analysis.get("transcript", {}).get("segments"):
|
||||
srt_path_out = OUTPUT_DIR / f"{job_id}.subtitles.srt"
|
||||
try:
|
||||
generate_srt_from_segments(
|
||||
analysis["transcript"]["segments"],
|
||||
cr["start"], cr["end"],
|
||||
srt_path_out,
|
||||
)
|
||||
srt_from_claude = str(srt_path_out)
|
||||
print(f"📝 Generated SRT from Claude transcript: {srt_path_out}")
|
||||
except Exception as e:
|
||||
print(f"⚠️ SRT generation failed: {e}")
|
||||
|
||||
update_job(
|
||||
job_id,
|
||||
analysis_summary={
|
||||
@ -193,6 +279,7 @@ def process_job(job_id):
|
||||
"video_duration": analysis.get("video_duration"),
|
||||
"candidates": analysis["chorus"].get("all_candidates", [])[:5]
|
||||
if analysis.get("chorus") else [],
|
||||
"claude_corrected_text": analysis.get("claude_corrected_text", False),
|
||||
},
|
||||
# Cel transkript shranimo za UI prikaz
|
||||
full_transcript=[
|
||||
@ -205,6 +292,7 @@ def process_job(job_id):
|
||||
fade_out=fade["fade_out"],
|
||||
detected_language=analysis["language"],
|
||||
is_instrumental=analysis["instrumental"],
|
||||
claude_srt_path=srt_from_claude,
|
||||
)
|
||||
# Auto-disable subs za instrumental
|
||||
if analysis["instrumental"] and not job.get("no_subs"):
|
||||
@ -235,6 +323,9 @@ def process_job(job_id):
|
||||
cmd += ["--fade-in", str(job["fade_in"])]
|
||||
if job.get("fade_out", 0) > 0:
|
||||
cmd += ["--fade-out", str(job["fade_out"])]
|
||||
# SRT iz Claude (boljše besedilo) — preda direktno v subtitle.py
|
||||
if job.get("claude_srt_path") and Path(job["claude_srt_path"]).exists() and not job.get("no_subs"):
|
||||
cmd += ["--srt", job["claude_srt_path"]]
|
||||
# lang: prefer detected_language če auto
|
||||
chosen_lang = job.get("lang")
|
||||
if chosen_lang in (None, "auto", ""):
|
||||
|
||||
@ -464,21 +464,31 @@ def analyze_with_claude(transcript, video_duration, target_duration=30):
|
||||
lines.append(f"[{start:6.1f}-{end:6.1f}] {text}")
|
||||
transcript_text = "\n".join(lines)
|
||||
|
||||
prompt = f"""Tu je transcript pesmi (timestamp v sekundah, besedilo):
|
||||
prompt = f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo):
|
||||
|
||||
{transcript_text}
|
||||
|
||||
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.
|
||||
|
||||
POMEMBNO: Whisper je avtomatski STT in pogosto naredi napake, posebej pri:
|
||||
- slovanskih jezikih (slovenščina, hrvaščina, bosanščina, srbščina)
|
||||
- narečnih izrazih
|
||||
- ko glasba prevladuje nad vokalom
|
||||
|
||||
PROSIM:
|
||||
1. Preberi celoten tekst in razumi strukturo (intro / verz / pre-chorus / refren / bridge / outro)
|
||||
2. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi (običajno 2-3x z istim ali zelo podobnim besedilom)
|
||||
3. Izberi najboljši odsek za reel:
|
||||
2. POPRAVI očitne napake v transkripciji:
|
||||
- Če pesem ima refren ki se ponavlja, vse pojavitve refrena POPRAVI da imajo ENAKO besedilo (uporabi najjasnejšo varianto)
|
||||
- Popravi napačne besede ki nimajo smisla v kontekstu
|
||||
- Popravi pomešane jezike (če pesem je slovenska, vse vrstice naj bodo v slovenščini)
|
||||
- Ohrani timestamp-e nepriremenjene
|
||||
3. Prepoznaj REFREN: del besedila, ki se ponavlja v pesmi
|
||||
4. Izberi najboljši odsek za reel:
|
||||
- Vključi cel refren (cel verz besedila brez prekinitve)
|
||||
- Če imaš prostor, dodaj pre-chorus build-up tik pred refrenom
|
||||
- Lahko traja 20-45 sekund (ne strogo 30s)
|
||||
- Začni in končaj na smiselni meji (konec stavka, ne sredi besede)
|
||||
4. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
|
||||
5. Če pesem nima jasnega refrena (instrumental, monolog, govor), izberi najbolj dramatičen ali zaključen del
|
||||
|
||||
Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
||||
{{
|
||||
@ -486,15 +496,21 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
||||
"end": <sekunde>,
|
||||
"reason": "<kratka razlaga zakaj ta odsek>",
|
||||
"chorus_text": "<besedilo refrena ali ključni del>",
|
||||
"structure": "<1 stavek o strukturi pesmi>"
|
||||
}}"""
|
||||
"structure": "<1 stavek o strukturi pesmi>",
|
||||
"language": "<jezik: sl/de/hr/bs/sr/en/it/es/fr>",
|
||||
"corrected_segments": [
|
||||
{{"start": <s>, "end": <s>, "text": "<popravljeno besedilo>"}}
|
||||
]
|
||||
}}
|
||||
|
||||
V "corrected_segments" vključi VSE segmente iz inputa s popravljenim besedilom (ohrani timestamp-e)."""
|
||||
|
||||
try:
|
||||
import urllib.request
|
||||
import urllib.error
|
||||
body = json.dumps({
|
||||
"model": "claude-haiku-4-5-20251001",
|
||||
"max_tokens": 1024,
|
||||
"max_tokens": 4096,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}).encode("utf-8")
|
||||
|
||||
@ -533,6 +549,9 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
||||
print(f" 🤖 Claude izbral: {start:.1f}-{end:.1f}s", file=sys.stderr)
|
||||
print(f" Razlog: {result.get('reason', '')[:80]}", file=sys.stderr)
|
||||
print(f" Struktura: {result.get('structure', '')[:80]}", file=sys.stderr)
|
||||
cs = result.get("corrected_segments")
|
||||
if cs:
|
||||
print(f" Popravljeni segmenti: {len(cs)}", file=sys.stderr)
|
||||
|
||||
return {
|
||||
"start": round(start, 2),
|
||||
@ -541,6 +560,8 @@ Odgovori SAMO v JSON formatu (brez markdown, brez razlage):
|
||||
"reason": result.get("reason", ""),
|
||||
"chorus_text": result.get("chorus_text", ""),
|
||||
"structure": result.get("structure", ""),
|
||||
"language": result.get("language"),
|
||||
"corrected_segments": result.get("corrected_segments"),
|
||||
"source": "claude_llm",
|
||||
}
|
||||
except urllib.error.HTTPError as e:
|
||||
@ -676,6 +697,45 @@ def main():
|
||||
f"(duration: {clip_range['duration']}s, source: {clip_range.get('source')})",
|
||||
file=sys.stderr)
|
||||
|
||||
# Če Claude je vrnil popravljene segmente, jih uporabi (boljši za podnapise)
|
||||
if claude_result and claude_result.get("corrected_segments"):
|
||||
corrected = claude_result["corrected_segments"]
|
||||
# Ohrani word-level timing iz originala, posodobi samo text
|
||||
orig_by_start = {round(s["start"], 1): s for s in transcript["segments"]}
|
||||
new_segments = []
|
||||
for cs in corrected:
|
||||
try:
|
||||
cs_start = float(cs["start"])
|
||||
cs_end = float(cs["end"])
|
||||
cs_text = str(cs["text"]).strip()
|
||||
except (KeyError, ValueError, TypeError):
|
||||
continue
|
||||
# Najdi originalni segment z istim start (ali blizu) za word-level timing
|
||||
orig = orig_by_start.get(round(cs_start, 1))
|
||||
if not orig:
|
||||
# Najdi najbližji
|
||||
closest_diff = 999
|
||||
for s in transcript["segments"]:
|
||||
diff = abs(s["start"] - cs_start)
|
||||
if diff < closest_diff and diff < 1.0:
|
||||
closest_diff = diff
|
||||
orig = s
|
||||
new_segments.append({
|
||||
"start": cs_start,
|
||||
"end": cs_end,
|
||||
"text": cs_text,
|
||||
# Word-level timing ne moremo posodabljati ker Claude ne vrača besede,
|
||||
# ampak ohranimo če imamo
|
||||
"words": orig.get("words", []) if orig else [],
|
||||
})
|
||||
transcript["segments"] = new_segments
|
||||
transcript["claude_corrected"] = True
|
||||
# Posodobi tudi jezik če Claude je drugačnega mnenja
|
||||
if claude_result.get("language") and claude_result["language"] != transcript["language"]:
|
||||
print(f" ✏️ Claude je popravil jezik: {transcript['language']} → {claude_result['language']}", file=sys.stderr)
|
||||
transcript["language"] = claude_result["language"]
|
||||
print(f" ✏️ Whisper segmenti zamenjani s Claude popravljenimi ({len(new_segments)})", file=sys.stderr)
|
||||
|
||||
# 7. Fade params (lahko razširi clip end če konča sredi vokala)
|
||||
fade = detect_audio_fade(clip_range, transcript, video_duration=duration)
|
||||
print(f"🎚 Fade: in={fade['fade_in']}s, out={fade['fade_out']}s", file=sys.stderr)
|
||||
@ -699,6 +759,8 @@ def main():
|
||||
"chorus": chorus,
|
||||
"clip_range": clip_range,
|
||||
"fade": fade,
|
||||
"claude_used": claude_result is not None,
|
||||
"claude_corrected_text": bool(claude_result and claude_result.get("corrected_segments")),
|
||||
}
|
||||
|
||||
if args.output:
|
||||
|
||||
@ -46,10 +46,11 @@ SCRIPT_DIR = Path(__file__).parent
|
||||
|
||||
|
||||
def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quality,
|
||||
fade_in=0.0, fade_out=0.0):
|
||||
fade_in=0.0, fade_out=0.0, srt_path=None):
|
||||
"""Naredi en klip src → dst."""
|
||||
print(f"🎯 run_clip args: src={src}, dst={dst}, start={start!r}, duration={duration!r}, "
|
||||
f"mode={mode}, fade_in={fade_in}, fade_out={fade_out}", file=sys.stderr)
|
||||
f"mode={mode}, fade_in={fade_in}, fade_out={fade_out}, "
|
||||
f"srt={'yes' if srt_path else 'no'}", file=sys.stderr)
|
||||
tmp = tempfile.mkdtemp(prefix="reel_")
|
||||
try:
|
||||
reframed = Path(tmp) / "reframed.mp4"
|
||||
@ -88,6 +89,8 @@ def run_clip(src, dst, start, duration, mode, lang, model, style, no_subs, quali
|
||||
]
|
||||
if lang:
|
||||
cmd += ["--lang", lang]
|
||||
if srt_path:
|
||||
cmd += ["--srt", str(srt_path)]
|
||||
r = subprocess.run(cmd)
|
||||
if r.returncode != 0:
|
||||
print(f"❌ Subtitle napaka — shranim brez", file=sys.stderr)
|
||||
@ -114,6 +117,7 @@ def main():
|
||||
ap.add_argument("--style", default="reels", choices=["reels", "yellow", "minimal"])
|
||||
ap.add_argument("--no-subs", action="store_true")
|
||||
ap.add_argument("--quality", default="medium", choices=["fast", "medium", "high"])
|
||||
ap.add_argument("--srt", default=None, help="Že-pripravljen SRT (preskoči Whisper)")
|
||||
args = ap.parse_args()
|
||||
|
||||
src = Path(args.input)
|
||||
@ -136,7 +140,8 @@ def main():
|
||||
start = parse_ts(args.start) if args.start else None
|
||||
run_clip(src, Path(args.output), start, args.duration, args.mode,
|
||||
args.lang, args.model, args.style, args.no_subs, args.quality,
|
||||
fade_in=args.fade_in, fade_out=args.fade_out)
|
||||
fade_in=args.fade_in, fade_out=args.fade_out,
|
||||
srt_path=args.srt)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -282,6 +282,7 @@ def main():
|
||||
ap.add_argument("--model", default="small", choices=["tiny", "base", "small", "medium", "large-v3"])
|
||||
ap.add_argument("--style", default="reels", choices=list(SUBTITLE_STYLES.keys()))
|
||||
ap.add_argument("--keep-srt", action="store_true", help="Ohrani .srt poleg output")
|
||||
ap.add_argument("--srt", default=None, help="Že-pripravljen SRT (preskoči Whisper transkripcijo)")
|
||||
args = ap.parse_args()
|
||||
|
||||
src = Path(args.input)
|
||||
@ -289,14 +290,21 @@ def main():
|
||||
print(f"❌ {src} ne obstaja", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
srt = transcribe(src, lang=args.lang, model_size=args.model)
|
||||
if args.srt and Path(args.srt).exists():
|
||||
print(f"📄 Uporabljam že-pripravljen SRT: {args.srt}")
|
||||
srt = args.srt
|
||||
srt_was_provided = True
|
||||
else:
|
||||
srt = transcribe(src, lang=args.lang, model_size=args.model)
|
||||
srt_was_provided = False
|
||||
|
||||
burn_subtitles(src, srt, args.output, style=args.style)
|
||||
|
||||
if args.keep_srt:
|
||||
if args.keep_srt and not srt_was_provided:
|
||||
keep_path = Path(args.output).with_suffix(".srt")
|
||||
os.rename(srt, keep_path)
|
||||
print(f"💾 SRT shranjen: {keep_path}")
|
||||
else:
|
||||
elif not srt_was_provided:
|
||||
os.unlink(srt)
|
||||
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user