Add Claude web_search tool for lyrics lookup + tighter subtitle timing
1. Claude API web_search tool integration:
- Claude can now search web for actual lyrics when STT text is wrong
- Especially useful for SLO/HR/BS/SR songs (Modrijani, Veseli Dolenjci)
where Claude doesn't know lyrics from training data
- Agentic loop: tool_use → server-side search → continuation → final text
- Max 3 searches per job ($0.03 cost limit)
- Hint sources: besedila.com, lyricstranslate.com, tekstovi.net, songtexte.com
2. Tighter subtitle segmentation from Scribe word timestamps:
- Phrase boundaries on shorter pauses (0.4s vs 0.6s)
- Sentence-ending punctuation triggers segment break
- Max segment 4s (was 6s) for natural readable subtitles
- Hard cap at 5.5s to prevent very long lines
This fixes 'ples to noč' → 'ples pojoč' for Modrijani songs that
Scribe transcribed phonetically wrong but Claude can fix via web lookup.
This commit is contained in:
parent
68247bb84c
commit
5f90085981
@ -137,7 +137,11 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
|
|||||||
detected_prob = data.get("language_probability", 1.0)
|
detected_prob = data.get("language_probability", 1.0)
|
||||||
|
|
||||||
# Scribe returns flat list of words (not segments)
|
# Scribe returns flat list of words (not segments)
|
||||||
# We need to group words into pseudo-segments (~10s each, breaking on long pauses)
|
# We group words into pseudo-segments using **smart phrase-aware segmentation**:
|
||||||
|
# - Close on long pause (>= 0.4s) — natural breath/phrase boundary
|
||||||
|
# - OR after sentence-ending punctuation (. ! ?)
|
||||||
|
# - OR after 4 seconds (max segment length for readable subtitle)
|
||||||
|
# This gives ~3-7 word segments matching natural sung phrases.
|
||||||
words = data.get("words", [])
|
words = data.get("words", [])
|
||||||
segments = []
|
segments = []
|
||||||
|
|
||||||
@ -152,16 +156,26 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"):
|
|||||||
for i, w in enumerate(real_words):
|
for i, w in enumerate(real_words):
|
||||||
current_seg_words.append(w)
|
current_seg_words.append(w)
|
||||||
w_end = w.get("end", w.get("start", 0))
|
w_end = w.get("end", w.get("start", 0))
|
||||||
|
w_text = w.get("text", "")
|
||||||
|
|
||||||
# Decide if we should close the segment
|
|
||||||
close = False
|
close = False
|
||||||
# Close on long pause (>= 0.6s)
|
# Decide if we should close the segment
|
||||||
if i + 1 < len(real_words):
|
if i + 1 < len(real_words):
|
||||||
next_start = real_words[i + 1].get("start", w_end)
|
next_start = real_words[i + 1].get("start", w_end)
|
||||||
pause = next_start - w_end
|
pause = next_start - w_end
|
||||||
seg_duration = w_end - seg_start
|
seg_duration = w_end - seg_start
|
||||||
# Long pause OR segment is long enough (>= 4s)
|
|
||||||
if pause >= 0.6 or seg_duration >= 6.0:
|
# Trigger close on:
|
||||||
|
# 1. Long pause (>= 0.4s) = phrase boundary
|
||||||
|
# 2. Sentence-ending punctuation
|
||||||
|
# 3. Segment is long enough (>= 4s)
|
||||||
|
if pause >= 0.4:
|
||||||
|
close = True
|
||||||
|
elif seg_duration >= 4.0 and pause >= 0.15:
|
||||||
|
close = True
|
||||||
|
elif w_text.rstrip().endswith(('.', '!', '?')) and pause >= 0.2:
|
||||||
|
close = True
|
||||||
|
elif seg_duration >= 5.5: # hard cap
|
||||||
close = True
|
close = True
|
||||||
else:
|
else:
|
||||||
close = True # last word
|
close = True # last word
|
||||||
@ -631,18 +645,24 @@ def _build_analysis_prompt(transcript, video_duration, target_duration=30, filen
|
|||||||
🎵 IME DATOTEKE: "{filename_hint}"
|
🎵 IME DATOTEKE: "{filename_hint}"
|
||||||
Iz imena datoteke morda lahko prepoznaš naslov pesmi ali izvajalca. Če je tako:
|
Iz imena datoteke morda lahko prepoznaš naslov pesmi ali izvajalca. Če je tako:
|
||||||
- Uporabi svoje znanje o **dejanskem besedilu** te pesmi
|
- Uporabi svoje znanje o **dejanskem besedilu** te pesmi
|
||||||
- Če Whisper transkript ne ustreza znanemu besedilu pesmi (halucinacija), POPRAVI besedilo na **dejansko besedilo pesmi**
|
- Če Whisper transkript ne ustreza znanemu besedilu pesmi, POPRAVI besedilo na **dejansko besedilo pesmi**
|
||||||
- Ohrani timestamp-e iz Whisper-ja (časovne meje so pravilne, samo besede so napačne)
|
- Ohrani timestamp-e iz Whisper-ja (časovne meje so pravilne, samo besede so morda napačne)
|
||||||
|
|
||||||
|
🔍 ČE NE POZNAŠ PESMI (npr. slovenske narodno-zabavne, manj znane pesmi) → **UPORABI web_search tool** da poiščeš pravo besedilo!
|
||||||
|
Primeri search queryjev:
|
||||||
|
- "[ime izvajalca] [naslov pesmi] besedilo" (slovenske: Modrijani, Veseli Dolenjci, Avseniki, Čuki, Atomik Harmonik)
|
||||||
|
- "[artist] [title] lyrics" (angleške/nemške)
|
||||||
|
- Pogosto so besedila na: besedila.com, lyricstranslate.com, genius.com, tekstovi.net (HR/SR), songtexte.com (DE)
|
||||||
|
Ko najdeš pravo besedilo, uporabi to za popravljanje "corrected_segments" — **transkript bo veliko bolj točen** kot če le ugibaš.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo):
|
return f"""Tu je transcript pesmi iz STT modela (timestamp v sekundah, besedilo):
|
||||||
|
|
||||||
{transcript_text}
|
{transcript_text}
|
||||||
|
|
||||||
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.{hint_block}
|
Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.{hint_block}
|
||||||
|
|
||||||
⚠️ POMEMBNO: Whisper si IZMIŠLJA besede ko ne razume jasno (HALLUCINACIJA). Posebej:
|
⚠️ POMEMBNO: STT lahko naredi napake — narečne besede, slovanski jeziki, ko glasba prevladuje:
|
||||||
- Ko glasba prevladuje nad vokalom
|
|
||||||
- Pri narečjih in slovanskih jezikih
|
- Pri narečjih in slovanskih jezikih
|
||||||
- Generira "tipičen" tekst (npr. tekst druge pesmi istega izvajalca)
|
- Generira "tipičen" tekst (npr. tekst druge pesmi istega izvajalca)
|
||||||
- Lahko vstavi besede ki se POdoBNO slišijo, ampak imajo ČISTO drug pomen
|
- Lahko vstavi besede ki se POdoBNO slišijo, ampak imajo ČISTO drug pomen
|
||||||
@ -739,44 +759,95 @@ def analyze_with_claude(transcript, video_duration, target_duration=30, model="c
|
|||||||
try:
|
try:
|
||||||
import urllib.request
|
import urllib.request
|
||||||
import urllib.error
|
import urllib.error
|
||||||
body = json.dumps({
|
|
||||||
"model": model,
|
|
||||||
# 8192 je dovolj za ~250 corrected_segments + ostali metadata pri dolgih pesmih.
|
|
||||||
# Sonnet 4.6 podpira precej več, ampak 8192 je varen default.
|
|
||||||
"max_tokens": 8192,
|
|
||||||
"messages": [{"role": "user", "content": prompt}],
|
|
||||||
}).encode("utf-8")
|
|
||||||
|
|
||||||
req = urllib.request.Request(
|
# Initial messages
|
||||||
"https://api.anthropic.com/v1/messages",
|
messages = [{"role": "user", "content": prompt}]
|
||||||
data=body,
|
|
||||||
headers={
|
|
||||||
"Content-Type": "application/json",
|
|
||||||
"x-api-key": api_key,
|
|
||||||
"anthropic-version": "2023-06-01",
|
|
||||||
},
|
|
||||||
method="POST",
|
|
||||||
)
|
|
||||||
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
||||||
data = json.loads(resp.read().decode("utf-8"))
|
|
||||||
|
|
||||||
content = data.get("content", [])
|
# Sonnet 4.6 podpira web_search tool — Claude lahko poišče prave lyrics
|
||||||
if not content:
|
# za pesmi v slovenščini/hrvaščini/itd., če jih ne pozna iz training data.
|
||||||
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
|
tools = [{
|
||||||
return None
|
"type": "web_search_20250305",
|
||||||
|
"name": "web_search",
|
||||||
|
"max_uses": 3, # Maksimalno 3 search-i = $0.03/job
|
||||||
|
}]
|
||||||
|
|
||||||
# Diagnostika: če je bil response odrezan, je JSON nepopoln
|
# Agentic loop: Claude lahko kliče web_search, dobi rezultate, vrne final answer
|
||||||
stop_reason = data.get("stop_reason")
|
max_iterations = 5
|
||||||
if stop_reason == "max_tokens":
|
for iteration in range(max_iterations):
|
||||||
usage = data.get("usage", {})
|
body = json.dumps({
|
||||||
print(
|
"model": model,
|
||||||
f" ⚠️ Claude odrezan (max_tokens): "
|
"max_tokens": 8192,
|
||||||
f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}",
|
"messages": messages,
|
||||||
file=sys.stderr,
|
"tools": tools,
|
||||||
|
}).encode("utf-8")
|
||||||
|
|
||||||
|
req = urllib.request.Request(
|
||||||
|
"https://api.anthropic.com/v1/messages",
|
||||||
|
data=body,
|
||||||
|
headers={
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"x-api-key": api_key,
|
||||||
|
"anthropic-version": "2023-06-01",
|
||||||
|
},
|
||||||
|
method="POST",
|
||||||
)
|
)
|
||||||
return None
|
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||||
|
data = json.loads(resp.read().decode("utf-8"))
|
||||||
|
|
||||||
text = content[0].get("text", "").strip()
|
content = data.get("content", [])
|
||||||
|
if not content:
|
||||||
|
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
stop_reason = data.get("stop_reason")
|
||||||
|
if stop_reason == "max_tokens":
|
||||||
|
usage = data.get("usage", {})
|
||||||
|
print(
|
||||||
|
f" ⚠️ Claude odrezan (max_tokens): "
|
||||||
|
f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Če je end_turn → smo končali, parsiraj text
|
||||||
|
if stop_reason in ("end_turn", "stop_sequence"):
|
||||||
|
# Najdem zadnji text block
|
||||||
|
text_blocks = [b for b in content if b.get("type") == "text"]
|
||||||
|
if text_blocks:
|
||||||
|
text = text_blocks[-1].get("text", "").strip()
|
||||||
|
break
|
||||||
|
print(" ⚠️ Claude end_turn brez text bloka", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Če je tool_use → Claude kliče web_search; appendamo response in nadaljujemo
|
||||||
|
if stop_reason == "tool_use":
|
||||||
|
# Anthropic web_search tool je server-side — sami obdela searches in vrne web_search_tool_result
|
||||||
|
# Ampak v API odgovoru so OBA: tool_use IN web_search_tool_result kot del content
|
||||||
|
# Torej končni text že obstaja v naslednji iteraciji
|
||||||
|
# Appendamo content do messages in pošljem nazaj (Claude bo nadaljeval)
|
||||||
|
messages.append({"role": "assistant", "content": content})
|
||||||
|
# Claude server-side že obdela search, samo nadaljujemo s pustim user msg
|
||||||
|
# Ampak server-side tools NE potrebujejo follow-up tool_result
|
||||||
|
# Pravilen flow: če stop_reason=tool_use ampak web_search_tool_result je že v content,
|
||||||
|
# potem Claude sam nadaljuje. Drugače moramo poslati tool_result.
|
||||||
|
|
||||||
|
# Preverim ali so že rezultati v content
|
||||||
|
has_results = any(b.get("type") == "web_search_tool_result" for b in content)
|
||||||
|
if has_results:
|
||||||
|
# Server-side: Anthropic je sam obdelal search, čakamo nadaljevanje
|
||||||
|
# Pošlji nazaj brez sprememb da Claude nadaljuje
|
||||||
|
print(f" 🔍 Claude je iskal lyrics, čakam nadaljevanje (iter {iteration+1})", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ tool_use brez results", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Drugi stop reasons
|
||||||
|
print(f" ⚠️ Nepričakovan stop_reason: {stop_reason}", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
print(f" ⚠️ Presežena max_iterations ({max_iterations})", file=sys.stderr)
|
||||||
|
return None
|
||||||
|
|
||||||
result = _parse_llm_response(text, video_duration)
|
result = _parse_llm_response(text, video_duration)
|
||||||
if not result:
|
if not result:
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user