From 5f90085981badae345323b0658cc0cf52ea68120 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastjan=20Arti=C4=8D?= Date: Wed, 29 Apr 2026 12:24:17 +0000 Subject: [PATCH] Add Claude web_search tool for lyrics lookup + tighter subtitle timing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. Claude API web_search tool integration: - Claude can now search web for actual lyrics when STT text is wrong - Especially useful for SLO/HR/BS/SR songs (Modrijani, Veseli Dolenjci) where Claude doesn't know lyrics from training data - Agentic loop: tool_use → server-side search → continuation → final text - Max 3 searches per job ($0.03 cost limit) - Hint sources: besedila.com, lyricstranslate.com, tekstovi.net, songtexte.com 2. Tighter subtitle segmentation from Scribe word timestamps: - Phrase boundaries on shorter pauses (0.4s vs 0.6s) - Sentence-ending punctuation triggers segment break - Max segment 4s (was 6s) for natural readable subtitles - Hard cap at 5.5s to prevent very long lines This fixes 'ples to noč' → 'ples pojoč' for Modrijani songs that Scribe transcribed phonetically wrong but Claude can fix via web lookup. --- scripts/analyze.py | 157 ++++++++++++++++++++++++++++++++------------- 1 file changed, 114 insertions(+), 43 deletions(-) diff --git a/scripts/analyze.py b/scripts/analyze.py index 9a964bf..b821801 100644 --- a/scripts/analyze.py +++ b/scripts/analyze.py @@ -137,7 +137,11 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"): detected_prob = data.get("language_probability", 1.0) # Scribe returns flat list of words (not segments) - # We need to group words into pseudo-segments (~10s each, breaking on long pauses) + # We group words into pseudo-segments using **smart phrase-aware segmentation**: + # - Close on long pause (>= 0.4s) — natural breath/phrase boundary + # - OR after sentence-ending punctuation (. ! ?) + # - OR after 4 seconds (max segment length for readable subtitle) + # This gives ~3-7 word segments matching natural sung phrases. words = data.get("words", []) segments = [] @@ -152,16 +156,26 @@ def transcribe_with_elevenlabs(audio_path, lang=None, model="scribe_v1"): for i, w in enumerate(real_words): current_seg_words.append(w) w_end = w.get("end", w.get("start", 0)) + w_text = w.get("text", "") - # Decide if we should close the segment close = False - # Close on long pause (>= 0.6s) + # Decide if we should close the segment if i + 1 < len(real_words): next_start = real_words[i + 1].get("start", w_end) pause = next_start - w_end seg_duration = w_end - seg_start - # Long pause OR segment is long enough (>= 4s) - if pause >= 0.6 or seg_duration >= 6.0: + + # Trigger close on: + # 1. Long pause (>= 0.4s) = phrase boundary + # 2. Sentence-ending punctuation + # 3. Segment is long enough (>= 4s) + if pause >= 0.4: + close = True + elif seg_duration >= 4.0 and pause >= 0.15: + close = True + elif w_text.rstrip().endswith(('.', '!', '?')) and pause >= 0.2: + close = True + elif seg_duration >= 5.5: # hard cap close = True else: close = True # last word @@ -631,18 +645,24 @@ def _build_analysis_prompt(transcript, video_duration, target_duration=30, filen 🎵 IME DATOTEKE: "{filename_hint}" Iz imena datoteke morda lahko prepoznaš naslov pesmi ali izvajalca. Če je tako: - Uporabi svoje znanje o **dejanskem besedilu** te pesmi -- Če Whisper transkript ne ustreza znanemu besedilu pesmi (halucinacija), POPRAVI besedilo na **dejansko besedilo pesmi** -- Ohrani timestamp-e iz Whisper-ja (časovne meje so pravilne, samo besede so napačne) +- Če Whisper transkript ne ustreza znanemu besedilu pesmi, POPRAVI besedilo na **dejansko besedilo pesmi** +- Ohrani timestamp-e iz Whisper-ja (časovne meje so pravilne, samo besede so morda napačne) + +🔍 ČE NE POZNAŠ PESMI (npr. slovenske narodno-zabavne, manj znane pesmi) → **UPORABI web_search tool** da poiščeš pravo besedilo! + Primeri search queryjev: + - "[ime izvajalca] [naslov pesmi] besedilo" (slovenske: Modrijani, Veseli Dolenjci, Avseniki, Čuki, Atomik Harmonik) + - "[artist] [title] lyrics" (angleške/nemške) + - Pogosto so besedila na: besedila.com, lyricstranslate.com, genius.com, tekstovi.net (HR/SR), songtexte.com (DE) + Ko najdeš pravo besedilo, uporabi to za popravljanje "corrected_segments" — **transkript bo veliko bolj točen** kot če le ugibaš. """ - return f"""Tu je transcript pesmi iz Whisper modela (timestamp v sekundah, besedilo): + return f"""Tu je transcript pesmi iz STT modela (timestamp v sekundah, besedilo): {transcript_text} Cela pesem traja {video_duration:.1f}s. Cilj: izrezati ~{target_duration}s odsek za TikTok/Instagram Reel.{hint_block} -⚠️ POMEMBNO: Whisper si IZMIŠLJA besede ko ne razume jasno (HALLUCINACIJA). Posebej: -- Ko glasba prevladuje nad vokalom +⚠️ POMEMBNO: STT lahko naredi napake — narečne besede, slovanski jeziki, ko glasba prevladuje: - Pri narečjih in slovanskih jezikih - Generira "tipičen" tekst (npr. tekst druge pesmi istega izvajalca) - Lahko vstavi besede ki se POdoBNO slišijo, ampak imajo ČISTO drug pomen @@ -739,44 +759,95 @@ def analyze_with_claude(transcript, video_duration, target_duration=30, model="c try: import urllib.request import urllib.error - body = json.dumps({ - "model": model, - # 8192 je dovolj za ~250 corrected_segments + ostali metadata pri dolgih pesmih. - # Sonnet 4.6 podpira precej več, ampak 8192 je varen default. - "max_tokens": 8192, - "messages": [{"role": "user", "content": prompt}], - }).encode("utf-8") - req = urllib.request.Request( - "https://api.anthropic.com/v1/messages", - data=body, - headers={ - "Content-Type": "application/json", - "x-api-key": api_key, - "anthropic-version": "2023-06-01", - }, - method="POST", - ) - with urllib.request.urlopen(req, timeout=120) as resp: - data = json.loads(resp.read().decode("utf-8")) + # Initial messages + messages = [{"role": "user", "content": prompt}] - content = data.get("content", []) - if not content: - print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr) - return None + # Sonnet 4.6 podpira web_search tool — Claude lahko poišče prave lyrics + # za pesmi v slovenščini/hrvaščini/itd., če jih ne pozna iz training data. + tools = [{ + "type": "web_search_20250305", + "name": "web_search", + "max_uses": 3, # Maksimalno 3 search-i = $0.03/job + }] - # Diagnostika: če je bil response odrezan, je JSON nepopoln - stop_reason = data.get("stop_reason") - if stop_reason == "max_tokens": - usage = data.get("usage", {}) - print( - f" ⚠️ Claude odrezan (max_tokens): " - f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}", - file=sys.stderr, + # Agentic loop: Claude lahko kliče web_search, dobi rezultate, vrne final answer + max_iterations = 5 + for iteration in range(max_iterations): + body = json.dumps({ + "model": model, + "max_tokens": 8192, + "messages": messages, + "tools": tools, + }).encode("utf-8") + + req = urllib.request.Request( + "https://api.anthropic.com/v1/messages", + data=body, + headers={ + "Content-Type": "application/json", + "x-api-key": api_key, + "anthropic-version": "2023-06-01", + }, + method="POST", ) - return None + with urllib.request.urlopen(req, timeout=180) as resp: + data = json.loads(resp.read().decode("utf-8")) - text = content[0].get("text", "").strip() + content = data.get("content", []) + if not content: + print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr) + return None + + stop_reason = data.get("stop_reason") + if stop_reason == "max_tokens": + usage = data.get("usage", {}) + print( + f" ⚠️ Claude odrezan (max_tokens): " + f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}", + file=sys.stderr, + ) + return None + + # Če je end_turn → smo končali, parsiraj text + if stop_reason in ("end_turn", "stop_sequence"): + # Najdem zadnji text block + text_blocks = [b for b in content if b.get("type") == "text"] + if text_blocks: + text = text_blocks[-1].get("text", "").strip() + break + print(" ⚠️ Claude end_turn brez text bloka", file=sys.stderr) + return None + + # Če je tool_use → Claude kliče web_search; appendamo response in nadaljujemo + if stop_reason == "tool_use": + # Anthropic web_search tool je server-side — sami obdela searches in vrne web_search_tool_result + # Ampak v API odgovoru so OBA: tool_use IN web_search_tool_result kot del content + # Torej končni text že obstaja v naslednji iteraciji + # Appendamo content do messages in pošljem nazaj (Claude bo nadaljeval) + messages.append({"role": "assistant", "content": content}) + # Claude server-side že obdela search, samo nadaljujemo s pustim user msg + # Ampak server-side tools NE potrebujejo follow-up tool_result + # Pravilen flow: če stop_reason=tool_use ampak web_search_tool_result je že v content, + # potem Claude sam nadaljuje. Drugače moramo poslati tool_result. + + # Preverim ali so že rezultati v content + has_results = any(b.get("type") == "web_search_tool_result" for b in content) + if has_results: + # Server-side: Anthropic je sam obdelal search, čakamo nadaljevanje + # Pošlji nazaj brez sprememb da Claude nadaljuje + print(f" 🔍 Claude je iskal lyrics, čakam nadaljevanje (iter {iteration+1})", file=sys.stderr) + continue + else: + print(f" ⚠️ tool_use brez results", file=sys.stderr) + return None + + # Drugi stop reasons + print(f" ⚠️ Nepričakovan stop_reason: {stop_reason}", file=sys.stderr) + return None + else: + print(f" ⚠️ Presežena max_iterations ({max_iterations})", file=sys.stderr) + return None result = _parse_llm_response(text, video_duration) if not result: