Fix: Gemini 3.1 Pro thinking model needs 32k maxOutputTokens (was 4096 → MAX_TOKENS truncation)
Diagnoza: - Gemini 3.x Pro je thinking model (ima internal reasoning, thoughtsTokenCount) - Pri velikih transkriptih (60+ segmentov pesmi): * thoughts ~ 1500-3000 tokens * output JSON s corrected_segments ~ 3000-7000 tokens * total ~ 4500-10000 tokens - Z maxOutputTokens=4096 je bil response prekinjen (finishReason: MAX_TOKENS), JSON odrezan na pol, _parse_llm_response je threw json.JSONDecodeError - Rezultat: 'Gemini vrnil prazen string' v logih Popravki: 1. Gemini maxOutputTokens 4096 → 32768 (dovolj za thinking + dolg JSON) 2. Diagnostika finishReason==MAX_TOKENS in usage tokens v logih 3. Detekcija praznega text-a (ne samo praznega parts array-a) 4. Claude max_tokens 4096 → 8192 (rezerva za dolge pesmi) 5. Claude detekcija stop_reason==max_tokens Test (60 segmentov, 5631 char prompt): - 4096 → finishReason=MAX_TOKENS, thoughts=2594, output=1488, JSON odrezan ❌ - 16384 → finishReason=STOP, thoughts=1445, output=3040, JSON popoln ✅ - 32768 → varen default ✅
This commit is contained in:
parent
534d710e8a
commit
e350352883
@ -542,7 +542,9 @@ def analyze_with_claude(transcript, video_duration, target_duration=30, model="c
|
||||
import urllib.error
|
||||
body = json.dumps({
|
||||
"model": model,
|
||||
"max_tokens": 4096,
|
||||
# 8192 je dovolj za ~250 corrected_segments + ostali metadata pri dolgih pesmih.
|
||||
# Sonnet 4.6 podpira precej več, ampak 8192 je varen default.
|
||||
"max_tokens": 8192,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}).encode("utf-8")
|
||||
|
||||
@ -563,6 +565,18 @@ def analyze_with_claude(transcript, video_duration, target_duration=30, model="c
|
||||
if not content:
|
||||
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Diagnostika: če je bil response odrezan, je JSON nepopoln
|
||||
stop_reason = data.get("stop_reason")
|
||||
if stop_reason == "max_tokens":
|
||||
usage = data.get("usage", {})
|
||||
print(
|
||||
f" ⚠️ Claude odrezan (max_tokens): "
|
||||
f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
|
||||
text = content[0].get("text", "").strip()
|
||||
|
||||
result = _parse_llm_response(text, video_duration)
|
||||
@ -606,6 +620,11 @@ def analyze_with_gemini(transcript, video_duration, target_duration=30, model="g
|
||||
import urllib.error
|
||||
|
||||
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
|
||||
# Gemini 3.x Pro je THINKING model — porabi tokene tudi za internal reasoning (thoughtsTokenCount).
|
||||
# 4096 je prenizko: pri velikih transkriptih thinking lahko porabi 1500-3000 tokenov,
|
||||
# output (corrected_segments za 60+ segmentov) pa še dodatnih 3000-7000 → odreže JSON na pol
|
||||
# (finishReason: MAX_TOKENS) in vrne nepopolen, neveljaven JSON.
|
||||
# 32768 daje dovolj prostora za thinking + cel JSON output tudi pri dolgih pesmih.
|
||||
body = json.dumps({
|
||||
"contents": [{
|
||||
"role": "user",
|
||||
@ -613,7 +632,7 @@ def analyze_with_gemini(transcript, video_duration, target_duration=30, model="g
|
||||
}],
|
||||
"generationConfig": {
|
||||
"temperature": 0.1,
|
||||
"maxOutputTokens": 4096,
|
||||
"maxOutputTokens": 32768,
|
||||
"responseMimeType": "application/json",
|
||||
},
|
||||
}).encode("utf-8")
|
||||
@ -624,18 +643,46 @@ def analyze_with_gemini(transcript, video_duration, target_duration=30, model="g
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=120) as resp:
|
||||
with urllib.request.urlopen(req, timeout=180) as resp:
|
||||
data = json.loads(resp.read().decode("utf-8"))
|
||||
|
||||
candidates = data.get("candidates", [])
|
||||
if not candidates:
|
||||
print(" ⚠️ Gemini vrnil 0 candidates", file=sys.stderr)
|
||||
return None
|
||||
parts = candidates[0].get("content", {}).get("parts", [])
|
||||
|
||||
cand0 = candidates[0]
|
||||
finish_reason = cand0.get("finishReason", "?")
|
||||
usage = data.get("usageMetadata", {})
|
||||
|
||||
# Diagnostika: če je finishReason == MAX_TOKENS, je output odrezan in JSON je invalid
|
||||
if finish_reason == "MAX_TOKENS":
|
||||
print(
|
||||
f" ⚠️ Gemini odrezan (MAX_TOKENS): "
|
||||
f"prompt={usage.get('promptTokenCount')} "
|
||||
f"thoughts={usage.get('thoughtsTokenCount')} "
|
||||
f"output={usage.get('candidatesTokenCount')}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
|
||||
parts = cand0.get("content", {}).get("parts", [])
|
||||
if not parts:
|
||||
print(" ⚠️ Gemini vrnil prazen content", file=sys.stderr)
|
||||
print(
|
||||
f" ⚠️ Gemini vrnil prazen content (finishReason={finish_reason}, "
|
||||
f"thoughts={usage.get('thoughtsTokenCount')})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
text = parts[0].get("text", "").strip()
|
||||
if not text:
|
||||
print(
|
||||
f" ⚠️ Gemini vrnil prazen text (finishReason={finish_reason}, "
|
||||
f"thoughts={usage.get('thoughtsTokenCount')}, "
|
||||
f"output={usage.get('candidatesTokenCount')})",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return None
|
||||
|
||||
result = _parse_llm_response(text, video_duration)
|
||||
if not result:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user