Fix: Gemini 3.1 Pro thinking model needs 32k maxOutputTokens (was 4096 → MAX_TOKENS truncation)

Diagnoza:
- Gemini 3.x Pro je thinking model (ima internal reasoning, thoughtsTokenCount)
- Pri velikih transkriptih (60+ segmentov pesmi):
  * thoughts ~ 1500-3000 tokens
  * output JSON s corrected_segments ~ 3000-7000 tokens
  * total ~ 4500-10000 tokens
- Z maxOutputTokens=4096 je bil response prekinjen (finishReason: MAX_TOKENS),
  JSON odrezan na pol, _parse_llm_response je threw json.JSONDecodeError
- Rezultat: 'Gemini vrnil prazen string' v logih

Popravki:
1. Gemini maxOutputTokens 4096 → 32768 (dovolj za thinking + dolg JSON)
2. Diagnostika finishReason==MAX_TOKENS in usage tokens v logih
3. Detekcija praznega text-a (ne samo praznega parts array-a)
4. Claude max_tokens 4096 → 8192 (rezerva za dolge pesmi)
5. Claude detekcija stop_reason==max_tokens

Test (60 segmentov, 5631 char prompt):
- 4096 → finishReason=MAX_TOKENS, thoughts=2594, output=1488, JSON odrezan 
- 16384 → finishReason=STOP, thoughts=1445, output=3040, JSON popoln 
- 32768 → varen default 
This commit is contained in:
OpenClaw Agent 2026-04-29 09:03:53 +00:00
parent 534d710e8a
commit e350352883

View File

@ -542,7 +542,9 @@ def analyze_with_claude(transcript, video_duration, target_duration=30, model="c
import urllib.error
body = json.dumps({
"model": model,
"max_tokens": 4096,
# 8192 je dovolj za ~250 corrected_segments + ostali metadata pri dolgih pesmih.
# Sonnet 4.6 podpira precej več, ampak 8192 je varen default.
"max_tokens": 8192,
"messages": [{"role": "user", "content": prompt}],
}).encode("utf-8")
@ -563,6 +565,18 @@ def analyze_with_claude(transcript, video_duration, target_duration=30, model="c
if not content:
print(" ⚠️ Claude vrnil prazen odgovor", file=sys.stderr)
return None
# Diagnostika: če je bil response odrezan, je JSON nepopoln
stop_reason = data.get("stop_reason")
if stop_reason == "max_tokens":
usage = data.get("usage", {})
print(
f" ⚠️ Claude odrezan (max_tokens): "
f"input={usage.get('input_tokens')} output={usage.get('output_tokens')}",
file=sys.stderr,
)
return None
text = content[0].get("text", "").strip()
result = _parse_llm_response(text, video_duration)
@ -606,6 +620,11 @@ def analyze_with_gemini(transcript, video_duration, target_duration=30, model="g
import urllib.error
url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"
# Gemini 3.x Pro je THINKING model — porabi tokene tudi za internal reasoning (thoughtsTokenCount).
# 4096 je prenizko: pri velikih transkriptih thinking lahko porabi 1500-3000 tokenov,
# output (corrected_segments za 60+ segmentov) pa še dodatnih 3000-7000 → odreže JSON na pol
# (finishReason: MAX_TOKENS) in vrne nepopolen, neveljaven JSON.
# 32768 daje dovolj prostora za thinking + cel JSON output tudi pri dolgih pesmih.
body = json.dumps({
"contents": [{
"role": "user",
@ -613,7 +632,7 @@ def analyze_with_gemini(transcript, video_duration, target_duration=30, model="g
}],
"generationConfig": {
"temperature": 0.1,
"maxOutputTokens": 4096,
"maxOutputTokens": 32768,
"responseMimeType": "application/json",
},
}).encode("utf-8")
@ -624,18 +643,46 @@ def analyze_with_gemini(transcript, video_duration, target_duration=30, model="g
headers={"Content-Type": "application/json"},
method="POST",
)
with urllib.request.urlopen(req, timeout=120) as resp:
with urllib.request.urlopen(req, timeout=180) as resp:
data = json.loads(resp.read().decode("utf-8"))
candidates = data.get("candidates", [])
if not candidates:
print(" ⚠️ Gemini vrnil 0 candidates", file=sys.stderr)
return None
parts = candidates[0].get("content", {}).get("parts", [])
cand0 = candidates[0]
finish_reason = cand0.get("finishReason", "?")
usage = data.get("usageMetadata", {})
# Diagnostika: če je finishReason == MAX_TOKENS, je output odrezan in JSON je invalid
if finish_reason == "MAX_TOKENS":
print(
f" ⚠️ Gemini odrezan (MAX_TOKENS): "
f"prompt={usage.get('promptTokenCount')} "
f"thoughts={usage.get('thoughtsTokenCount')} "
f"output={usage.get('candidatesTokenCount')}",
file=sys.stderr,
)
return None
parts = cand0.get("content", {}).get("parts", [])
if not parts:
print(" ⚠️ Gemini vrnil prazen content", file=sys.stderr)
print(
f" ⚠️ Gemini vrnil prazen content (finishReason={finish_reason}, "
f"thoughts={usage.get('thoughtsTokenCount')})",
file=sys.stderr,
)
return None
text = parts[0].get("text", "").strip()
if not text:
print(
f" ⚠️ Gemini vrnil prazen text (finishReason={finish_reason}, "
f"thoughts={usage.get('thoughtsTokenCount')}, "
f"output={usage.get('candidatesTokenCount')})",
file=sys.stderr,
)
return None
result = _parse_llm_response(text, video_duration)
if not result: