Compare commits

...

2 Commits

2 changed files with 9 additions and 189 deletions

View File

@ -247,9 +247,6 @@ def process_job(job_id):
# Brez extension # Brez extension
fn_hint = Path(job["filename"]).stem fn_hint = Path(job["filename"]).stem
cmd += ["--filename-hint", fn_hint] cmd += ["--filename-hint", fn_hint]
# Whisper provider (groq = 200x hitreje od lokalnega)
if job.get("whisper_provider"):
cmd += ["--whisper-provider", job["whisper_provider"]]
# lang: če None ali 'auto', pusti analyze.py auto-detect # lang: če None ali 'auto', pusti analyze.py auto-detect
if job.get("lang") and job["lang"] not in ("auto", ""): if job.get("lang") and job["lang"] not in ("auto", ""):
cmd += ["--lang", job["lang"]] cmd += ["--lang", job["lang"]]
@ -503,8 +500,6 @@ class StartJobIn(BaseModel):
# LLM za semantično analizo + popravke # LLM za semantično analizo + popravke
llm_provider: str = "claude" # claude / gemini / auto llm_provider: str = "claude" # claude / gemini / auto
llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider) llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider)
# Whisper provider (Groq je 200x hitrejši od lokalnega CPU faster-whisper)
whisper_provider: str = "auto" # auto / groq / local
# ──────────────────────────────────────────────────────────────── # ────────────────────────────────────────────────────────────────
@ -610,7 +605,6 @@ async def start_processing(
quality=payload.quality, quality=payload.quality,
llm_provider=payload.llm_provider, llm_provider=payload.llm_provider,
llm_model=payload.llm_model, llm_model=payload.llm_model,
whisper_provider=payload.whisper_provider,
current_step="V vrsti za obdelavo", current_step="V vrsti za obdelavo",
# Počisti pretekle napake (retry-friendly) # Počisti pretekle napake (retry-friendly)
chorus_error=None, chorus_error=None,

View File

@ -46,152 +46,13 @@ def extract_audio(video_path):
return audio.name return audio.name
def transcribe_with_groq(audio_path, lang=None, model="whisper-large-v3-turbo"): def transcribe_full(audio_path, lang=None, model_size="small"):
"""Whisper transkripcija prek Groq API-ja. """Whisper transcript celega avdia. lang=None → robust auto-detect.
216x realtime speed factor 30s audio = ~0.5s transcribe time. Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
Stroški: $0.04/h (turbo) ali $0.111/h (large-v3).
"""
import urllib.request
import urllib.error
import uuid
api_key = os.environ.get("GROQ_API_KEY")
if not api_key:
print(" ⚠️ GROQ_API_KEY ni nastavljen", file=sys.stderr)
return None
# Pripravi multipart/form-data
boundary = uuid.uuid4().hex
parts = []
def add_text(name, value):
parts.append(
f"--{boundary}\r\nContent-Disposition: form-data; "
f"name=\"{name}\"\r\n\r\n{value}\r\n".encode()
)
def add_file(name, filename, content, content_type="application/octet-stream"):
parts.append(
f"--{boundary}\r\nContent-Disposition: form-data; "
f"name=\"{name}\"; filename=\"{filename}\"\r\n"
f"Content-Type: {content_type}\r\n\r\n".encode()
+ content + b"\r\n"
)
with open(audio_path, "rb") as f:
file_content = f.read()
# Groq ima 25 MB limit za API requests (verjetno dovolj za večino pesmi)
if len(file_content) > 24 * 1024 * 1024:
print(f" ⚠️ Audio file {len(file_content)/1024/1024:.1f} MB > 24 MB limit, fallback na lokalno", file=sys.stderr)
return None
add_file("file", "audio.wav", file_content, "audio/wav")
add_text("model", model)
add_text("response_format", "verbose_json")
add_text("temperature", "0.0")
add_text("timestamp_granularities[]", "segment")
add_text("timestamp_granularities[]", "word")
if lang:
add_text("language", lang)
parts.append(f"--{boundary}--\r\n".encode())
body = b"".join(parts)
req = urllib.request.Request(
"https://api.groq.com/openai/v1/audio/transcriptions",
data=body,
headers={
"Authorization": f"Bearer {api_key}",
"Content-Type": f"multipart/form-data; boundary={boundary}",
"User-Agent": "groq-python/0.11.0", # Cloudflare bypass
},
)
print(f" 📡 Groq Whisper ({model}, {len(file_content)/1024/1024:.1f} MB)...", file=sys.stderr)
try:
with urllib.request.urlopen(req, timeout=180) as resp:
data = json.loads(resp.read().decode())
except urllib.error.HTTPError as e:
body_err = e.read().decode("utf-8", errors="replace")[:500]
print(f" ❌ Groq HTTP {e.code}: {body_err}", file=sys.stderr)
return None
except Exception as e:
print(f" ❌ Groq exception: {e}", file=sys.stderr)
return None
# Pretvori Groq response v isti format kot lokalni Whisper
detected_lang = data.get("language", "unknown")
# Groq vrača jezik z velikim začetkom (npr. "German", "Slovenian"), pretvorimo v ISO
LANG_MAP = {
"english": "en", "german": "de", "slovenian": "sl", "croatian": "hr",
"bosnian": "bs", "serbian": "sr", "italian": "it", "spanish": "es",
"french": "fr", "portuguese": "pt", "russian": "ru", "polish": "pl",
"czech": "cs", "slovak": "sk", "hungarian": "hu", "romanian": "ro",
}
detected_lang_iso = LANG_MAP.get(detected_lang.lower(), detected_lang.lower()[:2])
segments = []
for s in data.get("segments", []):
# Word-level timestamps (če so na voljo)
words_in_segment = []
for w in data.get("words", []):
if s["start"] <= w["start"] <= s["end"]:
words_in_segment.append({
"start": w["start"],
"end": w["end"],
"text": w["word"],
})
segments.append({
"start": s["start"],
"end": s["end"],
"text": s["text"].strip(),
"words": words_in_segment,
})
print(f" ✅ Groq: {len(segments)} segmentov, lang={detected_lang_iso}", file=sys.stderr)
return {
"language": detected_lang_iso,
"language_probability": 1.0, # Groq ne vrača confidence
"segments": segments,
}
def transcribe_full(audio_path, lang=None, model_size="small", provider="local"):
"""Whisper transcript celega avdia.
provider:
- "local" faster-whisper na CPU (počasi ampak brezplačno)
- "groq" Groq Whisper API (216x hitreje, $0.04/h)
- "auto" poskusi Groq, fallback na local če manjka API key
Vrne empty transcript če Whisper ne najde govora."""
# ── Provider routing ──
if provider in ("groq", "auto") and os.environ.get("GROQ_API_KEY"):
# Mapping: model_size → Groq model name
groq_model = "whisper-large-v3-turbo"
if model_size == "large-v3":
groq_model = "whisper-large-v3" # boljša kvaliteta, malo počasneje
result = transcribe_with_groq(audio_path, lang=lang, model=groq_model)
if result:
return result
if provider == "groq":
# Strict groq mode — če ne uspe, vrni prazen
print(f" ⚠️ Groq failed, no fallback (provider=groq)", file=sys.stderr)
return {"language": "unknown", "language_probability": 0.0, "segments": []}
print(f" 🔄 Groq failed, fallback na lokalno faster-whisper...", file=sys.stderr)
# ── Lokalni faster-whisper ──
return _transcribe_full_local(audio_path, lang=lang, model_size=model_size)
def _transcribe_full_local(audio_path, lang=None, model_size="small"):
"""Lokalna faster-whisper transkripcija (originalna implementacija)."""
from faster_whisper import WhisperModel from faster_whisper import WhisperModel
print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr) print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
m = WhisperModel(model_size, device="cpu", compute_type="int8") m = WhisperModel(model_size, device="cpu", compute_type="int8")
# Auto-detect z 3-sample voting da se zaklenemo na en jezik # Auto-detect z 3-sample voting da se zaklenemo na en jezik
@ -629,12 +490,10 @@ PROSIM:
- Ohrani timestamp-e nespremenjene - Ohrani timestamp-e nespremenjene
3. Prepoznaj REFREN: del besedila ki se PONAVLJA 3. Prepoznaj REFREN: del besedila ki se PONAVLJA
4. Izberi najboljši odsek za reel: 4. Izberi najboljši odsek za reel:
- **PREDNOSTNO**: en cel refren + morda kratek pre-chorus (skupaj 20-35s) - Vključi cel refren (brez prekinitve)
- **NIKOLI ne vključi**: "la la la", "ooh ooh", "yeah yeah", instrumentalni medbridge (interludij) - Lahko dodaj pre-chorus build-up
- **NIKOLI ne podaljšaj** clip range zato da bi vključil 2 refrena povezana z la-la-la ali instrumentalom - 20-45 sekund
- Če sta dva refrena ločena z medbridge-om/instrumentalom, izberi **SAMO PRVEGA** - Začni in končaj na smiselni meji
- Začni in končaj na smiselni meji (konec stavka)
- Maksimalno 35 sekund (smartphone reel attention span)
5. Če pesem nima jasnega refrena, izberi najbolj dramatičen ali zaključen del 5. Če pesem nima jasnega refrena, izberi najbolj dramatičen ali zaključen del
6. Če Whisper transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "WHISPER_HALLUCINATION_DETECTED" in vrni najmanj segmentov (samo tisti ki so smiselni) 6. Če Whisper transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "WHISPER_HALLUCINATION_DETECTED" in vrni najmanj segmentov (samo tisti ki so smiselni)
@ -929,9 +788,6 @@ def main():
help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)") help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
ap.add_argument("--filename-hint", default=None, ap.add_argument("--filename-hint", default=None,
help="Originalno ime datoteke (Claude lahko prepozna pesem)") help="Originalno ime datoteke (Claude lahko prepozna pesem)")
ap.add_argument("--whisper-provider", default="auto",
choices=["local", "groq", "auto"],
help="Whisper provider: local=faster-whisper na CPU, groq=Groq API (200x hitreje), auto=Groq če ima API key")
ap.add_argument("--json", action="store_true", help="Output JSON") ap.add_argument("--json", action="store_true", help="Output JSON")
ap.add_argument("--output", help="Path za JSON output") ap.add_argument("--output", help="Path za JSON output")
args = ap.parse_args() args = ap.parse_args()
@ -950,10 +806,7 @@ def main():
try: try:
# 2. Whisper transcript # 2. Whisper transcript
lang = None if args.lang in (None, "auto", "") else args.lang lang = None if args.lang in (None, "auto", "") else args.lang
transcript = transcribe_full( transcript = transcribe_full(audio, lang=lang, model_size=args.model)
audio, lang=lang, model_size=args.model,
provider=args.whisper_provider,
)
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr) print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
# 3. Energy profile # 3. Energy profile
@ -1028,33 +881,6 @@ def main():
clip_range["end"] = clip_range["start"] + args.max_duration clip_range["end"] = clip_range["start"] + args.max_duration
clip_range["duration"] = args.max_duration clip_range["duration"] = args.max_duration
clip_range["reason"] += " (capped at max_duration)" clip_range["reason"] += " (capped at max_duration)"
# ── DETEKCIJA "filler" segmentov (la-la-la, ooh, instrumental fillers) ──
# Če clip vsebuje segment kjer je >70% besedila ponovljen token,
# skrajšaj clip tik pred tem segmentom (preprečimo nesmiselno podaljšanje)
corrected_segs = claude_result.get("corrected_segments") or transcript["segments"]
for seg in corrected_segs:
seg_start = float(seg.get("start", 0))
seg_end = float(seg.get("end", 0))
seg_text = str(seg.get("text", "")).lower().strip()
# Samo segmenti znotraj clip range
if seg_start < clip_range["start"] or seg_end > clip_range["end"]:
continue
# Filler detection: ponavljajoče besede
words = seg_text.split()
if len(words) >= 4:
unique_ratio = len(set(words)) / len(words)
# Če je <30% unique besed = repetitive filler
if unique_ratio < 0.3:
# Skrajšaj clip do začetka tega segmenta
if seg_start - clip_range["start"] >= args.min_duration:
print(f" ✂️ Filler detected at {seg_start:.1f}s "
f"('{seg_text[:40]}', unique={unique_ratio:.0%}), "
f"trimming clip", file=sys.stderr)
clip_range["end"] = round(seg_start, 2)
clip_range["duration"] = round(seg_start - clip_range["start"], 2)
clip_range["reason"] += f" (trimmed at filler @ {seg_start:.1f}s)"
break
else: else:
clip_range = smart_clip_range( clip_range = smart_clip_range(
chorus, transcript, duration, chorus, transcript, duration,