Compare commits
2 Commits
4488717f6f
...
3ffa9740f0
| Author | SHA1 | Date | |
|---|---|---|---|
| 3ffa9740f0 | |||
| 6a8f87b4a2 |
@ -247,9 +247,6 @@ def process_job(job_id):
|
|||||||
# Brez extension
|
# Brez extension
|
||||||
fn_hint = Path(job["filename"]).stem
|
fn_hint = Path(job["filename"]).stem
|
||||||
cmd += ["--filename-hint", fn_hint]
|
cmd += ["--filename-hint", fn_hint]
|
||||||
# Whisper provider (groq = 200x hitreje od lokalnega)
|
|
||||||
if job.get("whisper_provider"):
|
|
||||||
cmd += ["--whisper-provider", job["whisper_provider"]]
|
|
||||||
# lang: če None ali 'auto', pusti analyze.py auto-detect
|
# lang: če None ali 'auto', pusti analyze.py auto-detect
|
||||||
if job.get("lang") and job["lang"] not in ("auto", ""):
|
if job.get("lang") and job["lang"] not in ("auto", ""):
|
||||||
cmd += ["--lang", job["lang"]]
|
cmd += ["--lang", job["lang"]]
|
||||||
@ -503,8 +500,6 @@ class StartJobIn(BaseModel):
|
|||||||
# LLM za semantično analizo + popravke
|
# LLM za semantično analizo + popravke
|
||||||
llm_provider: str = "claude" # claude / gemini / auto
|
llm_provider: str = "claude" # claude / gemini / auto
|
||||||
llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider)
|
llm_model: Optional[str] = None # specifičen model (privzeto najboljši za provider)
|
||||||
# Whisper provider (Groq je 200x hitrejši od lokalnega CPU faster-whisper)
|
|
||||||
whisper_provider: str = "auto" # auto / groq / local
|
|
||||||
|
|
||||||
|
|
||||||
# ────────────────────────────────────────────────────────────────
|
# ────────────────────────────────────────────────────────────────
|
||||||
@ -610,7 +605,6 @@ async def start_processing(
|
|||||||
quality=payload.quality,
|
quality=payload.quality,
|
||||||
llm_provider=payload.llm_provider,
|
llm_provider=payload.llm_provider,
|
||||||
llm_model=payload.llm_model,
|
llm_model=payload.llm_model,
|
||||||
whisper_provider=payload.whisper_provider,
|
|
||||||
current_step="V vrsti za obdelavo",
|
current_step="V vrsti za obdelavo",
|
||||||
# Počisti pretekle napake (retry-friendly)
|
# Počisti pretekle napake (retry-friendly)
|
||||||
chorus_error=None,
|
chorus_error=None,
|
||||||
|
|||||||
@ -46,152 +46,13 @@ def extract_audio(video_path):
|
|||||||
return audio.name
|
return audio.name
|
||||||
|
|
||||||
|
|
||||||
def transcribe_with_groq(audio_path, lang=None, model="whisper-large-v3-turbo"):
|
def transcribe_full(audio_path, lang=None, model_size="small"):
|
||||||
"""Whisper transkripcija prek Groq API-ja.
|
"""Whisper transcript celega avdia. lang=None → robust auto-detect.
|
||||||
|
|
||||||
216x realtime speed factor — 30s audio = ~0.5s transcribe time.
|
Vrne empty transcript če Whisper ne najde govora (popolnoma instrumental)."""
|
||||||
Stroški: $0.04/h (turbo) ali $0.111/h (large-v3).
|
|
||||||
"""
|
|
||||||
import urllib.request
|
|
||||||
import urllib.error
|
|
||||||
import uuid
|
|
||||||
|
|
||||||
api_key = os.environ.get("GROQ_API_KEY")
|
|
||||||
if not api_key:
|
|
||||||
print(" ⚠️ GROQ_API_KEY ni nastavljen", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Pripravi multipart/form-data
|
|
||||||
boundary = uuid.uuid4().hex
|
|
||||||
parts = []
|
|
||||||
|
|
||||||
def add_text(name, value):
|
|
||||||
parts.append(
|
|
||||||
f"--{boundary}\r\nContent-Disposition: form-data; "
|
|
||||||
f"name=\"{name}\"\r\n\r\n{value}\r\n".encode()
|
|
||||||
)
|
|
||||||
|
|
||||||
def add_file(name, filename, content, content_type="application/octet-stream"):
|
|
||||||
parts.append(
|
|
||||||
f"--{boundary}\r\nContent-Disposition: form-data; "
|
|
||||||
f"name=\"{name}\"; filename=\"{filename}\"\r\n"
|
|
||||||
f"Content-Type: {content_type}\r\n\r\n".encode()
|
|
||||||
+ content + b"\r\n"
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(audio_path, "rb") as f:
|
|
||||||
file_content = f.read()
|
|
||||||
|
|
||||||
# Groq ima 25 MB limit za API requests (verjetno dovolj za večino pesmi)
|
|
||||||
if len(file_content) > 24 * 1024 * 1024:
|
|
||||||
print(f" ⚠️ Audio file {len(file_content)/1024/1024:.1f} MB > 24 MB limit, fallback na lokalno", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
add_file("file", "audio.wav", file_content, "audio/wav")
|
|
||||||
add_text("model", model)
|
|
||||||
add_text("response_format", "verbose_json")
|
|
||||||
add_text("temperature", "0.0")
|
|
||||||
add_text("timestamp_granularities[]", "segment")
|
|
||||||
add_text("timestamp_granularities[]", "word")
|
|
||||||
if lang:
|
|
||||||
add_text("language", lang)
|
|
||||||
|
|
||||||
parts.append(f"--{boundary}--\r\n".encode())
|
|
||||||
body = b"".join(parts)
|
|
||||||
|
|
||||||
req = urllib.request.Request(
|
|
||||||
"https://api.groq.com/openai/v1/audio/transcriptions",
|
|
||||||
data=body,
|
|
||||||
headers={
|
|
||||||
"Authorization": f"Bearer {api_key}",
|
|
||||||
"Content-Type": f"multipart/form-data; boundary={boundary}",
|
|
||||||
"User-Agent": "groq-python/0.11.0", # Cloudflare bypass
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f" 📡 Groq Whisper ({model}, {len(file_content)/1024/1024:.1f} MB)...", file=sys.stderr)
|
|
||||||
try:
|
|
||||||
with urllib.request.urlopen(req, timeout=180) as resp:
|
|
||||||
data = json.loads(resp.read().decode())
|
|
||||||
except urllib.error.HTTPError as e:
|
|
||||||
body_err = e.read().decode("utf-8", errors="replace")[:500]
|
|
||||||
print(f" ❌ Groq HTTP {e.code}: {body_err}", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ❌ Groq exception: {e}", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
# Pretvori Groq response v isti format kot lokalni Whisper
|
|
||||||
detected_lang = data.get("language", "unknown")
|
|
||||||
# Groq vrača jezik z velikim začetkom (npr. "German", "Slovenian"), pretvorimo v ISO
|
|
||||||
LANG_MAP = {
|
|
||||||
"english": "en", "german": "de", "slovenian": "sl", "croatian": "hr",
|
|
||||||
"bosnian": "bs", "serbian": "sr", "italian": "it", "spanish": "es",
|
|
||||||
"french": "fr", "portuguese": "pt", "russian": "ru", "polish": "pl",
|
|
||||||
"czech": "cs", "slovak": "sk", "hungarian": "hu", "romanian": "ro",
|
|
||||||
}
|
|
||||||
detected_lang_iso = LANG_MAP.get(detected_lang.lower(), detected_lang.lower()[:2])
|
|
||||||
|
|
||||||
segments = []
|
|
||||||
for s in data.get("segments", []):
|
|
||||||
# Word-level timestamps (če so na voljo)
|
|
||||||
words_in_segment = []
|
|
||||||
for w in data.get("words", []):
|
|
||||||
if s["start"] <= w["start"] <= s["end"]:
|
|
||||||
words_in_segment.append({
|
|
||||||
"start": w["start"],
|
|
||||||
"end": w["end"],
|
|
||||||
"text": w["word"],
|
|
||||||
})
|
|
||||||
segments.append({
|
|
||||||
"start": s["start"],
|
|
||||||
"end": s["end"],
|
|
||||||
"text": s["text"].strip(),
|
|
||||||
"words": words_in_segment,
|
|
||||||
})
|
|
||||||
|
|
||||||
print(f" ✅ Groq: {len(segments)} segmentov, lang={detected_lang_iso}", file=sys.stderr)
|
|
||||||
return {
|
|
||||||
"language": detected_lang_iso,
|
|
||||||
"language_probability": 1.0, # Groq ne vrača confidence
|
|
||||||
"segments": segments,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def transcribe_full(audio_path, lang=None, model_size="small", provider="local"):
|
|
||||||
"""Whisper transcript celega avdia.
|
|
||||||
|
|
||||||
provider:
|
|
||||||
- "local" → faster-whisper na CPU (počasi ampak brezplačno)
|
|
||||||
- "groq" → Groq Whisper API (216x hitreje, $0.04/h)
|
|
||||||
- "auto" → poskusi Groq, fallback na local če manjka API key
|
|
||||||
|
|
||||||
Vrne empty transcript če Whisper ne najde govora."""
|
|
||||||
|
|
||||||
# ── Provider routing ──
|
|
||||||
if provider in ("groq", "auto") and os.environ.get("GROQ_API_KEY"):
|
|
||||||
# Mapping: model_size → Groq model name
|
|
||||||
groq_model = "whisper-large-v3-turbo"
|
|
||||||
if model_size == "large-v3":
|
|
||||||
groq_model = "whisper-large-v3" # boljša kvaliteta, malo počasneje
|
|
||||||
result = transcribe_with_groq(audio_path, lang=lang, model=groq_model)
|
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
if provider == "groq":
|
|
||||||
# Strict groq mode — če ne uspe, vrni prazen
|
|
||||||
print(f" ⚠️ Groq failed, no fallback (provider=groq)", file=sys.stderr)
|
|
||||||
return {"language": "unknown", "language_probability": 0.0, "segments": []}
|
|
||||||
print(f" 🔄 Groq failed, fallback na lokalno faster-whisper...", file=sys.stderr)
|
|
||||||
|
|
||||||
# ── Lokalni faster-whisper ──
|
|
||||||
return _transcribe_full_local(audio_path, lang=lang, model_size=model_size)
|
|
||||||
|
|
||||||
|
|
||||||
def _transcribe_full_local(audio_path, lang=None, model_size="small"):
|
|
||||||
"""Lokalna faster-whisper transkripcija (originalna implementacija)."""
|
|
||||||
from faster_whisper import WhisperModel
|
from faster_whisper import WhisperModel
|
||||||
|
|
||||||
print(f"🧠 Whisper LOCAL {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
print(f"🧠 Whisper {model_size}, lang={lang or 'auto'}", file=sys.stderr)
|
||||||
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
m = WhisperModel(model_size, device="cpu", compute_type="int8")
|
||||||
|
|
||||||
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
|
# Auto-detect z 3-sample voting da se zaklenemo na en jezik
|
||||||
@ -629,12 +490,10 @@ PROSIM:
|
|||||||
- Ohrani timestamp-e nespremenjene
|
- Ohrani timestamp-e nespremenjene
|
||||||
3. Prepoznaj REFREN: del besedila ki se PONAVLJA
|
3. Prepoznaj REFREN: del besedila ki se PONAVLJA
|
||||||
4. Izberi najboljši odsek za reel:
|
4. Izberi najboljši odsek za reel:
|
||||||
- **PREDNOSTNO**: en cel refren + morda kratek pre-chorus (skupaj 20-35s)
|
- Vključi cel refren (brez prekinitve)
|
||||||
- **NIKOLI ne vključi**: "la la la", "ooh ooh", "yeah yeah", instrumentalni medbridge (interludij)
|
- Lahko dodaj pre-chorus build-up
|
||||||
- **NIKOLI ne podaljšaj** clip range zato da bi vključil 2 refrena povezana z la-la-la ali instrumentalom
|
- 20-45 sekund
|
||||||
- Če sta dva refrena ločena z medbridge-om/instrumentalom, izberi **SAMO PRVEGA**
|
- Začni in končaj na smiselni meji
|
||||||
- Začni in končaj na smiselni meji (konec stavka)
|
|
||||||
- Maksimalno 35 sekund (smartphone reel attention span)
|
|
||||||
5. Če pesem nima jasnega refrena, izberi najbolj dramatičen ali zaključen del
|
5. Če pesem nima jasnega refrena, izberi najbolj dramatičen ali zaključen del
|
||||||
6. Če Whisper transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "WHISPER_HALLUCINATION_DETECTED" in vrni najmanj segmentov (samo tisti ki so smiselni)
|
6. Če Whisper transkript je v večini halucinacija (manj kot 30% smiselnih besed), v "reason" napiši "WHISPER_HALLUCINATION_DETECTED" in vrni najmanj segmentov (samo tisti ki so smiselni)
|
||||||
|
|
||||||
@ -929,9 +788,6 @@ def main():
|
|||||||
help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
|
help="Specifičen model (npr. claude-sonnet-4-6, gemini-3.1-pro-preview)")
|
||||||
ap.add_argument("--filename-hint", default=None,
|
ap.add_argument("--filename-hint", default=None,
|
||||||
help="Originalno ime datoteke (Claude lahko prepozna pesem)")
|
help="Originalno ime datoteke (Claude lahko prepozna pesem)")
|
||||||
ap.add_argument("--whisper-provider", default="auto",
|
|
||||||
choices=["local", "groq", "auto"],
|
|
||||||
help="Whisper provider: local=faster-whisper na CPU, groq=Groq API (200x hitreje), auto=Groq če ima API key")
|
|
||||||
ap.add_argument("--json", action="store_true", help="Output JSON")
|
ap.add_argument("--json", action="store_true", help="Output JSON")
|
||||||
ap.add_argument("--output", help="Path za JSON output")
|
ap.add_argument("--output", help="Path za JSON output")
|
||||||
args = ap.parse_args()
|
args = ap.parse_args()
|
||||||
@ -950,10 +806,7 @@ def main():
|
|||||||
try:
|
try:
|
||||||
# 2. Whisper transcript
|
# 2. Whisper transcript
|
||||||
lang = None if args.lang in (None, "auto", "") else args.lang
|
lang = None if args.lang in (None, "auto", "") else args.lang
|
||||||
transcript = transcribe_full(
|
transcript = transcribe_full(audio, lang=lang, model_size=args.model)
|
||||||
audio, lang=lang, model_size=args.model,
|
|
||||||
provider=args.whisper_provider,
|
|
||||||
)
|
|
||||||
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
|
print(f" Transkripcija: {len(transcript['segments'])} segmentov", file=sys.stderr)
|
||||||
|
|
||||||
# 3. Energy profile
|
# 3. Energy profile
|
||||||
@ -1028,33 +881,6 @@ def main():
|
|||||||
clip_range["end"] = clip_range["start"] + args.max_duration
|
clip_range["end"] = clip_range["start"] + args.max_duration
|
||||||
clip_range["duration"] = args.max_duration
|
clip_range["duration"] = args.max_duration
|
||||||
clip_range["reason"] += " (capped at max_duration)"
|
clip_range["reason"] += " (capped at max_duration)"
|
||||||
|
|
||||||
# ── DETEKCIJA "filler" segmentov (la-la-la, ooh, instrumental fillers) ──
|
|
||||||
# Če clip vsebuje segment kjer je >70% besedila ponovljen token,
|
|
||||||
# skrajšaj clip tik pred tem segmentom (preprečimo nesmiselno podaljšanje)
|
|
||||||
corrected_segs = claude_result.get("corrected_segments") or transcript["segments"]
|
|
||||||
for seg in corrected_segs:
|
|
||||||
seg_start = float(seg.get("start", 0))
|
|
||||||
seg_end = float(seg.get("end", 0))
|
|
||||||
seg_text = str(seg.get("text", "")).lower().strip()
|
|
||||||
# Samo segmenti znotraj clip range
|
|
||||||
if seg_start < clip_range["start"] or seg_end > clip_range["end"]:
|
|
||||||
continue
|
|
||||||
# Filler detection: ponavljajoče besede
|
|
||||||
words = seg_text.split()
|
|
||||||
if len(words) >= 4:
|
|
||||||
unique_ratio = len(set(words)) / len(words)
|
|
||||||
# Če je <30% unique besed = repetitive filler
|
|
||||||
if unique_ratio < 0.3:
|
|
||||||
# Skrajšaj clip do začetka tega segmenta
|
|
||||||
if seg_start - clip_range["start"] >= args.min_duration:
|
|
||||||
print(f" ✂️ Filler detected at {seg_start:.1f}s "
|
|
||||||
f"('{seg_text[:40]}', unique={unique_ratio:.0%}), "
|
|
||||||
f"trimming clip", file=sys.stderr)
|
|
||||||
clip_range["end"] = round(seg_start, 2)
|
|
||||||
clip_range["duration"] = round(seg_start - clip_range["start"], 2)
|
|
||||||
clip_range["reason"] += f" (trimmed at filler @ {seg_start:.1f}s)"
|
|
||||||
break
|
|
||||||
else:
|
else:
|
||||||
clip_range = smart_clip_range(
|
clip_range = smart_clip_range(
|
||||||
chorus, transcript, duration,
|
chorus, transcript, duration,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user