reels-app/scripts/acr_recognize.py
Sebastjan Artič 1cc8e8be35 MXF/MPG broadcast format support: handle multichannel audio properly
Problem: MXF and MPG files (TV broadcast formats) often contain:
- Multiple audio streams (4-8 streams for different language tracks)
- Multichannel layouts (5.1, 7.1) instead of stereo
- Default ffmpeg behavior was -c:a aac without channel limit, which
  meant multichannel got transcoded as multichannel AAC, overwriting
  what should have been clean stereo

Solution:

1. get_audio_streams() helper probes all audio streams with ffprobe
   - Returns codec, channels, sample_rate, language, layout for each

2. build_audio_args() picks best stream + downmix:
   - Prefers first 2-channel stereo stream (usually main mix)
   - Falls back to first stream if none are 2-ch
   - Always: -ac 2 (force stereo downmix), -ar 48000, -c:a aac, -b:a 192k
   - Bitrate raised from 128k to 192k for music quality

3. Smart trim path now detects broadcast formats:
   - .mxf, .mpg, .mpeg, .ts, .m2ts, .mts → transcode (not stream copy)
   - Standard MP4/MOV → stream copy (faster, lossless)

4. Pre-conversion step for broadcast files without trim:
   - Even without --start/--duration, MXF/MPG get converted to MP4
   - Same audio handling as trim path

5. Main render adds explicit -map 0✌️0 -map 0🅰️0? -ac 2 to ensure
   only first video and first audio stream get encoded, with stereo

6. ACR recognize also gets -map 0🅰️0 -ac 2 for MXF compatibility

7. UI accepts: video/*,.mxf,.mpg,.mpeg,.ts,.m2ts,.mts

8. Upload limit raised: 2GB → 10GB (MXF files are large)

This means a TV broadcast MXF with [SLO/EN/DE language tracks] now
correctly outputs stereo MP4 with the main language track preserved.
2026-04-29 14:38:48 +00:00

203 lines
6.7 KiB
Python
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
acr_recognize.py — Audio recognition prek ACRCloud Audio Fingerprinting API.
Uporabi native binary za fingerprinting (3KB sample namesto cel audio).
Vrne (artist, title) ali (None, None) če pesem ni prepoznana.
Credentials se preberejo iz env vars (ACR_ACCESS_KEY, ACR_SECRET_KEY, ACR_HOST).
"""
import os
import sys
import json
import hmac
import base64
import time
import subprocess
import tempfile
import urllib.request
import urllib.parse
import uuid
def _sign(string_to_sign, secret):
h = hmac.new(secret.encode('ascii'), string_to_sign.encode('ascii'), digestmod='sha1')
return base64.b64encode(h.digest()).decode('ascii')
def _build_multipart(fields, files):
"""Sestavi multipart/form-data body."""
boundary = uuid.uuid4().hex
parts = []
for k, v in fields.items():
parts.append(f"--{boundary}\r\nContent-Disposition: form-data; name=\"{k}\"\r\n\r\n{v}\r\n".encode())
for k, (fname, content, ctype) in files.items():
parts.append(
f"--{boundary}\r\nContent-Disposition: form-data; name=\"{k}\"; filename=\"{fname}\"\r\n"
f"Content-Type: {ctype}\r\n\r\n".encode() + content + b"\r\n"
)
parts.append(f"--{boundary}--\r\n".encode())
body = b"".join(parts)
return body, boundary
def recognize_audio_file(audio_path, timeout=30):
"""Pošlji audio file (ali fingerprint) ACRCloud-u in vrni raw response.
audio_path: pot do MP3/WAV/M4A datoteke.
Vrne dict z odgovorom ali None ob napaki.
"""
host = os.environ.get("ACR_HOST", "identify-eu-west-1.acrcloud.com")
access_key = os.environ.get("ACR_ACCESS_KEY")
secret_key = os.environ.get("ACR_SECRET_KEY")
if not access_key or not secret_key:
print("⚠️ ACR_ACCESS_KEY/SECRET_KEY nista nastavljena", file=sys.stderr)
return None
if not os.path.exists(audio_path):
print(f"⚠️ Audio file ne obstaja: {audio_path}", file=sys.stderr)
return None
# Probaj native binary za fingerprint (manjši payload)
fingerprint = None
try:
from acrcloud import acrcloud_extr_tool as acr
fingerprint = acr.create_fingerprint_by_file(audio_path, 0, 30, 0)
except (ImportError, Exception) as e:
print(f" Fingerprint binary ni na voljo ({e}), uporabljam audio direktno", file=sys.stderr)
timestamp = str(int(time.time()))
http_method = "POST"
http_uri = "/v1/identify"
signature_version = "1"
if fingerprint:
data_type = "fingerprint"
sample_data = fingerprint
sample_name = "sample.fp"
sample_ctype = "application/octet-stream"
else:
data_type = "audio"
with open(audio_path, "rb") as f:
sample_data = f.read()
sample_name = "sample.mp3"
sample_ctype = "audio/mpeg"
string_to_sign = f"{http_method}\n{http_uri}\n{access_key}\n{data_type}\n{signature_version}\n{timestamp}"
signature = _sign(string_to_sign, secret_key)
fields = {
"access_key": access_key,
"sample_bytes": str(len(sample_data)),
"timestamp": timestamp,
"signature": signature,
"data_type": data_type,
"signature_version": signature_version,
}
files = {"sample": (sample_name, sample_data, sample_ctype)}
body, boundary = _build_multipart(fields, files)
url = f"https://{host}/v1/identify"
req = urllib.request.Request(
url, data=body,
headers={"Content-Type": f"multipart/form-data; boundary={boundary}"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
return json.loads(resp.read().decode())
except Exception as e:
print(f"⚠️ ACR API napaka: {e}", file=sys.stderr)
return None
def extract_short_audio(video_path, duration=20, start_offset=15):
"""Izloči kratek audio iz videa za ACR fingerprint.
Začnemo 15s v video (preskoči intro) in vzamemo 20s.
Za MXF/multichannel: izberemo prvi audio stream, downmix v stereo.
Vrne pot do tmp MP3 ali None.
"""
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".mp3")
os.close(tmp_fd)
cmd = [
"ffmpeg", "-i", video_path,
"-ss", str(start_offset),
"-t", str(duration),
"-map", "0:a:0", # samo prvi audio stream (varno za MXF z več streami)
"-vn",
"-ac", "2", # downmix v stereo (če multichannel)
"-ar", "44100",
"-b:a", "128k",
"-f", "mp3", tmp_path,
"-y", "-loglevel", "error"
]
proc = subprocess.run(cmd, capture_output=True, text=True)
if proc.returncode != 0:
os.unlink(tmp_path)
print(f"⚠️ ffmpeg napaka: {proc.stderr[:200]}", file=sys.stderr)
return None
return tmp_path
def recognize_video(video_path):
"""Glavni vstop: prepoznaj pesem v videu, vrni (artist, title) ali (None, None).
Probavamo 2 točki v videu (15s in 60s) za večjo robustnost — če ni intro/instrumental.
"""
if not os.environ.get("ACR_ACCESS_KEY"):
return (None, None)
for start_offset in [15, 60]:
audio = extract_short_audio(video_path, duration=20, start_offset=start_offset)
if not audio:
continue
try:
result = recognize_audio_file(audio)
finally:
try:
os.unlink(audio)
except OSError:
pass
if not result:
continue
status = result.get("status", {}).get("code")
if status != 0:
# 1001 = no result, drugi = napaka
msg = result.get("status", {}).get("msg", "")
print(f" ACR @{start_offset}s: status={status} ({msg})", file=sys.stderr)
continue
# Uspeh — vzemi prvi music match
music = result.get("metadata", {}).get("music", [])
if not music:
continue
first = music[0]
title = first.get("title", "").strip()
artists = first.get("artists", [])
artist = artists[0].get("name", "").strip() if artists else ""
if artist and title:
print(f" ✅ ACR @{start_offset}s prepoznal: {artist} - {title}", file=sys.stderr)
return (artist, title)
print(" ⚠️ ACR pesem ni prepoznana", file=sys.stderr)
return (None, None)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Uporaba: python3 acr_recognize.py <video.mp4>")
sys.exit(1)
artist, title = recognize_video(sys.argv[1])
if artist and title:
print(json.dumps({"artist": artist, "title": title}, ensure_ascii=False))
sys.exit(0)
else:
print(json.dumps({"artist": None, "title": None}))
sys.exit(1)