275 lines
8.9 KiB
TypeScript
275 lines
8.9 KiB
TypeScript
import OpenAI, { toFile } from "openai";
|
|
import { Buffer } from "node:buffer";
|
|
import { spawn } from "child_process";
|
|
import { writeFile, unlink, readFile } from "fs/promises";
|
|
import { randomUUID } from "crypto";
|
|
import { tmpdir } from "os";
|
|
import { join } from "path";
|
|
|
|
export const openai = new OpenAI({
|
|
apiKey: process.env.AI_INTEGRATIONS_OPENAI_API_KEY,
|
|
baseURL: process.env.AI_INTEGRATIONS_OPENAI_BASE_URL,
|
|
});
|
|
|
|
export type AudioFormat = "wav" | "mp3" | "webm" | "mp4" | "ogg" | "unknown";
|
|
|
|
/**
|
|
* Detect audio format from buffer magic bytes.
|
|
* Supports: WAV, MP3, WebM (Chrome/Firefox), MP4/M4A/MOV (Safari/iOS), OGG
|
|
*/
|
|
export function detectAudioFormat(buffer: Buffer): AudioFormat {
|
|
if (buffer.length < 12) return "unknown";
|
|
|
|
// WAV: RIFF....WAVE
|
|
if (buffer[0] === 0x52 && buffer[1] === 0x49 && buffer[2] === 0x46 && buffer[3] === 0x46) {
|
|
return "wav";
|
|
}
|
|
// WebM: EBML header
|
|
if (buffer[0] === 0x1a && buffer[1] === 0x45 && buffer[2] === 0xdf && buffer[3] === 0xa3) {
|
|
return "webm";
|
|
}
|
|
// MP3: ID3 tag or frame sync
|
|
if (
|
|
(buffer[0] === 0xff && (buffer[1] === 0xfb || buffer[1] === 0xfa || buffer[1] === 0xf3)) ||
|
|
(buffer[0] === 0x49 && buffer[1] === 0x44 && buffer[2] === 0x33)
|
|
) {
|
|
return "mp3";
|
|
}
|
|
// MP4/M4A/MOV: ....ftyp (Safari/iOS records in these containers)
|
|
if (buffer[4] === 0x66 && buffer[5] === 0x74 && buffer[6] === 0x79 && buffer[7] === 0x70) {
|
|
return "mp4";
|
|
}
|
|
// OGG: OggS
|
|
if (buffer[0] === 0x4f && buffer[1] === 0x67 && buffer[2] === 0x67 && buffer[3] === 0x53) {
|
|
return "ogg";
|
|
}
|
|
return "unknown";
|
|
}
|
|
|
|
/**
|
|
* Convert any audio/video format to WAV using ffmpeg.
|
|
* Uses temp files instead of pipes because video containers (MP4/MOV)
|
|
* require seeking to find the audio track.
|
|
*/
|
|
export async function convertToWav(audioBuffer: Buffer): Promise<Buffer> {
|
|
const inputPath = join(tmpdir(), `input-${randomUUID()}`);
|
|
const outputPath = join(tmpdir(), `output-${randomUUID()}.wav`);
|
|
|
|
try {
|
|
// Write input to temp file (required for video containers that need seeking)
|
|
await writeFile(inputPath, audioBuffer);
|
|
|
|
// Run ffmpeg with file paths
|
|
await new Promise<void>((resolve, reject) => {
|
|
const ffmpeg = spawn("ffmpeg", [
|
|
"-i", inputPath,
|
|
"-vn", // Extract audio only (ignore video track)
|
|
"-f", "wav",
|
|
"-ar", "16000", // 16kHz sample rate (good for speech)
|
|
"-ac", "1", // Mono
|
|
"-acodec", "pcm_s16le",
|
|
"-y", // Overwrite output
|
|
outputPath,
|
|
]);
|
|
|
|
ffmpeg.stderr.on("data", () => {}); // Suppress logs
|
|
ffmpeg.on("close", (code) => {
|
|
if (code === 0) resolve();
|
|
else reject(new Error(`ffmpeg exited with code ${code}`));
|
|
});
|
|
ffmpeg.on("error", reject);
|
|
});
|
|
|
|
// Read converted audio
|
|
return await readFile(outputPath);
|
|
} finally {
|
|
// Clean up temp files
|
|
await unlink(inputPath).catch(() => {});
|
|
await unlink(outputPath).catch(() => {});
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Auto-detect and convert audio to OpenAI-compatible format.
|
|
* - WAV/MP3: Pass through (already compatible)
|
|
* - WebM/MP4/OGG: Convert to WAV via ffmpeg
|
|
*/
|
|
export async function ensureCompatibleFormat(
|
|
audioBuffer: Buffer
|
|
): Promise<{ buffer: Buffer; format: "wav" | "mp3" }> {
|
|
const detected = detectAudioFormat(audioBuffer);
|
|
if (detected === "wav") return { buffer: audioBuffer, format: "wav" };
|
|
if (detected === "mp3") return { buffer: audioBuffer, format: "mp3" };
|
|
// Convert WebM, MP4, OGG, or unknown to WAV
|
|
const wavBuffer = await convertToWav(audioBuffer);
|
|
return { buffer: wavBuffer, format: "wav" };
|
|
}
|
|
|
|
/**
|
|
* Voice Chat: User speaks, LLM responds with audio (audio-in, audio-out).
|
|
* Uses gpt-audio model via Replit AI Integrations.
|
|
* Note: Browser records WebM/opus - convert to WAV using ffmpeg before calling this.
|
|
*/
|
|
export async function voiceChat(
|
|
audioBuffer: Buffer,
|
|
voice: "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer" = "alloy",
|
|
inputFormat: "wav" | "mp3" = "wav",
|
|
outputFormat: "wav" | "mp3" = "mp3"
|
|
): Promise<{ transcript: string; audioResponse: Buffer }> {
|
|
const audioBase64 = audioBuffer.toString("base64");
|
|
const response = await openai.chat.completions.create({
|
|
model: "gpt-audio",
|
|
modalities: ["text", "audio"],
|
|
audio: { voice, format: outputFormat },
|
|
messages: [{
|
|
role: "user",
|
|
content: [
|
|
{ type: "input_audio", input_audio: { data: audioBase64, format: inputFormat } },
|
|
],
|
|
}],
|
|
});
|
|
const message = response.choices[0]?.message as any;
|
|
const transcript = message?.audio?.transcript || message?.content || "";
|
|
const audioData = message?.audio?.data ?? "";
|
|
return {
|
|
transcript,
|
|
audioResponse: Buffer.from(audioData, "base64"),
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Streaming Voice Chat: For real-time audio responses.
|
|
* Note: Streaming only supports pcm16 output format.
|
|
*
|
|
* @example
|
|
* // Converting browser WebM to WAV before calling:
|
|
* const webmBuffer = Buffer.from(req.body.audio, "base64");
|
|
* const wavBuffer = await convertWebmToWav(webmBuffer);
|
|
* for await (const chunk of voiceChatStream(wavBuffer)) { ... }
|
|
*/
|
|
export async function voiceChatStream(
|
|
audioBuffer: Buffer,
|
|
voice: "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer" = "alloy",
|
|
inputFormat: "wav" | "mp3" = "wav"
|
|
): Promise<AsyncIterable<{ type: "transcript" | "audio"; data: string }>> {
|
|
const audioBase64 = audioBuffer.toString("base64");
|
|
const stream = await openai.chat.completions.create({
|
|
model: "gpt-audio",
|
|
modalities: ["text", "audio"],
|
|
audio: { voice, format: "pcm16" },
|
|
messages: [{
|
|
role: "user",
|
|
content: [
|
|
{ type: "input_audio", input_audio: { data: audioBase64, format: inputFormat } },
|
|
],
|
|
}],
|
|
stream: true,
|
|
});
|
|
|
|
return (async function* () {
|
|
for await (const chunk of stream) {
|
|
const delta = chunk.choices?.[0]?.delta as any;
|
|
if (!delta) continue;
|
|
if (delta?.audio?.transcript) {
|
|
yield { type: "transcript", data: delta.audio.transcript };
|
|
}
|
|
if (delta?.audio?.data) {
|
|
yield { type: "audio", data: delta.audio.data };
|
|
}
|
|
}
|
|
})();
|
|
}
|
|
|
|
/**
|
|
* Text-to-Speech: Converts text to speech verbatim.
|
|
* Uses gpt-audio model via Replit AI Integrations.
|
|
*/
|
|
export async function textToSpeech(
|
|
text: string,
|
|
voice: "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer" = "alloy",
|
|
format: "wav" | "mp3" | "flac" | "opus" | "pcm16" = "wav"
|
|
): Promise<Buffer> {
|
|
const response = await openai.chat.completions.create({
|
|
model: "gpt-audio",
|
|
modalities: ["text", "audio"],
|
|
audio: { voice, format },
|
|
messages: [
|
|
{ role: "system", content: "You are an assistant that performs text-to-speech." },
|
|
{ role: "user", content: `Repeat the following text verbatim: ${text}` },
|
|
],
|
|
});
|
|
const audioData = (response.choices[0]?.message as any)?.audio?.data ?? "";
|
|
return Buffer.from(audioData, "base64");
|
|
}
|
|
|
|
/**
|
|
* Streaming Text-to-Speech: Converts text to speech with real-time streaming.
|
|
* Uses gpt-audio model via Replit AI Integrations.
|
|
* Note: Streaming only supports pcm16 output format.
|
|
*/
|
|
export async function textToSpeechStream(
|
|
text: string,
|
|
voice: "alloy" | "echo" | "fable" | "onyx" | "nova" | "shimmer" = "alloy"
|
|
): Promise<AsyncIterable<string>> {
|
|
const stream = await openai.chat.completions.create({
|
|
model: "gpt-audio",
|
|
modalities: ["text", "audio"],
|
|
audio: { voice, format: "pcm16" },
|
|
messages: [
|
|
{ role: "system", content: "You are an assistant that performs text-to-speech." },
|
|
{ role: "user", content: `Repeat the following text verbatim: ${text}` },
|
|
],
|
|
stream: true,
|
|
});
|
|
|
|
return (async function* () {
|
|
for await (const chunk of stream) {
|
|
const delta = chunk.choices?.[0]?.delta as any;
|
|
if (!delta) continue;
|
|
if (delta?.audio?.data) {
|
|
yield delta.audio.data;
|
|
}
|
|
}
|
|
})();
|
|
}
|
|
|
|
/**
|
|
* Speech-to-Text: Transcribes audio using dedicated transcription model.
|
|
* Uses gpt-4o-mini-transcribe for accurate transcription.
|
|
*/
|
|
export async function speechToText(
|
|
audioBuffer: Buffer,
|
|
format: "wav" | "mp3" | "webm" = "wav"
|
|
): Promise<string> {
|
|
const file = await toFile(audioBuffer, `audio.${format}`);
|
|
const response = await openai.audio.transcriptions.create({
|
|
file,
|
|
model: "gpt-4o-mini-transcribe",
|
|
});
|
|
return response.text;
|
|
}
|
|
|
|
/**
|
|
* Streaming Speech-to-Text: Transcribes audio with real-time streaming.
|
|
* Uses gpt-4o-mini-transcribe for accurate transcription.
|
|
*/
|
|
export async function speechToTextStream(
|
|
audioBuffer: Buffer,
|
|
format: "wav" | "mp3" | "webm" = "wav"
|
|
): Promise<AsyncIterable<string>> {
|
|
const file = await toFile(audioBuffer, `audio.${format}`);
|
|
const stream = await openai.audio.transcriptions.create({
|
|
file,
|
|
model: "gpt-4o-mini-transcribe",
|
|
stream: true,
|
|
});
|
|
|
|
return (async function* () {
|
|
for await (const event of stream) {
|
|
if (event.type === "transcript.text.delta") {
|
|
yield event.delta;
|
|
}
|
|
}
|
|
})();
|
|
}
|