mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-28 20:46:57 +02:00
fix(google): emit opus voice-note tts
This commit is contained in:
@@ -59,6 +59,8 @@ Docs: https://docs.openclaw.ai
|
||||
|
||||
### Fixes
|
||||
|
||||
- Providers/Google: transcode Gemini TTS PCM to Opus for voice-note targets so
|
||||
WhatsApp and other native voice-note replies can play as voice messages.
|
||||
- iOS/macOS Talk Mode: allow `talk.speechLocale` to set the speech
|
||||
recognition locale for non-English voice conversations. Fixes #44688.
|
||||
- Plugins/providers: honor explicit plugin candidate lists instead of reading a
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
f813474b1623f06e1465daacd56db970e8e92ab1be122faee0fa2a1dc2d4fc43 plugin-sdk-api-baseline.json
|
||||
b3ea88c0c9b4cf6d9a46f0d34149063303853e78ef9708224608e4da79b23190 plugin-sdk-api-baseline.jsonl
|
||||
c911117176b41eebf26470618274a7e093910e9b36855bc045bc8a92f6856745 plugin-sdk-api-baseline.json
|
||||
ff360635f95beb217b9dd207a87eaf331319a7671aea03acfe05911756741b21 plugin-sdk-api-baseline.jsonl
|
||||
|
||||
@@ -252,8 +252,8 @@ The bundled `google` speech provider uses the Gemini API TTS path with
|
||||
|
||||
- Default voice: `Kore`
|
||||
- Auth: `messages.tts.providers.google.apiKey`, `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY`
|
||||
- Output: WAV for regular TTS attachments, PCM for Talk/telephony
|
||||
- Native voice-note output: not supported on this Gemini API path because the API returns PCM rather than Opus
|
||||
- Output: WAV for regular TTS attachments, Opus for voice-note targets, PCM for Talk/telephony
|
||||
- Voice-note output: Google PCM is wrapped as WAV and transcoded to 48 kHz Opus with `ffmpeg`
|
||||
|
||||
To use Google as the default TTS provider:
|
||||
|
||||
|
||||
@@ -584,7 +584,7 @@ These override `messages.tts.*` for that host.
|
||||
- **Local CLI**: uses the configured `outputFormat`. Voice-note targets are
|
||||
converted to Ogg/Opus and telephony output is converted to raw 16 kHz mono PCM
|
||||
with `ffmpeg`.
|
||||
- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments and returns PCM directly for Talk/telephony. Native Opus voice-note format is not supported by this path.
|
||||
- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments, transcodes it to 48kHz Opus for voice-note targets, and returns PCM directly for Talk/telephony.
|
||||
- **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony.
|
||||
- **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path.
|
||||
- **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`).
|
||||
|
||||
@@ -38,6 +38,24 @@ describeLive("google plugin live", () => {
|
||||
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512);
|
||||
}, 120_000);
|
||||
|
||||
it("transcodes speech to Opus for voice-note targets", async () => {
|
||||
const { speechProviders } = await registerGooglePlugin();
|
||||
const provider = requireRegisteredProvider(speechProviders, "google");
|
||||
|
||||
const audioFile = await provider.synthesize({
|
||||
text: "OpenClaw Google voice note integration test OK.",
|
||||
cfg: { plugins: { enabled: true } } as never,
|
||||
providerConfig: { apiKey: GOOGLE_API_KEY },
|
||||
target: "voice-note",
|
||||
timeoutMs: 90_000,
|
||||
});
|
||||
|
||||
expect(audioFile.outputFormat).toBe("opus");
|
||||
expect(audioFile.fileExtension).toBe(".opus");
|
||||
expect(audioFile.voiceCompatible).toBe(true);
|
||||
expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(128);
|
||||
}, 120_000);
|
||||
|
||||
it("transcribes synthesized speech through the media provider", async () => {
|
||||
const { mediaProviders, speechProviders } = await registerGooglePlugin();
|
||||
const speechProvider = requireRegisteredProvider(speechProviders, "google");
|
||||
|
||||
@@ -1,5 +1,12 @@
|
||||
import * as providerHttp from "openclaw/plugin-sdk/provider-http";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
||||
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
|
||||
}));
|
||||
|
||||
import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js";
|
||||
|
||||
function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) {
|
||||
@@ -31,6 +38,7 @@ describe("Google speech provider", () => {
|
||||
vi.restoreAllMocks();
|
||||
vi.unstubAllGlobals();
|
||||
vi.unstubAllEnvs();
|
||||
transcodeAudioBufferToOpusMock.mockReset();
|
||||
});
|
||||
|
||||
it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => {
|
||||
@@ -82,6 +90,39 @@ describe("Google speech provider", () => {
|
||||
expect(result.audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
|
||||
expect(result.audioBuffer.readUInt32LE(24)).toBe(__testing.GOOGLE_TTS_SAMPLE_RATE);
|
||||
expect(result.audioBuffer.subarray(44)).toEqual(Buffer.from([1, 0, 2, 0]));
|
||||
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("transcodes Gemini PCM to Opus for voice-note targets", async () => {
|
||||
installGoogleTtsFetchMock(Buffer.from([5, 0, 6, 0]));
|
||||
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus"));
|
||||
const provider = buildGoogleSpeechProvider();
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "Send this as a voice note.",
|
||||
cfg: {},
|
||||
providerConfig: {
|
||||
apiKey: "google-test-key",
|
||||
},
|
||||
target: "voice-note",
|
||||
timeoutMs: 12_000,
|
||||
});
|
||||
|
||||
expect(result).toEqual({
|
||||
audioBuffer: Buffer.from("google-opus"),
|
||||
outputFormat: "opus",
|
||||
fileExtension: ".opus",
|
||||
voiceCompatible: true,
|
||||
});
|
||||
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
|
||||
audioBuffer: expect.any(Buffer),
|
||||
inputExtension: "wav",
|
||||
tempPrefix: "tts-google-",
|
||||
timeoutMs: 12_000,
|
||||
});
|
||||
const [{ audioBuffer }] = transcodeAudioBufferToOpusMock.mock.calls[0];
|
||||
expect(audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF");
|
||||
expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE");
|
||||
});
|
||||
|
||||
it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => {
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime";
|
||||
import {
|
||||
assertOkOrThrowProviderError,
|
||||
postJsonRequest,
|
||||
@@ -394,6 +395,19 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin {
|
||||
speakerName: overrides.speakerName ?? config.speakerName,
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
if (req.target === "voice-note") {
|
||||
return {
|
||||
audioBuffer: await transcodeAudioBufferToOpus({
|
||||
audioBuffer: wrapPcm16MonoToWav(pcm),
|
||||
inputExtension: "wav",
|
||||
tempPrefix: "tts-google-",
|
||||
timeoutMs: req.timeoutMs,
|
||||
}),
|
||||
outputFormat: "opus",
|
||||
fileExtension: ".opus",
|
||||
voiceCompatible: true,
|
||||
};
|
||||
}
|
||||
return {
|
||||
audioBuffer: wrapPcm16MonoToWav(pcm),
|
||||
outputFormat: "wav",
|
||||
|
||||
@@ -3,10 +3,10 @@ import { tmpdir } from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const runFfmpegMock = vi.hoisted(() => vi.fn());
|
||||
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
||||
runFfmpeg: runFfmpegMock,
|
||||
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
|
||||
}));
|
||||
|
||||
import { buildMinimaxSpeechProvider } from "./speech-provider.js";
|
||||
@@ -293,7 +293,7 @@ describe("buildMinimaxSpeechProvider", () => {
|
||||
};
|
||||
clearMinimaxAuthEnv();
|
||||
vi.stubGlobal("fetch", vi.fn());
|
||||
runFfmpegMock.mockReset();
|
||||
transcodeAudioBufferToOpusMock.mockReset();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
@@ -333,7 +333,7 @@ describe("buildMinimaxSpeechProvider", () => {
|
||||
expect(body.model).toBe("speech-2.8-hd");
|
||||
expect(body.text).toBe("Hello world");
|
||||
expect(body.voice_setting.voice_id).toBe("English_expressive_narrator");
|
||||
expect(runFfmpegMock).not.toHaveBeenCalled();
|
||||
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("transcodes MiniMax MP3 to Opus for voice-note targets", async () => {
|
||||
@@ -345,15 +345,7 @@ describe("buildMinimaxSpeechProvider", () => {
|
||||
headers: { "Content-Type": "application/json" },
|
||||
}),
|
||||
);
|
||||
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
|
||||
const outputPath = args.at(-1);
|
||||
if (typeof outputPath !== "string") {
|
||||
throw new Error("missing ffmpeg output path");
|
||||
}
|
||||
await import("node:fs/promises").then((fs) =>
|
||||
fs.writeFile(outputPath, Buffer.from("fake-opus-data")),
|
||||
);
|
||||
});
|
||||
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("fake-opus-data"));
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "Hello world",
|
||||
@@ -367,10 +359,12 @@ describe("buildMinimaxSpeechProvider", () => {
|
||||
expect(result.fileExtension).toBe(".opus");
|
||||
expect(result.voiceCompatible).toBe(true);
|
||||
expect(result.audioBuffer.toString()).toBe("fake-opus-data");
|
||||
expect(runFfmpegMock).toHaveBeenCalledWith(
|
||||
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]),
|
||||
{ timeoutMs: 30000 },
|
||||
);
|
||||
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
|
||||
audioBuffer: Buffer.from("fake-mp3-data"),
|
||||
inputExtension: "mp3",
|
||||
tempPrefix: "tts-minimax-",
|
||||
timeoutMs: 30000,
|
||||
});
|
||||
});
|
||||
|
||||
it("applies overrides", async () => {
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
|
||||
import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime";
|
||||
import {
|
||||
isProviderAuthProfileConfigured,
|
||||
type OpenClawConfig,
|
||||
@@ -14,7 +12,6 @@ import type {
|
||||
SpeechProviderPlugin,
|
||||
} from "openclaw/plugin-sdk/speech-core";
|
||||
import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core";
|
||||
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
|
||||
import {
|
||||
DEFAULT_MINIMAX_TTS_BASE_URL,
|
||||
MINIMAX_TTS_MODELS,
|
||||
@@ -209,41 +206,6 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): {
|
||||
}
|
||||
}
|
||||
|
||||
async function transcodeMp3ToOpus(audioBuffer: Buffer, timeoutMs: number | undefined) {
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
await mkdir(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = await mkdtemp(path.join(tempRoot, "tts-minimax-"));
|
||||
try {
|
||||
const inputPath = path.join(tempDir, "input.mp3");
|
||||
const outputPath = path.join(tempDir, "voice.opus");
|
||||
await writeFile(inputPath, audioBuffer, { mode: 0o600 });
|
||||
await runFfmpeg(
|
||||
[
|
||||
"-hide_banner",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-y",
|
||||
"-i",
|
||||
inputPath,
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
"64k",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"1",
|
||||
outputPath,
|
||||
],
|
||||
{ timeoutMs },
|
||||
);
|
||||
return await readFile(outputPath);
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "minimax",
|
||||
@@ -326,7 +288,12 @@ export function buildMinimaxSpeechProvider(): SpeechProviderPlugin {
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
if (req.target === "voice-note") {
|
||||
const opusBuffer = await transcodeMp3ToOpus(audioBuffer, req.timeoutMs);
|
||||
const opusBuffer = await transcodeAudioBufferToOpus({
|
||||
audioBuffer,
|
||||
inputExtension: "mp3",
|
||||
tempPrefix: "tts-minimax-",
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return {
|
||||
audioBuffer: opusBuffer,
|
||||
outputFormat: "opus",
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
import { afterEach, beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
const runFfmpegMock = vi.hoisted(() => vi.fn());
|
||||
const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("openclaw/plugin-sdk/media-runtime", () => ({
|
||||
runFfmpeg: runFfmpegMock,
|
||||
transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock,
|
||||
}));
|
||||
|
||||
import { buildXiaomiSpeechProvider } from "./speech-provider.js";
|
||||
@@ -123,7 +123,7 @@ describe("buildXiaomiSpeechProvider", () => {
|
||||
|
||||
beforeEach(() => {
|
||||
vi.stubGlobal("fetch", vi.fn());
|
||||
runFfmpegMock.mockReset();
|
||||
transcodeAudioBufferToOpusMock.mockReset();
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
@@ -170,7 +170,7 @@ describe("buildXiaomiSpeechProvider", () => {
|
||||
{ role: "assistant", content: "Hello from OpenClaw." },
|
||||
]);
|
||||
expect(body.audio).toEqual({ format: "mp3", voice: "default_en" });
|
||||
expect(runFfmpegMock).not.toHaveBeenCalled();
|
||||
expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it("transcodes Xiaomi output to Opus for voice-note targets", async () => {
|
||||
@@ -181,15 +181,7 @@ describe("buildXiaomiSpeechProvider", () => {
|
||||
headers: { "Content-Type": "application/json" },
|
||||
}),
|
||||
);
|
||||
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
|
||||
const outputPath = args.at(-1);
|
||||
if (typeof outputPath !== "string") {
|
||||
throw new Error("missing ffmpeg output path");
|
||||
}
|
||||
await import("node:fs/promises").then((fs) =>
|
||||
fs.writeFile(outputPath, Buffer.from("fake-opus-audio")),
|
||||
);
|
||||
});
|
||||
transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("fake-opus-audio"));
|
||||
|
||||
const result = await provider.synthesize({
|
||||
text: "Hello from OpenClaw.",
|
||||
@@ -203,10 +195,12 @@ describe("buildXiaomiSpeechProvider", () => {
|
||||
expect(result.fileExtension).toBe(".opus");
|
||||
expect(result.voiceCompatible).toBe(true);
|
||||
expect(result.audioBuffer.toString()).toBe("fake-opus-audio");
|
||||
expect(runFfmpegMock).toHaveBeenCalledWith(
|
||||
expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]),
|
||||
{ timeoutMs: 30000 },
|
||||
);
|
||||
expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({
|
||||
audioBuffer: Buffer.from("fake-mp3-audio"),
|
||||
inputExtension: "mp3",
|
||||
tempPrefix: "tts-xiaomi-",
|
||||
timeoutMs: 30000,
|
||||
});
|
||||
});
|
||||
|
||||
it("throws when API key is missing", async () => {
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime";
|
||||
import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime";
|
||||
import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http";
|
||||
import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input";
|
||||
import type {
|
||||
@@ -14,7 +12,6 @@ import {
|
||||
fetchWithSsrFGuard,
|
||||
ssrfPolicyFromHttpBaseUrlAllowedHostname,
|
||||
} from "openclaw/plugin-sdk/ssrf-runtime";
|
||||
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path";
|
||||
|
||||
export const DEFAULT_XIAOMI_TTS_BASE_URL = "https://api.xiaomimimo.com/v1";
|
||||
export const DEFAULT_XIAOMI_TTS_MODEL = "mimo-v2.5-tts";
|
||||
@@ -242,45 +239,6 @@ export async function xiaomiTTS(params: {
|
||||
}
|
||||
}
|
||||
|
||||
async function transcodeAudioToOpus(params: {
|
||||
audioBuffer: Buffer;
|
||||
inputExtension: string;
|
||||
timeoutMs: number | undefined;
|
||||
}) {
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
await mkdir(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = await mkdtemp(path.join(tempRoot, "tts-xiaomi-"));
|
||||
try {
|
||||
const inputPath = path.join(tempDir, `input.${params.inputExtension}`);
|
||||
const outputPath = path.join(tempDir, "voice.opus");
|
||||
await writeFile(inputPath, params.audioBuffer, { mode: 0o600 });
|
||||
await runFfmpeg(
|
||||
[
|
||||
"-hide_banner",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-y",
|
||||
"-i",
|
||||
inputPath,
|
||||
"-vn",
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
"64k",
|
||||
"-ar",
|
||||
"48000",
|
||||
"-ac",
|
||||
"1",
|
||||
outputPath,
|
||||
],
|
||||
{ timeoutMs: params.timeoutMs },
|
||||
);
|
||||
return await readFile(outputPath);
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
export function buildXiaomiSpeechProvider(): SpeechProviderPlugin {
|
||||
return {
|
||||
id: "xiaomi",
|
||||
@@ -313,9 +271,10 @@ export function buildXiaomiSpeechProvider(): SpeechProviderPlugin {
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
if (req.target === "voice-note") {
|
||||
const opusBuffer = await transcodeAudioToOpus({
|
||||
const opusBuffer = await transcodeAudioBufferToOpus({
|
||||
audioBuffer,
|
||||
inputExtension: outputFormat,
|
||||
tempPrefix: "tts-xiaomi-",
|
||||
timeoutMs: req.timeoutMs,
|
||||
});
|
||||
return {
|
||||
|
||||
100
src/media/audio-transcode.test.ts
Normal file
100
src/media/audio-transcode.test.ts
Normal file
@@ -0,0 +1,100 @@
|
||||
import { existsSync } from "node:fs";
|
||||
import { readFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { afterEach, describe, expect, it, vi } from "vitest";
|
||||
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
|
||||
|
||||
const runFfmpegMock = vi.hoisted(() => vi.fn());
|
||||
|
||||
vi.mock("./ffmpeg-exec.js", () => ({
|
||||
runFfmpeg: runFfmpegMock,
|
||||
}));
|
||||
|
||||
import { transcodeAudioBufferToOpus } from "./audio-transcode.js";
|
||||
|
||||
describe("transcodeAudioBufferToOpus", () => {
|
||||
afterEach(() => {
|
||||
runFfmpegMock.mockReset();
|
||||
});
|
||||
|
||||
it("writes input audio, runs ffmpeg for 48k mono Opus, and cleans temp files", async () => {
|
||||
let capturedInputPath: string | undefined;
|
||||
let capturedOutputPath: string | undefined;
|
||||
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
|
||||
capturedInputPath = args[args.indexOf("-i") + 1];
|
||||
capturedOutputPath = args.at(-1);
|
||||
if (!capturedInputPath || !capturedOutputPath) {
|
||||
throw new Error("missing ffmpeg paths");
|
||||
}
|
||||
await expect(readFile(capturedInputPath)).resolves.toEqual(Buffer.from("source-mp3"));
|
||||
await import("node:fs/promises").then((fs) =>
|
||||
fs.writeFile(capturedOutputPath!, Buffer.from("opus-output")),
|
||||
);
|
||||
});
|
||||
|
||||
await expect(
|
||||
transcodeAudioBufferToOpus({
|
||||
audioBuffer: Buffer.from("source-mp3"),
|
||||
inputExtension: "mp3",
|
||||
tempPrefix: "tts-test-",
|
||||
timeoutMs: 1234,
|
||||
}),
|
||||
).resolves.toEqual(Buffer.from("opus-output"));
|
||||
|
||||
expect(runFfmpegMock).toHaveBeenCalledWith(
|
||||
expect.arrayContaining(["-c:a", "libopus", "-b:a", "64k", "-ar", "48000", "-ac", "1"]),
|
||||
{ timeoutMs: 1234 },
|
||||
);
|
||||
expect(
|
||||
capturedInputPath?.startsWith(path.join(resolvePreferredOpenClawTmpDir(), "tts-test-")),
|
||||
).toBe(true);
|
||||
expect(capturedInputPath ? existsSync(capturedInputPath) : true).toBe(false);
|
||||
expect(capturedOutputPath ? existsSync(capturedOutputPath) : true).toBe(false);
|
||||
});
|
||||
|
||||
it("sanitizes unsafe input extensions", async () => {
|
||||
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
|
||||
const inputPath = args[args.indexOf("-i") + 1];
|
||||
const outputPath = args.at(-1);
|
||||
if (!inputPath || !outputPath) {
|
||||
throw new Error("missing ffmpeg paths");
|
||||
}
|
||||
expect(path.basename(inputPath)).toBe("input.audio");
|
||||
await import("node:fs/promises").then((fs) =>
|
||||
fs.writeFile(outputPath, Buffer.from("opus-output")),
|
||||
);
|
||||
});
|
||||
|
||||
await transcodeAudioBufferToOpus({
|
||||
audioBuffer: Buffer.from("source"),
|
||||
inputExtension: "../bad",
|
||||
});
|
||||
});
|
||||
|
||||
it("keeps temp prefixes and output names inside the preferred temp root", async () => {
|
||||
let capturedInputPath: string | undefined;
|
||||
let capturedOutputPath: string | undefined;
|
||||
runFfmpegMock.mockImplementationOnce(async (args: string[]) => {
|
||||
capturedInputPath = args[args.indexOf("-i") + 1];
|
||||
capturedOutputPath = args.at(-1);
|
||||
if (!capturedOutputPath) {
|
||||
throw new Error("missing ffmpeg output path");
|
||||
}
|
||||
expect(path.basename(capturedOutputPath)).toBe("escape.opus");
|
||||
await import("node:fs/promises").then((fs) =>
|
||||
fs.writeFile(capturedOutputPath!, Buffer.from("opus-output")),
|
||||
);
|
||||
});
|
||||
|
||||
await transcodeAudioBufferToOpus({
|
||||
audioBuffer: Buffer.from("source"),
|
||||
inputFileName: "voice.wav",
|
||||
outputFileName: "../escape.opus",
|
||||
tempPrefix: "../bad-prefix",
|
||||
});
|
||||
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
expect(capturedInputPath?.startsWith(tempRoot)).toBe(true);
|
||||
expect(capturedOutputPath?.startsWith(tempRoot)).toBe(true);
|
||||
});
|
||||
});
|
||||
87
src/media/audio-transcode.ts
Normal file
87
src/media/audio-transcode.ts
Normal file
@@ -0,0 +1,87 @@
|
||||
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js";
|
||||
import { runFfmpeg } from "./ffmpeg-exec.js";
|
||||
|
||||
const DEFAULT_OPUS_SAMPLE_RATE_HZ = 48_000;
|
||||
const DEFAULT_OPUS_BITRATE = "64k";
|
||||
const DEFAULT_OPUS_CHANNELS = 1;
|
||||
const DEFAULT_TEMP_PREFIX = "audio-opus-";
|
||||
const DEFAULT_OUTPUT_FILE_NAME = "voice.opus";
|
||||
|
||||
function normalizeAudioExtension(params: {
|
||||
inputExtension?: string;
|
||||
inputFileName?: string;
|
||||
}): string {
|
||||
const fromExtension = params.inputExtension?.trim();
|
||||
const candidate = fromExtension
|
||||
? fromExtension.startsWith(".")
|
||||
? fromExtension
|
||||
: `.${fromExtension}`
|
||||
: path.extname(params.inputFileName ?? "");
|
||||
const normalized = candidate.toLowerCase();
|
||||
return /^\.[a-z0-9]{1,12}$/.test(normalized) ? normalized : ".audio";
|
||||
}
|
||||
|
||||
function normalizeTempPrefix(value?: string): string {
|
||||
const sanitized = value?.trim().replace(/[^a-zA-Z0-9._-]/g, "-");
|
||||
if (!sanitized || sanitized === "." || sanitized === "..") {
|
||||
return DEFAULT_TEMP_PREFIX;
|
||||
}
|
||||
return sanitized.endsWith("-") ? sanitized : `${sanitized}-`;
|
||||
}
|
||||
|
||||
function normalizeOutputFileName(value?: string): string {
|
||||
const baseName = path.basename(value?.trim() || DEFAULT_OUTPUT_FILE_NAME);
|
||||
if (/^[a-zA-Z0-9._-]{1,80}$/.test(baseName) && baseName !== "." && baseName !== "..") {
|
||||
return baseName;
|
||||
}
|
||||
return DEFAULT_OUTPUT_FILE_NAME;
|
||||
}
|
||||
|
||||
export async function transcodeAudioBufferToOpus(params: {
|
||||
audioBuffer: Buffer;
|
||||
inputExtension?: string;
|
||||
inputFileName?: string;
|
||||
tempPrefix?: string;
|
||||
outputFileName?: string;
|
||||
timeoutMs?: number;
|
||||
sampleRateHz?: number;
|
||||
bitrate?: string;
|
||||
channels?: number;
|
||||
}): Promise<Buffer> {
|
||||
const tempRoot = resolvePreferredOpenClawTmpDir();
|
||||
await mkdir(tempRoot, { recursive: true, mode: 0o700 });
|
||||
const tempDir = await mkdtemp(path.join(tempRoot, normalizeTempPrefix(params.tempPrefix)));
|
||||
try {
|
||||
const inputPath = path.join(tempDir, `input${normalizeAudioExtension(params)}`);
|
||||
const outputPath = path.join(tempDir, normalizeOutputFileName(params.outputFileName));
|
||||
await writeFile(inputPath, params.audioBuffer, { mode: 0o600 });
|
||||
await runFfmpeg(
|
||||
[
|
||||
"-hide_banner",
|
||||
"-loglevel",
|
||||
"error",
|
||||
"-y",
|
||||
"-i",
|
||||
inputPath,
|
||||
"-vn",
|
||||
"-sn",
|
||||
"-dn",
|
||||
"-c:a",
|
||||
"libopus",
|
||||
"-b:a",
|
||||
params.bitrate ?? DEFAULT_OPUS_BITRATE,
|
||||
"-ar",
|
||||
String(params.sampleRateHz ?? DEFAULT_OPUS_SAMPLE_RATE_HZ),
|
||||
"-ac",
|
||||
String(params.channels ?? DEFAULT_OPUS_CHANNELS),
|
||||
outputPath,
|
||||
],
|
||||
{ timeoutMs: params.timeoutMs },
|
||||
);
|
||||
return await readFile(outputPath);
|
||||
} finally {
|
||||
await rm(tempDir, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
@@ -1,6 +1,7 @@
|
||||
// Public media/payload helpers for plugins that fetch, transform, or send attachments.
|
||||
|
||||
export * from "../media/audio.js";
|
||||
export * from "../media/audio-transcode.js";
|
||||
export * from "../media/base64.js";
|
||||
export * from "../media/constants.js";
|
||||
export * from "../media/fetch.js";
|
||||
|
||||
Reference in New Issue
Block a user