From e2fd3dcee9cdd37f89d1037b29ec70f9d59eec46 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sat, 25 Apr 2026 21:33:15 +0100 Subject: [PATCH] fix(google): emit opus voice-note tts --- CHANGELOG.md | 2 + .../.generated/plugin-sdk-api-baseline.sha256 | 4 +- docs/providers/google.md | 4 +- docs/tools/tts.md | 2 +- extensions/google/google.live.test.ts | 18 ++++ extensions/google/speech-provider.test.ts | 41 +++++++ extensions/google/speech-provider.ts | 14 +++ extensions/minimax/speech-provider.test.ts | 28 ++--- extensions/minimax/speech-provider.ts | 47 ++------ extensions/xiaomi/speech-provider.test.ts | 28 ++--- extensions/xiaomi/speech-provider.ts | 47 +------- src/media/audio-transcode.test.ts | 100 ++++++++++++++++++ src/media/audio-transcode.ts | 87 +++++++++++++++ src/plugin-sdk/media-runtime.ts | 1 + 14 files changed, 300 insertions(+), 123 deletions(-) create mode 100644 src/media/audio-transcode.test.ts create mode 100644 src/media/audio-transcode.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 037408b753d..318278d6f7b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -59,6 +59,8 @@ Docs: https://docs.openclaw.ai ### Fixes +- Providers/Google: transcode Gemini TTS PCM to Opus for voice-note targets so + WhatsApp and other native voice-note replies can play as voice messages. - iOS/macOS Talk Mode: allow `talk.speechLocale` to set the speech recognition locale for non-English voice conversations. Fixes #44688. - Plugins/providers: honor explicit plugin candidate lists instead of reading a diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index d414d56324f..0ec8dfc602f 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -f813474b1623f06e1465daacd56db970e8e92ab1be122faee0fa2a1dc2d4fc43 plugin-sdk-api-baseline.json -b3ea88c0c9b4cf6d9a46f0d34149063303853e78ef9708224608e4da79b23190 plugin-sdk-api-baseline.jsonl +c911117176b41eebf26470618274a7e093910e9b36855bc045bc8a92f6856745 plugin-sdk-api-baseline.json +ff360635f95beb217b9dd207a87eaf331319a7671aea03acfe05911756741b21 plugin-sdk-api-baseline.jsonl diff --git a/docs/providers/google.md b/docs/providers/google.md index 661cc7b7f2c..765439d0a69 100644 --- a/docs/providers/google.md +++ b/docs/providers/google.md @@ -252,8 +252,8 @@ The bundled `google` speech provider uses the Gemini API TTS path with - Default voice: `Kore` - Auth: `messages.tts.providers.google.apiKey`, `models.providers.google.apiKey`, `GEMINI_API_KEY`, or `GOOGLE_API_KEY` -- Output: WAV for regular TTS attachments, PCM for Talk/telephony -- Native voice-note output: not supported on this Gemini API path because the API returns PCM rather than Opus +- Output: WAV for regular TTS attachments, Opus for voice-note targets, PCM for Talk/telephony +- Voice-note output: Google PCM is wrapped as WAV and transcoded to 48 kHz Opus with `ffmpeg` To use Google as the default TTS provider: diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 0af905c65f8..8995e9afafa 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -584,7 +584,7 @@ These override `messages.tts.*` for that host. - **Local CLI**: uses the configured `outputFormat`. Voice-note targets are converted to Ogg/Opus and telephony output is converted to raw 16 kHz mono PCM with `ffmpeg`. -- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments and returns PCM directly for Talk/telephony. Native Opus voice-note format is not supported by this path. +- **Google Gemini**: Gemini API TTS returns raw 24kHz PCM. OpenClaw wraps it as WAV for audio attachments, transcodes it to 48kHz Opus for voice-note targets, and returns PCM directly for Talk/telephony. - **Gradium**: WAV for audio attachments, Opus for voice-note targets, and `ulaw_8000` at 8 kHz for telephony. - **xAI**: MP3 by default; `responseFormat` may be `mp3`, `wav`, `pcm`, `mulaw`, or `alaw`. OpenClaw uses xAI's batch REST TTS endpoint and returns a complete audio attachment; xAI's streaming TTS WebSocket is not used by this provider path. Native Opus voice-note format is not supported by this path. - **Microsoft**: uses `microsoft.outputFormat` (default `audio-24khz-48kbitrate-mono-mp3`). diff --git a/extensions/google/google.live.test.ts b/extensions/google/google.live.test.ts index 39751b5d635..e95d79530bf 100644 --- a/extensions/google/google.live.test.ts +++ b/extensions/google/google.live.test.ts @@ -38,6 +38,24 @@ describeLive("google plugin live", () => { expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(512); }, 120_000); + it("transcodes speech to Opus for voice-note targets", async () => { + const { speechProviders } = await registerGooglePlugin(); + const provider = requireRegisteredProvider(speechProviders, "google"); + + const audioFile = await provider.synthesize({ + text: "OpenClaw Google voice note integration test OK.", + cfg: { plugins: { enabled: true } } as never, + providerConfig: { apiKey: GOOGLE_API_KEY }, + target: "voice-note", + timeoutMs: 90_000, + }); + + expect(audioFile.outputFormat).toBe("opus"); + expect(audioFile.fileExtension).toBe(".opus"); + expect(audioFile.voiceCompatible).toBe(true); + expect(audioFile.audioBuffer.byteLength).toBeGreaterThan(128); + }, 120_000); + it("transcribes synthesized speech through the media provider", async () => { const { mediaProviders, speechProviders } = await registerGooglePlugin(); const speechProvider = requireRegisteredProvider(speechProviders, "google"); diff --git a/extensions/google/speech-provider.test.ts b/extensions/google/speech-provider.test.ts index 1c764d19fba..b64cc5bcace 100644 --- a/extensions/google/speech-provider.test.ts +++ b/extensions/google/speech-provider.test.ts @@ -1,5 +1,12 @@ import * as providerHttp from "openclaw/plugin-sdk/provider-http"; import { afterEach, describe, expect, it, vi } from "vitest"; + +const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn()); + +vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ + transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock, +})); + import { buildGoogleSpeechProvider, __testing } from "./speech-provider.js"; function installGoogleTtsFetchMock(pcm = Buffer.from([1, 0, 2, 0])) { @@ -31,6 +38,7 @@ describe("Google speech provider", () => { vi.restoreAllMocks(); vi.unstubAllGlobals(); vi.unstubAllEnvs(); + transcodeAudioBufferToOpusMock.mockReset(); }); it("synthesizes Gemini PCM as WAV and preserves audio tags in the request text", async () => { @@ -82,6 +90,39 @@ describe("Google speech provider", () => { expect(result.audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE"); expect(result.audioBuffer.readUInt32LE(24)).toBe(__testing.GOOGLE_TTS_SAMPLE_RATE); expect(result.audioBuffer.subarray(44)).toEqual(Buffer.from([1, 0, 2, 0])); + expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled(); + }); + + it("transcodes Gemini PCM to Opus for voice-note targets", async () => { + installGoogleTtsFetchMock(Buffer.from([5, 0, 6, 0])); + transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("google-opus")); + const provider = buildGoogleSpeechProvider(); + + const result = await provider.synthesize({ + text: "Send this as a voice note.", + cfg: {}, + providerConfig: { + apiKey: "google-test-key", + }, + target: "voice-note", + timeoutMs: 12_000, + }); + + expect(result).toEqual({ + audioBuffer: Buffer.from("google-opus"), + outputFormat: "opus", + fileExtension: ".opus", + voiceCompatible: true, + }); + expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({ + audioBuffer: expect.any(Buffer), + inputExtension: "wav", + tempPrefix: "tts-google-", + timeoutMs: 12_000, + }); + const [{ audioBuffer }] = transcodeAudioBufferToOpusMock.mock.calls[0]; + expect(audioBuffer.subarray(0, 4).toString("ascii")).toBe("RIFF"); + expect(audioBuffer.subarray(8, 12).toString("ascii")).toBe("WAVE"); }); it("falls back to GEMINI_API_KEY and configured Google API base URL", async () => { diff --git a/extensions/google/speech-provider.ts b/extensions/google/speech-provider.ts index fccc7afe7b2..e31686cddcb 100644 --- a/extensions/google/speech-provider.ts +++ b/extensions/google/speech-provider.ts @@ -1,3 +1,4 @@ +import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime"; import { assertOkOrThrowProviderError, postJsonRequest, @@ -394,6 +395,19 @@ export function buildGoogleSpeechProvider(): SpeechProviderPlugin { speakerName: overrides.speakerName ?? config.speakerName, timeoutMs: req.timeoutMs, }); + if (req.target === "voice-note") { + return { + audioBuffer: await transcodeAudioBufferToOpus({ + audioBuffer: wrapPcm16MonoToWav(pcm), + inputExtension: "wav", + tempPrefix: "tts-google-", + timeoutMs: req.timeoutMs, + }), + outputFormat: "opus", + fileExtension: ".opus", + voiceCompatible: true, + }; + } return { audioBuffer: wrapPcm16MonoToWav(pcm), outputFormat: "wav", diff --git a/extensions/minimax/speech-provider.test.ts b/extensions/minimax/speech-provider.test.ts index 84be7f1ddfa..96087adf1ac 100644 --- a/extensions/minimax/speech-provider.test.ts +++ b/extensions/minimax/speech-provider.test.ts @@ -3,10 +3,10 @@ import { tmpdir } from "node:os"; import path from "node:path"; import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -const runFfmpegMock = vi.hoisted(() => vi.fn()); +const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn()); vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ - runFfmpeg: runFfmpegMock, + transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock, })); import { buildMinimaxSpeechProvider } from "./speech-provider.js"; @@ -293,7 +293,7 @@ describe("buildMinimaxSpeechProvider", () => { }; clearMinimaxAuthEnv(); vi.stubGlobal("fetch", vi.fn()); - runFfmpegMock.mockReset(); + transcodeAudioBufferToOpusMock.mockReset(); }); afterEach(async () => { @@ -333,7 +333,7 @@ describe("buildMinimaxSpeechProvider", () => { expect(body.model).toBe("speech-2.8-hd"); expect(body.text).toBe("Hello world"); expect(body.voice_setting.voice_id).toBe("English_expressive_narrator"); - expect(runFfmpegMock).not.toHaveBeenCalled(); + expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled(); }); it("transcodes MiniMax MP3 to Opus for voice-note targets", async () => { @@ -345,15 +345,7 @@ describe("buildMinimaxSpeechProvider", () => { headers: { "Content-Type": "application/json" }, }), ); - runFfmpegMock.mockImplementationOnce(async (args: string[]) => { - const outputPath = args.at(-1); - if (typeof outputPath !== "string") { - throw new Error("missing ffmpeg output path"); - } - await import("node:fs/promises").then((fs) => - fs.writeFile(outputPath, Buffer.from("fake-opus-data")), - ); - }); + transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("fake-opus-data")); const result = await provider.synthesize({ text: "Hello world", @@ -367,10 +359,12 @@ describe("buildMinimaxSpeechProvider", () => { expect(result.fileExtension).toBe(".opus"); expect(result.voiceCompatible).toBe(true); expect(result.audioBuffer.toString()).toBe("fake-opus-data"); - expect(runFfmpegMock).toHaveBeenCalledWith( - expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]), - { timeoutMs: 30000 }, - ); + expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({ + audioBuffer: Buffer.from("fake-mp3-data"), + inputExtension: "mp3", + tempPrefix: "tts-minimax-", + timeoutMs: 30000, + }); }); it("applies overrides", async () => { diff --git a/extensions/minimax/speech-provider.ts b/extensions/minimax/speech-provider.ts index 9ab723ea94a..7483d9cbcdc 100644 --- a/extensions/minimax/speech-provider.ts +++ b/extensions/minimax/speech-provider.ts @@ -1,6 +1,4 @@ -import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; -import path from "node:path"; -import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime"; +import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime"; import { isProviderAuthProfileConfigured, type OpenClawConfig, @@ -14,7 +12,6 @@ import type { SpeechProviderPlugin, } from "openclaw/plugin-sdk/speech-core"; import { asFiniteNumber, asObject, trimToUndefined } from "openclaw/plugin-sdk/speech-core"; -import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path"; import { DEFAULT_MINIMAX_TTS_BASE_URL, MINIMAX_TTS_MODELS, @@ -209,41 +206,6 @@ function parseDirectiveToken(ctx: SpeechDirectiveTokenParseContext): { } } -async function transcodeMp3ToOpus(audioBuffer: Buffer, timeoutMs: number | undefined) { - const tempRoot = resolvePreferredOpenClawTmpDir(); - await mkdir(tempRoot, { recursive: true, mode: 0o700 }); - const tempDir = await mkdtemp(path.join(tempRoot, "tts-minimax-")); - try { - const inputPath = path.join(tempDir, "input.mp3"); - const outputPath = path.join(tempDir, "voice.opus"); - await writeFile(inputPath, audioBuffer, { mode: 0o600 }); - await runFfmpeg( - [ - "-hide_banner", - "-loglevel", - "error", - "-y", - "-i", - inputPath, - "-vn", - "-c:a", - "libopus", - "-b:a", - "64k", - "-ar", - "48000", - "-ac", - "1", - outputPath, - ], - { timeoutMs }, - ); - return await readFile(outputPath); - } finally { - await rm(tempDir, { recursive: true, force: true }); - } -} - export function buildMinimaxSpeechProvider(): SpeechProviderPlugin { return { id: "minimax", @@ -326,7 +288,12 @@ export function buildMinimaxSpeechProvider(): SpeechProviderPlugin { timeoutMs: req.timeoutMs, }); if (req.target === "voice-note") { - const opusBuffer = await transcodeMp3ToOpus(audioBuffer, req.timeoutMs); + const opusBuffer = await transcodeAudioBufferToOpus({ + audioBuffer, + inputExtension: "mp3", + tempPrefix: "tts-minimax-", + timeoutMs: req.timeoutMs, + }); return { audioBuffer: opusBuffer, outputFormat: "opus", diff --git a/extensions/xiaomi/speech-provider.test.ts b/extensions/xiaomi/speech-provider.test.ts index f20caccc12d..33ab1f73445 100644 --- a/extensions/xiaomi/speech-provider.test.ts +++ b/extensions/xiaomi/speech-provider.test.ts @@ -1,9 +1,9 @@ import { afterEach, beforeEach, describe, expect, it, vi } from "vitest"; -const runFfmpegMock = vi.hoisted(() => vi.fn()); +const transcodeAudioBufferToOpusMock = vi.hoisted(() => vi.fn()); vi.mock("openclaw/plugin-sdk/media-runtime", () => ({ - runFfmpeg: runFfmpegMock, + transcodeAudioBufferToOpus: transcodeAudioBufferToOpusMock, })); import { buildXiaomiSpeechProvider } from "./speech-provider.js"; @@ -123,7 +123,7 @@ describe("buildXiaomiSpeechProvider", () => { beforeEach(() => { vi.stubGlobal("fetch", vi.fn()); - runFfmpegMock.mockReset(); + transcodeAudioBufferToOpusMock.mockReset(); }); afterEach(() => { @@ -170,7 +170,7 @@ describe("buildXiaomiSpeechProvider", () => { { role: "assistant", content: "Hello from OpenClaw." }, ]); expect(body.audio).toEqual({ format: "mp3", voice: "default_en" }); - expect(runFfmpegMock).not.toHaveBeenCalled(); + expect(transcodeAudioBufferToOpusMock).not.toHaveBeenCalled(); }); it("transcodes Xiaomi output to Opus for voice-note targets", async () => { @@ -181,15 +181,7 @@ describe("buildXiaomiSpeechProvider", () => { headers: { "Content-Type": "application/json" }, }), ); - runFfmpegMock.mockImplementationOnce(async (args: string[]) => { - const outputPath = args.at(-1); - if (typeof outputPath !== "string") { - throw new Error("missing ffmpeg output path"); - } - await import("node:fs/promises").then((fs) => - fs.writeFile(outputPath, Buffer.from("fake-opus-audio")), - ); - }); + transcodeAudioBufferToOpusMock.mockResolvedValueOnce(Buffer.from("fake-opus-audio")); const result = await provider.synthesize({ text: "Hello from OpenClaw.", @@ -203,10 +195,12 @@ describe("buildXiaomiSpeechProvider", () => { expect(result.fileExtension).toBe(".opus"); expect(result.voiceCompatible).toBe(true); expect(result.audioBuffer.toString()).toBe("fake-opus-audio"); - expect(runFfmpegMock).toHaveBeenCalledWith( - expect.arrayContaining(["-c:a", "libopus", "-ar", "48000"]), - { timeoutMs: 30000 }, - ); + expect(transcodeAudioBufferToOpusMock).toHaveBeenCalledWith({ + audioBuffer: Buffer.from("fake-mp3-audio"), + inputExtension: "mp3", + tempPrefix: "tts-xiaomi-", + timeoutMs: 30000, + }); }); it("throws when API key is missing", async () => { diff --git a/extensions/xiaomi/speech-provider.ts b/extensions/xiaomi/speech-provider.ts index 748bdbe97df..6f8abc3785f 100644 --- a/extensions/xiaomi/speech-provider.ts +++ b/extensions/xiaomi/speech-provider.ts @@ -1,6 +1,4 @@ -import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; -import path from "node:path"; -import { runFfmpeg } from "openclaw/plugin-sdk/media-runtime"; +import { transcodeAudioBufferToOpus } from "openclaw/plugin-sdk/media-runtime"; import { assertOkOrThrowProviderError } from "openclaw/plugin-sdk/provider-http"; import { normalizeResolvedSecretInputString } from "openclaw/plugin-sdk/secret-input"; import type { @@ -14,7 +12,6 @@ import { fetchWithSsrFGuard, ssrfPolicyFromHttpBaseUrlAllowedHostname, } from "openclaw/plugin-sdk/ssrf-runtime"; -import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/temp-path"; export const DEFAULT_XIAOMI_TTS_BASE_URL = "https://api.xiaomimimo.com/v1"; export const DEFAULT_XIAOMI_TTS_MODEL = "mimo-v2.5-tts"; @@ -242,45 +239,6 @@ export async function xiaomiTTS(params: { } } -async function transcodeAudioToOpus(params: { - audioBuffer: Buffer; - inputExtension: string; - timeoutMs: number | undefined; -}) { - const tempRoot = resolvePreferredOpenClawTmpDir(); - await mkdir(tempRoot, { recursive: true, mode: 0o700 }); - const tempDir = await mkdtemp(path.join(tempRoot, "tts-xiaomi-")); - try { - const inputPath = path.join(tempDir, `input.${params.inputExtension}`); - const outputPath = path.join(tempDir, "voice.opus"); - await writeFile(inputPath, params.audioBuffer, { mode: 0o600 }); - await runFfmpeg( - [ - "-hide_banner", - "-loglevel", - "error", - "-y", - "-i", - inputPath, - "-vn", - "-c:a", - "libopus", - "-b:a", - "64k", - "-ar", - "48000", - "-ac", - "1", - outputPath, - ], - { timeoutMs: params.timeoutMs }, - ); - return await readFile(outputPath); - } finally { - await rm(tempDir, { recursive: true, force: true }); - } -} - export function buildXiaomiSpeechProvider(): SpeechProviderPlugin { return { id: "xiaomi", @@ -313,9 +271,10 @@ export function buildXiaomiSpeechProvider(): SpeechProviderPlugin { timeoutMs: req.timeoutMs, }); if (req.target === "voice-note") { - const opusBuffer = await transcodeAudioToOpus({ + const opusBuffer = await transcodeAudioBufferToOpus({ audioBuffer, inputExtension: outputFormat, + tempPrefix: "tts-xiaomi-", timeoutMs: req.timeoutMs, }); return { diff --git a/src/media/audio-transcode.test.ts b/src/media/audio-transcode.test.ts new file mode 100644 index 00000000000..171e4fd4ac9 --- /dev/null +++ b/src/media/audio-transcode.test.ts @@ -0,0 +1,100 @@ +import { existsSync } from "node:fs"; +import { readFile } from "node:fs/promises"; +import path from "node:path"; +import { afterEach, describe, expect, it, vi } from "vitest"; +import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; + +const runFfmpegMock = vi.hoisted(() => vi.fn()); + +vi.mock("./ffmpeg-exec.js", () => ({ + runFfmpeg: runFfmpegMock, +})); + +import { transcodeAudioBufferToOpus } from "./audio-transcode.js"; + +describe("transcodeAudioBufferToOpus", () => { + afterEach(() => { + runFfmpegMock.mockReset(); + }); + + it("writes input audio, runs ffmpeg for 48k mono Opus, and cleans temp files", async () => { + let capturedInputPath: string | undefined; + let capturedOutputPath: string | undefined; + runFfmpegMock.mockImplementationOnce(async (args: string[]) => { + capturedInputPath = args[args.indexOf("-i") + 1]; + capturedOutputPath = args.at(-1); + if (!capturedInputPath || !capturedOutputPath) { + throw new Error("missing ffmpeg paths"); + } + await expect(readFile(capturedInputPath)).resolves.toEqual(Buffer.from("source-mp3")); + await import("node:fs/promises").then((fs) => + fs.writeFile(capturedOutputPath!, Buffer.from("opus-output")), + ); + }); + + await expect( + transcodeAudioBufferToOpus({ + audioBuffer: Buffer.from("source-mp3"), + inputExtension: "mp3", + tempPrefix: "tts-test-", + timeoutMs: 1234, + }), + ).resolves.toEqual(Buffer.from("opus-output")); + + expect(runFfmpegMock).toHaveBeenCalledWith( + expect.arrayContaining(["-c:a", "libopus", "-b:a", "64k", "-ar", "48000", "-ac", "1"]), + { timeoutMs: 1234 }, + ); + expect( + capturedInputPath?.startsWith(path.join(resolvePreferredOpenClawTmpDir(), "tts-test-")), + ).toBe(true); + expect(capturedInputPath ? existsSync(capturedInputPath) : true).toBe(false); + expect(capturedOutputPath ? existsSync(capturedOutputPath) : true).toBe(false); + }); + + it("sanitizes unsafe input extensions", async () => { + runFfmpegMock.mockImplementationOnce(async (args: string[]) => { + const inputPath = args[args.indexOf("-i") + 1]; + const outputPath = args.at(-1); + if (!inputPath || !outputPath) { + throw new Error("missing ffmpeg paths"); + } + expect(path.basename(inputPath)).toBe("input.audio"); + await import("node:fs/promises").then((fs) => + fs.writeFile(outputPath, Buffer.from("opus-output")), + ); + }); + + await transcodeAudioBufferToOpus({ + audioBuffer: Buffer.from("source"), + inputExtension: "../bad", + }); + }); + + it("keeps temp prefixes and output names inside the preferred temp root", async () => { + let capturedInputPath: string | undefined; + let capturedOutputPath: string | undefined; + runFfmpegMock.mockImplementationOnce(async (args: string[]) => { + capturedInputPath = args[args.indexOf("-i") + 1]; + capturedOutputPath = args.at(-1); + if (!capturedOutputPath) { + throw new Error("missing ffmpeg output path"); + } + expect(path.basename(capturedOutputPath)).toBe("escape.opus"); + await import("node:fs/promises").then((fs) => + fs.writeFile(capturedOutputPath!, Buffer.from("opus-output")), + ); + }); + + await transcodeAudioBufferToOpus({ + audioBuffer: Buffer.from("source"), + inputFileName: "voice.wav", + outputFileName: "../escape.opus", + tempPrefix: "../bad-prefix", + }); + + const tempRoot = resolvePreferredOpenClawTmpDir(); + expect(capturedInputPath?.startsWith(tempRoot)).toBe(true); + expect(capturedOutputPath?.startsWith(tempRoot)).toBe(true); + }); +}); diff --git a/src/media/audio-transcode.ts b/src/media/audio-transcode.ts new file mode 100644 index 00000000000..e1631fa4a41 --- /dev/null +++ b/src/media/audio-transcode.ts @@ -0,0 +1,87 @@ +import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; +import path from "node:path"; +import { resolvePreferredOpenClawTmpDir } from "../infra/tmp-openclaw-dir.js"; +import { runFfmpeg } from "./ffmpeg-exec.js"; + +const DEFAULT_OPUS_SAMPLE_RATE_HZ = 48_000; +const DEFAULT_OPUS_BITRATE = "64k"; +const DEFAULT_OPUS_CHANNELS = 1; +const DEFAULT_TEMP_PREFIX = "audio-opus-"; +const DEFAULT_OUTPUT_FILE_NAME = "voice.opus"; + +function normalizeAudioExtension(params: { + inputExtension?: string; + inputFileName?: string; +}): string { + const fromExtension = params.inputExtension?.trim(); + const candidate = fromExtension + ? fromExtension.startsWith(".") + ? fromExtension + : `.${fromExtension}` + : path.extname(params.inputFileName ?? ""); + const normalized = candidate.toLowerCase(); + return /^\.[a-z0-9]{1,12}$/.test(normalized) ? normalized : ".audio"; +} + +function normalizeTempPrefix(value?: string): string { + const sanitized = value?.trim().replace(/[^a-zA-Z0-9._-]/g, "-"); + if (!sanitized || sanitized === "." || sanitized === "..") { + return DEFAULT_TEMP_PREFIX; + } + return sanitized.endsWith("-") ? sanitized : `${sanitized}-`; +} + +function normalizeOutputFileName(value?: string): string { + const baseName = path.basename(value?.trim() || DEFAULT_OUTPUT_FILE_NAME); + if (/^[a-zA-Z0-9._-]{1,80}$/.test(baseName) && baseName !== "." && baseName !== "..") { + return baseName; + } + return DEFAULT_OUTPUT_FILE_NAME; +} + +export async function transcodeAudioBufferToOpus(params: { + audioBuffer: Buffer; + inputExtension?: string; + inputFileName?: string; + tempPrefix?: string; + outputFileName?: string; + timeoutMs?: number; + sampleRateHz?: number; + bitrate?: string; + channels?: number; +}): Promise { + const tempRoot = resolvePreferredOpenClawTmpDir(); + await mkdir(tempRoot, { recursive: true, mode: 0o700 }); + const tempDir = await mkdtemp(path.join(tempRoot, normalizeTempPrefix(params.tempPrefix))); + try { + const inputPath = path.join(tempDir, `input${normalizeAudioExtension(params)}`); + const outputPath = path.join(tempDir, normalizeOutputFileName(params.outputFileName)); + await writeFile(inputPath, params.audioBuffer, { mode: 0o600 }); + await runFfmpeg( + [ + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + inputPath, + "-vn", + "-sn", + "-dn", + "-c:a", + "libopus", + "-b:a", + params.bitrate ?? DEFAULT_OPUS_BITRATE, + "-ar", + String(params.sampleRateHz ?? DEFAULT_OPUS_SAMPLE_RATE_HZ), + "-ac", + String(params.channels ?? DEFAULT_OPUS_CHANNELS), + outputPath, + ], + { timeoutMs: params.timeoutMs }, + ); + return await readFile(outputPath); + } finally { + await rm(tempDir, { recursive: true, force: true }); + } +} diff --git a/src/plugin-sdk/media-runtime.ts b/src/plugin-sdk/media-runtime.ts index decc72efd25..9276333a607 100644 --- a/src/plugin-sdk/media-runtime.ts +++ b/src/plugin-sdk/media-runtime.ts @@ -1,6 +1,7 @@ // Public media/payload helpers for plugins that fetch, transform, or send attachments. export * from "../media/audio.js"; +export * from "../media/audio-transcode.js"; export * from "../media/base64.js"; export * from "../media/constants.js"; export * from "../media/fetch.js";