mirror of
https://github.com/openclaw/openclaw.git
synced 2026-03-29 13:35:28 +02:00
251 lines
9.4 KiB
TypeScript
251 lines
9.4 KiB
TypeScript
import { mkdirSync, mkdtempSync, readFileSync, rmSync } from "node:fs";
|
|
import path from "node:path";
|
|
import {
|
|
CHROMIUM_FULL_VERSION,
|
|
TRUSTED_CLIENT_TOKEN,
|
|
generateSecMsGecToken,
|
|
} from "node-edge-tts/dist/drm.js";
|
|
import { resolvePreferredOpenClawTmpDir } from "openclaw/plugin-sdk/llm-task";
|
|
import { isVoiceCompatibleAudio } from "openclaw/plugin-sdk/media-runtime";
|
|
import type {
|
|
SpeechProviderConfig,
|
|
SpeechProviderPlugin,
|
|
SpeechVoiceOption,
|
|
} from "openclaw/plugin-sdk/speech-core";
|
|
import { edgeTTS, inferEdgeExtension } from "./tts.js";
|
|
|
|
const DEFAULT_EDGE_VOICE = "en-US-MichelleNeural";
|
|
const DEFAULT_EDGE_LANG = "en-US";
|
|
const DEFAULT_EDGE_OUTPUT_FORMAT = "audio-24khz-48kbitrate-mono-mp3";
|
|
|
|
type MicrosoftProviderConfig = {
|
|
enabled: boolean;
|
|
voice: string;
|
|
lang: string;
|
|
outputFormat: string;
|
|
outputFormatConfigured: boolean;
|
|
pitch?: string;
|
|
rate?: string;
|
|
volume?: string;
|
|
saveSubtitles: boolean;
|
|
proxy?: string;
|
|
timeoutMs?: number;
|
|
};
|
|
|
|
type MicrosoftVoiceListEntry = {
|
|
ShortName?: string;
|
|
FriendlyName?: string;
|
|
Locale?: string;
|
|
Gender?: string;
|
|
VoiceTag?: {
|
|
ContentCategories?: string[];
|
|
VoicePersonalities?: string[];
|
|
};
|
|
};
|
|
|
|
function trimToUndefined(value: unknown): string | undefined {
|
|
return typeof value === "string" && value.trim() ? value.trim() : undefined;
|
|
}
|
|
|
|
function asBoolean(value: unknown): boolean | undefined {
|
|
return typeof value === "boolean" ? value : undefined;
|
|
}
|
|
|
|
function asNumber(value: unknown): number | undefined {
|
|
return typeof value === "number" && Number.isFinite(value) ? value : undefined;
|
|
}
|
|
|
|
function asObject(value: unknown): Record<string, unknown> | undefined {
|
|
return typeof value === "object" && value !== null && !Array.isArray(value)
|
|
? (value as Record<string, unknown>)
|
|
: undefined;
|
|
}
|
|
|
|
function normalizeMicrosoftProviderConfig(
|
|
rawConfig: Record<string, unknown>,
|
|
): MicrosoftProviderConfig {
|
|
const providers = asObject(rawConfig.providers);
|
|
const rawEdge = asObject(rawConfig.edge);
|
|
const rawMicrosoft = asObject(rawConfig.microsoft);
|
|
const rawProvider = asObject(providers?.microsoft);
|
|
const raw = { ...(rawEdge ?? {}), ...(rawMicrosoft ?? {}), ...(rawProvider ?? {}) };
|
|
const outputFormat = trimToUndefined(raw.outputFormat);
|
|
return {
|
|
enabled: asBoolean(raw.enabled) ?? true,
|
|
voice: trimToUndefined(raw.voice) ?? DEFAULT_EDGE_VOICE,
|
|
lang: trimToUndefined(raw.lang) ?? DEFAULT_EDGE_LANG,
|
|
outputFormat: outputFormat ?? DEFAULT_EDGE_OUTPUT_FORMAT,
|
|
outputFormatConfigured: Boolean(outputFormat),
|
|
pitch: trimToUndefined(raw.pitch),
|
|
rate: trimToUndefined(raw.rate),
|
|
volume: trimToUndefined(raw.volume),
|
|
saveSubtitles: asBoolean(raw.saveSubtitles) ?? false,
|
|
proxy: trimToUndefined(raw.proxy),
|
|
timeoutMs: asNumber(raw.timeoutMs),
|
|
};
|
|
}
|
|
|
|
function readMicrosoftProviderConfig(config: SpeechProviderConfig): MicrosoftProviderConfig {
|
|
const defaults = normalizeMicrosoftProviderConfig({});
|
|
return {
|
|
enabled: asBoolean(config.enabled) ?? defaults.enabled,
|
|
voice: trimToUndefined(config.voice) ?? defaults.voice,
|
|
lang: trimToUndefined(config.lang) ?? defaults.lang,
|
|
outputFormat: trimToUndefined(config.outputFormat) ?? defaults.outputFormat,
|
|
outputFormatConfigured:
|
|
asBoolean(config.outputFormatConfigured) ?? defaults.outputFormatConfigured,
|
|
pitch: trimToUndefined(config.pitch) ?? defaults.pitch,
|
|
rate: trimToUndefined(config.rate) ?? defaults.rate,
|
|
volume: trimToUndefined(config.volume) ?? defaults.volume,
|
|
saveSubtitles: asBoolean(config.saveSubtitles) ?? defaults.saveSubtitles,
|
|
proxy: trimToUndefined(config.proxy) ?? defaults.proxy,
|
|
timeoutMs: asNumber(config.timeoutMs) ?? defaults.timeoutMs,
|
|
};
|
|
}
|
|
|
|
function buildMicrosoftVoiceHeaders(): Record<string, string> {
|
|
const major = CHROMIUM_FULL_VERSION.split(".")[0] || "0";
|
|
return {
|
|
Authority: "speech.platform.bing.com",
|
|
Origin: "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
|
|
Accept: "*/*",
|
|
"User-Agent":
|
|
`Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ` +
|
|
`(KHTML, like Gecko) Chrome/${major}.0.0.0 Safari/537.36 Edg/${major}.0.0.0`,
|
|
"Sec-MS-GEC": generateSecMsGecToken(),
|
|
"Sec-MS-GEC-Version": `1-${CHROMIUM_FULL_VERSION}`,
|
|
};
|
|
}
|
|
|
|
function formatMicrosoftVoiceDescription(entry: MicrosoftVoiceListEntry): string | undefined {
|
|
const personalities = entry.VoiceTag?.VoicePersonalities?.filter(Boolean) ?? [];
|
|
return personalities.length > 0 ? personalities.join(", ") : undefined;
|
|
}
|
|
|
|
export async function listMicrosoftVoices(): Promise<SpeechVoiceOption[]> {
|
|
const response = await fetch(
|
|
"https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list" +
|
|
`?trustedclienttoken=${TRUSTED_CLIENT_TOKEN}`,
|
|
{
|
|
headers: buildMicrosoftVoiceHeaders(),
|
|
},
|
|
);
|
|
if (!response.ok) {
|
|
throw new Error(`Microsoft voices API error (${response.status})`);
|
|
}
|
|
const voices = (await response.json()) as MicrosoftVoiceListEntry[];
|
|
return Array.isArray(voices)
|
|
? voices
|
|
.map((voice) => ({
|
|
id: voice.ShortName?.trim() ?? "",
|
|
name: voice.FriendlyName?.trim() || voice.ShortName?.trim() || undefined,
|
|
category: voice.VoiceTag?.ContentCategories?.find((value) => value.trim().length > 0),
|
|
description: formatMicrosoftVoiceDescription(voice),
|
|
locale: voice.Locale?.trim() || undefined,
|
|
gender: voice.Gender?.trim() || undefined,
|
|
personalities: voice.VoiceTag?.VoicePersonalities?.filter(
|
|
(value): value is string => value.trim().length > 0,
|
|
),
|
|
}))
|
|
.filter((voice) => voice.id.length > 0)
|
|
: [];
|
|
}
|
|
|
|
export function buildMicrosoftSpeechProvider(): SpeechProviderPlugin {
|
|
return {
|
|
id: "microsoft",
|
|
label: "Microsoft",
|
|
aliases: ["edge"],
|
|
autoSelectOrder: 30,
|
|
resolveConfig: ({ rawConfig }) => normalizeMicrosoftProviderConfig(rawConfig),
|
|
resolveTalkConfig: ({ baseTtsConfig, talkProviderConfig }) => {
|
|
const base = normalizeMicrosoftProviderConfig(baseTtsConfig);
|
|
return {
|
|
...base,
|
|
enabled: true,
|
|
...(trimToUndefined(talkProviderConfig.voiceId) == null
|
|
? {}
|
|
: { voice: trimToUndefined(talkProviderConfig.voiceId) }),
|
|
...(trimToUndefined(talkProviderConfig.languageCode) == null
|
|
? {}
|
|
: { lang: trimToUndefined(talkProviderConfig.languageCode) }),
|
|
...(trimToUndefined(talkProviderConfig.outputFormat) == null
|
|
? {}
|
|
: { outputFormat: trimToUndefined(talkProviderConfig.outputFormat) }),
|
|
...(trimToUndefined(talkProviderConfig.pitch) == null
|
|
? {}
|
|
: { pitch: trimToUndefined(talkProviderConfig.pitch) }),
|
|
...(trimToUndefined(talkProviderConfig.rate) == null
|
|
? {}
|
|
: { rate: trimToUndefined(talkProviderConfig.rate) }),
|
|
...(trimToUndefined(talkProviderConfig.volume) == null
|
|
? {}
|
|
: { volume: trimToUndefined(talkProviderConfig.volume) }),
|
|
...(trimToUndefined(talkProviderConfig.proxy) == null
|
|
? {}
|
|
: { proxy: trimToUndefined(talkProviderConfig.proxy) }),
|
|
...(asNumber(talkProviderConfig.timeoutMs) == null
|
|
? {}
|
|
: { timeoutMs: asNumber(talkProviderConfig.timeoutMs) }),
|
|
};
|
|
},
|
|
resolveTalkOverrides: ({ params }) => ({
|
|
...(trimToUndefined(params.voiceId) == null
|
|
? {}
|
|
: { voice: trimToUndefined(params.voiceId) }),
|
|
...(trimToUndefined(params.outputFormat) == null
|
|
? {}
|
|
: { outputFormat: trimToUndefined(params.outputFormat) }),
|
|
}),
|
|
listVoices: async () => await listMicrosoftVoices(),
|
|
isConfigured: ({ providerConfig }) => readMicrosoftProviderConfig(providerConfig).enabled,
|
|
synthesize: async (req) => {
|
|
const config = readMicrosoftProviderConfig(req.providerConfig);
|
|
const tempRoot = resolvePreferredOpenClawTmpDir();
|
|
mkdirSync(tempRoot, { recursive: true, mode: 0o700 });
|
|
const tempDir = mkdtempSync(path.join(tempRoot, "tts-microsoft-"));
|
|
const overrideVoice = trimToUndefined(req.providerOverrides?.voice);
|
|
let outputFormat =
|
|
trimToUndefined(req.providerOverrides?.outputFormat) ?? config.outputFormat;
|
|
const fallbackOutputFormat =
|
|
outputFormat !== DEFAULT_EDGE_OUTPUT_FORMAT ? DEFAULT_EDGE_OUTPUT_FORMAT : undefined;
|
|
|
|
try {
|
|
const runEdge = async (format: string) => {
|
|
const fileExtension = inferEdgeExtension(format);
|
|
const outputPath = path.join(tempDir, `speech${fileExtension}`);
|
|
await edgeTTS({
|
|
text: req.text,
|
|
outputPath,
|
|
config: {
|
|
...config,
|
|
voice: overrideVoice ?? config.voice,
|
|
outputFormat: format,
|
|
},
|
|
timeoutMs: req.timeoutMs,
|
|
});
|
|
const audioBuffer = readFileSync(outputPath);
|
|
return {
|
|
audioBuffer,
|
|
outputFormat: format,
|
|
fileExtension,
|
|
voiceCompatible: isVoiceCompatibleAudio({ fileName: outputPath }),
|
|
};
|
|
};
|
|
|
|
try {
|
|
return await runEdge(outputFormat);
|
|
} catch (error) {
|
|
if (!fallbackOutputFormat || fallbackOutputFormat === outputFormat) {
|
|
throw error;
|
|
}
|
|
outputFormat = fallbackOutputFormat;
|
|
return await runEdge(outputFormat);
|
|
}
|
|
} finally {
|
|
rmSync(tempDir, { recursive: true, force: true });
|
|
}
|
|
},
|
|
};
|
|
}
|