fix(config): accept video and audio model inputs

Preserve configured audio/video model input modalities through provider catalog normalization.\n\nFixes #20721.\nThanks @alvinttang.
This commit is contained in:
Alvin Tang
2026-04-26 12:18:54 +08:00
committed by GitHub
parent f1eef47839
commit 4428661779
10 changed files with 70 additions and 8 deletions

View File

@@ -74,6 +74,8 @@ Docs: https://docs.openclaw.ai
- Channels/status: keep read-only channel lists on manifest and package metadata by default, loading setup runtime only for explicit fallback callers. Thanks @shakkernerd.
- Plugins/onboarding: defer onboarding install-record index writes until the guarded config commit so setup failures cannot leave the plugin index ahead of `openclaw.json`. Thanks @shakkernerd.
- Plugins/registry: resolve web provider ownership from the installed plugin index instead of broad manifest scans on secret, tool, and pricing paths. Thanks @shakkernerd.
- Config/providers: accept `video` and `audio` in configured model `input` values and
preserve them in provider catalog entries. Fixes #20721. Thanks @alvinttang.
- TTS: strip model-emitted TTS directives from streamed block text before channel
delivery, including directives split across adjacent blocks, while preserving
the accumulated raw reply for final-mode synthesis. Fixes #38937.

View File

@@ -269,7 +269,7 @@ export type LmstudioModelBase = {
trainedForToolUse: boolean;
loaded: boolean;
reasoning: boolean;
input: ModelDefinitionConfig["input"];
input: Array<"text" | "image">;
cost: ModelDefinitionConfig["cost"];
contextWindow: number;
contextTokens: number;

View File

@@ -822,6 +822,9 @@ export async function prepareLmstudioDynamicModels(
provider: PROVIDER_ID,
api: ctx.providerConfig?.api ?? `openai-completions`,
baseUrl,
input: model.input.filter(
(entry): entry is "text" | "image" => entry === "text" || entry === "image",
),
}),
);
}

View File

@@ -1,4 +1,4 @@
export type ModelInputType = "text" | "image" | "document";
export type ModelInputType = "text" | "image" | "audio" | "video" | "document";
export type ModelCatalogEntry = {
id: string;

View File

@@ -2908,6 +2908,14 @@ export const GENERATED_BASE_CONFIG_SCHEMA: BaseConfigSchemaResponse = {
type: "string",
const: "image",
},
{
type: "string",
const: "video",
},
{
type: "string",
const: "audio",
},
],
},
},

View File

@@ -80,7 +80,7 @@ export type ModelDefinitionConfig = {
api?: ModelApi;
baseUrl?: string;
reasoning: boolean;
input: Array<"text" | "image">;
input: Array<"text" | "image" | "video" | "audio">;
cost: {
input: number;
output: number;

View File

@@ -312,7 +312,11 @@ export const ModelDefinitionSchema = z
api: ModelApiSchema.optional(),
baseUrl: z.string().min(1).optional(),
reasoning: z.boolean().optional(),
input: z.array(z.union([z.literal("text"), z.literal("image")])).optional(),
input: z
.array(
z.union([z.literal("text"), z.literal("image"), z.literal("video"), z.literal("audio")]),
)
.optional(),
cost: z
.object({
input: z.number().optional(),

View File

@@ -41,7 +41,7 @@ export type LmstudioModelBase = {
trainedForToolUse: boolean;
loaded: boolean;
reasoning: boolean;
input: ModelDefinitionConfig["input"];
input: Array<"text" | "image">;
cost: ModelDefinitionConfig["cost"];
contextWindow: number;
contextTokens: number;

View File

@@ -1,6 +1,7 @@
import { describe, expect, it } from "vitest";
import {
applyProviderNativeStreamingUsageCompat,
readConfiguredProviderCatalogEntries,
supportsNativeStreamingUsageCompat,
} from "./provider-catalog-shared.js";
import type { ModelDefinitionConfig } from "./provider-model-shared.js";
@@ -54,3 +55,43 @@ describe("provider-catalog-shared native streaming usage compat", () => {
expect(provider.models?.[1]?.compat?.supportsUsageInStreaming).toBe(false);
});
});
describe("provider-catalog-shared configured catalog entries", () => {
it("preserves configured audio and video input modalities", () => {
expect(
readConfiguredProviderCatalogEntries({
providerId: "kilocode",
config: {
models: {
providers: {
kilocode: {
baseUrl: "https://api.kilo.ai/api/gateway/",
api: "openai-completions",
models: [
{
id: "google/gemini-3-pro-preview",
name: "Gemini 3 Pro Preview",
input: ["text", "image", "video", "audio"],
reasoning: true,
cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
contextWindow: 1048576,
maxTokens: 65536,
},
],
},
},
},
},
}),
).toEqual([
{
provider: "kilocode",
id: "google/gemini-3-pro-preview",
name: "Gemini 3 Pro Preview",
input: ["text", "image", "video", "audio"],
reasoning: true,
contextWindow: 1048576,
},
]);
});
});

View File

@@ -23,7 +23,7 @@ export type ConfiguredProviderCatalogEntry = {
provider: string;
contextWindow?: number;
reasoning?: boolean;
input?: Array<"text" | "image" | "document">;
input?: Array<"text" | "image" | "audio" | "video" | "document">;
};
function normalizeConfiguredCatalogModelInput(
@@ -33,8 +33,12 @@ function normalizeConfiguredCatalogModelInput(
return undefined;
}
const normalized = input.filter(
(item): item is "text" | "image" | "document" =>
item === "text" || item === "image" || item === "document",
(item): item is "text" | "image" | "audio" | "video" | "document" =>
item === "text" ||
item === "image" ||
item === "audio" ||
item === "video" ||
item === "document",
);
return normalized.length > 0 ? normalized : undefined;
}