fix: route comfy music through shared tool

This commit is contained in:
Peter Steinberger
2026-04-06 02:01:03 +01:00
parent 9dfa4db76b
commit a9f491310c
11 changed files with 242 additions and 79 deletions
+4 -5
View File
@@ -13,8 +13,7 @@ OpenClaw ships a bundled `comfy` plugin for workflow-driven ComfyUI runs.
- Provider: `comfy`
- Models: `comfy/workflow`
- Shared surfaces: `image_generate`, `video_generate`
- Plugin tool: `music_generate`
- Shared surfaces: `image_generate`, `video_generate`, `music_generate`
- Auth: none for local ComfyUI; `COMFY_API_KEY` or `COMFY_CLOUD_API_KEY` for Comfy Cloud
- API: ComfyUI `/prompt` / `/history` / `/view` and Comfy Cloud `/api/*`
@@ -24,7 +23,7 @@ OpenClaw ships a bundled `comfy` plugin for workflow-driven ComfyUI runs.
- Image editing with 1 uploaded reference image
- Video generation from a workflow JSON
- Video generation with 1 uploaded reference image
- Music or audio generation through the bundled `music_generate` tool
- Music or audio generation through the shared `music_generate` tool
- Output download from a configured node or all matching output nodes
The bundled plugin is workflow-driven, so OpenClaw does not try to map generic
@@ -162,8 +161,8 @@ the configured graph. OpenClaw does not pass input videos into Comfy workflows.
## Music workflows
The bundled plugin registers a `music_generate` tool for workflow-defined audio
or music outputs:
The bundled plugin registers a music-generation provider for workflow-defined
audio or music outputs, surfaced through the shared `music_generate` tool:
```text
/tool music_generate prompt="Warm ambient synth loop with soft tape texture"
+1 -1
View File
@@ -106,7 +106,7 @@ Plugins can register additional tools. Some examples:
- [Lobster](/tools/lobster) — typed workflow runtime with resumable approvals
- [LLM Task](/tools/llm-task) — JSON-only LLM step for structured output
- [Music Generation](/tools/music-generation) — shared `music_generate` tool plus plugin-provided workflow variants
- [Music Generation](/tools/music-generation) — shared `music_generate` tool with workflow-backed providers
- [Diffs](/tools/diffs) — diff viewer and renderer
- [OpenProse](/prose) — markdown-first workflow orchestration
+14 -24
View File
@@ -1,5 +1,5 @@
---
summary: "Generate music with shared providers or plugin-provided workflows"
summary: "Generate music with shared providers, including workflow-backed plugins"
read_when:
- Generating music or audio via the agent
- Configuring music generation providers and models
@@ -9,10 +9,9 @@ title: "Music Generation"
# Music Generation
The `music_generate` tool lets the agent create music or audio through either:
- the shared music-generation capability with configured providers such as Google and MiniMax
- plugin-provided tool surfaces such as a workflow-configured ComfyUI graph
The `music_generate` tool lets the agent create music or audio through the
shared music-generation capability with configured providers such as Google,
MiniMax, and workflow-configured ComfyUI.
For shared provider-backed agent sessions, OpenClaw starts music generation as a
background task, tracks it in the task ledger, then wakes the agent again when
@@ -23,10 +22,6 @@ original channel.
The built-in shared tool only appears when at least one music-generation provider is available. If you don't see `music_generate` in your agent's tools, configure `agents.defaults.musicGenerationModel` or set up a provider API key.
</Note>
<Note>
Plugin-provided `music_generate` implementations can expose different parameters or runtime behavior. The async task/status flow below applies to the built-in shared provider-backed path.
</Note>
## Quick start
### Shared provider-backed generation
@@ -66,10 +61,10 @@ Generate a cinematic piano track with soft strings and no vocals.
Generate an energetic chiptune loop about launching a rocket at sunrise.
```
### Workflow-driven plugin generation
### Workflow-driven Comfy generation
The bundled `comfy` plugin can also provide `music_generate` using a
workflow-configured ComfyUI graph.
The bundled `comfy` plugin plugs into the shared `music_generate` tool through
the music-generation provider registry.
1. Configure `models.providers.comfy.music` with a workflow JSON and
prompt/output nodes.
@@ -84,16 +79,11 @@ Example:
## Shared bundled provider support
| Provider | Default model | Reference inputs | Supported controls | API key |
| -------- | ---------------------- | ---------------- | --------------------------------------------------------- | ---------------------------------- |
| Google | `lyria-3-clip-preview` | Up to 10 images | `lyrics`, `instrumental`, `format` | `GEMINI_API_KEY`, `GOOGLE_API_KEY` |
| MiniMax | `music-2.5+` | None | `lyrics`, `instrumental`, `durationSeconds`, `format=mp3` | `MINIMAX_API_KEY` |
## Plugin-provided support
| Provider | Model | Notes |
| -------- | ---------- | ------------------------------- |
| ComfyUI | `workflow` | Workflow-defined music or audio |
| Provider | Default model | Reference inputs | Supported controls | API key |
| -------- | ---------------------- | ---------------- | --------------------------------------------------------- | -------------------------------------- |
| ComfyUI | `workflow` | Up to 1 image | Workflow-defined music or audio | `COMFY_API_KEY`, `COMFY_CLOUD_API_KEY` |
| Google | `lyria-3-clip-preview` | Up to 10 images | `lyrics`, `instrumental`, `format` | `GEMINI_API_KEY`, `GOOGLE_API_KEY` |
| MiniMax | `music-2.5+` | None | `lyrics`, `instrumental`, `durationSeconds`, `format=mp3` | `MINIMAX_API_KEY` |
Use `action: "list"` to inspect available shared providers and models at
runtime:
@@ -129,8 +119,8 @@ Direct generation example:
| `format` | string | Output format hint (`mp3` or `wav`) when the provider supports it |
| `filename` | string | Output filename hint |
Not all providers or plugins support all parameters. The shared built-in tool
validates provider capability limits before it submits the request.
Not all providers support all parameters. The shared tool validates provider
capability limits before it submits the request.
## Async behavior for the shared provider-backed path
+20 -43
View File
@@ -1,4 +1,4 @@
import { beforeAll, describe, expect, it, vi } from "vitest";
import { beforeAll, describe, expect, it } from "vitest";
import { resolveOpenClawAgentDir } from "../../src/agents/agent-paths.js";
import { isLiveTestEnabled } from "../../src/agents/live-test-helpers.js";
import { loadConfig } from "../../src/config/config.js";
@@ -10,15 +10,10 @@ const LIVE =
isLiveTestEnabled(["COMFY_LIVE_TEST"]) && (process.env.COMFY_LIVE_TEST ?? "").trim() === "1";
const describeLive = LIVE ? describe : describe.skip;
type RegisteredTool = {
name: string;
execute: (
id: string,
params: Record<string, unknown>,
) => Promise<{
content: Array<{ type: string; text?: string }>;
details?: unknown;
}>;
type RegisteredMusicProvider = {
id: string;
generateMusic: Function;
isConfigured?: Function;
};
function withPluginsEnabled<T>(cfg: T): T {
@@ -40,23 +35,9 @@ describeLive("comfy live", () => {
let agentDir = "";
const imageProviders: Array<{ id: string; generateImage: Function; isConfigured?: Function }> =
[];
const musicProviders: RegisteredMusicProvider[] = [];
const videoProviders: Array<{ id: string; generateVideo: Function; isConfigured?: Function }> =
[];
const tools: RegisteredTool[] = [];
const saveMediaBuffer = vi.fn(
async (
_buffer: Buffer,
_mimeType: string,
_subdir?: string,
_maxBytes?: number,
originalFilename?: string,
) => ({
path: `/tmp/${originalFilename ?? "generated.bin"}`,
id: "saved-1",
mimeType: _mimeType,
bytes: _buffer.byteLength,
}),
);
beforeAll(async () => {
cfg = withPluginsEnabled(loadConfig());
@@ -64,22 +45,15 @@ describeLive("comfy live", () => {
await plugin.register(
createTestPluginApi({
config: cfg as never,
runtime: {
channel: {
media: {
saveMediaBuffer,
},
},
} as never,
registerImageGenerationProvider(provider) {
imageProviders.push(provider as never);
},
registerMusicGenerationProvider(provider) {
musicProviders.push(provider as never);
},
registerVideoGenerationProvider(provider) {
videoProviders.push(provider as never);
},
registerTool(tool) {
tools.push(tool as RegisteredTool);
},
}),
);
});
@@ -123,17 +97,20 @@ describeLive("comfy live", () => {
);
it.skipIf(!isComfyCapabilityConfigured({ cfg: cfg as never, agentDir, capability: "music" }))(
"runs a music workflow tool",
"runs a music workflow",
async () => {
const tool = tools.find((entry) => entry.name === "music_generate");
expect(tool).toBeDefined();
const result = await tool!.execute("music-live", {
const provider = musicProviders.find((entry) => entry.id === "comfy");
expect(provider).toBeDefined();
const result = await provider!.generateMusic({
provider: "comfy",
model: "workflow",
prompt: "A gentle ambient synth loop with warm analog pads.",
filename: "comfy-live.mp3",
cfg: cfg as never,
agentDir,
});
const text = result.content.find((entry) => entry.type === "text")?.text ?? "";
expect(text).toContain("MEDIA:/tmp/comfy-live.mp3");
expect(saveMediaBuffer).toHaveBeenCalled();
expect(result.tracks.length).toBeGreaterThan(0);
expect(result.tracks[0]?.mimeType.startsWith("audio/")).toBe(true);
expect(result.tracks[0]?.buffer.byteLength).toBeGreaterThan(512);
},
180_000,
);
+2 -2
View File
@@ -1,6 +1,6 @@
import { definePluginEntry } from "openclaw/plugin-sdk/plugin-entry";
import { buildComfyImageGenerationProvider } from "./image-generation-provider.js";
import { createComfyMusicGenerateTool } from "./music-generate-tool.js";
import { buildComfyMusicGenerationProvider } from "./music-generation-provider.js";
import { buildComfyVideoGenerationProvider } from "./video-generation-provider.js";
const PROVIDER_ID = "comfy";
@@ -18,7 +18,7 @@ export default definePluginEntry({
auth: [],
});
api.registerImageGenerationProvider(buildComfyImageGenerationProvider());
api.registerMusicGenerationProvider(buildComfyMusicGenerationProvider());
api.registerVideoGenerationProvider(buildComfyVideoGenerationProvider());
api.registerTool(createComfyMusicGenerateTool(api));
},
});
@@ -0,0 +1,93 @@
import { describe, expect, it, vi } from "vitest";
import { buildComfyMusicGenerationProvider } from "./music-generation-provider.js";
import { _setComfyFetchGuardForTesting } from "./workflow-runtime.js";
const { fetchWithSsrFGuardMock } = vi.hoisted(() => ({
fetchWithSsrFGuardMock: vi.fn(),
}));
describe("comfy music-generation provider", () => {
it("registers the workflow model", () => {
const provider = buildComfyMusicGenerationProvider();
expect(provider.defaultModel).toBe("workflow");
expect(provider.models).toEqual(["workflow"]);
expect(provider.capabilities.maxInputImages).toBe(1);
});
it("runs a music workflow and returns audio outputs", async () => {
_setComfyFetchGuardForTesting(fetchWithSsrFGuardMock);
fetchWithSsrFGuardMock
.mockResolvedValueOnce({
response: new Response(JSON.stringify({ prompt_id: "music-job-1" }), {
status: 200,
headers: { "content-type": "application/json" },
}),
release: vi.fn(async () => {}),
})
.mockResolvedValueOnce({
response: new Response(
JSON.stringify({
"music-job-1": {
outputs: {
"9": {
audio: [{ filename: "song.mp3", subfolder: "", type: "output" }],
},
},
},
}),
{
status: 200,
headers: { "content-type": "application/json" },
},
),
release: vi.fn(async () => {}),
})
.mockResolvedValueOnce({
response: new Response(Buffer.from("music-bytes"), {
status: 200,
headers: { "content-type": "audio/mpeg" },
}),
release: vi.fn(async () => {}),
});
const provider = buildComfyMusicGenerationProvider();
const result = await provider.generateMusic({
provider: "comfy",
model: "workflow",
prompt: "gentle ambient synth loop",
cfg: {
models: {
providers: {
comfy: {
music: {
workflow: {
"6": { inputs: { text: "" } },
"9": { inputs: {} },
},
promptNodeId: "6",
outputNodeId: "9",
},
},
},
},
} as never,
});
expect(result).toMatchObject({
model: "workflow",
tracks: [
{
mimeType: "audio/mpeg",
fileName: "song.mp3",
},
],
metadata: {
promptId: "music-job-1",
outputNodeIds: ["9"],
inputImageCount: 0,
},
});
expect(result.tracks[0]?.buffer).toEqual(Buffer.from("music-bytes"));
});
});
@@ -0,0 +1,84 @@
import type {
GeneratedMusicAsset,
MusicGenerationProvider,
MusicGenerationSourceImage,
} from "openclaw/plugin-sdk/music-generation";
import {
DEFAULT_COMFY_MODEL,
isComfyCapabilityConfigured,
runComfyWorkflow,
} from "./workflow-runtime.js";
const COMFY_MAX_INPUT_IMAGES = 1;
function toGeneratedTrack(asset: {
buffer: Buffer;
mimeType: string;
fileName: string;
}): GeneratedMusicAsset {
return {
buffer: asset.buffer,
mimeType: asset.mimeType,
fileName: asset.fileName,
};
}
function resolveInputImage(inputImage: MusicGenerationSourceImage | undefined) {
if (!inputImage) {
return undefined;
}
if (!inputImage.buffer) {
throw new Error("Comfy music generation requires loaded reference image bytes.");
}
return {
buffer: inputImage.buffer,
mimeType: inputImage.mimeType ?? "image/png",
fileName: inputImage.fileName,
};
}
export function buildComfyMusicGenerationProvider(): MusicGenerationProvider {
return {
id: "comfy",
label: "ComfyUI",
defaultModel: DEFAULT_COMFY_MODEL,
models: [DEFAULT_COMFY_MODEL],
isConfigured: ({ cfg, agentDir }) =>
isComfyCapabilityConfigured({
cfg,
agentDir,
capability: "music",
}),
capabilities: {
maxInputImages: COMFY_MAX_INPUT_IMAGES,
},
async generateMusic(req) {
if ((req.inputImages?.length ?? 0) > COMFY_MAX_INPUT_IMAGES) {
throw new Error(
`Comfy music generation supports at most ${COMFY_MAX_INPUT_IMAGES} reference image.`,
);
}
const result = await runComfyWorkflow({
cfg: req.cfg,
agentDir: req.agentDir,
authStore: req.authStore,
prompt: req.prompt,
model: req.model,
capability: "music",
outputKinds: ["audio"],
inputImage: resolveInputImage(req.inputImages?.[0]),
});
return {
tracks: result.assets.map(toGeneratedTrack),
model: result.model,
metadata: {
promptId: result.promptId,
outputNodeIds: result.outputNodeIds,
inputImageCount: req.inputImages?.length ?? 0,
},
};
},
};
}
+2 -2
View File
@@ -7,8 +7,8 @@
},
"contracts": {
"imageGenerationProviders": ["comfy"],
"videoGenerationProviders": ["comfy"],
"tools": ["music_generate"]
"musicGenerationProviders": ["comfy"],
"videoGenerationProviders": ["comfy"]
},
"configSchema": {
"type": "object",
@@ -4,8 +4,8 @@ describePluginRegistrationContract({
pluginId: "comfy",
providerIds: ["comfy"],
imageGenerationProviderIds: ["comfy"],
musicGenerationProviderIds: ["comfy"],
videoGenerationProviderIds: ["comfy"],
toolNames: ["music_generate"],
requireGenerateImage: true,
requireGenerateVideo: true,
});
@@ -17,8 +17,8 @@ export const pluginRegistrationContractCases = {
pluginId: "comfy",
providerIds: ["comfy"],
imageGenerationProviderIds: ["comfy"],
musicGenerationProviderIds: ["comfy"],
videoGenerationProviderIds: ["comfy"],
toolNames: ["music_generate"],
requireGenerateImage: true,
requireGenerateVideo: true,
},
@@ -2,6 +2,7 @@ import { describe, expect, it } from "vitest";
import {
imageGenerationProviderContractRegistry,
mediaUnderstandingProviderContractRegistry,
musicGenerationProviderContractRegistry,
pluginRegistrationContractRegistry,
speechProviderContractRegistry,
videoGenerationProviderContractRegistry,
@@ -19,6 +20,7 @@ type PluginRegistrationContractParams = {
mediaUnderstandingProviderIds?: string[];
imageGenerationProviderIds?: string[];
videoGenerationProviderIds?: string[];
musicGenerationProviderIds?: string[];
toolNames?: string[];
requireSpeechVoices?: boolean;
requireDescribeImages?: boolean;
@@ -110,6 +112,13 @@ function findVideoGenerationProvider(pluginId: string) {
return entry.provider;
}
function findMusicGenerationProviderIds(pluginId: string) {
return musicGenerationProviderContractRegistry
.filter((entry) => entry.pluginId === pluginId)
.map((entry) => entry.provider.id)
.toSorted((left, right) => left.localeCompare(right));
}
export function describePluginRegistrationContract(params: PluginRegistrationContractParams) {
describe(`${params.pluginId} plugin registration contract`, () => {
if (params.providerIds) {
@@ -192,6 +201,17 @@ export function describePluginRegistrationContract(params: PluginRegistrationCon
});
}
if (params.musicGenerationProviderIds) {
it("keeps bundled music-generation ownership explicit", () => {
expect(findRegistration(params.pluginId).musicGenerationProviderIds).toEqual(
params.musicGenerationProviderIds,
);
expect(findMusicGenerationProviderIds(params.pluginId)).toEqual(
params.musicGenerationProviderIds,
);
});
}
if (params.toolNames) {
it("keeps bundled tool ownership explicit", () => {
expect(findRegistration(params.pluginId).toolNames).toEqual(params.toolNames);