fix(otel): add agent label to token metrics

This commit is contained in:
Vincent Koc
2026-04-25 20:45:01 -07:00
parent d251932fcf
commit 44da034516
6 changed files with 41 additions and 0 deletions

View File

@@ -42,6 +42,7 @@ Docs: https://docs.openclaw.ai
- Diagnostics/OTEL: export existing tool-loop diagnostics as `openclaw.tool.loop` counters and spans without loop messages, session identifiers, params, or tool output. Thanks @vincentkoc.
- Diagnostics/OTEL: export diagnostic memory samples and pressure as bounded memory histograms, counters, and pressure spans to help spot leak regressions without session or payload data. Thanks @vincentkoc.
- Diagnostics/OTEL: add the GenAI `gen_ai.client.token.usage` histogram for input/output model usage while keeping session identifiers and aggregate cache counters out of the semantic metric. Thanks @vincentkoc.
- Diagnostics/OTEL: add a bounded `openclaw.agent` label to OpenClaw token metrics so per-agent Grafana dashboards can group usage without exporting session identifiers. Thanks @oc-factus.
- Plugins/install: consolidate managed plugin install metadata into the state-managed plugin index at `plugins/installs.json`, replacing the temporary `plugins/installed-index.json` path and removing `plugins.installs` as an authored config surface. Thanks @vincentkoc and @shakkernerd.
- Diagnostics/OTEL: add the GenAI `gen_ai.client.operation.duration` histogram for model-call latency in seconds with bounded provider/model/API and error attributes. Thanks @vincentkoc.
- Diagnostics/OTEL: add GenAI usage token attributes to model-usage spans, including cache read/write input token counts without session identifiers or prompt/response content. Thanks @vincentkoc.

View File

@@ -896,6 +896,7 @@ describe("diagnostics-otel service", () => {
type: "model.usage",
sessionKey: "session-key",
channel: "webchat",
agentId: "ops",
provider: "openai",
model: "gpt-5.4",
usage: {
@@ -919,6 +920,14 @@ describe("diagnostics-otel service", () => {
}),
);
const genAiTokenUsage = telemetryState.histograms.get("gen_ai.client.token.usage");
const tokens = telemetryState.counters.get("openclaw.tokens");
expect(tokens?.add).toHaveBeenCalledWith(12, {
"openclaw.channel": "webchat",
"openclaw.agent": "ops",
"openclaw.provider": "openai",
"openclaw.model": "gpt-5.4",
"openclaw.token": "input",
});
expect(genAiTokenUsage?.record).toHaveBeenCalledTimes(2);
expect(genAiTokenUsage?.record).toHaveBeenCalledWith(12, {
"gen_ai.operation.name": "chat",
@@ -936,6 +945,33 @@ describe("diagnostics-otel service", () => {
await service.stop?.(ctx);
});
test("bounds agent identifiers on model usage metric attributes", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true });
await service.start(ctx);
emitDiagnosticEvent({
type: "model.usage",
agentId: "Bearer sk-test-secret-value",
provider: "openai",
model: "gpt-5.4",
usage: { input: 2 },
});
await flushDiagnosticEvents();
expect(telemetryState.counters.get("openclaw.tokens")?.add).toHaveBeenCalledWith(2, {
"openclaw.channel": "unknown",
"openclaw.agent": "unknown",
"openclaw.provider": "openai",
"openclaw.model": "gpt-5.4",
"openclaw.token": "input",
});
expect(
JSON.stringify(telemetryState.counters.get("openclaw.tokens")?.add.mock.calls),
).not.toContain("sk-test-secret-value");
await service.stop?.(ctx);
});
test("keeps GenAI token usage metric model attribute present when model is unavailable", async () => {
const service = createDiagnosticsOtelService();
const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true });

View File

@@ -999,6 +999,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
) => {
const attrs = {
"openclaw.channel": evt.channel ?? "unknown",
"openclaw.agent": lowCardinalityAttr(evt.agentId),
"openclaw.provider": evt.provider ?? "unknown",
"openclaw.model": evt.model ?? "unknown",
};

View File

@@ -334,6 +334,7 @@ describe("runReplyAgent auto-compaction token update", () => {
expect(usageEvent).toMatchObject({
type: "model.usage",
agentId: "main",
usage: {
input: 75_000,
output: 5_000,

View File

@@ -1406,6 +1406,7 @@ export async function runReplyAgent(params: {
sessionKey,
sessionId: followupRun.run.sessionId,
channel: replyToChannel,
agentId: followupRun.run.agentId,
provider: providerUsed,
model: modelUsed,
usage: {

View File

@@ -18,6 +18,7 @@ export type DiagnosticUsageEvent = DiagnosticBaseEvent & {
sessionKey?: string;
sessionId?: string;
channel?: string;
agentId?: string;
provider?: string;
model?: string;
usage: {