fix(otel): add agent label to token metrics

2026-05-03 06:57:09 +02:00 · 2026-04-25 20:45:01 -07:00
parent d251932fcf
commit 44da034516
6 changed files with 41 additions and 0 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -42,6 +42,7 @@ Docs: https://docs.openclaw.ai
 - Diagnostics/OTEL: export existing tool-loop diagnostics as `openclaw.tool.loop` counters and spans without loop messages, session identifiers, params, or tool output. Thanks @vincentkoc.
 - Diagnostics/OTEL: export diagnostic memory samples and pressure as bounded memory histograms, counters, and pressure spans to help spot leak regressions without session or payload data. Thanks @vincentkoc.
 - Diagnostics/OTEL: add the GenAI `gen_ai.client.token.usage` histogram for input/output model usage while keeping session identifiers and aggregate cache counters out of the semantic metric. Thanks @vincentkoc.
+- Diagnostics/OTEL: add a bounded `openclaw.agent` label to OpenClaw token metrics so per-agent Grafana dashboards can group usage without exporting session identifiers. Thanks @oc-factus.
 - Plugins/install: consolidate managed plugin install metadata into the state-managed plugin index at `plugins/installs.json`, replacing the temporary `plugins/installed-index.json` path and removing `plugins.installs` as an authored config surface. Thanks @vincentkoc and @shakkernerd.
 - Diagnostics/OTEL: add the GenAI `gen_ai.client.operation.duration` histogram for model-call latency in seconds with bounded provider/model/API and error attributes. Thanks @vincentkoc.
 - Diagnostics/OTEL: add GenAI usage token attributes to model-usage spans, including cache read/write input token counts without session identifiers or prompt/response content. Thanks @vincentkoc.
--- a/extensions/diagnostics-otel/src/service.test.ts
+++ b/extensions/diagnostics-otel/src/service.test.ts
@@ -896,6 +896,7 @@ describe("diagnostics-otel service", () => {
      type: "model.usage",
      sessionKey: "session-key",
      channel: "webchat",
+      agentId: "ops",
      provider: "openai",
      model: "gpt-5.4",
      usage: {
@@ -919,6 +920,14 @@ describe("diagnostics-otel service", () => {
      }),
    );
    const genAiTokenUsage = telemetryState.histograms.get("gen_ai.client.token.usage");
+    const tokens = telemetryState.counters.get("openclaw.tokens");
+    expect(tokens?.add).toHaveBeenCalledWith(12, {
+      "openclaw.channel": "webchat",
+      "openclaw.agent": "ops",
+      "openclaw.provider": "openai",
+      "openclaw.model": "gpt-5.4",
+      "openclaw.token": "input",
+    });
    expect(genAiTokenUsage?.record).toHaveBeenCalledTimes(2);
    expect(genAiTokenUsage?.record).toHaveBeenCalledWith(12, {
      "gen_ai.operation.name": "chat",
@@ -936,6 +945,33 @@ describe("diagnostics-otel service", () => {
    await service.stop?.(ctx);
  });

+  test("bounds agent identifiers on model usage metric attributes", async () => {
+    const service = createDiagnosticsOtelService();
+    const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true });
+    await service.start(ctx);
+
+    emitDiagnosticEvent({
+      type: "model.usage",
+      agentId: "Bearer sk-test-secret-value",
+      provider: "openai",
+      model: "gpt-5.4",
+      usage: { input: 2 },
+    });
+    await flushDiagnosticEvents();
+
+    expect(telemetryState.counters.get("openclaw.tokens")?.add).toHaveBeenCalledWith(2, {
+      "openclaw.channel": "unknown",
+      "openclaw.agent": "unknown",
+      "openclaw.provider": "openai",
+      "openclaw.model": "gpt-5.4",
+      "openclaw.token": "input",
+    });
+    expect(
+      JSON.stringify(telemetryState.counters.get("openclaw.tokens")?.add.mock.calls),
+    ).not.toContain("sk-test-secret-value");
+    await service.stop?.(ctx);
+  });
+
  test("keeps GenAI token usage metric model attribute present when model is unavailable", async () => {
    const service = createDiagnosticsOtelService();
    const ctx = createOtelContext(OTEL_TEST_ENDPOINT, { metrics: true });
--- a/extensions/diagnostics-otel/src/service.ts
+++ b/extensions/diagnostics-otel/src/service.ts
@@ -999,6 +999,7 @@ export function createDiagnosticsOtelService(): OpenClawPluginService {
      ) => {
        const attrs = {
          "openclaw.channel": evt.channel ?? "unknown",
+          "openclaw.agent": lowCardinalityAttr(evt.agentId),
          "openclaw.provider": evt.provider ?? "unknown",
          "openclaw.model": evt.model ?? "unknown",
        };
--- a/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts
+++ b/src/auto-reply/reply/agent-runner.misc.runreplyagent.test.ts
@@ -334,6 +334,7 @@ describe("runReplyAgent auto-compaction token update", () => {

    expect(usageEvent).toMatchObject({
      type: "model.usage",
+      agentId: "main",
      usage: {
        input: 75_000,
        output: 5_000,
--- a/src/auto-reply/reply/agent-runner.ts
+++ b/src/auto-reply/reply/agent-runner.ts
@@ -1406,6 +1406,7 @@ export async function runReplyAgent(params: {
        sessionKey,
        sessionId: followupRun.run.sessionId,
        channel: replyToChannel,
+        agentId: followupRun.run.agentId,
        provider: providerUsed,
        model: modelUsed,
        usage: {
--- a/src/infra/diagnostic-events.ts
+++ b/src/infra/diagnostic-events.ts
@@ -18,6 +18,7 @@ export type DiagnosticUsageEvent = DiagnosticBaseEvent & {
  sessionKey?: string;
  sessionId?: string;
  channel?: string;
+  agentId?: string;
  provider?: string;
  model?: string;
  usage: {