diff --git a/packages/pi-llm-performance/README.md b/packages/pi-llm-performance/README.md new file mode 100644 index 0000000..d397fe2 --- /dev/null +++ b/packages/pi-llm-performance/README.md @@ -0,0 +1,30 @@ +# pi-llm-performance + +LLM performance metrics extension + +## How to install + +Add to your global pi settings: + +```bash +pi install /Users/willem/dev/spikes/llm/custom-coding-agent/packages/pi-llm-performance +``` + +Or add manually to `~/.pi/agent/settings.json`: + +``` + "packages": [ + "/Users/willem/dev/spikes/llm/custom-coding-agent/packages/pi-llm-performance", + ... + ] +``` + +Then reload pi: + +```bash +/reload +``` + +## License + +MIT diff --git a/packages/pi-llm-performance/deno.lock b/packages/pi-llm-performance/deno.lock new file mode 100644 index 0000000..84f8171 --- /dev/null +++ b/packages/pi-llm-performance/deno.lock @@ -0,0 +1,18 @@ +{ + "version": "5", + "specifiers": { + "jsr:@std/assert@*": "1.0.19", + "jsr:@std/internal@^1.0.12": "1.0.12" + }, + "jsr": { + "@std/assert@1.0.19": { + "integrity": "eaada96ee120cb980bc47e040f82814d786fe8162ecc53c91d8df60b8755991e", + "dependencies": [ + "jsr:@std/internal" + ] + }, + "@std/internal@1.0.12": { + "integrity": "972a634fd5bc34b242024402972cd5143eac68d8dffaca5eaa4dba30ce17b027" + } + } +} diff --git a/packages/pi-llm-performance/package.json b/packages/pi-llm-performance/package.json new file mode 100644 index 0000000..3afe060 --- /dev/null +++ b/packages/pi-llm-performance/package.json @@ -0,0 +1,17 @@ +{ + "name": "pi-llm-performance", + "version": "0.1.0", + "description": "LLM performance metrics extension", + "type": "module", + "exports": { + ".": "./src/llm-performance-metrics.ts" + }, + "keywords": ["pi-package"], + "pi": { + "extensions": ["src/llm-performance-metrics.ts"] + }, + "peerDependencies": { + "@mariozechner/pi-coding-agent": "*" + }, + "license": "MIT" +} diff --git a/packages/pi-llm-performance/src/llm-metrics-core.test.ts b/packages/pi-llm-performance/src/llm-metrics-core.test.ts new file mode 100644 index 0000000..18d816e --- /dev/null +++ b/packages/pi-llm-performance/src/llm-metrics-core.test.ts @@ -0,0 +1,398 @@ +import { + calculateTurnMetrics, + aggregatePromptMetrics, + formatMetricsForDisplay, + toLogEntry, + type TurnMetrics, + type PromptMetrics, +} from "./llm-metrics-core.ts"; +import { assertEquals, assertGreaterOrEqual, assertLessOrEqual } from "jsr:@std/assert"; + +Deno.test("calculateTurnMetrics - creates turn metrics object", () => { + const result = calculateTurnMetrics({ + turnId: "turn-1", + inputTokens: 100, + outputTokens: 50, + durationMs: 2000, + timeToFirstTokenMs: 500, + }); + + assertEquals(result.turnId, "turn-1"); + assertEquals(result.inputTokens, 100); + assertEquals(result.outputTokens, 50); + assertEquals(result.durationMs, 2000); + assertEquals(result.timeToFirstTokenMs, 500); +}); + +Deno.test("calculateTurnMetrics - handles missing timeToFirstToken", () => { + const result = calculateTurnMetrics({ + turnId: "turn-1", + inputTokens: 100, + outputTokens: 50, + durationMs: 2000, + }); + + assertEquals(result.timeToFirstTokenMs, undefined); +}); + +Deno.test("aggregatePromptMetrics - aggregates single turn", () => { + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-1", + inputTokens: 1000, + outputTokens: 200, + durationMs: 5000, + timeToFirstTokenMs: 800, + }, + ]; + + const result = aggregatePromptMetrics({ + provider: "anthropic", + model: "claude-sonnet-4", + turnMetrics, + }); + + assertEquals(result.provider, "anthropic"); + assertEquals(result.model, "claude-sonnet-4"); + assertEquals(result.turnCount, 1); + assertEquals(result.inputTokens, 1000); + assertEquals(result.outputTokens, 200); + assertEquals(result.totalTokens, 1200); + assertEquals(result.totalDurationMs, 5000); + assertEquals(result.timeToFirstTokenMs, 800); + + // Tokens per second calculations + // prefill: 1000 input tokens / 0.8s TTFT = 1250 tok/s + assertEquals(result.prefillTokensPerSec, 1250); + // generation: 200 output tokens / 4.2s (5s - 0.8s) = 47.62 tok/s + assertGreaterOrEqual(result.generationTokensPerSec, 47.6); + assertLessOrEqual(result.generationTokensPerSec, 47.7); + // combined: 1200 total tokens / 5s = 240 tok/s + assertEquals(result.combinedTokensPerSec, 240); +}); + +Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => { + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-1", + inputTokens: 1000, + outputTokens: 200, + durationMs: 3000, + timeToFirstTokenMs: 800, + }, + { + turnId: "turn-2", + inputTokens: 500, + outputTokens: 150, + durationMs: 2000, + }, + { + turnId: "turn-3", + inputTokens: 300, + outputTokens: 100, + durationMs: 1500, + }, + ]; + + const result = aggregatePromptMetrics({ + provider: "openai", + model: "gpt-4o", + turnMetrics, + }); + + assertEquals(result.turnCount, 3); + assertEquals(result.inputTokens, 1800); // 1000 + 500 + 300 + assertEquals(result.outputTokens, 450); // 200 + 150 + 100 + assertEquals(result.totalTokens, 2250); + assertEquals(result.totalDurationMs, 6500); // 3000 + 2000 + 1500 + assertEquals(result.timeToFirstTokenMs, 800); // From first turn only + + // Tokens per second: prefill uses TTFT (0.8s), generation uses (total - TTFT) = 5.7s + // prefill: 1800 / 0.8 = 2250 tok/s + assertEquals(result.prefillTokensPerSec, 2250); + // generation: 450 / 5.7 = 78.95 tok/s + assertGreaterOrEqual(result.generationTokensPerSec, 78.9); + assertLessOrEqual(result.generationTokensPerSec, 79.0); + // combined: 2250 / 6.5 = 346.15 tok/s + assertGreaterOrEqual(result.combinedTokensPerSec, 346.1); + assertLessOrEqual(result.combinedTokensPerSec, 346.2); +}); + +Deno.test("aggregatePromptMetrics - handles empty turn list", () => { + const result = aggregatePromptMetrics({ + provider: "anthropic", + model: "claude-sonnet-4", + turnMetrics: [], + }); + + assertEquals(result.turnCount, 0); + assertEquals(result.inputTokens, 0); + assertEquals(result.outputTokens, 0); + assertEquals(result.totalTokens, 0); + assertEquals(result.prefillTokensPerSec, 0); + assertEquals(result.generationTokensPerSec, 0); + assertEquals(result.combinedTokensPerSec, 0); + assertEquals(result.totalDurationMs, 0); + assertEquals(result.timeToFirstTokenMs, undefined); +}); + +Deno.test("formatMetricsForDisplay - formats single turn metrics", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 1, + inputTokens: 1250, + outputTokens: 342, + totalTokens: 1592, + prefillTokensPerSec: 482.1, + generationTokensPerSec: 18.3, + combinedTokensPerSec: 38.0, + totalDurationMs: 21600, + timeToFirstTokenMs: 850, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("anthropic/claude-sonnet-4"), true); + assertEquals(display.includes("1,250 tokens"), true); + assertEquals(display.includes("482.1 tok/s"), true); + assertEquals(display.includes("342 tokens"), true); + assertEquals(display.includes("18.3 tok/s"), true); + assertEquals(display.includes("1,592 tokens"), true); + assertEquals(display.includes("38.0 tok/s"), true); + assertEquals(display.includes("21.6s"), true); + assertEquals(display.includes("TTFT: 850ms"), true); +}); + +Deno.test("formatMetricsForDisplay - formats duration as minutes when over 60s", () => { + const metrics: PromptMetrics = { + provider: "openai", + model: "gpt-4o", + turnCount: 1, + inputTokens: 5000, + outputTokens: 1000, + totalTokens: 6000, + prefillTokensPerSec: 50, + generationTokensPerSec: 10, + combinedTokensPerSec: 60, + totalDurationMs: 120000, // 2 minutes + timeToFirstTokenMs: 1500, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("2.0m"), true); +}); + +Deno.test("formatMetricsForDisplay - omits turn count when single turn", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 1, + inputTokens: 100, + outputTokens: 50, + totalTokens: 150, + prefillTokensPerSec: 20, + generationTokensPerSec: 10, + combinedTokensPerSec: 30, + totalDurationMs: 5000, + timeToFirstTokenMs: 500, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("Turns: 1"), false); +}); + +Deno.test("formatMetricsForDisplay - omits prefill/generation when TTFT is unavailable", () => { + const metrics: PromptMetrics = { + provider: "openai", + model: "gpt-4o", + turnCount: 1, + inputTokens: 1000, + outputTokens: 200, + totalTokens: 1200, + prefillTokensPerSec: 0, + generationTokensPerSec: 0, + combinedTokensPerSec: 240, + totalDurationMs: 5000, + timeToFirstTokenMs: undefined, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("Prefill:"), false); + assertEquals(display.includes("Generation:"), false); + assertEquals(display.includes("1,200 tokens"), true); + assertEquals(display.includes("240.0 tok/s"), true); +}); + +Deno.test("formatMetricsForDisplay - shows turn count when multiple turns", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 3, + inputTokens: 100, + outputTokens: 50, + totalTokens: 150, + prefillTokensPerSec: 20, + generationTokensPerSec: 10, + combinedTokensPerSec: 30, + totalDurationMs: 5000, + timeToFirstTokenMs: 500, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("Turns: 3"), true); +}); + +Deno.test("toLogEntry - creates JSON-serializable log entry", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 2, + inputTokens: 1250, + outputTokens: 342, + totalTokens: 1592, + prefillTokensPerSec: 482.12345, + generationTokensPerSec: 18.34567, + combinedTokensPerSec: 38.09876, + totalDurationMs: 21600, + timeToFirstTokenMs: 850, + turns: [], + }; + + const logEntry = toLogEntry(metrics); + + assertEquals(logEntry.provider, "anthropic"); + assertEquals(logEntry.model, "claude-sonnet-4"); + assertEquals(logEntry.turnCount, 2); + assertEquals(logEntry.inputTokens, 1250); + assertEquals(logEntry.outputTokens, 342); + assertEquals(logEntry.totalTokens, 1592); + // Rounded to 2 decimal places + assertEquals(logEntry.prefillTokensPerSec, 482.12); + assertEquals(logEntry.generationTokensPerSec, 18.35); + assertEquals(logEntry.combinedTokensPerSec, 38.1); + assertEquals(logEntry.totalDurationMs, 21600); + assertEquals(logEntry.timeToFirstTokenMs, 850); + + // Should have ISO timestamp + assertEquals(logEntry.timestamp.includes("T"), true); + assertEquals(logEntry.timestamp.includes("Z"), true); + + // Should be JSON serializable + const json = JSON.stringify(logEntry); + assertEquals(json.length > 0, true); + const parsed = JSON.parse(json); + assertEquals(parsed.provider, "anthropic"); +}); + +Deno.test("aggregatePromptMetrics - uses full duration when TTFT is undefined", () => { + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-1", + inputTokens: 1000, + outputTokens: 200, + durationMs: 5000, + // No timeToFirstTokenMs + }, + ]; + + const result = aggregatePromptMetrics({ + provider: "openai", + model: "gpt-4o", + turnMetrics, + }); + + assertEquals(result.turnCount, 1); + assertEquals(result.inputTokens, 1000); + assertEquals(result.outputTokens, 200); + // Without TTFT, prefill and generation rates are 0 (cannot separate phases) + // Only combined rate is meaningful + assertEquals(result.prefillTokensPerSec, 0); + assertEquals(result.generationTokensPerSec, 0); + assertEquals(result.combinedTokensPerSec, 240); +}); + +Deno.test("toLogEntry - handles missing timeToFirstToken", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 1, + inputTokens: 100, + outputTokens: 50, + totalTokens: 150, + prefillTokensPerSec: 20, + generationTokensPerSec: 10, + combinedTokensPerSec: 30, + totalDurationMs: 5000, + timeToFirstTokenMs: undefined, + turns: [], + }; + + const logEntry = toLogEntry(metrics); + + assertEquals(logEntry.timeToFirstTokenMs, undefined); +}); + +Deno.test("Integration - full flow from turns to log entry", () => { + // Simulate a real scenario with multiple turns + const turn1 = calculateTurnMetrics({ + turnId: "turn-1", + inputTokens: 2000, + outputTokens: 500, + durationMs: 8000, + timeToFirstTokenMs: 1200, + }); + + const turn2 = calculateTurnMetrics({ + turnId: "turn-2", + inputTokens: 800, + outputTokens: 200, + durationMs: 3000, + }); + + const promptMetrics = aggregatePromptMetrics({ + provider: "groq", + model: "llama-3.1-70b", + turnMetrics: [turn1, turn2], + }); + + const display = formatMetricsForDisplay(promptMetrics); + const logEntry = toLogEntry(promptMetrics); + + // Verify aggregation + assertEquals(promptMetrics.turnCount, 2); + assertEquals(promptMetrics.inputTokens, 2800); + assertEquals(promptMetrics.outputTokens, 700); + assertEquals(promptMetrics.totalTokens, 3500); + assertEquals(promptMetrics.totalDurationMs, 11000); + assertEquals(promptMetrics.timeToFirstTokenMs, 1200); + + // Verify corrected rate calculations + // prefill: 2800 / 1.2 = 2333.33 tok/s + assertGreaterOrEqual(promptMetrics.prefillTokensPerSec, 2333.3); + assertLessOrEqual(promptMetrics.prefillTokensPerSec, 2333.4); + // generation: 700 / 9.8 = 71.43 tok/s + assertGreaterOrEqual(promptMetrics.generationTokensPerSec, 71.4); + assertLessOrEqual(promptMetrics.generationTokensPerSec, 71.5); + // combined: 3500 / 11 = 318.18 tok/s + assertGreaterOrEqual(promptMetrics.combinedTokensPerSec, 318.1); + assertLessOrEqual(promptMetrics.combinedTokensPerSec, 318.2); + + // Verify display contains key info + assertEquals(display.includes("groq/llama-3.1-70b"), true); + assertEquals(display.includes("TTFT: 1200ms"), true); + + // Verify log entry + assertEquals(logEntry.provider, "groq"); + assertEquals(logEntry.model, "llama-3.1-70b"); + assertEquals(logEntry.turnCount, 2); +}); \ No newline at end of file diff --git a/packages/pi-llm-performance/src/llm-metrics-core.ts b/packages/pi-llm-performance/src/llm-metrics-core.ts new file mode 100644 index 0000000..dca82c9 --- /dev/null +++ b/packages/pi-llm-performance/src/llm-metrics-core.ts @@ -0,0 +1,201 @@ +// Functional core for LLM performance metrics calculation + +export interface TurnMetrics { + turnId: string; + inputTokens: number; + outputTokens: number; + durationMs: number; + timeToFirstTokenMs?: number; +} + +export interface PromptMetrics { + provider: string; + model: string; + turnCount: number; + inputTokens: number; + outputTokens: number; + totalTokens: number; + prefillTokensPerSec: number; + generationTokensPerSec: number; + combinedTokensPerSec: number; + totalDurationMs: number; + timeToFirstTokenMs?: number; + turns: TurnMetrics[]; +} + +export interface MetricLogEntry { + timestamp: string; + provider: string; + model: string; + turnCount: number; + inputTokens: number; + outputTokens: number; + totalTokens: number; + prefillTokensPerSec: number; + generationTokensPerSec: number; + combinedTokensPerSec: number; + totalDurationMs: number; + timeToFirstTokenMs?: number; +} + +/** + * Calculate metrics for a single turn + */ +export function calculateTurnMetrics(params: { + turnId: string; + inputTokens: number; + outputTokens: number; + durationMs: number; + timeToFirstTokenMs?: number; +}): TurnMetrics { + return { + turnId: params.turnId, + inputTokens: params.inputTokens, + outputTokens: params.outputTokens, + durationMs: params.durationMs, + timeToFirstTokenMs: params.timeToFirstTokenMs, + }; +} + +/** + * Aggregate multiple turn metrics into prompt-level metrics + */ +export function aggregatePromptMetrics(params: { + provider: string; + model: string; + turnMetrics: TurnMetrics[]; +}): PromptMetrics { + const { provider, model, turnMetrics } = params; + + if (turnMetrics.length === 0) { + return { + provider, + model, + turnCount: 0, + inputTokens: 0, + outputTokens: 0, + totalTokens: 0, + prefillTokensPerSec: 0, + generationTokensPerSec: 0, + combinedTokensPerSec: 0, + totalDurationMs: 0, + turns: [], + }; + } + + // Sum tokens across all turns + const inputTokens = turnMetrics.reduce((sum, t) => sum + t.inputTokens, 0); + const outputTokens = turnMetrics.reduce((sum, t) => sum + t.outputTokens, 0); + const totalTokens = inputTokens + outputTokens; + + // Sum duration across all turns + const totalDurationMs = turnMetrics.reduce((sum, t) => sum + t.durationMs, 0); + const totalDurationSec = totalDurationMs / 1000; + + // Time to first token is from the first turn + const timeToFirstTokenMs = turnMetrics[0]?.timeToFirstTokenMs; + + // Calculate tokens per second + // Prefill: input tokens / TTFT duration (prefill phase) + // Generation: output tokens / (totalDuration - TTFT) (generation phase) + // Combined: total tokens / total duration + // When TTFT is unavailable, prefill and generation phases cannot be separated, + // so we set them to 0 and only report combined. + const ttftSec = timeToFirstTokenMs !== undefined ? timeToFirstTokenMs / 1000 : undefined; + const generationDurationSec = timeToFirstTokenMs !== undefined + ? (totalDurationMs - timeToFirstTokenMs) / 1000 + : undefined; + + const prefillTokensPerSec = (ttftSec && ttftSec > 0) ? inputTokens / ttftSec : 0; + const generationTokensPerSec = (generationDurationSec !== undefined && generationDurationSec > 0) + ? outputTokens / generationDurationSec + : 0; + const combinedTokensPerSec = totalDurationSec > 0 ? totalTokens / totalDurationSec : 0; + + return { + provider, + model, + turnCount: turnMetrics.length, + inputTokens, + outputTokens, + totalTokens, + prefillTokensPerSec, + generationTokensPerSec, + combinedTokensPerSec, + totalDurationMs, + timeToFirstTokenMs, + turns: turnMetrics, + }; +} + +/** + * Format metrics for TUI display + */ +export function formatMetricsForDisplay(metrics: PromptMetrics): string { + const lines: string[] = []; + + // Header with provider/model + lines.push(`📊 Performance: ${metrics.provider}/${metrics.model}`); + + if (metrics.turnCount === 0) { + lines.push(" No turns recorded"); + return lines.join("\n"); + } + + // Format duration display + const durationSec = metrics.totalDurationMs / 1000; + const durationDisplay = durationSec >= 60 + ? `${(durationSec / 60).toFixed(1)}m` + : `${durationSec.toFixed(1)}s`; + + // Prefill metrics (only when TTFT was available) + if (metrics.prefillTokensPerSec > 0) { + lines.push( + ` Prefill: ${metrics.inputTokens.toLocaleString()} tokens @ ${metrics.prefillTokensPerSec.toFixed(1)} tok/s` + ); + } + + // Generation metrics (only when TTFT was available) + if (metrics.generationTokensPerSec > 0) { + lines.push( + ` Generation: ${metrics.outputTokens.toLocaleString()} tokens @ ${metrics.generationTokensPerSec.toFixed(1)} tok/s` + ); + } + + // Combined metrics + lines.push( + ` Combined: ${metrics.totalTokens.toLocaleString()} tokens @ ${metrics.combinedTokensPerSec.toFixed(1)} tok/s (${durationDisplay} total)` + ); + + // Time to first token + if (metrics.timeToFirstTokenMs !== undefined) { + lines.push(` TTFT: ${metrics.timeToFirstTokenMs.toFixed(0)}ms`); + } + + // Turn count + if (metrics.turnCount > 1) { + lines.push(` Turns: ${metrics.turnCount}`); + } + + return lines.join("\n"); +} + +/** + * Convert PromptMetrics to JSONL log entry + */ +export function toLogEntry(metrics: PromptMetrics): MetricLogEntry { + return { + timestamp: new Date().toISOString(), + provider: metrics.provider, + model: metrics.model, + turnCount: metrics.turnCount, + inputTokens: metrics.inputTokens, + outputTokens: metrics.outputTokens, + totalTokens: metrics.totalTokens, + prefillTokensPerSec: Math.round(metrics.prefillTokensPerSec * 100) / 100, + generationTokensPerSec: Math.round(metrics.generationTokensPerSec * 100) / 100, + combinedTokensPerSec: Math.round(metrics.combinedTokensPerSec * 100) / 100, + totalDurationMs: metrics.totalDurationMs, + timeToFirstTokenMs: metrics.timeToFirstTokenMs, + }; +} \ No newline at end of file diff --git a/packages/pi-llm-performance/src/llm-performance-metrics.test.ts b/packages/pi-llm-performance/src/llm-performance-metrics.test.ts new file mode 100644 index 0000000..18d816e --- /dev/null +++ b/packages/pi-llm-performance/src/llm-performance-metrics.test.ts @@ -0,0 +1,398 @@ +import { + calculateTurnMetrics, + aggregatePromptMetrics, + formatMetricsForDisplay, + toLogEntry, + type TurnMetrics, + type PromptMetrics, +} from "./llm-metrics-core.ts"; +import { assertEquals, assertGreaterOrEqual, assertLessOrEqual } from "jsr:@std/assert"; + +Deno.test("calculateTurnMetrics - creates turn metrics object", () => { + const result = calculateTurnMetrics({ + turnId: "turn-1", + inputTokens: 100, + outputTokens: 50, + durationMs: 2000, + timeToFirstTokenMs: 500, + }); + + assertEquals(result.turnId, "turn-1"); + assertEquals(result.inputTokens, 100); + assertEquals(result.outputTokens, 50); + assertEquals(result.durationMs, 2000); + assertEquals(result.timeToFirstTokenMs, 500); +}); + +Deno.test("calculateTurnMetrics - handles missing timeToFirstToken", () => { + const result = calculateTurnMetrics({ + turnId: "turn-1", + inputTokens: 100, + outputTokens: 50, + durationMs: 2000, + }); + + assertEquals(result.timeToFirstTokenMs, undefined); +}); + +Deno.test("aggregatePromptMetrics - aggregates single turn", () => { + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-1", + inputTokens: 1000, + outputTokens: 200, + durationMs: 5000, + timeToFirstTokenMs: 800, + }, + ]; + + const result = aggregatePromptMetrics({ + provider: "anthropic", + model: "claude-sonnet-4", + turnMetrics, + }); + + assertEquals(result.provider, "anthropic"); + assertEquals(result.model, "claude-sonnet-4"); + assertEquals(result.turnCount, 1); + assertEquals(result.inputTokens, 1000); + assertEquals(result.outputTokens, 200); + assertEquals(result.totalTokens, 1200); + assertEquals(result.totalDurationMs, 5000); + assertEquals(result.timeToFirstTokenMs, 800); + + // Tokens per second calculations + // prefill: 1000 input tokens / 0.8s TTFT = 1250 tok/s + assertEquals(result.prefillTokensPerSec, 1250); + // generation: 200 output tokens / 4.2s (5s - 0.8s) = 47.62 tok/s + assertGreaterOrEqual(result.generationTokensPerSec, 47.6); + assertLessOrEqual(result.generationTokensPerSec, 47.7); + // combined: 1200 total tokens / 5s = 240 tok/s + assertEquals(result.combinedTokensPerSec, 240); +}); + +Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => { + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-1", + inputTokens: 1000, + outputTokens: 200, + durationMs: 3000, + timeToFirstTokenMs: 800, + }, + { + turnId: "turn-2", + inputTokens: 500, + outputTokens: 150, + durationMs: 2000, + }, + { + turnId: "turn-3", + inputTokens: 300, + outputTokens: 100, + durationMs: 1500, + }, + ]; + + const result = aggregatePromptMetrics({ + provider: "openai", + model: "gpt-4o", + turnMetrics, + }); + + assertEquals(result.turnCount, 3); + assertEquals(result.inputTokens, 1800); // 1000 + 500 + 300 + assertEquals(result.outputTokens, 450); // 200 + 150 + 100 + assertEquals(result.totalTokens, 2250); + assertEquals(result.totalDurationMs, 6500); // 3000 + 2000 + 1500 + assertEquals(result.timeToFirstTokenMs, 800); // From first turn only + + // Tokens per second: prefill uses TTFT (0.8s), generation uses (total - TTFT) = 5.7s + // prefill: 1800 / 0.8 = 2250 tok/s + assertEquals(result.prefillTokensPerSec, 2250); + // generation: 450 / 5.7 = 78.95 tok/s + assertGreaterOrEqual(result.generationTokensPerSec, 78.9); + assertLessOrEqual(result.generationTokensPerSec, 79.0); + // combined: 2250 / 6.5 = 346.15 tok/s + assertGreaterOrEqual(result.combinedTokensPerSec, 346.1); + assertLessOrEqual(result.combinedTokensPerSec, 346.2); +}); + +Deno.test("aggregatePromptMetrics - handles empty turn list", () => { + const result = aggregatePromptMetrics({ + provider: "anthropic", + model: "claude-sonnet-4", + turnMetrics: [], + }); + + assertEquals(result.turnCount, 0); + assertEquals(result.inputTokens, 0); + assertEquals(result.outputTokens, 0); + assertEquals(result.totalTokens, 0); + assertEquals(result.prefillTokensPerSec, 0); + assertEquals(result.generationTokensPerSec, 0); + assertEquals(result.combinedTokensPerSec, 0); + assertEquals(result.totalDurationMs, 0); + assertEquals(result.timeToFirstTokenMs, undefined); +}); + +Deno.test("formatMetricsForDisplay - formats single turn metrics", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 1, + inputTokens: 1250, + outputTokens: 342, + totalTokens: 1592, + prefillTokensPerSec: 482.1, + generationTokensPerSec: 18.3, + combinedTokensPerSec: 38.0, + totalDurationMs: 21600, + timeToFirstTokenMs: 850, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("anthropic/claude-sonnet-4"), true); + assertEquals(display.includes("1,250 tokens"), true); + assertEquals(display.includes("482.1 tok/s"), true); + assertEquals(display.includes("342 tokens"), true); + assertEquals(display.includes("18.3 tok/s"), true); + assertEquals(display.includes("1,592 tokens"), true); + assertEquals(display.includes("38.0 tok/s"), true); + assertEquals(display.includes("21.6s"), true); + assertEquals(display.includes("TTFT: 850ms"), true); +}); + +Deno.test("formatMetricsForDisplay - formats duration as minutes when over 60s", () => { + const metrics: PromptMetrics = { + provider: "openai", + model: "gpt-4o", + turnCount: 1, + inputTokens: 5000, + outputTokens: 1000, + totalTokens: 6000, + prefillTokensPerSec: 50, + generationTokensPerSec: 10, + combinedTokensPerSec: 60, + totalDurationMs: 120000, // 2 minutes + timeToFirstTokenMs: 1500, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("2.0m"), true); +}); + +Deno.test("formatMetricsForDisplay - omits turn count when single turn", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 1, + inputTokens: 100, + outputTokens: 50, + totalTokens: 150, + prefillTokensPerSec: 20, + generationTokensPerSec: 10, + combinedTokensPerSec: 30, + totalDurationMs: 5000, + timeToFirstTokenMs: 500, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("Turns: 1"), false); +}); + +Deno.test("formatMetricsForDisplay - omits prefill/generation when TTFT is unavailable", () => { + const metrics: PromptMetrics = { + provider: "openai", + model: "gpt-4o", + turnCount: 1, + inputTokens: 1000, + outputTokens: 200, + totalTokens: 1200, + prefillTokensPerSec: 0, + generationTokensPerSec: 0, + combinedTokensPerSec: 240, + totalDurationMs: 5000, + timeToFirstTokenMs: undefined, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("Prefill:"), false); + assertEquals(display.includes("Generation:"), false); + assertEquals(display.includes("1,200 tokens"), true); + assertEquals(display.includes("240.0 tok/s"), true); +}); + +Deno.test("formatMetricsForDisplay - shows turn count when multiple turns", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 3, + inputTokens: 100, + outputTokens: 50, + totalTokens: 150, + prefillTokensPerSec: 20, + generationTokensPerSec: 10, + combinedTokensPerSec: 30, + totalDurationMs: 5000, + timeToFirstTokenMs: 500, + turns: [], + }; + + const display = formatMetricsForDisplay(metrics); + + assertEquals(display.includes("Turns: 3"), true); +}); + +Deno.test("toLogEntry - creates JSON-serializable log entry", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 2, + inputTokens: 1250, + outputTokens: 342, + totalTokens: 1592, + prefillTokensPerSec: 482.12345, + generationTokensPerSec: 18.34567, + combinedTokensPerSec: 38.09876, + totalDurationMs: 21600, + timeToFirstTokenMs: 850, + turns: [], + }; + + const logEntry = toLogEntry(metrics); + + assertEquals(logEntry.provider, "anthropic"); + assertEquals(logEntry.model, "claude-sonnet-4"); + assertEquals(logEntry.turnCount, 2); + assertEquals(logEntry.inputTokens, 1250); + assertEquals(logEntry.outputTokens, 342); + assertEquals(logEntry.totalTokens, 1592); + // Rounded to 2 decimal places + assertEquals(logEntry.prefillTokensPerSec, 482.12); + assertEquals(logEntry.generationTokensPerSec, 18.35); + assertEquals(logEntry.combinedTokensPerSec, 38.1); + assertEquals(logEntry.totalDurationMs, 21600); + assertEquals(logEntry.timeToFirstTokenMs, 850); + + // Should have ISO timestamp + assertEquals(logEntry.timestamp.includes("T"), true); + assertEquals(logEntry.timestamp.includes("Z"), true); + + // Should be JSON serializable + const json = JSON.stringify(logEntry); + assertEquals(json.length > 0, true); + const parsed = JSON.parse(json); + assertEquals(parsed.provider, "anthropic"); +}); + +Deno.test("aggregatePromptMetrics - uses full duration when TTFT is undefined", () => { + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-1", + inputTokens: 1000, + outputTokens: 200, + durationMs: 5000, + // No timeToFirstTokenMs + }, + ]; + + const result = aggregatePromptMetrics({ + provider: "openai", + model: "gpt-4o", + turnMetrics, + }); + + assertEquals(result.turnCount, 1); + assertEquals(result.inputTokens, 1000); + assertEquals(result.outputTokens, 200); + // Without TTFT, prefill and generation rates are 0 (cannot separate phases) + // Only combined rate is meaningful + assertEquals(result.prefillTokensPerSec, 0); + assertEquals(result.generationTokensPerSec, 0); + assertEquals(result.combinedTokensPerSec, 240); +}); + +Deno.test("toLogEntry - handles missing timeToFirstToken", () => { + const metrics: PromptMetrics = { + provider: "anthropic", + model: "claude-sonnet-4", + turnCount: 1, + inputTokens: 100, + outputTokens: 50, + totalTokens: 150, + prefillTokensPerSec: 20, + generationTokensPerSec: 10, + combinedTokensPerSec: 30, + totalDurationMs: 5000, + timeToFirstTokenMs: undefined, + turns: [], + }; + + const logEntry = toLogEntry(metrics); + + assertEquals(logEntry.timeToFirstTokenMs, undefined); +}); + +Deno.test("Integration - full flow from turns to log entry", () => { + // Simulate a real scenario with multiple turns + const turn1 = calculateTurnMetrics({ + turnId: "turn-1", + inputTokens: 2000, + outputTokens: 500, + durationMs: 8000, + timeToFirstTokenMs: 1200, + }); + + const turn2 = calculateTurnMetrics({ + turnId: "turn-2", + inputTokens: 800, + outputTokens: 200, + durationMs: 3000, + }); + + const promptMetrics = aggregatePromptMetrics({ + provider: "groq", + model: "llama-3.1-70b", + turnMetrics: [turn1, turn2], + }); + + const display = formatMetricsForDisplay(promptMetrics); + const logEntry = toLogEntry(promptMetrics); + + // Verify aggregation + assertEquals(promptMetrics.turnCount, 2); + assertEquals(promptMetrics.inputTokens, 2800); + assertEquals(promptMetrics.outputTokens, 700); + assertEquals(promptMetrics.totalTokens, 3500); + assertEquals(promptMetrics.totalDurationMs, 11000); + assertEquals(promptMetrics.timeToFirstTokenMs, 1200); + + // Verify corrected rate calculations + // prefill: 2800 / 1.2 = 2333.33 tok/s + assertGreaterOrEqual(promptMetrics.prefillTokensPerSec, 2333.3); + assertLessOrEqual(promptMetrics.prefillTokensPerSec, 2333.4); + // generation: 700 / 9.8 = 71.43 tok/s + assertGreaterOrEqual(promptMetrics.generationTokensPerSec, 71.4); + assertLessOrEqual(promptMetrics.generationTokensPerSec, 71.5); + // combined: 3500 / 11 = 318.18 tok/s + assertGreaterOrEqual(promptMetrics.combinedTokensPerSec, 318.1); + assertLessOrEqual(promptMetrics.combinedTokensPerSec, 318.2); + + // Verify display contains key info + assertEquals(display.includes("groq/llama-3.1-70b"), true); + assertEquals(display.includes("TTFT: 1200ms"), true); + + // Verify log entry + assertEquals(logEntry.provider, "groq"); + assertEquals(logEntry.model, "llama-3.1-70b"); + assertEquals(logEntry.turnCount, 2); +}); \ No newline at end of file diff --git a/packages/pi-llm-performance/src/llm-performance-metrics.ts b/packages/pi-llm-performance/src/llm-performance-metrics.ts new file mode 100644 index 0000000..31bde9a --- /dev/null +++ b/packages/pi-llm-performance/src/llm-performance-metrics.ts @@ -0,0 +1,100 @@ +// LLM Performance Metrics Extension +// Captures and displays LLM inference performance metrics + +import type { ExtensionAPI } from "@mariozechner/pi-coding-agent"; +import { appendFileSync, mkdirSync } from "node:fs"; +import { dirname, join } from "node:path"; + +// Re-export core functions from the shared metrics module +import { + calculateTurnMetrics, + aggregatePromptMetrics, + formatMetricsForDisplay, + toLogEntry, + type TurnMetrics, + type PromptMetrics, + type MetricLogEntry, +} from "./llm-metrics-core.ts"; + +// ============================================================================ +// Extension Event Handlers (imperative shell) +// ============================================================================ + +// State tracking +let promptStartMs: number | undefined; +let currentTurnStartMs: number | undefined; +let currentTurnId: string | undefined; +let turnMetrics: TurnMetrics[] = []; +let firstTokenTimeMs: number | undefined; +let provider: string | undefined; +let model: string | undefined; + +export default function (pi: ExtensionAPI) { + const logFile = join(process.cwd(), ".pi", "llm-metrics.log"); + + pi.on("agent_start", async (_event, ctx) => { + if (!ctx.model) return; + promptStartMs = Date.now(); + turnMetrics = []; + firstTokenTimeMs = undefined; + provider = ctx.model.provider; + model = ctx.model.id; + }); + + pi.on("turn_start", async (event, _ctx) => { + currentTurnStartMs = Date.now(); + currentTurnId = `turn-${event.turnIndex}`; + }); + + pi.on("message_update", async (event, _ctx) => { + // Capture TTFT on first token + if (firstTokenTimeMs === undefined && event.assistantMessageEvent?.type === "text_delta") { + firstTokenTimeMs = Date.now(); + } + }); + + pi.on("turn_end", async (event, _ctx) => { + if (event.message.role !== "assistant") return; + const inputTokens = event.message.usage?.input ?? 0; + const outputTokens = event.message.usage?.output ?? 0; + const durationMs = currentTurnStartMs ? Date.now() - currentTurnStartMs : 0; + const ttftMs = currentTurnId === `turn-${event.turnIndex}` && firstTokenTimeMs && currentTurnStartMs + ? firstTokenTimeMs - currentTurnStartMs + : undefined; + + const turnMetric = calculateTurnMetrics({ + turnId: currentTurnId!, + inputTokens, + outputTokens, + durationMs, + timeToFirstTokenMs: ttftMs, + }); + + turnMetrics.push(turnMetric); + }); + + pi.on("agent_end", async (_event, ctx) => { + if (!provider || !model || promptStartMs === undefined) return; + + const promptMetrics = aggregatePromptMetrics({ + provider, + model, + turnMetrics, + }); + + // Display in TUI + const display = formatMetricsForDisplay(promptMetrics); + ctx.ui.notify(display, "info"); + ctx.ui.setStatus("metrics", `📊 ${promptMetrics.combinedTokensPerSec.toFixed(1)} tok/s`); + + // Log to JSONL file + const logEntry = toLogEntry(promptMetrics); + mkdirSync(dirname(logFile), { recursive: true }); + appendFileSync(logFile, JSON.stringify(logEntry) + "\n", "utf8"); + + // Reset state + promptStartMs = undefined; + turnMetrics = []; + firstTokenTimeMs = undefined; + }); +} \ No newline at end of file