From 98e18643c5b33d23f83bc3c2387170e23e15d354 Mon Sep 17 00:00:00 2001 From: Willem van den Ende Date: Tue, 28 Apr 2026 10:52:00 +0100 Subject: [PATCH] pi-performance: Make Time to first token more accurate. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary of changes: ┌──────┬──────────────────────────────────────────────────────────────────┬──────────┐ │ Step │ Change │ Result │ ├──────┼──────────────────────────────────────────────────────────────────┼──────────┤ │ 1 │ Removed duplicate llm-performance-metrics.test.ts │ 14 tests │ ├──────┼──────────────────────────────────────────────────────────────────┼──────────┤ │ 2 │ Added rawTimestamps assertions to toLogEntry test │ 14 tests │ ├──────┼──────────────────────────────────────────────────────────────────┼──────────┤ │ 3 │ Added rawTimestamps assertions to single-turn aggregate test │ 14 tests │ ├──────┼──────────────────────────────────────────────────────────────────┼──────────┤ │ 4 │ Added rawTimestamps assertions to multi-turn aggregate test │ 14 tests │ ├──────┼──────────────────────────────────────────────────────────────────┼──────────┤ │ 5 │ Added negative TTFT filtering test │ 15 tests │ ├──────┼──────────────────────────────────────────────────────────────────┼──────────┤ │ 6 │ Added "first turn missing TTFT, later turns have it" test │ 16 tests │ ├──────┼──────────────────────────────────────────────────────────────────┼──────────┤ │ 7 │ Added sanity check tests (warn on >500 tok/s, no warn otherwise) │ 18 tests │ └──────┴──────────────────────────────────────────────────────────────────┴──────────┘ This is what it looks like now when I run `pi` 📊 Performance: llama.cpp/Qwen3.6-35B-A3B-MXFP4_MOE.gguf Prefill: 15,460 tokens @ 20104.0 tok/s Generation: 12,179 tokens @ 52.6 tok/s Combined: 27,639 tokens @ 118.9 tok/s (3.9m total) TTFT: 769ms Turns: 36 --- .pi/llm-metrics.log | 12 + packages/pi-llm-performance/README.md | 98 ++++- packages/pi-llm-performance/deno.lock | 10 +- .../src/llm-metrics-core.test.ts | 160 +++++++ .../src/llm-metrics-core.ts | 51 ++- .../src/llm-performance-metrics.test.ts | 398 ------------------ .../src/llm-performance-metrics.ts | 17 +- 7 files changed, 311 insertions(+), 435 deletions(-) delete mode 100644 packages/pi-llm-performance/src/llm-performance-metrics.test.ts diff --git a/.pi/llm-metrics.log b/.pi/llm-metrics.log index 6523c00..fe1a39d 100644 --- a/.pi/llm-metrics.log +++ b/.pi/llm-metrics.log @@ -1 +1,13 @@ {"timestamp":"2026-04-28T08:58:29.989Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":6,"inputTokens":8294,"outputTokens":1356,"totalTokens":9650,"prefillTokensPerSec":1925.26,"generationTokensPerSec":42.55,"combinedTokensPerSec":266.77,"totalDurationMs":36174,"timeToFirstTokenMs":4308} +{"timestamp":"2026-04-28T09:16:50.070Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":1,"inputTokens":25,"outputTokens":39,"totalTokens":64,"prefillTokensPerSec":46.38,"generationTokensPerSec":89.86,"combinedTokensPerSec":65.78,"totalDurationMs":973,"timeToFirstTokenMs":539} +{"timestamp":"2026-04-28T09:19:10.796Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":1146,"outputTokens":911,"totalTokens":2057,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":104.65,"totalDurationMs":19656} +{"timestamp":"2026-04-28T09:20:38.777Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":1704,"outputTokens":1466,"totalTokens":3170,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":98.11,"totalDurationMs":32311} +{"timestamp":"2026-04-28T09:26:23.124Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":4,"inputTokens":2258,"outputTokens":2984,"totalTokens":5242,"prefillTokensPerSec":1019.41,"generationTokensPerSec":54.73,"combinedTokensPerSec":92.39,"totalDurationMs":56736,"timeToFirstTokenMs":2215} +{"timestamp":"2026-04-28T09:28:34.906Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":3,"inputTokens":976,"outputTokens":1434,"totalTokens":2410,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":72.74,"totalDurationMs":33130,"rawTimestamps":{"turns":[{"turnId":"turn-0","durationMs":5261},{"turnId":"turn-1","durationMs":23398,"ttftMs":3008},{"turnId":"turn-2","durationMs":4471,"ttftMs":-20390}]}} +{"timestamp":"2026-04-28T09:32:02.665Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":5,"inputTokens":2780,"outputTokens":4488,"totalTokens":7268,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":73.68,"totalDurationMs":98638,"rawTimestamps":{"turns":[{"turnId":"turn-0","durationMs":1617},{"turnId":"turn-1","durationMs":23911,"ttftMs":19482},{"turnId":"turn-2","durationMs":62977,"ttftMs":-4429},{"turnId":"turn-3","durationMs":2504,"ttftMs":-67406},{"turnId":"turn-4","durationMs":7629,"ttftMs":-69910}]}} +{"timestamp":"2026-04-28T09:33:30.666Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":1496,"outputTokens":291,"totalTokens":1787,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":59.38,"totalDurationMs":30092,"rawTimestamps":{"turns":[{"turnId":"turn-0","durationMs":5111},{"turnId":"turn-1","durationMs":24981,"ttftMs":12129}]}} +{"timestamp":"2026-04-28T09:36:38.399Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":10,"inputTokens":5923,"outputTokens":4327,"totalTokens":10250,"prefillTokensPerSec":245.08,"generationTokensPerSec":34.94,"combinedTokensPerSec":69.25,"totalDurationMs":148018,"timeToFirstTokenMs":24168,"rawTimestamps":{"ttftMs":24168,"generationDurationMs":123850,"turns":[{"turnId":"turn-0","durationMs":30016,"ttftMs":24168},{"turnId":"turn-1","durationMs":10759},{"turnId":"turn-2","durationMs":6510},{"turnId":"turn-3","durationMs":48120,"ttftMs":21676},{"turnId":"turn-4","durationMs":11553,"ttftMs":5626},{"turnId":"turn-5","durationMs":2866},{"turnId":"turn-6","durationMs":10915,"ttftMs":7202},{"turnId":"turn-7","durationMs":8631,"ttftMs":3181},{"turnId":"turn-8","durationMs":4935},{"turnId":"turn-9","durationMs":13713,"ttftMs":6306}]}} +{"timestamp":"2026-04-28T09:38:54.862Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":8054,"outputTokens":1145,"totalTokens":9199,"prefillTokensPerSec":235.91,"generationTokensPerSec":78.04,"combinedTokensPerSec":188.46,"totalDurationMs":48812,"timeToFirstTokenMs":34140,"rawTimestamps":{"ttftMs":34140,"allTtftMs":[34140],"generationDurationMs":14672,"turns":[{"turnId":"turn-0","durationMs":3559},{"turnId":"turn-1","durationMs":45253,"ttftMs":34140}]}} +{"timestamp":"2026-04-28T09:40:15.439Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":97,"outputTokens":733,"totalTokens":830,"prefillTokensPerSec":63.19,"generationTokensPerSec":46.13,"combinedTokensPerSec":47.63,"totalDurationMs":17425,"timeToFirstTokenMs":1535,"rawTimestamps":{"ttftMs":1535,"allTtftMs":[1535,1095],"generationDurationMs":15890,"turns":[{"turnId":"turn-0","durationMs":8515,"ttftMs":1535},{"turnId":"turn-1","durationMs":8910,"ttftMs":1095}]}} +{"timestamp":"2026-04-28T09:41:02.168Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":1,"inputTokens":33,"outputTokens":386,"totalTokens":419,"prefillTokensPerSec":5.11,"generationTokensPerSec":135.87,"combinedTokensPerSec":45.06,"totalDurationMs":9298,"timeToFirstTokenMs":6457,"rawTimestamps":{"ttftMs":6457,"allTtftMs":[6457],"generationDurationMs":2841,"turns":[{"turnId":"turn-0","durationMs":9298,"ttftMs":6457}]}} +{"timestamp":"2026-04-28T09:47:46.401Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":36,"inputTokens":15460,"outputTokens":12179,"totalTokens":27639,"prefillTokensPerSec":20104.03,"generationTokensPerSec":52.58,"combinedTokensPerSec":118.93,"totalDurationMs":232395,"timeToFirstTokenMs":769,"rawTimestamps":{"ttftMs":769,"allTtftMs":[769,3069,6075,3956,3658,3607,3823,3913,9797,0,0,1,13456],"generationDurationMs":231626,"turns":[{"turnId":"turn-0","durationMs":2786,"ttftMs":769},{"turnId":"turn-1","durationMs":2727},{"turnId":"turn-2","durationMs":2103},{"turnId":"turn-3","durationMs":2530},{"turnId":"turn-4","durationMs":2754},{"turnId":"turn-5","durationMs":3357},{"turnId":"turn-6","durationMs":10489,"ttftMs":3069},{"turnId":"turn-7","durationMs":2625},{"turnId":"turn-8","durationMs":15034,"ttftMs":6075},{"turnId":"turn-9","durationMs":2817},{"turnId":"turn-10","durationMs":14176,"ttftMs":3956},{"turnId":"turn-11","durationMs":2537},{"turnId":"turn-12","durationMs":16864,"ttftMs":3658},{"turnId":"turn-13","durationMs":3137},{"turnId":"turn-14","durationMs":18123,"ttftMs":3607},{"turnId":"turn-15","durationMs":2307},{"turnId":"turn-16","durationMs":17842,"ttftMs":3823},{"turnId":"turn-17","durationMs":2287},{"turnId":"turn-18","durationMs":10719,"ttftMs":3913},{"turnId":"turn-19","durationMs":24735},{"turnId":"turn-20","durationMs":2656},{"turnId":"turn-21","durationMs":15351,"ttftMs":9797},{"turnId":"turn-22","durationMs":22251},{"turnId":"turn-23","durationMs":4702},{"turnId":"turn-24","durationMs":2,"ttftMs":0},{"turnId":"turn-25","durationMs":0},{"turnId":"turn-26","durationMs":1},{"turnId":"turn-27","durationMs":0,"ttftMs":0},{"turnId":"turn-28","durationMs":1},{"turnId":"turn-29","durationMs":1},{"turnId":"turn-30","durationMs":0},{"turnId":"turn-31","durationMs":1},{"turnId":"turn-32","durationMs":1,"ttftMs":1},{"turnId":"turn-33","durationMs":1},{"turnId":"turn-34","durationMs":0},{"turnId":"turn-35","durationMs":27478,"ttftMs":13456}]}} diff --git a/packages/pi-llm-performance/README.md b/packages/pi-llm-performance/README.md index a12d9f2..9ea592d 100644 --- a/packages/pi-llm-performance/README.md +++ b/packages/pi-llm-performance/README.md @@ -1,6 +1,84 @@ # pi-llm-performance -LLM performance metrics extension — captures and displays TTFT, prefill, and generation speeds from pi agent turns. +Pi coding agent extension that captures and displays LLM inference performance metrics. + +## Why + +Understanding model performance helps you: + +- **Compare models** — measure throughput differences between providers and model sizes +- **Debug slowdowns** — spot when prefill or generation degrades unexpectedly +- **Validate hardware** — confirm your setup delivers expected token throughput +- **Tune parameters** — evaluate the impact of speculative decoding, context window size, etc. + +## What it measures + +| Metric | Description | +|--------|-------------| +| **TTFT** | Time to first token (ms) — how long before you see output | +| **Prefill speed** | Input tokens processed per second during the prefill phase | +| **Generation speed** | Output tokens generated per second during the generation phase | +| **Combined speed** | Total tokens (input + output) per second across the full prompt | + +## How it works + +The extension hooks into pi's agent lifecycle events: + +| Event | Behavior | +|-------|----------| +| `agent_start` | Records provider/model, resets counters | +| `turn_start` | Marks turn boundary | +| `message_update` | Captures TTFT on first token delta | +| `turn_end` | Records token counts and turn duration | +| `agent_end` | Aggregates metrics, displays in TUI, logs to `.pi/llm-metrics.log` | + +## Output + +### TUI notification + +After each prompt completes, a notification shows: + +``` +📊 Performance: llama.cpp/Qwen3.6-35B-A3B-MXFP4_MOE.gguf + Prefill: 1,240 tokens @ 68.3 tok/s + Generation: 312 tokens @ 89.9 tok/s + Combined: 1,552 tokens @ 78.4 tok/s (19.8s total) + TTFT: 1250ms +``` + +### Status bar + +The footer status shows combined throughput: `📊 78.4 tok/s` + +### Log file + +Each prompt writes a JSONL entry to `.pi/llm-metrics.log`: + +```json +{ + "timestamp": "2026-04-28T10:05:00.000Z", + "provider": "llama.cpp", + "model": "Qwen3.6-35B-A3B-MXFP4_MOE.gguf", + "turnCount": 1, + "inputTokens": 1240, + "outputTokens": 312, + "totalTokens": 1552, + "prefillTokensPerSec": 68.3, + "generationTokensPerSec": 89.9, + "combinedTokensPerSec": 78.4, + "totalDurationMs": 19800, + "timeToFirstTokenMs": 1250, + "rawTimestamps": { + "ttftMs": 1250, + "generationDurationMs": 18550, + "turns": [{"turnId": "turn-0", "durationMs": 19800, "ttftMs": 1250}] + } +} +``` + +## Sanity checks + +The extension warns to the console if generation speed exceeds 500 tok/s (physically impossible for any known model/hardware setup). This helps catch timing bugs early. ## Development @@ -11,24 +89,6 @@ pnpm install # workspace setup deno test # run tests ``` -## Usage - -Add to your pi settings (`~/.pi/agent/settings.json`): - -```json -{ - "packages": [ - "../dev/spikes/llm/monotonic-pi-extensions/packages/pi-llm-performance" - ] -} -``` - -Then reload pi: - -```bash -/reload -``` - ## License MIT diff --git a/packages/pi-llm-performance/deno.lock b/packages/pi-llm-performance/deno.lock index cba41d8..f362743 100644 --- a/packages/pi-llm-performance/deno.lock +++ b/packages/pi-llm-performance/deno.lock @@ -2,7 +2,9 @@ "version": "5", "specifiers": { "jsr:@std/assert@*": "1.0.19", - "jsr:@std/internal@^1.0.12": "1.0.12" + "jsr:@std/assert@^1.0.19": "1.0.19", + "jsr:@std/internal@^1.0.12": "1.0.12", + "jsr:@std/testing@*": "1.0.18" }, "jsr": { "@std/assert@1.0.19": { @@ -13,6 +15,12 @@ }, "@std/internal@1.0.12": { "integrity": "972a634fd5bc34b242024402972cd5143eac68d8dffaca5eaa4dba30ce17b027" + }, + "@std/testing@1.0.18": { + "integrity": "d3152f57b11666bf6358d0e127c7e3488e91178b0c2d8fbf0793e1c53cd13cb1", + "dependencies": [ + "jsr:@std/assert@^1.0.19" + ] } }, "workspace": { diff --git a/packages/pi-llm-performance/src/llm-metrics-core.test.ts b/packages/pi-llm-performance/src/llm-metrics-core.test.ts index 18d816e..d2ac701 100644 --- a/packages/pi-llm-performance/src/llm-metrics-core.test.ts +++ b/packages/pi-llm-performance/src/llm-metrics-core.test.ts @@ -69,6 +69,11 @@ Deno.test("aggregatePromptMetrics - aggregates single turn", () => { assertLessOrEqual(result.generationTokensPerSec, 47.7); // combined: 1200 total tokens / 5s = 240 tok/s assertEquals(result.combinedTokensPerSec, 240); + + // rawTimestamps + assertEquals(result.rawTimestamps?.ttftMs, 800); + assertEquals(result.rawTimestamps?.allTtftMs, [800]); + assertEquals(result.rawTimestamps?.generationDurationMs, 4200); }); Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => { @@ -116,6 +121,15 @@ Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => { // combined: 2250 / 6.5 = 346.15 tok/s assertGreaterOrEqual(result.combinedTokensPerSec, 346.1); assertLessOrEqual(result.combinedTokensPerSec, 346.2); + + // rawTimestamps: only turn-1 has valid TTFT, turns 2+ have none + assertEquals(result.rawTimestamps?.ttftMs, 800); + assertEquals(result.rawTimestamps?.allTtftMs, [800]); + assertEquals(result.rawTimestamps?.generationDurationMs, 5700); + assertEquals(result.rawTimestamps?.turns.length, 3); + assertEquals(result.rawTimestamps?.turns[0].ttftMs, 800); + assertEquals(result.rawTimestamps?.turns[1].ttftMs, undefined); + assertEquals(result.rawTimestamps?.turns[2].ttftMs, undefined); }); Deno.test("aggregatePromptMetrics - handles empty turn list", () => { @@ -252,6 +266,80 @@ Deno.test("formatMetricsForDisplay - shows turn count when multiple turns", () = assertEquals(display.includes("Turns: 3"), true); }); +Deno.test("aggregatePromptMetrics - uses first valid TTFT when turn-0 has none", () => { + // Edge case: turn-0 has no TTFT, turn-1 does. Should use turn-1's TTFT. + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-0", + inputTokens: 1000, + outputTokens: 200, + durationMs: 3000, + // No timeToFirstTokenMs + }, + { + turnId: "turn-1", + inputTokens: 500, + outputTokens: 150, + durationMs: 2000, + timeToFirstTokenMs: 600, + }, + ]; + + const result = aggregatePromptMetrics({ + provider: "llama.cpp", + model: "Qwen3.6-35B", + turnMetrics, + }); + + // First valid TTFT is from turn-1 (600ms) + assertEquals(result.rawTimestamps?.allTtftMs, [600]); + assertEquals(result.rawTimestamps?.ttftMs, 600); + // Generation duration = totalDuration - firstValidTTFT = 5000 - 600 = 4400 + assertEquals(result.rawTimestamps?.generationDurationMs, 4400); + // prefill: 1500 / 0.6 = 2500 + assertEquals(result.prefillTokensPerSec, 2500); + // generation: 350 / 4.4 = 79.55 + assertGreaterOrEqual(result.generationTokensPerSec, 79.5); + assertLessOrEqual(result.generationTokensPerSec, 79.6); +}); + +Deno.test("aggregatePromptMetrics - filters out negative TTFT values", () => { + // Simulates the bug where turn-2 got TTFT=-20390 from the old global-firstToken code + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-0", + inputTokens: 1000, + outputTokens: 200, + durationMs: 3000, + timeToFirstTokenMs: 800, + }, + { + turnId: "turn-1", + inputTokens: 500, + outputTokens: 150, + durationMs: 2000, + timeToFirstTokenMs: -5000, // Invalid: negative + }, + ]; + + const result = aggregatePromptMetrics({ + provider: "llama.cpp", + model: "Qwen3.6-35B", + turnMetrics, + }); + + // Only turn-0's TTFT (800) should be used; turn-1's negative value is filtered + assertEquals(result.rawTimestamps?.allTtftMs, [800]); + assertEquals(result.rawTimestamps?.ttftMs, 800); + // Generation duration = totalDuration - firstTurnTTFT = 5000 - 800 = 4200 + assertEquals(result.rawTimestamps?.generationDurationMs, 4200); + // prefill: 1500 / 0.8 = 1875 + assertEquals(result.prefillTokensPerSec, 1875); + // generation: 350 / 4.2 = 83.33 + assertGreaterOrEqual(result.generationTokensPerSec, 83.3); + assertLessOrEqual(result.generationTokensPerSec, 83.4); +}); + Deno.test("toLogEntry - creates JSON-serializable log entry", () => { const metrics: PromptMetrics = { provider: "anthropic", @@ -265,6 +353,12 @@ Deno.test("toLogEntry - creates JSON-serializable log entry", () => { combinedTokensPerSec: 38.09876, totalDurationMs: 21600, timeToFirstTokenMs: 850, + rawTimestamps: { + ttftMs: 850, + allTtftMs: [850], + generationDurationMs: 20750, + turns: [], + }, turns: [], }; @@ -292,6 +386,72 @@ Deno.test("toLogEntry - creates JSON-serializable log entry", () => { assertEquals(json.length > 0, true); const parsed = JSON.parse(json); assertEquals(parsed.provider, "anthropic"); + + // rawTimestamps should be included + assertEquals(logEntry.rawTimestamps?.ttftMs, 850); + assertEquals(logEntry.rawTimestamps?.allTtftMs, [850]); + assertEquals(logEntry.rawTimestamps?.generationDurationMs, 20750); + assertEquals(logEntry.rawTimestamps?.turns.length, 0); +}); + +Deno.test("aggregatePromptMetrics - warns when generation speed is physically impossible", () => { + const originalWarn = console.warn; + let warnCall: string | undefined; + console.warn = (msg: string) => { warnCall = msg; }; + + try { + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-0", + inputTokens: 100, + outputTokens: 1000, + durationMs: 1000, + timeToFirstTokenMs: 100, + }, + ]; + + aggregatePromptMetrics({ + provider: "llama.cpp", + model: "Qwen3.6-35B", + turnMetrics, + }); + + // generation: 1000 / 0.9 = 1111.11 tok/s > 500 + assertGreaterOrEqual(warnCall, ""); + assertEquals(warnCall!.includes("Suspicious generation speed"), true); + assertEquals(warnCall!.includes("1111.1 tok/s"), true); + assertEquals(warnCall!.includes("output=1000"), true); + } finally { + console.warn = originalWarn; + } +}); + +Deno.test("aggregatePromptMetrics - does not warn for normal speeds", () => { + const originalWarn = console.warn; + let warnCall: string | undefined; + console.warn = (msg: string) => { warnCall = msg; }; + + try { + const turnMetrics: TurnMetrics[] = [ + { + turnId: "turn-0", + inputTokens: 1000, + outputTokens: 200, + durationMs: 5000, + timeToFirstTokenMs: 800, + }, + ]; + + aggregatePromptMetrics({ + provider: "llama.cpp", + model: "Qwen3.6-35B", + turnMetrics, + }); + + assertEquals(warnCall, undefined); + } finally { + console.warn = originalWarn; + } }); Deno.test("aggregatePromptMetrics - uses full duration when TTFT is undefined", () => { diff --git a/packages/pi-llm-performance/src/llm-metrics-core.ts b/packages/pi-llm-performance/src/llm-metrics-core.ts index dca82c9..dc75b67 100644 --- a/packages/pi-llm-performance/src/llm-metrics-core.ts +++ b/packages/pi-llm-performance/src/llm-metrics-core.ts @@ -1,5 +1,10 @@ // Functional core for LLM performance metrics calculation +// Extracted warning function so tests can mock it without touching console +export function warn(msg: string): void { + console.warn(msg); +} + export interface TurnMetrics { turnId: string; inputTokens: number; @@ -20,6 +25,12 @@ export interface PromptMetrics { combinedTokensPerSec: number; totalDurationMs: number; timeToFirstTokenMs?: number; + rawTimestamps?: { + ttftMs?: number; + allTtftMs?: number[]; + generationDurationMs?: number; + turns: Array<{ turnId: string; durationMs: number; ttftMs?: number }>; + }; turns: TurnMetrics[]; } @@ -36,6 +47,12 @@ export interface MetricLogEntry { combinedTokensPerSec: number; totalDurationMs: number; timeToFirstTokenMs?: number; + rawTimestamps?: { + ttftMs?: number; + allTtftMs?: number[]; + generationDurationMs?: number; + turns: Array<{ turnId: string; durationMs: number; ttftMs?: number }>; + }; } /** @@ -79,6 +96,7 @@ export function aggregatePromptMetrics(params: { generationTokensPerSec: 0, combinedTokensPerSec: 0, totalDurationMs: 0, + rawTimestamps: { turns: [] }, turns: [], }; } @@ -92,18 +110,19 @@ export function aggregatePromptMetrics(params: { const totalDurationMs = turnMetrics.reduce((sum, t) => sum + t.durationMs, 0); const totalDurationSec = totalDurationMs / 1000; - // Time to first token is from the first turn - const timeToFirstTokenMs = turnMetrics[0]?.timeToFirstTokenMs; + // Collect per-turn TTFTs; prefill boundary is the first turn's TTFT + const ttftValues = turnMetrics.map(t => t.timeToFirstTokenMs).filter((t): t is number => t !== undefined && t >= 0); + const firstTurnTtftMs = ttftValues.length > 0 ? ttftValues[0] : undefined; // Calculate tokens per second - // Prefill: input tokens / TTFT duration (prefill phase) - // Generation: output tokens / (totalDuration - TTFT) (generation phase) + // Prefill: input tokens / first-turn TTFT (prefill happens once at the start) + // Generation: output tokens / (totalDuration - firstTurnTTFT) (generation phase) // Combined: total tokens / total duration - // When TTFT is unavailable, prefill and generation phases cannot be separated, + // When first-turn TTFT is unavailable, prefill and generation phases cannot be separated, // so we set them to 0 and only report combined. - const ttftSec = timeToFirstTokenMs !== undefined ? timeToFirstTokenMs / 1000 : undefined; - const generationDurationSec = timeToFirstTokenMs !== undefined - ? (totalDurationMs - timeToFirstTokenMs) / 1000 + const ttftSec = firstTurnTtftMs !== undefined ? firstTurnTtftMs / 1000 : undefined; + const generationDurationSec = firstTurnTtftMs !== undefined + ? (totalDurationMs - firstTurnTtftMs) / 1000 : undefined; const prefillTokensPerSec = (ttftSec && ttftSec > 0) ? inputTokens / ttftSec : 0; @@ -112,6 +131,13 @@ export function aggregatePromptMetrics(params: { : 0; const combinedTokensPerSec = totalDurationSec > 0 ? totalTokens / totalDurationSec : 0; + // Sanity check: flag physically impossible generation speeds + if (generationTokensPerSec > 500) { + warn( + `[metrics] Suspicious generation speed: ${generationTokensPerSec.toFixed(1)} tok/s (input=${inputTokens}, output=${outputTokens}, totalDuration=${totalDurationMs}ms, TTFT=${firstTurnTtftMs}ms)` + ); + } + return { provider, model, @@ -123,7 +149,13 @@ export function aggregatePromptMetrics(params: { generationTokensPerSec, combinedTokensPerSec, totalDurationMs, - timeToFirstTokenMs, + timeToFirstTokenMs: firstTurnTtftMs, + rawTimestamps: { + ttftMs: firstTurnTtftMs, + allTtftMs: ttftValues, + generationDurationMs: generationDurationSec !== undefined ? generationDurationSec * 1000 : undefined, + turns: turnMetrics.map(t => ({ turnId: t.turnId, durationMs: t.durationMs, ttftMs: t.timeToFirstTokenMs })), + }, turns: turnMetrics, }; } @@ -197,5 +229,6 @@ export function toLogEntry(metrics: PromptMetrics): MetricLogEntry { combinedTokensPerSec: Math.round(metrics.combinedTokensPerSec * 100) / 100, totalDurationMs: metrics.totalDurationMs, timeToFirstTokenMs: metrics.timeToFirstTokenMs, + rawTimestamps: metrics.rawTimestamps, }; } \ No newline at end of file diff --git a/packages/pi-llm-performance/src/llm-performance-metrics.test.ts b/packages/pi-llm-performance/src/llm-performance-metrics.test.ts deleted file mode 100644 index 18d816e..0000000 --- a/packages/pi-llm-performance/src/llm-performance-metrics.test.ts +++ /dev/null @@ -1,398 +0,0 @@ -import { - calculateTurnMetrics, - aggregatePromptMetrics, - formatMetricsForDisplay, - toLogEntry, - type TurnMetrics, - type PromptMetrics, -} from "./llm-metrics-core.ts"; -import { assertEquals, assertGreaterOrEqual, assertLessOrEqual } from "jsr:@std/assert"; - -Deno.test("calculateTurnMetrics - creates turn metrics object", () => { - const result = calculateTurnMetrics({ - turnId: "turn-1", - inputTokens: 100, - outputTokens: 50, - durationMs: 2000, - timeToFirstTokenMs: 500, - }); - - assertEquals(result.turnId, "turn-1"); - assertEquals(result.inputTokens, 100); - assertEquals(result.outputTokens, 50); - assertEquals(result.durationMs, 2000); - assertEquals(result.timeToFirstTokenMs, 500); -}); - -Deno.test("calculateTurnMetrics - handles missing timeToFirstToken", () => { - const result = calculateTurnMetrics({ - turnId: "turn-1", - inputTokens: 100, - outputTokens: 50, - durationMs: 2000, - }); - - assertEquals(result.timeToFirstTokenMs, undefined); -}); - -Deno.test("aggregatePromptMetrics - aggregates single turn", () => { - const turnMetrics: TurnMetrics[] = [ - { - turnId: "turn-1", - inputTokens: 1000, - outputTokens: 200, - durationMs: 5000, - timeToFirstTokenMs: 800, - }, - ]; - - const result = aggregatePromptMetrics({ - provider: "anthropic", - model: "claude-sonnet-4", - turnMetrics, - }); - - assertEquals(result.provider, "anthropic"); - assertEquals(result.model, "claude-sonnet-4"); - assertEquals(result.turnCount, 1); - assertEquals(result.inputTokens, 1000); - assertEquals(result.outputTokens, 200); - assertEquals(result.totalTokens, 1200); - assertEquals(result.totalDurationMs, 5000); - assertEquals(result.timeToFirstTokenMs, 800); - - // Tokens per second calculations - // prefill: 1000 input tokens / 0.8s TTFT = 1250 tok/s - assertEquals(result.prefillTokensPerSec, 1250); - // generation: 200 output tokens / 4.2s (5s - 0.8s) = 47.62 tok/s - assertGreaterOrEqual(result.generationTokensPerSec, 47.6); - assertLessOrEqual(result.generationTokensPerSec, 47.7); - // combined: 1200 total tokens / 5s = 240 tok/s - assertEquals(result.combinedTokensPerSec, 240); -}); - -Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => { - const turnMetrics: TurnMetrics[] = [ - { - turnId: "turn-1", - inputTokens: 1000, - outputTokens: 200, - durationMs: 3000, - timeToFirstTokenMs: 800, - }, - { - turnId: "turn-2", - inputTokens: 500, - outputTokens: 150, - durationMs: 2000, - }, - { - turnId: "turn-3", - inputTokens: 300, - outputTokens: 100, - durationMs: 1500, - }, - ]; - - const result = aggregatePromptMetrics({ - provider: "openai", - model: "gpt-4o", - turnMetrics, - }); - - assertEquals(result.turnCount, 3); - assertEquals(result.inputTokens, 1800); // 1000 + 500 + 300 - assertEquals(result.outputTokens, 450); // 200 + 150 + 100 - assertEquals(result.totalTokens, 2250); - assertEquals(result.totalDurationMs, 6500); // 3000 + 2000 + 1500 - assertEquals(result.timeToFirstTokenMs, 800); // From first turn only - - // Tokens per second: prefill uses TTFT (0.8s), generation uses (total - TTFT) = 5.7s - // prefill: 1800 / 0.8 = 2250 tok/s - assertEquals(result.prefillTokensPerSec, 2250); - // generation: 450 / 5.7 = 78.95 tok/s - assertGreaterOrEqual(result.generationTokensPerSec, 78.9); - assertLessOrEqual(result.generationTokensPerSec, 79.0); - // combined: 2250 / 6.5 = 346.15 tok/s - assertGreaterOrEqual(result.combinedTokensPerSec, 346.1); - assertLessOrEqual(result.combinedTokensPerSec, 346.2); -}); - -Deno.test("aggregatePromptMetrics - handles empty turn list", () => { - const result = aggregatePromptMetrics({ - provider: "anthropic", - model: "claude-sonnet-4", - turnMetrics: [], - }); - - assertEquals(result.turnCount, 0); - assertEquals(result.inputTokens, 0); - assertEquals(result.outputTokens, 0); - assertEquals(result.totalTokens, 0); - assertEquals(result.prefillTokensPerSec, 0); - assertEquals(result.generationTokensPerSec, 0); - assertEquals(result.combinedTokensPerSec, 0); - assertEquals(result.totalDurationMs, 0); - assertEquals(result.timeToFirstTokenMs, undefined); -}); - -Deno.test("formatMetricsForDisplay - formats single turn metrics", () => { - const metrics: PromptMetrics = { - provider: "anthropic", - model: "claude-sonnet-4", - turnCount: 1, - inputTokens: 1250, - outputTokens: 342, - totalTokens: 1592, - prefillTokensPerSec: 482.1, - generationTokensPerSec: 18.3, - combinedTokensPerSec: 38.0, - totalDurationMs: 21600, - timeToFirstTokenMs: 850, - turns: [], - }; - - const display = formatMetricsForDisplay(metrics); - - assertEquals(display.includes("anthropic/claude-sonnet-4"), true); - assertEquals(display.includes("1,250 tokens"), true); - assertEquals(display.includes("482.1 tok/s"), true); - assertEquals(display.includes("342 tokens"), true); - assertEquals(display.includes("18.3 tok/s"), true); - assertEquals(display.includes("1,592 tokens"), true); - assertEquals(display.includes("38.0 tok/s"), true); - assertEquals(display.includes("21.6s"), true); - assertEquals(display.includes("TTFT: 850ms"), true); -}); - -Deno.test("formatMetricsForDisplay - formats duration as minutes when over 60s", () => { - const metrics: PromptMetrics = { - provider: "openai", - model: "gpt-4o", - turnCount: 1, - inputTokens: 5000, - outputTokens: 1000, - totalTokens: 6000, - prefillTokensPerSec: 50, - generationTokensPerSec: 10, - combinedTokensPerSec: 60, - totalDurationMs: 120000, // 2 minutes - timeToFirstTokenMs: 1500, - turns: [], - }; - - const display = formatMetricsForDisplay(metrics); - - assertEquals(display.includes("2.0m"), true); -}); - -Deno.test("formatMetricsForDisplay - omits turn count when single turn", () => { - const metrics: PromptMetrics = { - provider: "anthropic", - model: "claude-sonnet-4", - turnCount: 1, - inputTokens: 100, - outputTokens: 50, - totalTokens: 150, - prefillTokensPerSec: 20, - generationTokensPerSec: 10, - combinedTokensPerSec: 30, - totalDurationMs: 5000, - timeToFirstTokenMs: 500, - turns: [], - }; - - const display = formatMetricsForDisplay(metrics); - - assertEquals(display.includes("Turns: 1"), false); -}); - -Deno.test("formatMetricsForDisplay - omits prefill/generation when TTFT is unavailable", () => { - const metrics: PromptMetrics = { - provider: "openai", - model: "gpt-4o", - turnCount: 1, - inputTokens: 1000, - outputTokens: 200, - totalTokens: 1200, - prefillTokensPerSec: 0, - generationTokensPerSec: 0, - combinedTokensPerSec: 240, - totalDurationMs: 5000, - timeToFirstTokenMs: undefined, - turns: [], - }; - - const display = formatMetricsForDisplay(metrics); - - assertEquals(display.includes("Prefill:"), false); - assertEquals(display.includes("Generation:"), false); - assertEquals(display.includes("1,200 tokens"), true); - assertEquals(display.includes("240.0 tok/s"), true); -}); - -Deno.test("formatMetricsForDisplay - shows turn count when multiple turns", () => { - const metrics: PromptMetrics = { - provider: "anthropic", - model: "claude-sonnet-4", - turnCount: 3, - inputTokens: 100, - outputTokens: 50, - totalTokens: 150, - prefillTokensPerSec: 20, - generationTokensPerSec: 10, - combinedTokensPerSec: 30, - totalDurationMs: 5000, - timeToFirstTokenMs: 500, - turns: [], - }; - - const display = formatMetricsForDisplay(metrics); - - assertEquals(display.includes("Turns: 3"), true); -}); - -Deno.test("toLogEntry - creates JSON-serializable log entry", () => { - const metrics: PromptMetrics = { - provider: "anthropic", - model: "claude-sonnet-4", - turnCount: 2, - inputTokens: 1250, - outputTokens: 342, - totalTokens: 1592, - prefillTokensPerSec: 482.12345, - generationTokensPerSec: 18.34567, - combinedTokensPerSec: 38.09876, - totalDurationMs: 21600, - timeToFirstTokenMs: 850, - turns: [], - }; - - const logEntry = toLogEntry(metrics); - - assertEquals(logEntry.provider, "anthropic"); - assertEquals(logEntry.model, "claude-sonnet-4"); - assertEquals(logEntry.turnCount, 2); - assertEquals(logEntry.inputTokens, 1250); - assertEquals(logEntry.outputTokens, 342); - assertEquals(logEntry.totalTokens, 1592); - // Rounded to 2 decimal places - assertEquals(logEntry.prefillTokensPerSec, 482.12); - assertEquals(logEntry.generationTokensPerSec, 18.35); - assertEquals(logEntry.combinedTokensPerSec, 38.1); - assertEquals(logEntry.totalDurationMs, 21600); - assertEquals(logEntry.timeToFirstTokenMs, 850); - - // Should have ISO timestamp - assertEquals(logEntry.timestamp.includes("T"), true); - assertEquals(logEntry.timestamp.includes("Z"), true); - - // Should be JSON serializable - const json = JSON.stringify(logEntry); - assertEquals(json.length > 0, true); - const parsed = JSON.parse(json); - assertEquals(parsed.provider, "anthropic"); -}); - -Deno.test("aggregatePromptMetrics - uses full duration when TTFT is undefined", () => { - const turnMetrics: TurnMetrics[] = [ - { - turnId: "turn-1", - inputTokens: 1000, - outputTokens: 200, - durationMs: 5000, - // No timeToFirstTokenMs - }, - ]; - - const result = aggregatePromptMetrics({ - provider: "openai", - model: "gpt-4o", - turnMetrics, - }); - - assertEquals(result.turnCount, 1); - assertEquals(result.inputTokens, 1000); - assertEquals(result.outputTokens, 200); - // Without TTFT, prefill and generation rates are 0 (cannot separate phases) - // Only combined rate is meaningful - assertEquals(result.prefillTokensPerSec, 0); - assertEquals(result.generationTokensPerSec, 0); - assertEquals(result.combinedTokensPerSec, 240); -}); - -Deno.test("toLogEntry - handles missing timeToFirstToken", () => { - const metrics: PromptMetrics = { - provider: "anthropic", - model: "claude-sonnet-4", - turnCount: 1, - inputTokens: 100, - outputTokens: 50, - totalTokens: 150, - prefillTokensPerSec: 20, - generationTokensPerSec: 10, - combinedTokensPerSec: 30, - totalDurationMs: 5000, - timeToFirstTokenMs: undefined, - turns: [], - }; - - const logEntry = toLogEntry(metrics); - - assertEquals(logEntry.timeToFirstTokenMs, undefined); -}); - -Deno.test("Integration - full flow from turns to log entry", () => { - // Simulate a real scenario with multiple turns - const turn1 = calculateTurnMetrics({ - turnId: "turn-1", - inputTokens: 2000, - outputTokens: 500, - durationMs: 8000, - timeToFirstTokenMs: 1200, - }); - - const turn2 = calculateTurnMetrics({ - turnId: "turn-2", - inputTokens: 800, - outputTokens: 200, - durationMs: 3000, - }); - - const promptMetrics = aggregatePromptMetrics({ - provider: "groq", - model: "llama-3.1-70b", - turnMetrics: [turn1, turn2], - }); - - const display = formatMetricsForDisplay(promptMetrics); - const logEntry = toLogEntry(promptMetrics); - - // Verify aggregation - assertEquals(promptMetrics.turnCount, 2); - assertEquals(promptMetrics.inputTokens, 2800); - assertEquals(promptMetrics.outputTokens, 700); - assertEquals(promptMetrics.totalTokens, 3500); - assertEquals(promptMetrics.totalDurationMs, 11000); - assertEquals(promptMetrics.timeToFirstTokenMs, 1200); - - // Verify corrected rate calculations - // prefill: 2800 / 1.2 = 2333.33 tok/s - assertGreaterOrEqual(promptMetrics.prefillTokensPerSec, 2333.3); - assertLessOrEqual(promptMetrics.prefillTokensPerSec, 2333.4); - // generation: 700 / 9.8 = 71.43 tok/s - assertGreaterOrEqual(promptMetrics.generationTokensPerSec, 71.4); - assertLessOrEqual(promptMetrics.generationTokensPerSec, 71.5); - // combined: 3500 / 11 = 318.18 tok/s - assertGreaterOrEqual(promptMetrics.combinedTokensPerSec, 318.1); - assertLessOrEqual(promptMetrics.combinedTokensPerSec, 318.2); - - // Verify display contains key info - assertEquals(display.includes("groq/llama-3.1-70b"), true); - assertEquals(display.includes("TTFT: 1200ms"), true); - - // Verify log entry - assertEquals(logEntry.provider, "groq"); - assertEquals(logEntry.model, "llama-3.1-70b"); - assertEquals(logEntry.turnCount, 2); -}); \ No newline at end of file diff --git a/packages/pi-llm-performance/src/llm-performance-metrics.ts b/packages/pi-llm-performance/src/llm-performance-metrics.ts index 31bde9a..309043b 100644 --- a/packages/pi-llm-performance/src/llm-performance-metrics.ts +++ b/packages/pi-llm-performance/src/llm-performance-metrics.ts @@ -25,7 +25,7 @@ let promptStartMs: number | undefined; let currentTurnStartMs: number | undefined; let currentTurnId: string | undefined; let turnMetrics: TurnMetrics[] = []; -let firstTokenTimeMs: number | undefined; +let currentTurnFirstTokenMs: number | undefined; // Per-turn TTFT let provider: string | undefined; let model: string | undefined; @@ -36,7 +36,7 @@ export default function (pi: ExtensionAPI) { if (!ctx.model) return; promptStartMs = Date.now(); turnMetrics = []; - firstTokenTimeMs = undefined; + currentTurnFirstTokenMs = undefined; provider = ctx.model.provider; model = ctx.model.id; }); @@ -44,12 +44,13 @@ export default function (pi: ExtensionAPI) { pi.on("turn_start", async (event, _ctx) => { currentTurnStartMs = Date.now(); currentTurnId = `turn-${event.turnIndex}`; + currentTurnFirstTokenMs = undefined; // Reset TTFT for this turn }); pi.on("message_update", async (event, _ctx) => { - // Capture TTFT on first token - if (firstTokenTimeMs === undefined && event.assistantMessageEvent?.type === "text_delta") { - firstTokenTimeMs = Date.now(); + // Capture per-turn TTFT on first token + if (currentTurnFirstTokenMs === undefined && event.assistantMessageEvent?.type === "text_delta") { + currentTurnFirstTokenMs = Date.now(); } }); @@ -58,8 +59,8 @@ export default function (pi: ExtensionAPI) { const inputTokens = event.message.usage?.input ?? 0; const outputTokens = event.message.usage?.output ?? 0; const durationMs = currentTurnStartMs ? Date.now() - currentTurnStartMs : 0; - const ttftMs = currentTurnId === `turn-${event.turnIndex}` && firstTokenTimeMs && currentTurnStartMs - ? firstTokenTimeMs - currentTurnStartMs + const ttftMs = currentTurnFirstTokenMs && currentTurnStartMs + ? currentTurnFirstTokenMs - currentTurnStartMs : undefined; const turnMetric = calculateTurnMetrics({ @@ -95,6 +96,6 @@ export default function (pi: ExtensionAPI) { // Reset state promptStartMs = undefined; turnMetrics = []; - firstTokenTimeMs = undefined; + currentTurnFirstTokenMs = undefined; }); } \ No newline at end of file