pi-performance: Make Time to first token more accurate.
Summary of changes:
┌──────┬──────────────────────────────────────────────────────────────────┬──────────┐
│ Step │ Change │ Result │
├──────┼──────────────────────────────────────────────────────────────────┼──────────┤
│ 1 │ Removed duplicate llm-performance-metrics.test.ts │ 14 tests │
├──────┼──────────────────────────────────────────────────────────────────┼──────────┤
│ 2 │ Added rawTimestamps assertions to toLogEntry test │ 14 tests │
├──────┼──────────────────────────────────────────────────────────────────┼──────────┤
│ 3 │ Added rawTimestamps assertions to single-turn aggregate test │ 14 tests │
├──────┼──────────────────────────────────────────────────────────────────┼──────────┤
│ 4 │ Added rawTimestamps assertions to multi-turn aggregate test │ 14 tests │
├──────┼──────────────────────────────────────────────────────────────────┼──────────┤
│ 5 │ Added negative TTFT filtering test │ 15 tests │
├──────┼──────────────────────────────────────────────────────────────────┼──────────┤
│ 6 │ Added "first turn missing TTFT, later turns have it" test │ 16 tests │
├──────┼──────────────────────────────────────────────────────────────────┼──────────┤
│ 7 │ Added sanity check tests (warn on >500 tok/s, no warn otherwise) │ 18 tests │
└──────┴──────────────────────────────────────────────────────────────────┴──────────┘
This is what it looks like now when I run `pi`
📊 Performance: llama.cpp/Qwen3.6-35B-A3B-MXFP4_MOE.gguf
Prefill: 15,460 tokens @ 20104.0 tok/s
Generation: 12,179 tokens @ 52.6 tok/s
Combined: 27,639 tokens @ 118.9 tok/s (3.9m total)
TTFT: 769ms
Turns: 36
This commit is contained in:
parent
a38c76c65e
commit
98e18643c5
@ -1 +1,13 @@
|
|||||||
{"timestamp":"2026-04-28T08:58:29.989Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":6,"inputTokens":8294,"outputTokens":1356,"totalTokens":9650,"prefillTokensPerSec":1925.26,"generationTokensPerSec":42.55,"combinedTokensPerSec":266.77,"totalDurationMs":36174,"timeToFirstTokenMs":4308}
|
{"timestamp":"2026-04-28T08:58:29.989Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":6,"inputTokens":8294,"outputTokens":1356,"totalTokens":9650,"prefillTokensPerSec":1925.26,"generationTokensPerSec":42.55,"combinedTokensPerSec":266.77,"totalDurationMs":36174,"timeToFirstTokenMs":4308}
|
||||||
|
{"timestamp":"2026-04-28T09:16:50.070Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":1,"inputTokens":25,"outputTokens":39,"totalTokens":64,"prefillTokensPerSec":46.38,"generationTokensPerSec":89.86,"combinedTokensPerSec":65.78,"totalDurationMs":973,"timeToFirstTokenMs":539}
|
||||||
|
{"timestamp":"2026-04-28T09:19:10.796Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":1146,"outputTokens":911,"totalTokens":2057,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":104.65,"totalDurationMs":19656}
|
||||||
|
{"timestamp":"2026-04-28T09:20:38.777Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":1704,"outputTokens":1466,"totalTokens":3170,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":98.11,"totalDurationMs":32311}
|
||||||
|
{"timestamp":"2026-04-28T09:26:23.124Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":4,"inputTokens":2258,"outputTokens":2984,"totalTokens":5242,"prefillTokensPerSec":1019.41,"generationTokensPerSec":54.73,"combinedTokensPerSec":92.39,"totalDurationMs":56736,"timeToFirstTokenMs":2215}
|
||||||
|
{"timestamp":"2026-04-28T09:28:34.906Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":3,"inputTokens":976,"outputTokens":1434,"totalTokens":2410,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":72.74,"totalDurationMs":33130,"rawTimestamps":{"turns":[{"turnId":"turn-0","durationMs":5261},{"turnId":"turn-1","durationMs":23398,"ttftMs":3008},{"turnId":"turn-2","durationMs":4471,"ttftMs":-20390}]}}
|
||||||
|
{"timestamp":"2026-04-28T09:32:02.665Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":5,"inputTokens":2780,"outputTokens":4488,"totalTokens":7268,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":73.68,"totalDurationMs":98638,"rawTimestamps":{"turns":[{"turnId":"turn-0","durationMs":1617},{"turnId":"turn-1","durationMs":23911,"ttftMs":19482},{"turnId":"turn-2","durationMs":62977,"ttftMs":-4429},{"turnId":"turn-3","durationMs":2504,"ttftMs":-67406},{"turnId":"turn-4","durationMs":7629,"ttftMs":-69910}]}}
|
||||||
|
{"timestamp":"2026-04-28T09:33:30.666Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":1496,"outputTokens":291,"totalTokens":1787,"prefillTokensPerSec":0,"generationTokensPerSec":0,"combinedTokensPerSec":59.38,"totalDurationMs":30092,"rawTimestamps":{"turns":[{"turnId":"turn-0","durationMs":5111},{"turnId":"turn-1","durationMs":24981,"ttftMs":12129}]}}
|
||||||
|
{"timestamp":"2026-04-28T09:36:38.399Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":10,"inputTokens":5923,"outputTokens":4327,"totalTokens":10250,"prefillTokensPerSec":245.08,"generationTokensPerSec":34.94,"combinedTokensPerSec":69.25,"totalDurationMs":148018,"timeToFirstTokenMs":24168,"rawTimestamps":{"ttftMs":24168,"generationDurationMs":123850,"turns":[{"turnId":"turn-0","durationMs":30016,"ttftMs":24168},{"turnId":"turn-1","durationMs":10759},{"turnId":"turn-2","durationMs":6510},{"turnId":"turn-3","durationMs":48120,"ttftMs":21676},{"turnId":"turn-4","durationMs":11553,"ttftMs":5626},{"turnId":"turn-5","durationMs":2866},{"turnId":"turn-6","durationMs":10915,"ttftMs":7202},{"turnId":"turn-7","durationMs":8631,"ttftMs":3181},{"turnId":"turn-8","durationMs":4935},{"turnId":"turn-9","durationMs":13713,"ttftMs":6306}]}}
|
||||||
|
{"timestamp":"2026-04-28T09:38:54.862Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":8054,"outputTokens":1145,"totalTokens":9199,"prefillTokensPerSec":235.91,"generationTokensPerSec":78.04,"combinedTokensPerSec":188.46,"totalDurationMs":48812,"timeToFirstTokenMs":34140,"rawTimestamps":{"ttftMs":34140,"allTtftMs":[34140],"generationDurationMs":14672,"turns":[{"turnId":"turn-0","durationMs":3559},{"turnId":"turn-1","durationMs":45253,"ttftMs":34140}]}}
|
||||||
|
{"timestamp":"2026-04-28T09:40:15.439Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":2,"inputTokens":97,"outputTokens":733,"totalTokens":830,"prefillTokensPerSec":63.19,"generationTokensPerSec":46.13,"combinedTokensPerSec":47.63,"totalDurationMs":17425,"timeToFirstTokenMs":1535,"rawTimestamps":{"ttftMs":1535,"allTtftMs":[1535,1095],"generationDurationMs":15890,"turns":[{"turnId":"turn-0","durationMs":8515,"ttftMs":1535},{"turnId":"turn-1","durationMs":8910,"ttftMs":1095}]}}
|
||||||
|
{"timestamp":"2026-04-28T09:41:02.168Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":1,"inputTokens":33,"outputTokens":386,"totalTokens":419,"prefillTokensPerSec":5.11,"generationTokensPerSec":135.87,"combinedTokensPerSec":45.06,"totalDurationMs":9298,"timeToFirstTokenMs":6457,"rawTimestamps":{"ttftMs":6457,"allTtftMs":[6457],"generationDurationMs":2841,"turns":[{"turnId":"turn-0","durationMs":9298,"ttftMs":6457}]}}
|
||||||
|
{"timestamp":"2026-04-28T09:47:46.401Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":36,"inputTokens":15460,"outputTokens":12179,"totalTokens":27639,"prefillTokensPerSec":20104.03,"generationTokensPerSec":52.58,"combinedTokensPerSec":118.93,"totalDurationMs":232395,"timeToFirstTokenMs":769,"rawTimestamps":{"ttftMs":769,"allTtftMs":[769,3069,6075,3956,3658,3607,3823,3913,9797,0,0,1,13456],"generationDurationMs":231626,"turns":[{"turnId":"turn-0","durationMs":2786,"ttftMs":769},{"turnId":"turn-1","durationMs":2727},{"turnId":"turn-2","durationMs":2103},{"turnId":"turn-3","durationMs":2530},{"turnId":"turn-4","durationMs":2754},{"turnId":"turn-5","durationMs":3357},{"turnId":"turn-6","durationMs":10489,"ttftMs":3069},{"turnId":"turn-7","durationMs":2625},{"turnId":"turn-8","durationMs":15034,"ttftMs":6075},{"turnId":"turn-9","durationMs":2817},{"turnId":"turn-10","durationMs":14176,"ttftMs":3956},{"turnId":"turn-11","durationMs":2537},{"turnId":"turn-12","durationMs":16864,"ttftMs":3658},{"turnId":"turn-13","durationMs":3137},{"turnId":"turn-14","durationMs":18123,"ttftMs":3607},{"turnId":"turn-15","durationMs":2307},{"turnId":"turn-16","durationMs":17842,"ttftMs":3823},{"turnId":"turn-17","durationMs":2287},{"turnId":"turn-18","durationMs":10719,"ttftMs":3913},{"turnId":"turn-19","durationMs":24735},{"turnId":"turn-20","durationMs":2656},{"turnId":"turn-21","durationMs":15351,"ttftMs":9797},{"turnId":"turn-22","durationMs":22251},{"turnId":"turn-23","durationMs":4702},{"turnId":"turn-24","durationMs":2,"ttftMs":0},{"turnId":"turn-25","durationMs":0},{"turnId":"turn-26","durationMs":1},{"turnId":"turn-27","durationMs":0,"ttftMs":0},{"turnId":"turn-28","durationMs":1},{"turnId":"turn-29","durationMs":1},{"turnId":"turn-30","durationMs":0},{"turnId":"turn-31","durationMs":1},{"turnId":"turn-32","durationMs":1,"ttftMs":1},{"turnId":"turn-33","durationMs":1},{"turnId":"turn-34","durationMs":0},{"turnId":"turn-35","durationMs":27478,"ttftMs":13456}]}}
|
||||||
|
|||||||
@ -1,6 +1,84 @@
|
|||||||
# pi-llm-performance
|
# pi-llm-performance
|
||||||
|
|
||||||
LLM performance metrics extension — captures and displays TTFT, prefill, and generation speeds from pi agent turns.
|
Pi coding agent extension that captures and displays LLM inference performance metrics.
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
Understanding model performance helps you:
|
||||||
|
|
||||||
|
- **Compare models** — measure throughput differences between providers and model sizes
|
||||||
|
- **Debug slowdowns** — spot when prefill or generation degrades unexpectedly
|
||||||
|
- **Validate hardware** — confirm your setup delivers expected token throughput
|
||||||
|
- **Tune parameters** — evaluate the impact of speculative decoding, context window size, etc.
|
||||||
|
|
||||||
|
## What it measures
|
||||||
|
|
||||||
|
| Metric | Description |
|
||||||
|
|--------|-------------|
|
||||||
|
| **TTFT** | Time to first token (ms) — how long before you see output |
|
||||||
|
| **Prefill speed** | Input tokens processed per second during the prefill phase |
|
||||||
|
| **Generation speed** | Output tokens generated per second during the generation phase |
|
||||||
|
| **Combined speed** | Total tokens (input + output) per second across the full prompt |
|
||||||
|
|
||||||
|
## How it works
|
||||||
|
|
||||||
|
The extension hooks into pi's agent lifecycle events:
|
||||||
|
|
||||||
|
| Event | Behavior |
|
||||||
|
|-------|----------|
|
||||||
|
| `agent_start` | Records provider/model, resets counters |
|
||||||
|
| `turn_start` | Marks turn boundary |
|
||||||
|
| `message_update` | Captures TTFT on first token delta |
|
||||||
|
| `turn_end` | Records token counts and turn duration |
|
||||||
|
| `agent_end` | Aggregates metrics, displays in TUI, logs to `.pi/llm-metrics.log` |
|
||||||
|
|
||||||
|
## Output
|
||||||
|
|
||||||
|
### TUI notification
|
||||||
|
|
||||||
|
After each prompt completes, a notification shows:
|
||||||
|
|
||||||
|
```
|
||||||
|
📊 Performance: llama.cpp/Qwen3.6-35B-A3B-MXFP4_MOE.gguf
|
||||||
|
Prefill: 1,240 tokens @ 68.3 tok/s
|
||||||
|
Generation: 312 tokens @ 89.9 tok/s
|
||||||
|
Combined: 1,552 tokens @ 78.4 tok/s (19.8s total)
|
||||||
|
TTFT: 1250ms
|
||||||
|
```
|
||||||
|
|
||||||
|
### Status bar
|
||||||
|
|
||||||
|
The footer status shows combined throughput: `📊 78.4 tok/s`
|
||||||
|
|
||||||
|
### Log file
|
||||||
|
|
||||||
|
Each prompt writes a JSONL entry to `.pi/llm-metrics.log`:
|
||||||
|
|
||||||
|
```json
|
||||||
|
{
|
||||||
|
"timestamp": "2026-04-28T10:05:00.000Z",
|
||||||
|
"provider": "llama.cpp",
|
||||||
|
"model": "Qwen3.6-35B-A3B-MXFP4_MOE.gguf",
|
||||||
|
"turnCount": 1,
|
||||||
|
"inputTokens": 1240,
|
||||||
|
"outputTokens": 312,
|
||||||
|
"totalTokens": 1552,
|
||||||
|
"prefillTokensPerSec": 68.3,
|
||||||
|
"generationTokensPerSec": 89.9,
|
||||||
|
"combinedTokensPerSec": 78.4,
|
||||||
|
"totalDurationMs": 19800,
|
||||||
|
"timeToFirstTokenMs": 1250,
|
||||||
|
"rawTimestamps": {
|
||||||
|
"ttftMs": 1250,
|
||||||
|
"generationDurationMs": 18550,
|
||||||
|
"turns": [{"turnId": "turn-0", "durationMs": 19800, "ttftMs": 1250}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
## Sanity checks
|
||||||
|
|
||||||
|
The extension warns to the console if generation speed exceeds 500 tok/s (physically impossible for any known model/hardware setup). This helps catch timing bugs early.
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
@ -11,24 +89,6 @@ pnpm install # workspace setup
|
|||||||
deno test # run tests
|
deno test # run tests
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
Add to your pi settings (`~/.pi/agent/settings.json`):
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"packages": [
|
|
||||||
"../dev/spikes/llm/monotonic-pi-extensions/packages/pi-llm-performance"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Then reload pi:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
/reload
|
|
||||||
```
|
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
MIT
|
MIT
|
||||||
|
|||||||
10
packages/pi-llm-performance/deno.lock
generated
10
packages/pi-llm-performance/deno.lock
generated
@ -2,7 +2,9 @@
|
|||||||
"version": "5",
|
"version": "5",
|
||||||
"specifiers": {
|
"specifiers": {
|
||||||
"jsr:@std/assert@*": "1.0.19",
|
"jsr:@std/assert@*": "1.0.19",
|
||||||
"jsr:@std/internal@^1.0.12": "1.0.12"
|
"jsr:@std/assert@^1.0.19": "1.0.19",
|
||||||
|
"jsr:@std/internal@^1.0.12": "1.0.12",
|
||||||
|
"jsr:@std/testing@*": "1.0.18"
|
||||||
},
|
},
|
||||||
"jsr": {
|
"jsr": {
|
||||||
"@std/assert@1.0.19": {
|
"@std/assert@1.0.19": {
|
||||||
@ -13,6 +15,12 @@
|
|||||||
},
|
},
|
||||||
"@std/internal@1.0.12": {
|
"@std/internal@1.0.12": {
|
||||||
"integrity": "972a634fd5bc34b242024402972cd5143eac68d8dffaca5eaa4dba30ce17b027"
|
"integrity": "972a634fd5bc34b242024402972cd5143eac68d8dffaca5eaa4dba30ce17b027"
|
||||||
|
},
|
||||||
|
"@std/testing@1.0.18": {
|
||||||
|
"integrity": "d3152f57b11666bf6358d0e127c7e3488e91178b0c2d8fbf0793e1c53cd13cb1",
|
||||||
|
"dependencies": [
|
||||||
|
"jsr:@std/assert@^1.0.19"
|
||||||
|
]
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"workspace": {
|
"workspace": {
|
||||||
|
|||||||
@ -69,6 +69,11 @@ Deno.test("aggregatePromptMetrics - aggregates single turn", () => {
|
|||||||
assertLessOrEqual(result.generationTokensPerSec, 47.7);
|
assertLessOrEqual(result.generationTokensPerSec, 47.7);
|
||||||
// combined: 1200 total tokens / 5s = 240 tok/s
|
// combined: 1200 total tokens / 5s = 240 tok/s
|
||||||
assertEquals(result.combinedTokensPerSec, 240);
|
assertEquals(result.combinedTokensPerSec, 240);
|
||||||
|
|
||||||
|
// rawTimestamps
|
||||||
|
assertEquals(result.rawTimestamps?.ttftMs, 800);
|
||||||
|
assertEquals(result.rawTimestamps?.allTtftMs, [800]);
|
||||||
|
assertEquals(result.rawTimestamps?.generationDurationMs, 4200);
|
||||||
});
|
});
|
||||||
|
|
||||||
Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => {
|
Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => {
|
||||||
@ -116,6 +121,15 @@ Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => {
|
|||||||
// combined: 2250 / 6.5 = 346.15 tok/s
|
// combined: 2250 / 6.5 = 346.15 tok/s
|
||||||
assertGreaterOrEqual(result.combinedTokensPerSec, 346.1);
|
assertGreaterOrEqual(result.combinedTokensPerSec, 346.1);
|
||||||
assertLessOrEqual(result.combinedTokensPerSec, 346.2);
|
assertLessOrEqual(result.combinedTokensPerSec, 346.2);
|
||||||
|
|
||||||
|
// rawTimestamps: only turn-1 has valid TTFT, turns 2+ have none
|
||||||
|
assertEquals(result.rawTimestamps?.ttftMs, 800);
|
||||||
|
assertEquals(result.rawTimestamps?.allTtftMs, [800]);
|
||||||
|
assertEquals(result.rawTimestamps?.generationDurationMs, 5700);
|
||||||
|
assertEquals(result.rawTimestamps?.turns.length, 3);
|
||||||
|
assertEquals(result.rawTimestamps?.turns[0].ttftMs, 800);
|
||||||
|
assertEquals(result.rawTimestamps?.turns[1].ttftMs, undefined);
|
||||||
|
assertEquals(result.rawTimestamps?.turns[2].ttftMs, undefined);
|
||||||
});
|
});
|
||||||
|
|
||||||
Deno.test("aggregatePromptMetrics - handles empty turn list", () => {
|
Deno.test("aggregatePromptMetrics - handles empty turn list", () => {
|
||||||
@ -252,6 +266,80 @@ Deno.test("formatMetricsForDisplay - shows turn count when multiple turns", () =
|
|||||||
assertEquals(display.includes("Turns: 3"), true);
|
assertEquals(display.includes("Turns: 3"), true);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Deno.test("aggregatePromptMetrics - uses first valid TTFT when turn-0 has none", () => {
|
||||||
|
// Edge case: turn-0 has no TTFT, turn-1 does. Should use turn-1's TTFT.
|
||||||
|
const turnMetrics: TurnMetrics[] = [
|
||||||
|
{
|
||||||
|
turnId: "turn-0",
|
||||||
|
inputTokens: 1000,
|
||||||
|
outputTokens: 200,
|
||||||
|
durationMs: 3000,
|
||||||
|
// No timeToFirstTokenMs
|
||||||
|
},
|
||||||
|
{
|
||||||
|
turnId: "turn-1",
|
||||||
|
inputTokens: 500,
|
||||||
|
outputTokens: 150,
|
||||||
|
durationMs: 2000,
|
||||||
|
timeToFirstTokenMs: 600,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = aggregatePromptMetrics({
|
||||||
|
provider: "llama.cpp",
|
||||||
|
model: "Qwen3.6-35B",
|
||||||
|
turnMetrics,
|
||||||
|
});
|
||||||
|
|
||||||
|
// First valid TTFT is from turn-1 (600ms)
|
||||||
|
assertEquals(result.rawTimestamps?.allTtftMs, [600]);
|
||||||
|
assertEquals(result.rawTimestamps?.ttftMs, 600);
|
||||||
|
// Generation duration = totalDuration - firstValidTTFT = 5000 - 600 = 4400
|
||||||
|
assertEquals(result.rawTimestamps?.generationDurationMs, 4400);
|
||||||
|
// prefill: 1500 / 0.6 = 2500
|
||||||
|
assertEquals(result.prefillTokensPerSec, 2500);
|
||||||
|
// generation: 350 / 4.4 = 79.55
|
||||||
|
assertGreaterOrEqual(result.generationTokensPerSec, 79.5);
|
||||||
|
assertLessOrEqual(result.generationTokensPerSec, 79.6);
|
||||||
|
});
|
||||||
|
|
||||||
|
Deno.test("aggregatePromptMetrics - filters out negative TTFT values", () => {
|
||||||
|
// Simulates the bug where turn-2 got TTFT=-20390 from the old global-firstToken code
|
||||||
|
const turnMetrics: TurnMetrics[] = [
|
||||||
|
{
|
||||||
|
turnId: "turn-0",
|
||||||
|
inputTokens: 1000,
|
||||||
|
outputTokens: 200,
|
||||||
|
durationMs: 3000,
|
||||||
|
timeToFirstTokenMs: 800,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
turnId: "turn-1",
|
||||||
|
inputTokens: 500,
|
||||||
|
outputTokens: 150,
|
||||||
|
durationMs: 2000,
|
||||||
|
timeToFirstTokenMs: -5000, // Invalid: negative
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
const result = aggregatePromptMetrics({
|
||||||
|
provider: "llama.cpp",
|
||||||
|
model: "Qwen3.6-35B",
|
||||||
|
turnMetrics,
|
||||||
|
});
|
||||||
|
|
||||||
|
// Only turn-0's TTFT (800) should be used; turn-1's negative value is filtered
|
||||||
|
assertEquals(result.rawTimestamps?.allTtftMs, [800]);
|
||||||
|
assertEquals(result.rawTimestamps?.ttftMs, 800);
|
||||||
|
// Generation duration = totalDuration - firstTurnTTFT = 5000 - 800 = 4200
|
||||||
|
assertEquals(result.rawTimestamps?.generationDurationMs, 4200);
|
||||||
|
// prefill: 1500 / 0.8 = 1875
|
||||||
|
assertEquals(result.prefillTokensPerSec, 1875);
|
||||||
|
// generation: 350 / 4.2 = 83.33
|
||||||
|
assertGreaterOrEqual(result.generationTokensPerSec, 83.3);
|
||||||
|
assertLessOrEqual(result.generationTokensPerSec, 83.4);
|
||||||
|
});
|
||||||
|
|
||||||
Deno.test("toLogEntry - creates JSON-serializable log entry", () => {
|
Deno.test("toLogEntry - creates JSON-serializable log entry", () => {
|
||||||
const metrics: PromptMetrics = {
|
const metrics: PromptMetrics = {
|
||||||
provider: "anthropic",
|
provider: "anthropic",
|
||||||
@ -265,6 +353,12 @@ Deno.test("toLogEntry - creates JSON-serializable log entry", () => {
|
|||||||
combinedTokensPerSec: 38.09876,
|
combinedTokensPerSec: 38.09876,
|
||||||
totalDurationMs: 21600,
|
totalDurationMs: 21600,
|
||||||
timeToFirstTokenMs: 850,
|
timeToFirstTokenMs: 850,
|
||||||
|
rawTimestamps: {
|
||||||
|
ttftMs: 850,
|
||||||
|
allTtftMs: [850],
|
||||||
|
generationDurationMs: 20750,
|
||||||
|
turns: [],
|
||||||
|
},
|
||||||
turns: [],
|
turns: [],
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -292,6 +386,72 @@ Deno.test("toLogEntry - creates JSON-serializable log entry", () => {
|
|||||||
assertEquals(json.length > 0, true);
|
assertEquals(json.length > 0, true);
|
||||||
const parsed = JSON.parse(json);
|
const parsed = JSON.parse(json);
|
||||||
assertEquals(parsed.provider, "anthropic");
|
assertEquals(parsed.provider, "anthropic");
|
||||||
|
|
||||||
|
// rawTimestamps should be included
|
||||||
|
assertEquals(logEntry.rawTimestamps?.ttftMs, 850);
|
||||||
|
assertEquals(logEntry.rawTimestamps?.allTtftMs, [850]);
|
||||||
|
assertEquals(logEntry.rawTimestamps?.generationDurationMs, 20750);
|
||||||
|
assertEquals(logEntry.rawTimestamps?.turns.length, 0);
|
||||||
|
});
|
||||||
|
|
||||||
|
Deno.test("aggregatePromptMetrics - warns when generation speed is physically impossible", () => {
|
||||||
|
const originalWarn = console.warn;
|
||||||
|
let warnCall: string | undefined;
|
||||||
|
console.warn = (msg: string) => { warnCall = msg; };
|
||||||
|
|
||||||
|
try {
|
||||||
|
const turnMetrics: TurnMetrics[] = [
|
||||||
|
{
|
||||||
|
turnId: "turn-0",
|
||||||
|
inputTokens: 100,
|
||||||
|
outputTokens: 1000,
|
||||||
|
durationMs: 1000,
|
||||||
|
timeToFirstTokenMs: 100,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
aggregatePromptMetrics({
|
||||||
|
provider: "llama.cpp",
|
||||||
|
model: "Qwen3.6-35B",
|
||||||
|
turnMetrics,
|
||||||
|
});
|
||||||
|
|
||||||
|
// generation: 1000 / 0.9 = 1111.11 tok/s > 500
|
||||||
|
assertGreaterOrEqual(warnCall, "");
|
||||||
|
assertEquals(warnCall!.includes("Suspicious generation speed"), true);
|
||||||
|
assertEquals(warnCall!.includes("1111.1 tok/s"), true);
|
||||||
|
assertEquals(warnCall!.includes("output=1000"), true);
|
||||||
|
} finally {
|
||||||
|
console.warn = originalWarn;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
Deno.test("aggregatePromptMetrics - does not warn for normal speeds", () => {
|
||||||
|
const originalWarn = console.warn;
|
||||||
|
let warnCall: string | undefined;
|
||||||
|
console.warn = (msg: string) => { warnCall = msg; };
|
||||||
|
|
||||||
|
try {
|
||||||
|
const turnMetrics: TurnMetrics[] = [
|
||||||
|
{
|
||||||
|
turnId: "turn-0",
|
||||||
|
inputTokens: 1000,
|
||||||
|
outputTokens: 200,
|
||||||
|
durationMs: 5000,
|
||||||
|
timeToFirstTokenMs: 800,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
aggregatePromptMetrics({
|
||||||
|
provider: "llama.cpp",
|
||||||
|
model: "Qwen3.6-35B",
|
||||||
|
turnMetrics,
|
||||||
|
});
|
||||||
|
|
||||||
|
assertEquals(warnCall, undefined);
|
||||||
|
} finally {
|
||||||
|
console.warn = originalWarn;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
Deno.test("aggregatePromptMetrics - uses full duration when TTFT is undefined", () => {
|
Deno.test("aggregatePromptMetrics - uses full duration when TTFT is undefined", () => {
|
||||||
|
|||||||
@ -1,5 +1,10 @@
|
|||||||
// Functional core for LLM performance metrics calculation
|
// Functional core for LLM performance metrics calculation
|
||||||
|
|
||||||
|
// Extracted warning function so tests can mock it without touching console
|
||||||
|
export function warn(msg: string): void {
|
||||||
|
console.warn(msg);
|
||||||
|
}
|
||||||
|
|
||||||
export interface TurnMetrics {
|
export interface TurnMetrics {
|
||||||
turnId: string;
|
turnId: string;
|
||||||
inputTokens: number;
|
inputTokens: number;
|
||||||
@ -20,6 +25,12 @@ export interface PromptMetrics {
|
|||||||
combinedTokensPerSec: number;
|
combinedTokensPerSec: number;
|
||||||
totalDurationMs: number;
|
totalDurationMs: number;
|
||||||
timeToFirstTokenMs?: number;
|
timeToFirstTokenMs?: number;
|
||||||
|
rawTimestamps?: {
|
||||||
|
ttftMs?: number;
|
||||||
|
allTtftMs?: number[];
|
||||||
|
generationDurationMs?: number;
|
||||||
|
turns: Array<{ turnId: string; durationMs: number; ttftMs?: number }>;
|
||||||
|
};
|
||||||
turns: TurnMetrics[];
|
turns: TurnMetrics[];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -36,6 +47,12 @@ export interface MetricLogEntry {
|
|||||||
combinedTokensPerSec: number;
|
combinedTokensPerSec: number;
|
||||||
totalDurationMs: number;
|
totalDurationMs: number;
|
||||||
timeToFirstTokenMs?: number;
|
timeToFirstTokenMs?: number;
|
||||||
|
rawTimestamps?: {
|
||||||
|
ttftMs?: number;
|
||||||
|
allTtftMs?: number[];
|
||||||
|
generationDurationMs?: number;
|
||||||
|
turns: Array<{ turnId: string; durationMs: number; ttftMs?: number }>;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@ -79,6 +96,7 @@ export function aggregatePromptMetrics(params: {
|
|||||||
generationTokensPerSec: 0,
|
generationTokensPerSec: 0,
|
||||||
combinedTokensPerSec: 0,
|
combinedTokensPerSec: 0,
|
||||||
totalDurationMs: 0,
|
totalDurationMs: 0,
|
||||||
|
rawTimestamps: { turns: [] },
|
||||||
turns: [],
|
turns: [],
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -92,18 +110,19 @@ export function aggregatePromptMetrics(params: {
|
|||||||
const totalDurationMs = turnMetrics.reduce((sum, t) => sum + t.durationMs, 0);
|
const totalDurationMs = turnMetrics.reduce((sum, t) => sum + t.durationMs, 0);
|
||||||
const totalDurationSec = totalDurationMs / 1000;
|
const totalDurationSec = totalDurationMs / 1000;
|
||||||
|
|
||||||
// Time to first token is from the first turn
|
// Collect per-turn TTFTs; prefill boundary is the first turn's TTFT
|
||||||
const timeToFirstTokenMs = turnMetrics[0]?.timeToFirstTokenMs;
|
const ttftValues = turnMetrics.map(t => t.timeToFirstTokenMs).filter((t): t is number => t !== undefined && t >= 0);
|
||||||
|
const firstTurnTtftMs = ttftValues.length > 0 ? ttftValues[0] : undefined;
|
||||||
|
|
||||||
// Calculate tokens per second
|
// Calculate tokens per second
|
||||||
// Prefill: input tokens / TTFT duration (prefill phase)
|
// Prefill: input tokens / first-turn TTFT (prefill happens once at the start)
|
||||||
// Generation: output tokens / (totalDuration - TTFT) (generation phase)
|
// Generation: output tokens / (totalDuration - firstTurnTTFT) (generation phase)
|
||||||
// Combined: total tokens / total duration
|
// Combined: total tokens / total duration
|
||||||
// When TTFT is unavailable, prefill and generation phases cannot be separated,
|
// When first-turn TTFT is unavailable, prefill and generation phases cannot be separated,
|
||||||
// so we set them to 0 and only report combined.
|
// so we set them to 0 and only report combined.
|
||||||
const ttftSec = timeToFirstTokenMs !== undefined ? timeToFirstTokenMs / 1000 : undefined;
|
const ttftSec = firstTurnTtftMs !== undefined ? firstTurnTtftMs / 1000 : undefined;
|
||||||
const generationDurationSec = timeToFirstTokenMs !== undefined
|
const generationDurationSec = firstTurnTtftMs !== undefined
|
||||||
? (totalDurationMs - timeToFirstTokenMs) / 1000
|
? (totalDurationMs - firstTurnTtftMs) / 1000
|
||||||
: undefined;
|
: undefined;
|
||||||
|
|
||||||
const prefillTokensPerSec = (ttftSec && ttftSec > 0) ? inputTokens / ttftSec : 0;
|
const prefillTokensPerSec = (ttftSec && ttftSec > 0) ? inputTokens / ttftSec : 0;
|
||||||
@ -112,6 +131,13 @@ export function aggregatePromptMetrics(params: {
|
|||||||
: 0;
|
: 0;
|
||||||
const combinedTokensPerSec = totalDurationSec > 0 ? totalTokens / totalDurationSec : 0;
|
const combinedTokensPerSec = totalDurationSec > 0 ? totalTokens / totalDurationSec : 0;
|
||||||
|
|
||||||
|
// Sanity check: flag physically impossible generation speeds
|
||||||
|
if (generationTokensPerSec > 500) {
|
||||||
|
warn(
|
||||||
|
`[metrics] Suspicious generation speed: ${generationTokensPerSec.toFixed(1)} tok/s (input=${inputTokens}, output=${outputTokens}, totalDuration=${totalDurationMs}ms, TTFT=${firstTurnTtftMs}ms)`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
provider,
|
provider,
|
||||||
model,
|
model,
|
||||||
@ -123,7 +149,13 @@ export function aggregatePromptMetrics(params: {
|
|||||||
generationTokensPerSec,
|
generationTokensPerSec,
|
||||||
combinedTokensPerSec,
|
combinedTokensPerSec,
|
||||||
totalDurationMs,
|
totalDurationMs,
|
||||||
timeToFirstTokenMs,
|
timeToFirstTokenMs: firstTurnTtftMs,
|
||||||
|
rawTimestamps: {
|
||||||
|
ttftMs: firstTurnTtftMs,
|
||||||
|
allTtftMs: ttftValues,
|
||||||
|
generationDurationMs: generationDurationSec !== undefined ? generationDurationSec * 1000 : undefined,
|
||||||
|
turns: turnMetrics.map(t => ({ turnId: t.turnId, durationMs: t.durationMs, ttftMs: t.timeToFirstTokenMs })),
|
||||||
|
},
|
||||||
turns: turnMetrics,
|
turns: turnMetrics,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -197,5 +229,6 @@ export function toLogEntry(metrics: PromptMetrics): MetricLogEntry {
|
|||||||
combinedTokensPerSec: Math.round(metrics.combinedTokensPerSec * 100) / 100,
|
combinedTokensPerSec: Math.round(metrics.combinedTokensPerSec * 100) / 100,
|
||||||
totalDurationMs: metrics.totalDurationMs,
|
totalDurationMs: metrics.totalDurationMs,
|
||||||
timeToFirstTokenMs: metrics.timeToFirstTokenMs,
|
timeToFirstTokenMs: metrics.timeToFirstTokenMs,
|
||||||
|
rawTimestamps: metrics.rawTimestamps,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -1,398 +0,0 @@
|
|||||||
import {
|
|
||||||
calculateTurnMetrics,
|
|
||||||
aggregatePromptMetrics,
|
|
||||||
formatMetricsForDisplay,
|
|
||||||
toLogEntry,
|
|
||||||
type TurnMetrics,
|
|
||||||
type PromptMetrics,
|
|
||||||
} from "./llm-metrics-core.ts";
|
|
||||||
import { assertEquals, assertGreaterOrEqual, assertLessOrEqual } from "jsr:@std/assert";
|
|
||||||
|
|
||||||
Deno.test("calculateTurnMetrics - creates turn metrics object", () => {
|
|
||||||
const result = calculateTurnMetrics({
|
|
||||||
turnId: "turn-1",
|
|
||||||
inputTokens: 100,
|
|
||||||
outputTokens: 50,
|
|
||||||
durationMs: 2000,
|
|
||||||
timeToFirstTokenMs: 500,
|
|
||||||
});
|
|
||||||
|
|
||||||
assertEquals(result.turnId, "turn-1");
|
|
||||||
assertEquals(result.inputTokens, 100);
|
|
||||||
assertEquals(result.outputTokens, 50);
|
|
||||||
assertEquals(result.durationMs, 2000);
|
|
||||||
assertEquals(result.timeToFirstTokenMs, 500);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("calculateTurnMetrics - handles missing timeToFirstToken", () => {
|
|
||||||
const result = calculateTurnMetrics({
|
|
||||||
turnId: "turn-1",
|
|
||||||
inputTokens: 100,
|
|
||||||
outputTokens: 50,
|
|
||||||
durationMs: 2000,
|
|
||||||
});
|
|
||||||
|
|
||||||
assertEquals(result.timeToFirstTokenMs, undefined);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("aggregatePromptMetrics - aggregates single turn", () => {
|
|
||||||
const turnMetrics: TurnMetrics[] = [
|
|
||||||
{
|
|
||||||
turnId: "turn-1",
|
|
||||||
inputTokens: 1000,
|
|
||||||
outputTokens: 200,
|
|
||||||
durationMs: 5000,
|
|
||||||
timeToFirstTokenMs: 800,
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const result = aggregatePromptMetrics({
|
|
||||||
provider: "anthropic",
|
|
||||||
model: "claude-sonnet-4",
|
|
||||||
turnMetrics,
|
|
||||||
});
|
|
||||||
|
|
||||||
assertEquals(result.provider, "anthropic");
|
|
||||||
assertEquals(result.model, "claude-sonnet-4");
|
|
||||||
assertEquals(result.turnCount, 1);
|
|
||||||
assertEquals(result.inputTokens, 1000);
|
|
||||||
assertEquals(result.outputTokens, 200);
|
|
||||||
assertEquals(result.totalTokens, 1200);
|
|
||||||
assertEquals(result.totalDurationMs, 5000);
|
|
||||||
assertEquals(result.timeToFirstTokenMs, 800);
|
|
||||||
|
|
||||||
// Tokens per second calculations
|
|
||||||
// prefill: 1000 input tokens / 0.8s TTFT = 1250 tok/s
|
|
||||||
assertEquals(result.prefillTokensPerSec, 1250);
|
|
||||||
// generation: 200 output tokens / 4.2s (5s - 0.8s) = 47.62 tok/s
|
|
||||||
assertGreaterOrEqual(result.generationTokensPerSec, 47.6);
|
|
||||||
assertLessOrEqual(result.generationTokensPerSec, 47.7);
|
|
||||||
// combined: 1200 total tokens / 5s = 240 tok/s
|
|
||||||
assertEquals(result.combinedTokensPerSec, 240);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("aggregatePromptMetrics - aggregates multiple turns", () => {
|
|
||||||
const turnMetrics: TurnMetrics[] = [
|
|
||||||
{
|
|
||||||
turnId: "turn-1",
|
|
||||||
inputTokens: 1000,
|
|
||||||
outputTokens: 200,
|
|
||||||
durationMs: 3000,
|
|
||||||
timeToFirstTokenMs: 800,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
turnId: "turn-2",
|
|
||||||
inputTokens: 500,
|
|
||||||
outputTokens: 150,
|
|
||||||
durationMs: 2000,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
turnId: "turn-3",
|
|
||||||
inputTokens: 300,
|
|
||||||
outputTokens: 100,
|
|
||||||
durationMs: 1500,
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const result = aggregatePromptMetrics({
|
|
||||||
provider: "openai",
|
|
||||||
model: "gpt-4o",
|
|
||||||
turnMetrics,
|
|
||||||
});
|
|
||||||
|
|
||||||
assertEquals(result.turnCount, 3);
|
|
||||||
assertEquals(result.inputTokens, 1800); // 1000 + 500 + 300
|
|
||||||
assertEquals(result.outputTokens, 450); // 200 + 150 + 100
|
|
||||||
assertEquals(result.totalTokens, 2250);
|
|
||||||
assertEquals(result.totalDurationMs, 6500); // 3000 + 2000 + 1500
|
|
||||||
assertEquals(result.timeToFirstTokenMs, 800); // From first turn only
|
|
||||||
|
|
||||||
// Tokens per second: prefill uses TTFT (0.8s), generation uses (total - TTFT) = 5.7s
|
|
||||||
// prefill: 1800 / 0.8 = 2250 tok/s
|
|
||||||
assertEquals(result.prefillTokensPerSec, 2250);
|
|
||||||
// generation: 450 / 5.7 = 78.95 tok/s
|
|
||||||
assertGreaterOrEqual(result.generationTokensPerSec, 78.9);
|
|
||||||
assertLessOrEqual(result.generationTokensPerSec, 79.0);
|
|
||||||
// combined: 2250 / 6.5 = 346.15 tok/s
|
|
||||||
assertGreaterOrEqual(result.combinedTokensPerSec, 346.1);
|
|
||||||
assertLessOrEqual(result.combinedTokensPerSec, 346.2);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("aggregatePromptMetrics - handles empty turn list", () => {
|
|
||||||
const result = aggregatePromptMetrics({
|
|
||||||
provider: "anthropic",
|
|
||||||
model: "claude-sonnet-4",
|
|
||||||
turnMetrics: [],
|
|
||||||
});
|
|
||||||
|
|
||||||
assertEquals(result.turnCount, 0);
|
|
||||||
assertEquals(result.inputTokens, 0);
|
|
||||||
assertEquals(result.outputTokens, 0);
|
|
||||||
assertEquals(result.totalTokens, 0);
|
|
||||||
assertEquals(result.prefillTokensPerSec, 0);
|
|
||||||
assertEquals(result.generationTokensPerSec, 0);
|
|
||||||
assertEquals(result.combinedTokensPerSec, 0);
|
|
||||||
assertEquals(result.totalDurationMs, 0);
|
|
||||||
assertEquals(result.timeToFirstTokenMs, undefined);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("formatMetricsForDisplay - formats single turn metrics", () => {
|
|
||||||
const metrics: PromptMetrics = {
|
|
||||||
provider: "anthropic",
|
|
||||||
model: "claude-sonnet-4",
|
|
||||||
turnCount: 1,
|
|
||||||
inputTokens: 1250,
|
|
||||||
outputTokens: 342,
|
|
||||||
totalTokens: 1592,
|
|
||||||
prefillTokensPerSec: 482.1,
|
|
||||||
generationTokensPerSec: 18.3,
|
|
||||||
combinedTokensPerSec: 38.0,
|
|
||||||
totalDurationMs: 21600,
|
|
||||||
timeToFirstTokenMs: 850,
|
|
||||||
turns: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
const display = formatMetricsForDisplay(metrics);
|
|
||||||
|
|
||||||
assertEquals(display.includes("anthropic/claude-sonnet-4"), true);
|
|
||||||
assertEquals(display.includes("1,250 tokens"), true);
|
|
||||||
assertEquals(display.includes("482.1 tok/s"), true);
|
|
||||||
assertEquals(display.includes("342 tokens"), true);
|
|
||||||
assertEquals(display.includes("18.3 tok/s"), true);
|
|
||||||
assertEquals(display.includes("1,592 tokens"), true);
|
|
||||||
assertEquals(display.includes("38.0 tok/s"), true);
|
|
||||||
assertEquals(display.includes("21.6s"), true);
|
|
||||||
assertEquals(display.includes("TTFT: 850ms"), true);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("formatMetricsForDisplay - formats duration as minutes when over 60s", () => {
|
|
||||||
const metrics: PromptMetrics = {
|
|
||||||
provider: "openai",
|
|
||||||
model: "gpt-4o",
|
|
||||||
turnCount: 1,
|
|
||||||
inputTokens: 5000,
|
|
||||||
outputTokens: 1000,
|
|
||||||
totalTokens: 6000,
|
|
||||||
prefillTokensPerSec: 50,
|
|
||||||
generationTokensPerSec: 10,
|
|
||||||
combinedTokensPerSec: 60,
|
|
||||||
totalDurationMs: 120000, // 2 minutes
|
|
||||||
timeToFirstTokenMs: 1500,
|
|
||||||
turns: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
const display = formatMetricsForDisplay(metrics);
|
|
||||||
|
|
||||||
assertEquals(display.includes("2.0m"), true);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("formatMetricsForDisplay - omits turn count when single turn", () => {
|
|
||||||
const metrics: PromptMetrics = {
|
|
||||||
provider: "anthropic",
|
|
||||||
model: "claude-sonnet-4",
|
|
||||||
turnCount: 1,
|
|
||||||
inputTokens: 100,
|
|
||||||
outputTokens: 50,
|
|
||||||
totalTokens: 150,
|
|
||||||
prefillTokensPerSec: 20,
|
|
||||||
generationTokensPerSec: 10,
|
|
||||||
combinedTokensPerSec: 30,
|
|
||||||
totalDurationMs: 5000,
|
|
||||||
timeToFirstTokenMs: 500,
|
|
||||||
turns: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
const display = formatMetricsForDisplay(metrics);
|
|
||||||
|
|
||||||
assertEquals(display.includes("Turns: 1"), false);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("formatMetricsForDisplay - omits prefill/generation when TTFT is unavailable", () => {
|
|
||||||
const metrics: PromptMetrics = {
|
|
||||||
provider: "openai",
|
|
||||||
model: "gpt-4o",
|
|
||||||
turnCount: 1,
|
|
||||||
inputTokens: 1000,
|
|
||||||
outputTokens: 200,
|
|
||||||
totalTokens: 1200,
|
|
||||||
prefillTokensPerSec: 0,
|
|
||||||
generationTokensPerSec: 0,
|
|
||||||
combinedTokensPerSec: 240,
|
|
||||||
totalDurationMs: 5000,
|
|
||||||
timeToFirstTokenMs: undefined,
|
|
||||||
turns: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
const display = formatMetricsForDisplay(metrics);
|
|
||||||
|
|
||||||
assertEquals(display.includes("Prefill:"), false);
|
|
||||||
assertEquals(display.includes("Generation:"), false);
|
|
||||||
assertEquals(display.includes("1,200 tokens"), true);
|
|
||||||
assertEquals(display.includes("240.0 tok/s"), true);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("formatMetricsForDisplay - shows turn count when multiple turns", () => {
|
|
||||||
const metrics: PromptMetrics = {
|
|
||||||
provider: "anthropic",
|
|
||||||
model: "claude-sonnet-4",
|
|
||||||
turnCount: 3,
|
|
||||||
inputTokens: 100,
|
|
||||||
outputTokens: 50,
|
|
||||||
totalTokens: 150,
|
|
||||||
prefillTokensPerSec: 20,
|
|
||||||
generationTokensPerSec: 10,
|
|
||||||
combinedTokensPerSec: 30,
|
|
||||||
totalDurationMs: 5000,
|
|
||||||
timeToFirstTokenMs: 500,
|
|
||||||
turns: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
const display = formatMetricsForDisplay(metrics);
|
|
||||||
|
|
||||||
assertEquals(display.includes("Turns: 3"), true);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("toLogEntry - creates JSON-serializable log entry", () => {
|
|
||||||
const metrics: PromptMetrics = {
|
|
||||||
provider: "anthropic",
|
|
||||||
model: "claude-sonnet-4",
|
|
||||||
turnCount: 2,
|
|
||||||
inputTokens: 1250,
|
|
||||||
outputTokens: 342,
|
|
||||||
totalTokens: 1592,
|
|
||||||
prefillTokensPerSec: 482.12345,
|
|
||||||
generationTokensPerSec: 18.34567,
|
|
||||||
combinedTokensPerSec: 38.09876,
|
|
||||||
totalDurationMs: 21600,
|
|
||||||
timeToFirstTokenMs: 850,
|
|
||||||
turns: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
const logEntry = toLogEntry(metrics);
|
|
||||||
|
|
||||||
assertEquals(logEntry.provider, "anthropic");
|
|
||||||
assertEquals(logEntry.model, "claude-sonnet-4");
|
|
||||||
assertEquals(logEntry.turnCount, 2);
|
|
||||||
assertEquals(logEntry.inputTokens, 1250);
|
|
||||||
assertEquals(logEntry.outputTokens, 342);
|
|
||||||
assertEquals(logEntry.totalTokens, 1592);
|
|
||||||
// Rounded to 2 decimal places
|
|
||||||
assertEquals(logEntry.prefillTokensPerSec, 482.12);
|
|
||||||
assertEquals(logEntry.generationTokensPerSec, 18.35);
|
|
||||||
assertEquals(logEntry.combinedTokensPerSec, 38.1);
|
|
||||||
assertEquals(logEntry.totalDurationMs, 21600);
|
|
||||||
assertEquals(logEntry.timeToFirstTokenMs, 850);
|
|
||||||
|
|
||||||
// Should have ISO timestamp
|
|
||||||
assertEquals(logEntry.timestamp.includes("T"), true);
|
|
||||||
assertEquals(logEntry.timestamp.includes("Z"), true);
|
|
||||||
|
|
||||||
// Should be JSON serializable
|
|
||||||
const json = JSON.stringify(logEntry);
|
|
||||||
assertEquals(json.length > 0, true);
|
|
||||||
const parsed = JSON.parse(json);
|
|
||||||
assertEquals(parsed.provider, "anthropic");
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("aggregatePromptMetrics - uses full duration when TTFT is undefined", () => {
|
|
||||||
const turnMetrics: TurnMetrics[] = [
|
|
||||||
{
|
|
||||||
turnId: "turn-1",
|
|
||||||
inputTokens: 1000,
|
|
||||||
outputTokens: 200,
|
|
||||||
durationMs: 5000,
|
|
||||||
// No timeToFirstTokenMs
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
const result = aggregatePromptMetrics({
|
|
||||||
provider: "openai",
|
|
||||||
model: "gpt-4o",
|
|
||||||
turnMetrics,
|
|
||||||
});
|
|
||||||
|
|
||||||
assertEquals(result.turnCount, 1);
|
|
||||||
assertEquals(result.inputTokens, 1000);
|
|
||||||
assertEquals(result.outputTokens, 200);
|
|
||||||
// Without TTFT, prefill and generation rates are 0 (cannot separate phases)
|
|
||||||
// Only combined rate is meaningful
|
|
||||||
assertEquals(result.prefillTokensPerSec, 0);
|
|
||||||
assertEquals(result.generationTokensPerSec, 0);
|
|
||||||
assertEquals(result.combinedTokensPerSec, 240);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("toLogEntry - handles missing timeToFirstToken", () => {
|
|
||||||
const metrics: PromptMetrics = {
|
|
||||||
provider: "anthropic",
|
|
||||||
model: "claude-sonnet-4",
|
|
||||||
turnCount: 1,
|
|
||||||
inputTokens: 100,
|
|
||||||
outputTokens: 50,
|
|
||||||
totalTokens: 150,
|
|
||||||
prefillTokensPerSec: 20,
|
|
||||||
generationTokensPerSec: 10,
|
|
||||||
combinedTokensPerSec: 30,
|
|
||||||
totalDurationMs: 5000,
|
|
||||||
timeToFirstTokenMs: undefined,
|
|
||||||
turns: [],
|
|
||||||
};
|
|
||||||
|
|
||||||
const logEntry = toLogEntry(metrics);
|
|
||||||
|
|
||||||
assertEquals(logEntry.timeToFirstTokenMs, undefined);
|
|
||||||
});
|
|
||||||
|
|
||||||
Deno.test("Integration - full flow from turns to log entry", () => {
|
|
||||||
// Simulate a real scenario with multiple turns
|
|
||||||
const turn1 = calculateTurnMetrics({
|
|
||||||
turnId: "turn-1",
|
|
||||||
inputTokens: 2000,
|
|
||||||
outputTokens: 500,
|
|
||||||
durationMs: 8000,
|
|
||||||
timeToFirstTokenMs: 1200,
|
|
||||||
});
|
|
||||||
|
|
||||||
const turn2 = calculateTurnMetrics({
|
|
||||||
turnId: "turn-2",
|
|
||||||
inputTokens: 800,
|
|
||||||
outputTokens: 200,
|
|
||||||
durationMs: 3000,
|
|
||||||
});
|
|
||||||
|
|
||||||
const promptMetrics = aggregatePromptMetrics({
|
|
||||||
provider: "groq",
|
|
||||||
model: "llama-3.1-70b",
|
|
||||||
turnMetrics: [turn1, turn2],
|
|
||||||
});
|
|
||||||
|
|
||||||
const display = formatMetricsForDisplay(promptMetrics);
|
|
||||||
const logEntry = toLogEntry(promptMetrics);
|
|
||||||
|
|
||||||
// Verify aggregation
|
|
||||||
assertEquals(promptMetrics.turnCount, 2);
|
|
||||||
assertEquals(promptMetrics.inputTokens, 2800);
|
|
||||||
assertEquals(promptMetrics.outputTokens, 700);
|
|
||||||
assertEquals(promptMetrics.totalTokens, 3500);
|
|
||||||
assertEquals(promptMetrics.totalDurationMs, 11000);
|
|
||||||
assertEquals(promptMetrics.timeToFirstTokenMs, 1200);
|
|
||||||
|
|
||||||
// Verify corrected rate calculations
|
|
||||||
// prefill: 2800 / 1.2 = 2333.33 tok/s
|
|
||||||
assertGreaterOrEqual(promptMetrics.prefillTokensPerSec, 2333.3);
|
|
||||||
assertLessOrEqual(promptMetrics.prefillTokensPerSec, 2333.4);
|
|
||||||
// generation: 700 / 9.8 = 71.43 tok/s
|
|
||||||
assertGreaterOrEqual(promptMetrics.generationTokensPerSec, 71.4);
|
|
||||||
assertLessOrEqual(promptMetrics.generationTokensPerSec, 71.5);
|
|
||||||
// combined: 3500 / 11 = 318.18 tok/s
|
|
||||||
assertGreaterOrEqual(promptMetrics.combinedTokensPerSec, 318.1);
|
|
||||||
assertLessOrEqual(promptMetrics.combinedTokensPerSec, 318.2);
|
|
||||||
|
|
||||||
// Verify display contains key info
|
|
||||||
assertEquals(display.includes("groq/llama-3.1-70b"), true);
|
|
||||||
assertEquals(display.includes("TTFT: 1200ms"), true);
|
|
||||||
|
|
||||||
// Verify log entry
|
|
||||||
assertEquals(logEntry.provider, "groq");
|
|
||||||
assertEquals(logEntry.model, "llama-3.1-70b");
|
|
||||||
assertEquals(logEntry.turnCount, 2);
|
|
||||||
});
|
|
||||||
@ -25,7 +25,7 @@ let promptStartMs: number | undefined;
|
|||||||
let currentTurnStartMs: number | undefined;
|
let currentTurnStartMs: number | undefined;
|
||||||
let currentTurnId: string | undefined;
|
let currentTurnId: string | undefined;
|
||||||
let turnMetrics: TurnMetrics[] = [];
|
let turnMetrics: TurnMetrics[] = [];
|
||||||
let firstTokenTimeMs: number | undefined;
|
let currentTurnFirstTokenMs: number | undefined; // Per-turn TTFT
|
||||||
let provider: string | undefined;
|
let provider: string | undefined;
|
||||||
let model: string | undefined;
|
let model: string | undefined;
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ export default function (pi: ExtensionAPI) {
|
|||||||
if (!ctx.model) return;
|
if (!ctx.model) return;
|
||||||
promptStartMs = Date.now();
|
promptStartMs = Date.now();
|
||||||
turnMetrics = [];
|
turnMetrics = [];
|
||||||
firstTokenTimeMs = undefined;
|
currentTurnFirstTokenMs = undefined;
|
||||||
provider = ctx.model.provider;
|
provider = ctx.model.provider;
|
||||||
model = ctx.model.id;
|
model = ctx.model.id;
|
||||||
});
|
});
|
||||||
@ -44,12 +44,13 @@ export default function (pi: ExtensionAPI) {
|
|||||||
pi.on("turn_start", async (event, _ctx) => {
|
pi.on("turn_start", async (event, _ctx) => {
|
||||||
currentTurnStartMs = Date.now();
|
currentTurnStartMs = Date.now();
|
||||||
currentTurnId = `turn-${event.turnIndex}`;
|
currentTurnId = `turn-${event.turnIndex}`;
|
||||||
|
currentTurnFirstTokenMs = undefined; // Reset TTFT for this turn
|
||||||
});
|
});
|
||||||
|
|
||||||
pi.on("message_update", async (event, _ctx) => {
|
pi.on("message_update", async (event, _ctx) => {
|
||||||
// Capture TTFT on first token
|
// Capture per-turn TTFT on first token
|
||||||
if (firstTokenTimeMs === undefined && event.assistantMessageEvent?.type === "text_delta") {
|
if (currentTurnFirstTokenMs === undefined && event.assistantMessageEvent?.type === "text_delta") {
|
||||||
firstTokenTimeMs = Date.now();
|
currentTurnFirstTokenMs = Date.now();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -58,8 +59,8 @@ export default function (pi: ExtensionAPI) {
|
|||||||
const inputTokens = event.message.usage?.input ?? 0;
|
const inputTokens = event.message.usage?.input ?? 0;
|
||||||
const outputTokens = event.message.usage?.output ?? 0;
|
const outputTokens = event.message.usage?.output ?? 0;
|
||||||
const durationMs = currentTurnStartMs ? Date.now() - currentTurnStartMs : 0;
|
const durationMs = currentTurnStartMs ? Date.now() - currentTurnStartMs : 0;
|
||||||
const ttftMs = currentTurnId === `turn-${event.turnIndex}` && firstTokenTimeMs && currentTurnStartMs
|
const ttftMs = currentTurnFirstTokenMs && currentTurnStartMs
|
||||||
? firstTokenTimeMs - currentTurnStartMs
|
? currentTurnFirstTokenMs - currentTurnStartMs
|
||||||
: undefined;
|
: undefined;
|
||||||
|
|
||||||
const turnMetric = calculateTurnMetrics({
|
const turnMetric = calculateTurnMetrics({
|
||||||
@ -95,6 +96,6 @@ export default function (pi: ExtensionAPI) {
|
|||||||
// Reset state
|
// Reset state
|
||||||
promptStartMs = undefined;
|
promptStartMs = undefined;
|
||||||
turnMetrics = [];
|
turnMetrics = [];
|
||||||
firstTokenTimeMs = undefined;
|
currentTurnFirstTokenMs = undefined;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user