From a38c76c65e13b52346cabcb12fc65ec0da591e88 Mon Sep 17 00:00:00 2001 From: Willem van den Ende Date: Tue, 28 Apr 2026 10:06:03 +0100 Subject: [PATCH] move pi-llm-performance to monorepo, update README and add deno.json --- .pi/llm-metrics.log | 1 + mise.toml | 1 + packages/pi-llm-performance/README.md | 20 ++-- packages/pi-llm-performance/deno.json | 8 ++ packages/pi-llm-performance/deno.lock | 5 + plans/metrics-check.md | 73 +++++++++++++ scoped-packages.md | 76 +++++++++++++ working-with-extensions.md | 152 ++++++++++++++++++++++++++ 8 files changed, 328 insertions(+), 8 deletions(-) create mode 100644 .pi/llm-metrics.log create mode 100644 packages/pi-llm-performance/deno.json create mode 100644 plans/metrics-check.md create mode 100644 scoped-packages.md create mode 100644 working-with-extensions.md diff --git a/.pi/llm-metrics.log b/.pi/llm-metrics.log new file mode 100644 index 0000000..6523c00 --- /dev/null +++ b/.pi/llm-metrics.log @@ -0,0 +1 @@ +{"timestamp":"2026-04-28T08:58:29.989Z","provider":"llama.cpp","model":"Qwen3.6-35B-A3B-MXFP4_MOE.gguf","turnCount":6,"inputTokens":8294,"outputTokens":1356,"totalTokens":9650,"prefillTokensPerSec":1925.26,"generationTokensPerSec":42.55,"combinedTokensPerSec":266.77,"totalDurationMs":36174,"timeToFirstTokenMs":4308} diff --git a/mise.toml b/mise.toml index 14c0e47..7fc8194 100644 --- a/mise.toml +++ b/mise.toml @@ -1,5 +1,6 @@ [tools] bun = "latest" +deno = "latest" elixir = "latest" erlang = "latest" node = "24" diff --git a/packages/pi-llm-performance/README.md b/packages/pi-llm-performance/README.md index d397fe2..a12d9f2 100644 --- a/packages/pi-llm-performance/README.md +++ b/packages/pi-llm-performance/README.md @@ -1,22 +1,26 @@ # pi-llm-performance -LLM performance metrics extension +LLM performance metrics extension — captures and displays TTFT, prefill, and generation speeds from pi agent turns. -## How to install +## Development -Add to your global pi settings: +This package lives in the `pi-extensions` monorepo. ```bash -pi install /Users/willem/dev/spikes/llm/custom-coding-agent/packages/pi-llm-performance +pnpm install # workspace setup +deno test # run tests ``` -Or add manually to `~/.pi/agent/settings.json`: +## Usage -``` +Add to your pi settings (`~/.pi/agent/settings.json`): + +```json +{ "packages": [ - "/Users/willem/dev/spikes/llm/custom-coding-agent/packages/pi-llm-performance", - ... + "../dev/spikes/llm/monotonic-pi-extensions/packages/pi-llm-performance" ] +} ``` Then reload pi: diff --git a/packages/pi-llm-performance/deno.json b/packages/pi-llm-performance/deno.json new file mode 100644 index 0000000..f43c6de --- /dev/null +++ b/packages/pi-llm-performance/deno.json @@ -0,0 +1,8 @@ +{ + "imports": { + "@std/assert": "jsr:@std/assert@^1.0.0" + }, + "tasks": { + "test": "deno test src/" + } +} diff --git a/packages/pi-llm-performance/deno.lock b/packages/pi-llm-performance/deno.lock index 84f8171..cba41d8 100644 --- a/packages/pi-llm-performance/deno.lock +++ b/packages/pi-llm-performance/deno.lock @@ -14,5 +14,10 @@ "@std/internal@1.0.12": { "integrity": "972a634fd5bc34b242024402972cd5143eac68d8dffaca5eaa4dba30ce17b027" } + }, + "workspace": { + "dependencies": [ + "jsr:@std/assert@1" + ] } } diff --git a/plans/metrics-check.md b/plans/metrics-check.md new file mode 100644 index 0000000..083c0e0 --- /dev/null +++ b/plans/metrics-check.md @@ -0,0 +1,73 @@ +# Plan: Analyze & Fix `llm-metrics` Extension Timing Bug + +## Problem Statement +The extension reports generation speed as ~8,000–2,400 tok/s (physically impossible) while prefill speed is ~70 tok/s. The math is internally consistent but the underlying phase boundaries are inverted or misaligned. Real generation speed is ~53–70 tok/s (confirmed by earlier runs). + +## Phase 1: Locate & Map the Extension +1. **Find the source code** + - Search `~/.pi/extensions/`, `~/.pi/tools/`, and the pi-coding-agent package for files matching `llm`, `metric`, `performance`, `benchmark` + - Check `~/.pi/config` or project `.pi/config` for extension/tool registration + - Look for custom tool definitions in `extensions/`, `tools/`, or `skills/` directories +2. **Identify the provider integration** + - The log shows `"provider":"llama.cpp"` — find where the extension hooks into llama.cpp (likely via subprocess, WebSocket, or callback interception) + - Map the data flow: raw llama.cpp output → extension parsing → JSON log writing + +## Phase 2: Diagnose the Timing Bug +3. **Trace phase boundary detection** + - Find how the extension defines "prefill" vs "generation" start/end times + - Check if it uses: + - `timeToFirstToken` (TTFT) as the split point + - llama.cpp callback hooks (`completion_token_callback`, `prompt_token_callback`) + - Wall-clock timestamps around token streaming +4. **Verify the calculation** + - Confirm the formula: `generationTok/s = outputTokens / (totalDuration - TTFT)` + - Check if `totalDuration` includes only generation, or the full call + - Look for race conditions: async callbacks firing out of order, or generation end timestamp captured before all tokens are flushed +5. **Reproduce the anomaly** + - Run the same model with identical prompt/output length + - Compare TTFT, totalDuration, and per-phase timestamps + - Check if the bug appears only with large prompts, speculative decoding, or certain sampling configs + +## Phase 3: Fix the Implementation +6. **Correct phase boundaries** + - If using callbacks: ensure generation start = TTFT timestamp, generation end = last token callback or explicit `done` event + - If using wall-clock: add a small buffer after last token to account for async flush + - Add validation: reject generation speeds > 500 tok/s (sanity check) +7. **Fix label assignment** + - Ensure `prefillTokensPerSec` = `inputTokens / TTFT` + - Ensure `generationTokensPerSec` = `outputTokens / (totalDuration - TTFT)` + - Add explicit phase logging to debug output +8. **Add telemetry** + - Log raw timestamps: `prefill_start`, `prefill_end`, `gen_start`, `gen_end`, `total_start`, `total_end` + - Log per-phase token counts to catch mismatches + - Write to `.pi/llm-metrics.log` with consistent schema + +## Phase 4: Verify & Deploy +9. **Test cases** + - Small prompt + short output (baseline) + - Large prompt + long output (original failure case) + - Speculative decoding run (if supported) + - Early termination / stop token edge case +10. **Validate output** + - Generation speed should be 40–100 tok/s for this model/hardware + - Prefill speed should be 50–200 tok/s (parallel compute) + - TTFT should match prefill duration + - No negative phase durations +11. **Update schema & docs** + - Add `rawTimestamps` field to log entries for debugging + - Document phase definitions in extension README + - Add unit tests for metric calculation logic + +## Deliverables +- [ ] Extension source located & data flow mapped +- [ ] Root cause identified (callback timing gap, phase boundary misassignment, or async flush race) +- [ ] Fix implemented with sanity checks +- [ ] Test suite covering edge cases +- [ ] Log schema updated with raw timestamps +- [ ] PR or patch ready for review + +## Questions to Answer During Analysis +- Does the extension intercept llama.cpp at the C++ level, via CLI, or through a Python wrapper? +- Are callbacks synchronous or async? +- Is there a `done`/`end` event, or does it rely on empty token streams? +- Could speculative decoding be causing the draft model's batched verification to be misclassified as "generation"? diff --git a/scoped-packages.md b/scoped-packages.md new file mode 100644 index 0000000..7f67cec --- /dev/null +++ b/scoped-packages.md @@ -0,0 +1,76 @@ +# Scoped Packages + +## Step 1: Create the npm org + +```bash +npm org create mostalive +``` + +This creates the `@mostalive` scope on npm. You'll need to pay the [org fee](https://docs.npmjs.com/about-organizations) (currently ~$7/month for the basic tier). + +Alternatively, if you already have an account, you can use your username directly — scoped packages can use your personal account too: + +```bash +# No separate org creation needed if @mostalive is your npm username +``` + +Check if the scope exists: + +```bash +npm org list +``` + +## Step 2: Rename the package + +In `packages/pi-turn-limit/package.json`: + +```json +{ + "name": "@mostalive/pi-turn-limit", + "version": "0.1.0", + ... +} +``` + +## Step 3: Publish + +```bash +cd packages/pi-turn-limit +npm publish +``` + +Scoped packages require `--access public` on first publish (since npm defaults scoped packages to private): + +```bash +npm publish --access public +``` + +## Step 4: Users install + +```bash +pi install npm:@mostalive/pi-turn-limit +``` + +--- + +## Cheaper Alternative: Scoped Git Package + +If you don't want to pay for an npm org, you can ship via git without scoping: + +```bash +pi install git:github.com/mostalive/pi-turn-limit +``` + +No npm org needed. Users install directly from your GitHub repo. You'd still need to publish to npm for the `npm:` install path, but the git path is free. + +--- + +## Summary + +| Approach | Cost | User installs via | +|----------|------|-------------------| +| `npm org create` + scoped npm | ~$7/mo | `pi install npm:@mostalive/pi-turn-limit` | +| GitHub repo (no scope) | Free | `pi install git:github.com/user/repo` | +| Unscoped npm (`pi-turn-limit`) | Free | `pi install npm:pi-turn-limit` | + +If you already have a personal npm account named `mostalive`, the scope is free — scoped packages just use your existing account. The org fee only applies if you create a separate organization entity. diff --git a/working-with-extensions.md b/working-with-extensions.md new file mode 100644 index 0000000..72c18c3 --- /dev/null +++ b/working-with-extensions.md @@ -0,0 +1,152 @@ +# Working with Pi Extensions + +## Installation Options + +### Option 1: Publish to npm + `pi install` (Recommended) + +The cleanest path that replicates the official pi experience. + +**You (publishing):** + +```bash +cd packages/pi-turn-limit +npm publish +``` + +**Users (installing globally):** + +```bash +pi install npm:pi-turn-limit +``` + +This writes to `~/.pi/agent/settings.json` under `packages`. Pi handles the install, runs `npm install`, and auto-discovers the extension from the `pi.extensions` manifest. + +### Option 2: npm global install + settings.json + +**You (publishing):** + +```bash +npm publish +``` + +**Users:** Two steps — install the npm package globally, then tell pi about it: + +```bash +npm install -g pi-turn-limit +``` + +Then in `~/.pi/agent/settings.json`: + +```json +{ + "packages": [ + "npm:pi-turn-limit" + ] +} +``` + +Or use the same command as Option 1 — `pi install npm:pi-turn-limit` does both steps. + +### Option 3: Local directory (for development) + +For local testing without publishing: + +```bash +pi install /Users/willem/dev/spikes/llm/monotonic-pi-extensions/packages/pi-turn-limit +``` + +Or in `~/.pi/agent/settings.json`: + +```json +{ + "packages": [ + "/Users/willem/dev/spikes/llm/monotonic-pi-extensions/packages/pi-turn-limit" + ] +} +``` + +Or as a single-file extension in `~/.pi/agent/extensions/`: + +```bash +cp packages/pi-turn-limit/src/turn-limit.ts ~/.pi/agent/extensions/turn-limit.ts +``` + +### Option 4: Per-repo project-local install + +Users can install an extension only for a specific project: + +```bash +pi install -l npm:pi-turn-limit # -l = project-local +``` + +This writes to `.pi/settings.json` in the project root. Pi auto-installs missing packages on startup per-project. + +--- + +## Disabling Extensions Per-Repo + +Three approaches: + +### A. `pi config` (simplest) + +```bash +pi config turn-limit:off # Disable by extension name +pi config turn-limit:on # Re-enable +``` + +Works for both global and project scope. Per-repo: + +```bash +pi config -l turn-limit:off +``` + +### B. Package filtering in project `settings.json` + +In `.pi/settings.json` (project-local): + +```json +{ + "packages": [ + { + "source": "npm:pi-turn-limit", + "extensions": [] // Load none + } + ] +} +``` + +Or filter specific files: + +```json +{ + "packages": [ + { + "source": "npm:pi-turn-limit", + "extensions": ["!src/turn-limit.ts"] // Exclude this one + } + ] +} +``` + +### C. Remove from settings entirely + +```bash +pi remove npm:pi-turn-limit +``` + +Or manually edit `~/.pi/agent/settings.json` and remove the package entry. + +--- + +## Summary Table + +| Method | Scope | User Command | +|--------|-------|--------------| +| `pi install npm:pkg` | Global | One command, handles everything | +| `npm i -g` + settings.json | Global | Two steps | +| `pi install ./path` | Global (symlink-style) | Local dev | +| `pi install -l npm:pkg` | Project-local | Per-repo | +| `pi config name:off` | Toggle | Enable/disable without uninstalling | +| `pi config -l name:off` | Project-local toggle | Per-repo disable | + +**Recommendation:** Publish to npm, then users run `pi install npm:pi-turn-limit`. For disabling per-repo, `pi config -l turn-limit:off` is the simplest approach — a one-liner that doesn't require editing JSON files.