#!/usr/bin/env bash set -euo pipefail # ─── autoresearch.sh ───────────────────────────────────────────────────────── # Benchmark script for sequence diagram skill optimization. # Runs all 3 test inputs, scores each, outputs METRIC lines. # ───────────────────────────────────────────────────────────────────────────── SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/scripts/config.env" 2>/dev/null || true # Defaults SSH_TARGET="${SSH_TARGET:-}" SSH_PORT="${SSH_PORT:-2222}" export TASK_TIMEOUT="${TASK_TIMEOUT:-180}" # ─── Pre-checks ────────────────────────────────────────────────────────────── SKILL_FILE="${SCRIPT_DIR}/skill/SKILL.md" if [[ ! -s "$SKILL_FILE" ]]; then echo "ERROR: skill/SKILL.md is missing or empty" exit 1 fi SKILL_CHARS=$(wc -c < "$SKILL_FILE") echo "Skill: ${SKILL_CHARS} chars" TASKS_FILE="${SCRIPT_DIR}/benchmark/tasks.jsonl" if [[ ! -f "$TASKS_FILE" ]]; then echo "ERROR: benchmark/tasks.jsonl not found" exit 1 fi echo "────────────────────────────────────────────────────" # ─── Run all tasks ─────────────────────────────────────────────────────────── TMPDIR=$(mktemp -d) TOTAL_SCORE=0 SIDETRACK_COUNT=0 PARSE_COUNT=0 TASK_COUNT=0 START_TIME=$(date +%s) while IFS= read -r line; do TASK_ID=$(echo "$line" | jq -r '.id') TASK_PROMPT=$(echo "$line" | jq -r '.prompt') TASK_COUNT=$((TASK_COUNT + 1)) OUTPUT_FILE="${TMPDIR}/${TASK_ID}.txt" SCORE_FILE="${TMPDIR}/${TASK_ID}.json" echo " [${TASK_COUNT}/3] ${TASK_ID}..." # Run the task bash "${SCRIPT_DIR}/scripts/run_one.sh" \ "$TASK_PROMPT" \ "$OUTPUT_FILE" \ "$SSH_TARGET" \ "$SSH_PORT" # Score it SCORE_JSON=$(bash "${SCRIPT_DIR}/scripts/score.sh" "$OUTPUT_FILE") echo "$SCORE_JSON" > "$SCORE_FILE" # Extract scores TASK_SCORE=$(echo "$SCORE_JSON" | jq -r '.score') TASK_SIDETRACK=$(echo "$SCORE_JSON" | jq -r '.no_sidetracking') TASK_PARSE=$(echo "$SCORE_JSON" | jq -r '.diagram_parseable') TASK_CHARS=$(echo "$SCORE_JSON" | jq -r '.char_count') TOTAL_SCORE=$((TOTAL_SCORE + TASK_SCORE)) if (( TASK_SIDETRACK == 0 )); then SIDETRACK_COUNT=$((SIDETRACK_COUNT + 1)) fi if (( TASK_PARSE == 1 )); then PARSE_COUNT=$((PARSE_COUNT + 1)) fi echo " score=${TASK_SCORE}/6 sidetrack=$(( 1 - TASK_SIDETRACK )) parseable=${TASK_PARSE} chars=${TASK_CHARS}" done < "$TASKS_FILE" END_TIME=$(date +%s) TOTAL_SECONDS=$((END_TIME - START_TIME)) # ─── Cleanup ───────────────────────────────────────────────────────────────── rm -rf "$TMPDIR" # ─── Output METRIC lines ──────────────────────────────────────────────────── echo "" echo "METRIC score=${TOTAL_SCORE}" echo "METRIC sidetrack_count=${SIDETRACK_COUNT}" echo "METRIC parse_count=${PARSE_COUNT}" echo "METRIC total_seconds=${TOTAL_SECONDS}" echo "METRIC skill_chars=${SKILL_CHARS}"