102 lines
3.7 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
# ─── autoresearch.sh ─────────────────────────────────────────────────────────
# Benchmark script for sequence diagram skill optimization.
# Runs all 3 test inputs, scores each, outputs METRIC lines.
# ─────────────────────────────────────────────────────────────────────────────
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "${SCRIPT_DIR}/scripts/config.env" 2>/dev/null || true
# Defaults
SSH_TARGET="${SSH_TARGET:-}"
SSH_PORT="${SSH_PORT:-2222}"
export TASK_TIMEOUT="${TASK_TIMEOUT:-180}"
# ─── Pre-checks ──────────────────────────────────────────────────────────────
SKILL_FILE="${SCRIPT_DIR}/skill/SKILL.md"
if [[ ! -s "$SKILL_FILE" ]]; then
echo "ERROR: skill/SKILL.md is missing or empty"
exit 1
fi
SKILL_CHARS=$(wc -c < "$SKILL_FILE")
echo "Skill: ${SKILL_CHARS} chars"
TASKS_FILE="${SCRIPT_DIR}/benchmark/tasks.jsonl"
if [[ ! -f "$TASKS_FILE" ]]; then
echo "ERROR: benchmark/tasks.jsonl not found"
exit 1
fi
echo "────────────────────────────────────────────────────"
# ─── Run all tasks ───────────────────────────────────────────────────────────
TMPDIR=$(mktemp -d)
TOTAL_SCORE=0
SIDETRACK_COUNT=0
PARSE_COUNT=0
TASK_COUNT=0
START_TIME=$(date +%s)
while IFS= read -r line; do
TASK_ID=$(echo "$line" | jq -r '.id')
TASK_PROMPT=$(echo "$line" | jq -r '.prompt')
TASK_COUNT=$((TASK_COUNT + 1))
OUTPUT_FILE="${TMPDIR}/${TASK_ID}.txt"
SCORE_FILE="${TMPDIR}/${TASK_ID}.json"
echo " [${TASK_COUNT}/3] ${TASK_ID}..."
# Run the task
bash "${SCRIPT_DIR}/scripts/run_one.sh" \
"$TASK_PROMPT" \
"$OUTPUT_FILE" \
"$SSH_TARGET" \
"$SSH_PORT"
# Score it
SCORE_JSON=$(bash "${SCRIPT_DIR}/scripts/score.sh" "$OUTPUT_FILE")
echo "$SCORE_JSON" > "$SCORE_FILE"
# Extract scores
TASK_SCORE=$(echo "$SCORE_JSON" | jq -r '.score')
TASK_SIDETRACK=$(echo "$SCORE_JSON" | jq -r '.no_sidetracking')
TASK_PARSE=$(echo "$SCORE_JSON" | jq -r '.diagram_parseable')
TASK_CHARS=$(echo "$SCORE_JSON" | jq -r '.char_count')
TOTAL_SCORE=$((TOTAL_SCORE + TASK_SCORE))
if (( TASK_SIDETRACK == 0 )); then
SIDETRACK_COUNT=$((SIDETRACK_COUNT + 1))
fi
if (( TASK_PARSE == 1 )); then
PARSE_COUNT=$((PARSE_COUNT + 1))
fi
echo " score=${TASK_SCORE}/6 sidetrack=$(( 1 - TASK_SIDETRACK )) parseable=${TASK_PARSE} chars=${TASK_CHARS}"
done < "$TASKS_FILE"
END_TIME=$(date +%s)
TOTAL_SECONDS=$((END_TIME - START_TIME))
# ─── Cleanup ─────────────────────────────────────────────────────────────────
rm -rf "$TMPDIR"
# ─── Output METRIC lines ────────────────────────────────────────────────────
echo ""
echo "METRIC score=${TOTAL_SCORE}"
echo "METRIC sidetrack_count=${SIDETRACK_COUNT}"
echo "METRIC parse_count=${PARSE_COUNT}"
echo "METRIC total_seconds=${TOTAL_SECONDS}"
echo "METRIC skill_chars=${SKILL_CHARS}"