102 lines
3.7 KiB
Bash
Executable File
102 lines
3.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# ─── autoresearch.sh ─────────────────────────────────────────────────────────
|
|
# Benchmark script for sequence diagram skill optimization.
|
|
# Runs all 3 test inputs, scores each, outputs METRIC lines.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
source "${SCRIPT_DIR}/scripts/config.env" 2>/dev/null || true
|
|
|
|
# Defaults
|
|
SSH_TARGET="${SSH_TARGET:-}"
|
|
SSH_PORT="${SSH_PORT:-2222}"
|
|
export TASK_TIMEOUT="${TASK_TIMEOUT:-180}"
|
|
|
|
# ─── Pre-checks ──────────────────────────────────────────────────────────────
|
|
|
|
SKILL_FILE="${SCRIPT_DIR}/skill/SKILL.md"
|
|
if [[ ! -s "$SKILL_FILE" ]]; then
|
|
echo "ERROR: skill/SKILL.md is missing or empty"
|
|
exit 1
|
|
fi
|
|
|
|
SKILL_CHARS=$(wc -c < "$SKILL_FILE")
|
|
echo "Skill: ${SKILL_CHARS} chars"
|
|
|
|
TASKS_FILE="${SCRIPT_DIR}/benchmark/tasks.jsonl"
|
|
if [[ ! -f "$TASKS_FILE" ]]; then
|
|
echo "ERROR: benchmark/tasks.jsonl not found"
|
|
exit 1
|
|
fi
|
|
|
|
echo "────────────────────────────────────────────────────"
|
|
|
|
# ─── Run all tasks ───────────────────────────────────────────────────────────
|
|
|
|
TMPDIR=$(mktemp -d)
|
|
TOTAL_SCORE=0
|
|
SIDETRACK_COUNT=0
|
|
PARSE_COUNT=0
|
|
TASK_COUNT=0
|
|
|
|
START_TIME=$(date +%s)
|
|
|
|
while IFS= read -r line; do
|
|
TASK_ID=$(echo "$line" | jq -r '.id')
|
|
TASK_PROMPT=$(echo "$line" | jq -r '.prompt')
|
|
TASK_COUNT=$((TASK_COUNT + 1))
|
|
|
|
OUTPUT_FILE="${TMPDIR}/${TASK_ID}.txt"
|
|
SCORE_FILE="${TMPDIR}/${TASK_ID}.json"
|
|
|
|
echo " [${TASK_COUNT}/3] ${TASK_ID}..."
|
|
|
|
# Run the task
|
|
bash "${SCRIPT_DIR}/scripts/run_one.sh" \
|
|
"$TASK_PROMPT" \
|
|
"$OUTPUT_FILE" \
|
|
"$SSH_TARGET" \
|
|
"$SSH_PORT"
|
|
|
|
# Score it
|
|
SCORE_JSON=$(bash "${SCRIPT_DIR}/scripts/score.sh" "$OUTPUT_FILE")
|
|
echo "$SCORE_JSON" > "$SCORE_FILE"
|
|
|
|
# Extract scores
|
|
TASK_SCORE=$(echo "$SCORE_JSON" | jq -r '.score')
|
|
TASK_SIDETRACK=$(echo "$SCORE_JSON" | jq -r '.no_sidetracking')
|
|
TASK_PARSE=$(echo "$SCORE_JSON" | jq -r '.diagram_parseable')
|
|
TASK_CHARS=$(echo "$SCORE_JSON" | jq -r '.char_count')
|
|
|
|
TOTAL_SCORE=$((TOTAL_SCORE + TASK_SCORE))
|
|
|
|
if (( TASK_SIDETRACK == 0 )); then
|
|
SIDETRACK_COUNT=$((SIDETRACK_COUNT + 1))
|
|
fi
|
|
|
|
if (( TASK_PARSE == 1 )); then
|
|
PARSE_COUNT=$((PARSE_COUNT + 1))
|
|
fi
|
|
|
|
echo " score=${TASK_SCORE}/6 sidetrack=$(( 1 - TASK_SIDETRACK )) parseable=${TASK_PARSE} chars=${TASK_CHARS}"
|
|
|
|
done < "$TASKS_FILE"
|
|
|
|
END_TIME=$(date +%s)
|
|
TOTAL_SECONDS=$((END_TIME - START_TIME))
|
|
|
|
# ─── Cleanup ─────────────────────────────────────────────────────────────────
|
|
|
|
rm -rf "$TMPDIR"
|
|
|
|
# ─── Output METRIC lines ────────────────────────────────────────────────────
|
|
|
|
echo ""
|
|
echo "METRIC score=${TOTAL_SCORE}"
|
|
echo "METRIC sidetrack_count=${SIDETRACK_COUNT}"
|
|
echo "METRIC parse_count=${PARSE_COUNT}"
|
|
echo "METRIC total_seconds=${TOTAL_SECONDS}"
|
|
echo "METRIC skill_chars=${SKILL_CHARS}"
|