Merge pull request #213 from mkimhi/agentbench-lite-suite

Xiao9905 · web-flow · commit d1e4a10db08c · 2026-02-09T01:00:40.000+08:00
Add lite presets for minimal local runs
diff --git a/.github/workflows/lite-configs.yml b/.github/workflows/lite-configs.yml
@@ -0,0 +1,26 @@
+name: validate-lite-configs
+
+on:
+  pull_request:
+  push:
+    branches: [ main ]
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.9"
+
+      - name: Install minimal deps
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install pyyaml
+
+      - name: Validate lite configs
+        run: |
+          python scripts/validate_lite_configs.py
diff --git a/README.md b/README.md
@@ -151,6 +151,9 @@ and [Program Entrance Guide](docs/Entrance_en.md).
 
 Clone this repo and install the dependencies.
 
+> **Python version note:** AgentBench pins older scientific Python deps (e.g. `numpy~=1.23.x`).
+> Using the recommended **Python 3.9** (via conda) is the most reliable way to install dependencies.
+
 ```bash
 cd AgentBench
 conda create -n agent-bench python=3.9
@@ -200,6 +203,14 @@ python -m src.start_task -a
 This will launch five task_workers each for `dbbench-std` and `os-std` tasks and automatically connect them
 to the controller on port 5000. **After executing this command, please allow approximately 1 minute for the task setup to complete.** If the terminal shows ".... 200 OK", you can open another terminal and follow step 4.
 
+#### Lite preset (laptops / limited RAM)
+
+If you want to start with minimal concurrency (1 worker per task), use the lite preset:
+
+```bash
+python -m src.start_task -a --config configs/start_task_lite.yaml
+```
+
 ### Step 4. Start the assigner
 
 This step is to actually start the tasks.
@@ -210,6 +221,12 @@ If everything is correctly configured so far, you can now initiate the task test
 python -m src.assigner
 ```
 
+If you started the task server with the lite preset, you can also run the lite evaluation preset:
+
+```bash
+python -m src.assigner --config configs/assignments/lite.yaml
+```
+
 ## Next Steps
 
 If you wish to launch more tasks or use other models, you can refer to the content
diff --git a/configs/assignments/lite.yaml b/configs/assignments/lite.yaml
@@ -0,0 +1,22 @@
+# Lite preset: evaluate only low-resource tasks with minimal concurrency.
+#
+# Usage:
+#   python -m src.assigner --config configs/assignments/lite.yaml
+
+import: definition.yaml
+
+concurrency:
+  task:
+    dbbench-std: 1
+    os-std: 1
+  agent:
+    gpt-3.5-turbo-0613: 1
+
+assignments:
+  - agent:
+      - gpt-3.5-turbo-0613
+    task:
+      - dbbench-std
+      - os-std
+
+output: "outputs/{TIMESTAMP}"
diff --git a/configs/start_task_lite.yaml b/configs/start_task_lite.yaml
@@ -0,0 +1,15 @@
+# Lite preset: start only low-resource tasks.
+# Intended for laptops / limited RAM.
+#
+# Usage:
+#   python -m src.start_task -a --config configs/start_task_lite.yaml
+#
+# You can still override tasks at runtime:
+#   python -m src.start_task -a --config configs/start_task_lite.yaml -s dbbench-std 2 os-std 2
+
+definition:
+  import: tasks/task_assembly.yaml
+
+start:
+  dbbench-std: 1
+  os-std: 1
diff --git a/scripts/validate_lite_configs.py b/scripts/validate_lite_configs.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+"""Validate AgentBench lite preset configs.
+
+Goal: a fast smoke check that doesn't require building Docker images or running
+any tasks. It only validates that:
+- lite config files exist and are valid YAML
+- referenced tasks exist in configs/tasks/task_assembly.yaml
+- lite assignment references tasks that exist in the task assembly
+
+Run:
+  python scripts/validate_lite_configs.py
+"""
+
+from __future__ import annotations
+
+import pathlib
+import sys
+from typing import Any, Dict, Set
+
+import yaml
+
+ROOT = pathlib.Path(__file__).resolve().parents[1]
+
+
+def load_yaml(path: pathlib.Path) -> Dict[str, Any]:
+    try:
+        with path.open("r", encoding="utf-8") as f:
+            data = yaml.safe_load(f)
+    except Exception as e:
+        raise RuntimeError(f"Failed to parse YAML: {path}: {e}")
+    if data is None:
+        return {}
+    if not isinstance(data, dict):
+        raise RuntimeError(f"Expected mapping at top-level in {path}, got {type(data)}")
+    return data
+
+
+def main() -> int:
+    start_lite = ROOT / "configs" / "start_task_lite.yaml"
+    assign_lite = ROOT / "configs" / "assignments" / "lite.yaml"
+    task_assembly = ROOT / "configs" / "tasks" / "task_assembly.yaml"
+
+    for p in (start_lite, assign_lite, task_assembly):
+        if not p.exists():
+            raise RuntimeError(f"Missing required file: {p}")
+
+    start_cfg = load_yaml(start_lite)
+    assign_cfg = load_yaml(assign_lite)
+    assembly_cfg = load_yaml(task_assembly)
+
+    # Collect task names from imported task configs (based on file names).
+    imports = assembly_cfg.get("import", [])
+    if not isinstance(imports, list) or not all(isinstance(x, str) for x in imports):
+        raise RuntimeError("configs/tasks/task_assembly.yaml must have a list field: import")
+
+    task_names: Set[str] = set()
+    for rel in imports:
+        # rel like "webshop.yaml" -> file stem "webshop".
+        task_names.add(pathlib.Path(rel).stem)
+
+    # start_task_lite.yaml: ensure start keys look like known tasks.
+    start = start_cfg.get("start", {})
+    if not isinstance(start, dict) or not start:
+        raise RuntimeError("configs/start_task_lite.yaml must have non-empty mapping field: start")
+
+    unknown_in_start = sorted([k for k in start.keys() if str(k).split("-")[0] not in task_names])
+    if unknown_in_start:
+        raise RuntimeError(
+            "start_task_lite.yaml references tasks not present in task_assembly imports: "
+            + ", ".join(map(str, unknown_in_start))
+        )
+
+    # assignments/lite.yaml: ensure tasks exist.
+    assignments = assign_cfg.get("assignments")
+    if not isinstance(assignments, list) or not assignments:
+        raise RuntimeError("configs/assignments/lite.yaml must have a non-empty list field: assignments")
+
+    unknown_in_assign = []
+    for a in assignments:
+        if not isinstance(a, dict):
+            raise RuntimeError("Each assignment must be a mapping")
+        tasks = a.get("task", [])
+        if isinstance(tasks, str):
+            tasks = [tasks]
+        if not isinstance(tasks, list):
+            raise RuntimeError("assignment.task must be a string or list")
+        for t in tasks:
+            base = str(t).split("-")[0]
+            if base not in task_names:
+                unknown_in_assign.append(t)
+
+    if unknown_in_assign:
+        raise RuntimeError(
+            "assignments/lite.yaml references tasks not present in task_assembly imports: "
+            + ", ".join(map(str, unknown_in_assign))
+        )
+
+    print("OK: lite configs look valid")
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except Exception as e:
+        print(f"ERROR: {e}", file=sys.stderr)
+        raise SystemExit(1)