From e0e9867d399969e5b0df6b30ef38daa6f179647f Mon Sep 17 00:00:00 2001
From: Bharath Sriraam R R <albharath.1305@gmail.com>
Date: Fri, 23 Jan 2026 23:35:07 -0500
Subject: [PATCH 1/4] Add benchmarks evaluation integration via submodule

- Add adityasoni9998/benchmarks as git submodule (agentic_code_search branch)
- Add eval_runner.py that uses sys.path to import benchmarks at runtime
- Add run_eval.sh wrapper script for running evaluations
- Add minimal deps (jinja2, pandas, tqdm, lmnr) needed for benchmarks

This allows running agentic_code_search evaluations using the benchmarks
repo while keeping our existing SDK and training setup intact.
---
 .gitmodules            |  4 ++++
 benchmarks             |  1 +
 pyproject.toml         |  4 ++++
 scripts/eval_runner.py | 33 +++++++++++++++++++++++++++++++++
 scripts/run_eval.sh    | 21 +++++++++++++++++++++
 5 files changed, 63 insertions(+)
 create mode 160000 benchmarks
 create mode 100755 scripts/eval_runner.py
 create mode 100755 scripts/run_eval.sh
diff --git a/.gitmodules b/.gitmodules
index ad55c20..27f5528 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,7 @@
 [submodule "prime-rl"]
 	path = prime-rl
 	url = https://github.com/PrimeIntellect-ai/prime-rl
+[submodule "benchmarks"]
+	path = benchmarks
+	url = https://github.com/adityasoni9998/benchmarks.git
+	branch = agentic_code_search
diff --git a/benchmarks b/benchmarks
new file mode 160000
index 0000000..160f527
--- /dev/null
+++ b/benchmarks
@@ -0,0 +1 @@
+Subproject commit 160f5279769281d013ce6ccff429a7ca354c2d58
diff --git a/pyproject.toml b/pyproject.toml
index cce1f28..e05a138 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,6 +25,10 @@ dependencies = [
     "seaborn>=0.13.2",
     "gcsfs>=2025.3.0",
     "lmcache",
+    "jinja2",
+    "pandas",
+    "tqdm",
+    "lmnr>=0.7.24",
     # "flashinfer-python",
     # "flashinfer-jit-cache",
 ]
diff --git a/scripts/eval_runner.py b/scripts/eval_runner.py
new file mode 100755
index 0000000..f6a6efd
--- /dev/null
+++ b/scripts/eval_runner.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+"""
+Evaluation runner for agentic code search benchmark.
+
+This script adds the benchmarks submodule to sys.path and runs the
+agentic_code_search evaluation from the benchmarks package.
+
+Usage:
+    python scripts/eval_runner.py --dataset_file <path> --llm-config-path <path> [options]
+
+Example:
+    python scripts/eval_runner.py \
+        --dataset_file ./data/test.jsonl \
+        --llm-config-path ./configs/llm.json \
+        --output-dir ./outputs \
+        --max-iterations 25 \
+        --num-workers 4
+
+For all available options, run:
+    python scripts/eval_runner.py --help
+"""
+
+import sys
+from pathlib import Path
+
+# Add the benchmarks submodule to sys.path so we can import from it
+_benchmarks_path = Path(__file__).parent.parent / "benchmarks"
+sys.path.insert(0, str(_benchmarks_path.resolve()))
+
+from benchmarks.agentic_code_search.run_infer import main
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_eval.sh b/scripts/run_eval.sh
new file mode 100755
index 0000000..4a3b2d7
--- /dev/null
+++ b/scripts/run_eval.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+#
+# run_eval.sh - Wrapper script to run the evaluation runner with uv
+#
+# Usage:
+#   ./scripts/run_eval.sh [OPTIONS]
+#
+# Example usage:
+#   ./scripts/run_eval.sh \
+#     --dataset_file benchmarks/gt_location.jsonl \
+#     --llm-config-path configs/llm_config.json \
+#     --max-iterations 10 \
+#     --num-workers 1 \
+#     --tools terminal
+#
+# Options are passed through to scripts/eval_runner.py
+# Run with --help to see all available options:
+#   ./scripts/run_eval.sh --help
+#
+
+uv run python scripts/eval_runner.py "$@"

From c103eae8a22a814478cacd34731ce732ee2c31af Mon Sep 17 00:00:00 2001
From: Bharath Sriraam R R <albharath.1305@gmail.com>
Date: Fri, 23 Jan 2026 23:55:12 -0500
Subject: [PATCH 2/4] Add evaluation integration documentation

---
 docs/EVAL_INTEGRATION.md | 181 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)
 create mode 100644 docs/EVAL_INTEGRATION.md

diff --git a/docs/EVAL_INTEGRATION.md b/docs/EVAL_INTEGRATION.md
new file mode 100644
index 0000000..4135218
--- /dev/null
+++ b/docs/EVAL_INTEGRATION.md
@@ -0,0 +1,181 @@
+# Evaluation Integration Documentation
+
+## Goal
+
+Integrate evaluation code from the [benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) into this repository to enable end-to-end training AND evaluation of code localization agents for the ICML submission.
+
+**Key requirements:**
+- Run trained models on SWE-Bench Pro/Verified benchmarks
+- Use the same `software-agent-sdk` for both training and evaluation
+- No dependency conflicts with existing SkyRL training setup
+
+## Solution Approach
+
+### The Problem
+
+The benchmarks repo is designed as a standalone project with its own workspace pointing to `vendor/software-agent-sdk/`. Directly integrating it as a workspace member caused:
+
+1. **Nested workspace error** - uv doesn't support workspaces inside workspaces
+2. **Dependency conflicts** - `commit0` requires `datasets==3.0.1`, we need `>=4.0.0`
+
+### The Solution: Runtime sys.path Manipulation
+
+Instead of making benchmarks a proper package in our workspace, we use Python's `sys.path` to import it at runtime:
+
+```python
+import sys
+sys.path.insert(0, "/path/to/benchmarks")
+
+# Now imports work - and they use OUR installed SDK
+from benchmarks.agentic_code_search.run_infer import main
+```
+
+**Why this works:**
+- When benchmarks code imports `openhands.sdk`, Python searches `sys.path`
+- Our SDK packages are already installed via uv workspace
+- Python finds our SDK first, not benchmarks' vendor/ (which doesn't exist anyway)
+
+## Files Added/Modified
+
+| File | Description |
+|------|-------------|
+| `benchmarks/` | Git submodule pointing to adityasoni9998/benchmarks@agentic_code_search |
+| `.gitmodules` | Submodule configuration |
+| `pyproject.toml` | Added jinja2, pandas, tqdm, lmnr dependencies |
+| `scripts/eval_runner.py` | Python wrapper that sets up sys.path and runs eval |
+| `scripts/run_eval.sh` | Shell wrapper for `uv run` |
+
+## Architecture
+
+```
+agentic-code-search-oss/
+├── software-agent-sdk/          # Our SDK (used for training AND eval)
+│   ├── openhands-sdk/
+│   ├── openhands-tools/
+│   └── ...
+├── benchmarks/                   # Submodule (NOT in workspace)
+│   └── benchmarks/
+│       └── agentic_code_search/
+│           ├── run_infer.py      # Main eval script
+│           ├── eval_infer.py     # Results aggregator
+│           └── prompts/          # Jinja2 templates
+├── scripts/
+│   ├── eval_runner.py            # sys.path wrapper
+│   └── run_eval.sh               # Shell wrapper
+└── src/                          # Training code (unchanged)
+```
+
+## How Evaluation Works
+
+```
+┌─────────────────┐
+│  run_eval.sh    │
+└────────┬────────┘
+         │ uv run
+         ▼
+┌─────────────────┐
+│ eval_runner.py  │
+│                 │
+│ sys.path.insert │
+│ (benchmarks/)   │
+└────────┬────────┘
+         │ import
+         ▼
+┌─────────────────────────────────┐
+│ benchmarks.agentic_code_search  │
+│                                 │
+│ from openhands.sdk import ...   │──► Uses OUR SDK
+└─────────────────────────────────┘
+```
+
+## Learnings
+
+1. **uv workspaces don't nest** - Can't add a package with its own workspace as a member
+2. **sys.path manipulation is clean** - Keeps submodule pristine, easy to update
+3. **Python import resolution** - First match in sys.path wins, so our installed SDK is used
+4. **Dependency isolation** - We only add deps we actually need, avoiding conflicts
+
+## What to Test
+
+### On Linux with CUDA (training machine)
+
+1. **Sync dependencies:**
+   ```bash
+   uv sync
+   ```
+
+2. **Test import works:**
+   ```bash
+   uv run python -c "
+   import sys
+   sys.path.insert(0, 'benchmarks')
+   from benchmarks.agentic_code_search.run_infer import main
+   print('Import successful!')
+   "
+   ```
+
+3. **Run a minimal evaluation:**
+   ```bash
+   # Create LLM config file first
+   cat > configs/llm_config.json << 'EOF'
+   {
+     "model": "openai/gpt-4o-mini",
+     "api_key": "your-api-key",
+     "base_url": "https://api.openai.com/v1",
+     "temperature": 0.0
+   }
+   EOF
+
+   # Run eval on 1 instance
+   ./scripts/run_eval.sh \
+     --dataset_file benchmarks/gt_location.jsonl \
+     --llm-config-path configs/llm_config.json \
+     --max-iterations 10 \
+     --num-workers 1 \
+     --tools terminal \
+     --n-limit 1
+   ```
+
+4. **Verify training still works:**
+   ```bash
+   # Your existing training command should work unchanged
+   bash scripts/run_async_training.sh -m Qwen/Qwen3-4B -d $DATA_PATH
+   ```
+
+### Expected Output Format
+
+The evaluation produces JSONL output with F1 scores for:
+- **File-level**: Did the agent find the correct files?
+- **Module-level**: Did it find the correct classes?
+- **Entity-level**: Did it find the correct functions/methods?
+
+Example output:
+```json
+{
+  "instance_id": "astropy__astropy-12907",
+  "test_result": {
+    "reward": {
+      "file_reward": 1.0,
+      "module_reward": 0.8,
+      "entity_reward": 0.6
+    },
+    "raw_prediction": "astropy/modeling/separable.py\nfunction: _cstack",
+    "wall_time_seconds": 45.2,
+    "num_steps": 5,
+    "num_tool_calls": 12
+  }
+}
+```
+
+## Next Steps
+
+1. **Test on training machine** - Verify uv sync works with CUDA deps
+2. **Prepare SWE-Bench Pro/Verified datasets** - May need to download separately
+3. **Run base model evals** - Establish baseline before training
+4. **Integrate with training loop** - Optional: run evals at checkpoints
+
+## References
+
+- [Benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search)
+- [Original Slack conversation](#) - Aditya's integration instructions
+- [SWE-Bench](https://www.swebench.com/) - Benchmark website

From b5c37dd255a418dfccd0bc3b1c02c4d311be054e Mon Sep 17 00:00:00 2001
From: Bharath Sriraam R R <albharath.1305@gmail.com>
Date: Fri, 23 Jan 2026 23:57:54 -0500
Subject: [PATCH 3/4] Fix markdown formatting in eval integration docs

---
 docs/EVAL_INTEGRATION.md | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/docs/EVAL_INTEGRATION.md b/docs/EVAL_INTEGRATION.md
index 4135218..5e05ab4 100644
--- a/docs/EVAL_INTEGRATION.md
+++ b/docs/EVAL_INTEGRATION.md
@@ -2,9 +2,10 @@
 
 ## Goal
 
-Integrate evaluation code from the [benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) into this repository to enable end-to-end training AND evaluation of code localization agents for the ICML submission.
+Integrate evaluation code from the [benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) into this repository to enable end-to-end training AND evaluation of code localization agents.
 
 **Key requirements:**
+
 - Run trained models on SWE-Bench Pro/Verified benchmarks
 - Use the same `software-agent-sdk` for both training and evaluation
 - No dependency conflicts with existing SkyRL training setup
@@ -31,19 +32,20 @@ from benchmarks.agentic_code_search.run_infer import main
 ```
 
 **Why this works:**
+
 - When benchmarks code imports `openhands.sdk`, Python searches `sys.path`
 - Our SDK packages are already installed via uv workspace
 - Python finds our SDK first, not benchmarks' vendor/ (which doesn't exist anyway)
 
 ## Files Added/Modified
 
-| File | Description |
-|------|-------------|
-| `benchmarks/` | Git submodule pointing to adityasoni9998/benchmarks@agentic_code_search |
-| `.gitmodules` | Submodule configuration |
-| `pyproject.toml` | Added jinja2, pandas, tqdm, lmnr dependencies |
-| `scripts/eval_runner.py` | Python wrapper that sets up sys.path and runs eval |
-| `scripts/run_eval.sh` | Shell wrapper for `uv run` |
+| File                     | Description                                                             |
+| ------------------------ | ----------------------------------------------------------------------- |
+| `benchmarks/`            | Git submodule pointing to adityasoni9998/benchmarks@agentic_code_search |
+| `.gitmodules`            | Submodule configuration                                                 |
+| `pyproject.toml`         | Added jinja2, pandas, tqdm, lmnr dependencies                           |
+| `scripts/eval_runner.py` | Python wrapper that sets up sys.path and runs eval                      |
+| `scripts/run_eval.sh`    | Shell wrapper for `uv run`                                              |
 
 ## Architecture
 
@@ -100,11 +102,13 @@ agentic-code-search-oss/
 ### On Linux with CUDA (training machine)
 
 1. **Sync dependencies:**
+
    ```bash
    uv sync
    ```
 
 2. **Test import works:**
+
    ```bash
    uv run python -c "
    import sys
@@ -115,6 +119,7 @@ agentic-code-search-oss/
    ```
 
 3. **Run a minimal evaluation:**
+
    ```bash
    # Create LLM config file first
    cat > configs/llm_config.json << 'EOF'
@@ -145,11 +150,13 @@ agentic-code-search-oss/
 ### Expected Output Format
 
 The evaluation produces JSONL output with F1 scores for:
+
 - **File-level**: Did the agent find the correct files?
 - **Module-level**: Did it find the correct classes?
 - **Entity-level**: Did it find the correct functions/methods?
 
 Example output:
+
 ```json
 {
   "instance_id": "astropy__astropy-12907",

From 3dba6a50811a1bda6af6e1ad494b50a8528ea2ae Mon Sep 17 00:00:00 2001
From: Bharath Sriraam R R <albharath.1305@gmail.com>
Date: Sat, 24 Jan 2026 05:25:22 +0000
Subject: [PATCH 4/4] Fix eval integration and update documentation

- Fix version module patching in eval_runner.py to use parent repo's SDK SHA
  instead of benchmarks/vendor/ which doesn't exist in our setup
- Restructure EVAL_INTEGRATION.md with practical quick start guide
- Add example LLM config file for vLLM setup
- Add troubleshooting section for common issues (litellm provider prefix,
  vLLM tool calling flags, stale output)
---
 configs/eval_llm_config_example.json |   6 +
 docs/EVAL_INTEGRATION.md             | 223 ++++++++++++++++-----------
 scripts/eval_runner.py               |  30 ++++
 3 files changed, 173 insertions(+), 86 deletions(-)
 create mode 100644 configs/eval_llm_config_example.json

diff --git a/configs/eval_llm_config_example.json b/configs/eval_llm_config_example.json
new file mode 100644
index 0000000..735e911
--- /dev/null
+++ b/configs/eval_llm_config_example.json
@@ -0,0 +1,6 @@
+{
+  "model": "openai/Qwen/Qwen3-4B",
+  "api_key": "dummy",
+  "base_url": "http://localhost:8000/v1",
+  "temperature": 0.0
+}
diff --git a/docs/EVAL_INTEGRATION.md b/docs/EVAL_INTEGRATION.md
index 5e05ab4..6734639 100644
--- a/docs/EVAL_INTEGRATION.md
+++ b/docs/EVAL_INTEGRATION.md
@@ -1,6 +1,98 @@
 # Evaluation Integration Documentation
 
-## Goal
+This document explains how to run evaluations for code localization agents using the integrated benchmarks system.
+
+## Quick Start
+
+### 1. Start a Local Model with vLLM
+
+Start vLLM with tool calling enabled:
+
+```bash
+# For a small model (quick testing)
+uv run vllm serve Qwen/Qwen3-4B \
+  --port 8000 \
+  --max-model-len 32768 \
+  --enable-auto-tool-choice \
+  --tool-call-parser hermes
+
+### 2. Create LLM Config
+
+```bash
+mkdir -p configs
+cat > configs/llm_config.json << 'EOF'
+{
+  "model": "openai/Qwen/Qwen3-4B",
+  "api_key": "dummy",
+  "base_url": "http://localhost:8000/v1",
+  "temperature": 0.0
+}
+EOF
+```
+
+**Important:** The model name must be prefixed with `openai/` to tell litellm it's an OpenAI-compatible endpoint.
+
+### 3. Run Evaluation
+
+```bash
+./scripts/run_eval.sh \
+    --dataset_file benchmarks/gt_location.jsonl \
+    --llm-config-path configs/llm_config.json \
+    --system_prompt_file benchmarks/benchmarks/agentic_code_search/prompts/system_prompt.j2 \
+    --user_prompt_file benchmarks/benchmarks/agentic_code_search/prompts/file_module_short.j2 \
+    --tools terminal \
+    --max-iterations 10 \
+    --num-workers 1 \
+    --output-dir ./agentic_code_search_outputs \
+    --n-limit 1 \
+    --workspace_base_dir /tmp/testbed/
+```
+
+**Key options:**
+- `--n-limit 1` - Run on 1 instance (remove for full dataset)
+- `--num-workers 1` - Parallel workers (increase for faster eval)
+- `--max-iterations 10` - Max agent steps per instance
+
+### 4. Check Results
+
+```bash
+# View full output
+cat ./agentic_code_search_outputs/agentic_code_search_gt_location/openai/Qwen/Qwen3-4B_sdk_*/output.jsonl | jq .
+
+# View just the reward scores
+cat ./agentic_code_search_outputs/agentic_code_search_gt_location/openai/Qwen/Qwen3-4B_sdk_*/output.jsonl | jq '.test_result.reward'
+```
+
+### Example Output
+
+```json
+{
+  "file_reward": 0.5,
+  "module_reward": 0.5,
+  "entity_reward": 0.4,
+  "prediction": {
+    "files": ["sklearn/calibration.py", "sklearn/_config.py", "sklearn/isotonic.py"],
+    "modules": ["sklearn/calibration.py:_CalibratedClassifier", "sklearn/_config.py:set_config", "sklearn/isotonic.py:IsotonicRegression"],
+    "entities": ["sklearn/isotonic.py:IsotonicRegression.predict", "sklearn/_config.py:set_config", "sklearn/calibration.py:_CalibratedClassifier.predict_proba"]
+  },
+  "ground_truth": {
+    "files": ["sklearn/isotonic.py"],
+    "modules": ["sklearn/isotonic.py:IsotonicRegression"],
+    "entities": ["sklearn/isotonic.py:IsotonicRegression.predict", "sklearn/isotonic.py:IsotonicRegression.transform"]
+  }
+}
+```
+
+**Metrics explained:**
+- **file_reward** - F1 score for file-level localization
+- **module_reward** - F1 score for class-level localization  
+- **entity_reward** - F1 score for function/method-level localization
+
+---
+
+## Implementation Details
+
+### Goal
 
 Integrate evaluation code from the [benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) into this repository to enable end-to-end training AND evaluation of code localization agents.
 
@@ -10,8 +102,6 @@ Integrate evaluation code from the [benchmarks repo](https://github.com/adityaso
 - Use the same `software-agent-sdk` for both training and evaluation
 - No dependency conflicts with existing SkyRL training setup
 
-## Solution Approach
-
 ### The Problem
 
 The benchmarks repo is designed as a standalone project with its own workspace pointing to `vendor/software-agent-sdk/`. Directly integrating it as a workspace member caused:
@@ -37,7 +127,20 @@ from benchmarks.agentic_code_search.run_infer import main
 - Our SDK packages are already installed via uv workspace
 - Python finds our SDK first, not benchmarks' vendor/ (which doesn't exist anyway)
 
-## Files Added/Modified
+### Version Module Patching
+
+The benchmarks code has a `version.py` that tries to get the SDK SHA from `vendor/software-agent-sdk` (which doesn't exist in our setup). The `eval_runner.py` script pre-creates this module with the SHA from our repo's SDK:
+
+```python
+# Pre-create the version module with our SDK SHA before benchmarks imports it
+_sdk_sha = _get_sdk_sha_from_parent_repo()
+_version_module = ModuleType("benchmarks.utils.version")
+_version_module.SDK_SHA = _sdk_sha
+_version_module.SDK_SHORT_SHA = _sdk_sha[:7]
+sys.modules["benchmarks.utils.version"] = _version_module
+```
+
+### Files Added/Modified
 
 | File                     | Description                                                             |
 | ------------------------ | ----------------------------------------------------------------------- |
@@ -47,7 +150,7 @@ from benchmarks.agentic_code_search.run_infer import main
 | `scripts/eval_runner.py` | Python wrapper that sets up sys.path and runs eval                      |
 | `scripts/run_eval.sh`    | Shell wrapper for `uv run`                                              |
 
-## Architecture
+### Architecture
 
 ```
 agentic-code-search-oss/
@@ -67,7 +170,7 @@ agentic-code-search-oss/
 └── src/                          # Training code (unchanged)
 ```
 
-## How Evaluation Works
+### How Evaluation Works
 
 ```
 ┌─────────────────┐
@@ -90,99 +193,47 @@ agentic-code-search-oss/
 └─────────────────────────────────┘
 ```
 
-## Learnings
+### Learnings
 
 1. **uv workspaces don't nest** - Can't add a package with its own workspace as a member
 2. **sys.path manipulation is clean** - Keeps submodule pristine, easy to update
 3. **Python import resolution** - First match in sys.path wins, so our installed SDK is used
 4. **Dependency isolation** - We only add deps we actually need, avoiding conflicts
+5. **Version module patching** - Pre-create the version module to use our repo's SDK SHA
+6. **litellm provider prefix** - Local vLLM endpoints need `openai/` prefix in model name
+7. **vLLM tool calling** - Requires `--enable-auto-tool-choice --tool-call-parser hermes` flags
 
-## What to Test
-
-### On Linux with CUDA (training machine)
-
-1. **Sync dependencies:**
-
-   ```bash
-   uv sync
-   ```
-
-2. **Test import works:**
-
-   ```bash
-   uv run python -c "
-   import sys
-   sys.path.insert(0, 'benchmarks')
-   from benchmarks.agentic_code_search.run_infer import main
-   print('Import successful!')
-   "
-   ```
+---
 
-3. **Run a minimal evaluation:**
+## Troubleshooting
 
-   ```bash
-   # Create LLM config file first
-   cat > configs/llm_config.json << 'EOF'
-   {
-     "model": "openai/gpt-4o-mini",
-     "api_key": "your-api-key",
-     "base_url": "https://api.openai.com/v1",
-     "temperature": 0.0
-   }
-   EOF
+### "LLM Provider NOT provided"
 
-   # Run eval on 1 instance
-   ./scripts/run_eval.sh \
-     --dataset_file benchmarks/gt_location.jsonl \
-     --llm-config-path configs/llm_config.json \
-     --max-iterations 10 \
-     --num-workers 1 \
-     --tools terminal \
-     --n-limit 1
-   ```
-
-4. **Verify training still works:**
-   ```bash
-   # Your existing training command should work unchanged
-   bash scripts/run_async_training.sh -m Qwen/Qwen3-4B -d $DATA_PATH
-   ```
-
-### Expected Output Format
-
-The evaluation produces JSONL output with F1 scores for:
-
-- **File-level**: Did the agent find the correct files?
-- **Module-level**: Did it find the correct classes?
-- **Entity-level**: Did it find the correct functions/methods?
+Add `openai/` prefix to your model name in `llm_config.json`:
+```json
+{"model": "openai/Qwen/Qwen3-4B", ...}
+```
 
-Example output:
+### "auto tool choice requires --enable-auto-tool-choice"
 
-```json
-{
-  "instance_id": "astropy__astropy-12907",
-  "test_result": {
-    "reward": {
-      "file_reward": 1.0,
-      "module_reward": 0.8,
-      "entity_reward": 0.6
-    },
-    "raw_prediction": "astropy/modeling/separable.py\nfunction: _cstack",
-    "wall_time_seconds": 45.2,
-    "num_steps": 5,
-    "num_tool_calls": 12
-  }
-}
+Restart vLLM with tool calling flags:
+```bash
+uv run vllm serve Qwen/Qwen3-4B \
+  --port 8000 \
+  --enable-auto-tool-choice \
+  --tool-call-parser hermes
 ```
 
-## Next Steps
+### "Processing 0 instances"
 
-1. **Test on training machine** - Verify uv sync works with CUDA deps
-2. **Prepare SWE-Bench Pro/Verified datasets** - May need to download separately
-3. **Run base model evals** - Establish baseline before training
-4. **Integrate with training loop** - Optional: run evals at checkpoints
+Previous failed runs left stale output. Delete the output directory:
+```bash
+rm -rf ./agentic_code_search_outputs/
+```
 
-## References
+### Import errors from benchmarks
 
-- [Benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search)
-- [Original Slack conversation](#) - Aditya's integration instructions
-- [SWE-Bench](https://www.swebench.com/) - Benchmark website
+Ensure the submodule is initialized:
+```bash
+git submodule update --init --recursive
+```
diff --git a/scripts/eval_runner.py b/scripts/eval_runner.py
index f6a6efd..156d290 100755
--- a/scripts/eval_runner.py
+++ b/scripts/eval_runner.py
@@ -20,13 +20,43 @@
     python scripts/eval_runner.py --help
 """
 
+import subprocess
 import sys
 from pathlib import Path
+from types import ModuleType
 
 # Add the benchmarks submodule to sys.path so we can import from it
 _benchmarks_path = Path(__file__).parent.parent / "benchmarks"
+_project_root = Path(__file__).parent.parent
 sys.path.insert(0, str(_benchmarks_path.resolve()))
 
+
+def _get_sdk_sha_from_parent_repo() -> str:
+    """Get SDK SHA from the parent repo's software-agent-sdk submodule."""
+    sdk_path = _project_root / "software-agent-sdk"
+    try:
+        result = subprocess.run(
+            ["git", "submodule", "status", str(sdk_path)],
+            capture_output=True,
+            text=True,
+            check=True,
+            cwd=str(_project_root),
+        )
+        sha = result.stdout.strip().split()[0].lstrip("+-")
+        return sha
+    except Exception:
+        # Fallback if git command fails
+        return "unknown"
+
+
+# Pre-create the version module with our SDK SHA before benchmarks imports it
+_sdk_sha = _get_sdk_sha_from_parent_repo()
+_version_module = ModuleType("benchmarks.utils.version")
+_version_module.SDK_SHA = _sdk_sha
+_version_module.SDK_SHORT_SHA = _sdk_sha[:7] if _sdk_sha != "unknown" else "unknown"
+_version_module.PROJECT_ROOT = _benchmarks_path
+sys.modules["benchmarks.utils.version"] = _version_module
+
 from benchmarks.agentic_code_search.run_infer import main
 
 if __name__ == "__main__":