From e0e9867d399969e5b0df6b30ef38daa6f179647f Mon Sep 17 00:00:00 2001 From: Bharath Sriraam R R Date: Fri, 23 Jan 2026 23:35:07 -0500 Subject: [PATCH 1/4] Add benchmarks evaluation integration via submodule - Add adityasoni9998/benchmarks as git submodule (agentic_code_search branch) - Add eval_runner.py that uses sys.path to import benchmarks at runtime - Add run_eval.sh wrapper script for running evaluations - Add minimal deps (jinja2, pandas, tqdm, lmnr) needed for benchmarks This allows running agentic_code_search evaluations using the benchmarks repo while keeping our existing SDK and training setup intact. --- .gitmodules | 4 ++++ benchmarks | 1 + pyproject.toml | 4 ++++ scripts/eval_runner.py | 33 +++++++++++++++++++++++++++++++++ scripts/run_eval.sh | 21 +++++++++++++++++++++ 5 files changed, 63 insertions(+) create mode 160000 benchmarks create mode 100755 scripts/eval_runner.py create mode 100755 scripts/run_eval.sh diff --git a/.gitmodules b/.gitmodules index ad55c20..27f5528 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,7 @@ [submodule "prime-rl"] path = prime-rl url = https://github.com/PrimeIntellect-ai/prime-rl +[submodule "benchmarks"] + path = benchmarks + url = https://github.com/adityasoni9998/benchmarks.git + branch = agentic_code_search diff --git a/benchmarks b/benchmarks new file mode 160000 index 0000000..160f527 --- /dev/null +++ b/benchmarks @@ -0,0 +1 @@ +Subproject commit 160f5279769281d013ce6ccff429a7ca354c2d58 diff --git a/pyproject.toml b/pyproject.toml index cce1f28..e05a138 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,6 +25,10 @@ dependencies = [ "seaborn>=0.13.2", "gcsfs>=2025.3.0", "lmcache", + "jinja2", + "pandas", + "tqdm", + "lmnr>=0.7.24", # "flashinfer-python", # "flashinfer-jit-cache", ] diff --git a/scripts/eval_runner.py b/scripts/eval_runner.py new file mode 100755 index 0000000..f6a6efd --- /dev/null +++ b/scripts/eval_runner.py @@ -0,0 +1,33 @@ +#!/usr/bin/env python3 +""" +Evaluation runner for agentic code search benchmark. + +This script adds the benchmarks submodule to sys.path and runs the +agentic_code_search evaluation from the benchmarks package. + +Usage: + python scripts/eval_runner.py --dataset_file --llm-config-path [options] + +Example: + python scripts/eval_runner.py \ + --dataset_file ./data/test.jsonl \ + --llm-config-path ./configs/llm.json \ + --output-dir ./outputs \ + --max-iterations 25 \ + --num-workers 4 + +For all available options, run: + python scripts/eval_runner.py --help +""" + +import sys +from pathlib import Path + +# Add the benchmarks submodule to sys.path so we can import from it +_benchmarks_path = Path(__file__).parent.parent / "benchmarks" +sys.path.insert(0, str(_benchmarks_path.resolve())) + +from benchmarks.agentic_code_search.run_infer import main + +if __name__ == "__main__": + main() diff --git a/scripts/run_eval.sh b/scripts/run_eval.sh new file mode 100755 index 0000000..4a3b2d7 --- /dev/null +++ b/scripts/run_eval.sh @@ -0,0 +1,21 @@ +#!/bin/bash +# +# run_eval.sh - Wrapper script to run the evaluation runner with uv +# +# Usage: +# ./scripts/run_eval.sh [OPTIONS] +# +# Example usage: +# ./scripts/run_eval.sh \ +# --dataset_file benchmarks/gt_location.jsonl \ +# --llm-config-path configs/llm_config.json \ +# --max-iterations 10 \ +# --num-workers 1 \ +# --tools terminal +# +# Options are passed through to scripts/eval_runner.py +# Run with --help to see all available options: +# ./scripts/run_eval.sh --help +# + +uv run python scripts/eval_runner.py "$@" From c103eae8a22a814478cacd34731ce732ee2c31af Mon Sep 17 00:00:00 2001 From: Bharath Sriraam R R Date: Fri, 23 Jan 2026 23:55:12 -0500 Subject: [PATCH 2/4] Add evaluation integration documentation --- docs/EVAL_INTEGRATION.md | 181 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 181 insertions(+) create mode 100644 docs/EVAL_INTEGRATION.md diff --git a/docs/EVAL_INTEGRATION.md b/docs/EVAL_INTEGRATION.md new file mode 100644 index 0000000..4135218 --- /dev/null +++ b/docs/EVAL_INTEGRATION.md @@ -0,0 +1,181 @@ +# Evaluation Integration Documentation + +## Goal + +Integrate evaluation code from the [benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) into this repository to enable end-to-end training AND evaluation of code localization agents for the ICML submission. + +**Key requirements:** +- Run trained models on SWE-Bench Pro/Verified benchmarks +- Use the same `software-agent-sdk` for both training and evaluation +- No dependency conflicts with existing SkyRL training setup + +## Solution Approach + +### The Problem + +The benchmarks repo is designed as a standalone project with its own workspace pointing to `vendor/software-agent-sdk/`. Directly integrating it as a workspace member caused: + +1. **Nested workspace error** - uv doesn't support workspaces inside workspaces +2. **Dependency conflicts** - `commit0` requires `datasets==3.0.1`, we need `>=4.0.0` + +### The Solution: Runtime sys.path Manipulation + +Instead of making benchmarks a proper package in our workspace, we use Python's `sys.path` to import it at runtime: + +```python +import sys +sys.path.insert(0, "/path/to/benchmarks") + +# Now imports work - and they use OUR installed SDK +from benchmarks.agentic_code_search.run_infer import main +``` + +**Why this works:** +- When benchmarks code imports `openhands.sdk`, Python searches `sys.path` +- Our SDK packages are already installed via uv workspace +- Python finds our SDK first, not benchmarks' vendor/ (which doesn't exist anyway) + +## Files Added/Modified + +| File | Description | +|------|-------------| +| `benchmarks/` | Git submodule pointing to adityasoni9998/benchmarks@agentic_code_search | +| `.gitmodules` | Submodule configuration | +| `pyproject.toml` | Added jinja2, pandas, tqdm, lmnr dependencies | +| `scripts/eval_runner.py` | Python wrapper that sets up sys.path and runs eval | +| `scripts/run_eval.sh` | Shell wrapper for `uv run` | + +## Architecture + +``` +agentic-code-search-oss/ +├── software-agent-sdk/ # Our SDK (used for training AND eval) +│ ├── openhands-sdk/ +│ ├── openhands-tools/ +│ └── ... +├── benchmarks/ # Submodule (NOT in workspace) +│ └── benchmarks/ +│ └── agentic_code_search/ +│ ├── run_infer.py # Main eval script +│ ├── eval_infer.py # Results aggregator +│ └── prompts/ # Jinja2 templates +├── scripts/ +│ ├── eval_runner.py # sys.path wrapper +│ └── run_eval.sh # Shell wrapper +└── src/ # Training code (unchanged) +``` + +## How Evaluation Works + +``` +┌─────────────────┐ +│ run_eval.sh │ +└────────┬────────┘ + │ uv run + ▼ +┌─────────────────┐ +│ eval_runner.py │ +│ │ +│ sys.path.insert │ +│ (benchmarks/) │ +└────────┬────────┘ + │ import + ▼ +┌─────────────────────────────────┐ +│ benchmarks.agentic_code_search │ +│ │ +│ from openhands.sdk import ... │──► Uses OUR SDK +└─────────────────────────────────┘ +``` + +## Learnings + +1. **uv workspaces don't nest** - Can't add a package with its own workspace as a member +2. **sys.path manipulation is clean** - Keeps submodule pristine, easy to update +3. **Python import resolution** - First match in sys.path wins, so our installed SDK is used +4. **Dependency isolation** - We only add deps we actually need, avoiding conflicts + +## What to Test + +### On Linux with CUDA (training machine) + +1. **Sync dependencies:** + ```bash + uv sync + ``` + +2. **Test import works:** + ```bash + uv run python -c " + import sys + sys.path.insert(0, 'benchmarks') + from benchmarks.agentic_code_search.run_infer import main + print('Import successful!') + " + ``` + +3. **Run a minimal evaluation:** + ```bash + # Create LLM config file first + cat > configs/llm_config.json << 'EOF' + { + "model": "openai/gpt-4o-mini", + "api_key": "your-api-key", + "base_url": "https://api.openai.com/v1", + "temperature": 0.0 + } + EOF + + # Run eval on 1 instance + ./scripts/run_eval.sh \ + --dataset_file benchmarks/gt_location.jsonl \ + --llm-config-path configs/llm_config.json \ + --max-iterations 10 \ + --num-workers 1 \ + --tools terminal \ + --n-limit 1 + ``` + +4. **Verify training still works:** + ```bash + # Your existing training command should work unchanged + bash scripts/run_async_training.sh -m Qwen/Qwen3-4B -d $DATA_PATH + ``` + +### Expected Output Format + +The evaluation produces JSONL output with F1 scores for: +- **File-level**: Did the agent find the correct files? +- **Module-level**: Did it find the correct classes? +- **Entity-level**: Did it find the correct functions/methods? + +Example output: +```json +{ + "instance_id": "astropy__astropy-12907", + "test_result": { + "reward": { + "file_reward": 1.0, + "module_reward": 0.8, + "entity_reward": 0.6 + }, + "raw_prediction": "astropy/modeling/separable.py\nfunction: _cstack", + "wall_time_seconds": 45.2, + "num_steps": 5, + "num_tool_calls": 12 + } +} +``` + +## Next Steps + +1. **Test on training machine** - Verify uv sync works with CUDA deps +2. **Prepare SWE-Bench Pro/Verified datasets** - May need to download separately +3. **Run base model evals** - Establish baseline before training +4. **Integrate with training loop** - Optional: run evals at checkpoints + +## References + +- [Benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) +- [Original Slack conversation](#) - Aditya's integration instructions +- [SWE-Bench](https://www.swebench.com/) - Benchmark website From b5c37dd255a418dfccd0bc3b1c02c4d311be054e Mon Sep 17 00:00:00 2001 From: Bharath Sriraam R R Date: Fri, 23 Jan 2026 23:57:54 -0500 Subject: [PATCH 3/4] Fix markdown formatting in eval integration docs --- docs/EVAL_INTEGRATION.md | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/docs/EVAL_INTEGRATION.md b/docs/EVAL_INTEGRATION.md index 4135218..5e05ab4 100644 --- a/docs/EVAL_INTEGRATION.md +++ b/docs/EVAL_INTEGRATION.md @@ -2,9 +2,10 @@ ## Goal -Integrate evaluation code from the [benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) into this repository to enable end-to-end training AND evaluation of code localization agents for the ICML submission. +Integrate evaluation code from the [benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) into this repository to enable end-to-end training AND evaluation of code localization agents. **Key requirements:** + - Run trained models on SWE-Bench Pro/Verified benchmarks - Use the same `software-agent-sdk` for both training and evaluation - No dependency conflicts with existing SkyRL training setup @@ -31,19 +32,20 @@ from benchmarks.agentic_code_search.run_infer import main ``` **Why this works:** + - When benchmarks code imports `openhands.sdk`, Python searches `sys.path` - Our SDK packages are already installed via uv workspace - Python finds our SDK first, not benchmarks' vendor/ (which doesn't exist anyway) ## Files Added/Modified -| File | Description | -|------|-------------| -| `benchmarks/` | Git submodule pointing to adityasoni9998/benchmarks@agentic_code_search | -| `.gitmodules` | Submodule configuration | -| `pyproject.toml` | Added jinja2, pandas, tqdm, lmnr dependencies | -| `scripts/eval_runner.py` | Python wrapper that sets up sys.path and runs eval | -| `scripts/run_eval.sh` | Shell wrapper for `uv run` | +| File | Description | +| ------------------------ | ----------------------------------------------------------------------- | +| `benchmarks/` | Git submodule pointing to adityasoni9998/benchmarks@agentic_code_search | +| `.gitmodules` | Submodule configuration | +| `pyproject.toml` | Added jinja2, pandas, tqdm, lmnr dependencies | +| `scripts/eval_runner.py` | Python wrapper that sets up sys.path and runs eval | +| `scripts/run_eval.sh` | Shell wrapper for `uv run` | ## Architecture @@ -100,11 +102,13 @@ agentic-code-search-oss/ ### On Linux with CUDA (training machine) 1. **Sync dependencies:** + ```bash uv sync ``` 2. **Test import works:** + ```bash uv run python -c " import sys @@ -115,6 +119,7 @@ agentic-code-search-oss/ ``` 3. **Run a minimal evaluation:** + ```bash # Create LLM config file first cat > configs/llm_config.json << 'EOF' @@ -145,11 +150,13 @@ agentic-code-search-oss/ ### Expected Output Format The evaluation produces JSONL output with F1 scores for: + - **File-level**: Did the agent find the correct files? - **Module-level**: Did it find the correct classes? - **Entity-level**: Did it find the correct functions/methods? Example output: + ```json { "instance_id": "astropy__astropy-12907", From 3dba6a50811a1bda6af6e1ad494b50a8528ea2ae Mon Sep 17 00:00:00 2001 From: Bharath Sriraam R R Date: Sat, 24 Jan 2026 05:25:22 +0000 Subject: [PATCH 4/4] Fix eval integration and update documentation - Fix version module patching in eval_runner.py to use parent repo's SDK SHA instead of benchmarks/vendor/ which doesn't exist in our setup - Restructure EVAL_INTEGRATION.md with practical quick start guide - Add example LLM config file for vLLM setup - Add troubleshooting section for common issues (litellm provider prefix, vLLM tool calling flags, stale output) --- configs/eval_llm_config_example.json | 6 + docs/EVAL_INTEGRATION.md | 223 ++++++++++++++++----------- scripts/eval_runner.py | 30 ++++ 3 files changed, 173 insertions(+), 86 deletions(-) create mode 100644 configs/eval_llm_config_example.json diff --git a/configs/eval_llm_config_example.json b/configs/eval_llm_config_example.json new file mode 100644 index 0000000..735e911 --- /dev/null +++ b/configs/eval_llm_config_example.json @@ -0,0 +1,6 @@ +{ + "model": "openai/Qwen/Qwen3-4B", + "api_key": "dummy", + "base_url": "http://localhost:8000/v1", + "temperature": 0.0 +} diff --git a/docs/EVAL_INTEGRATION.md b/docs/EVAL_INTEGRATION.md index 5e05ab4..6734639 100644 --- a/docs/EVAL_INTEGRATION.md +++ b/docs/EVAL_INTEGRATION.md @@ -1,6 +1,98 @@ # Evaluation Integration Documentation -## Goal +This document explains how to run evaluations for code localization agents using the integrated benchmarks system. + +## Quick Start + +### 1. Start a Local Model with vLLM + +Start vLLM with tool calling enabled: + +```bash +# For a small model (quick testing) +uv run vllm serve Qwen/Qwen3-4B \ + --port 8000 \ + --max-model-len 32768 \ + --enable-auto-tool-choice \ + --tool-call-parser hermes + +### 2. Create LLM Config + +```bash +mkdir -p configs +cat > configs/llm_config.json << 'EOF' +{ + "model": "openai/Qwen/Qwen3-4B", + "api_key": "dummy", + "base_url": "http://localhost:8000/v1", + "temperature": 0.0 +} +EOF +``` + +**Important:** The model name must be prefixed with `openai/` to tell litellm it's an OpenAI-compatible endpoint. + +### 3. Run Evaluation + +```bash +./scripts/run_eval.sh \ + --dataset_file benchmarks/gt_location.jsonl \ + --llm-config-path configs/llm_config.json \ + --system_prompt_file benchmarks/benchmarks/agentic_code_search/prompts/system_prompt.j2 \ + --user_prompt_file benchmarks/benchmarks/agentic_code_search/prompts/file_module_short.j2 \ + --tools terminal \ + --max-iterations 10 \ + --num-workers 1 \ + --output-dir ./agentic_code_search_outputs \ + --n-limit 1 \ + --workspace_base_dir /tmp/testbed/ +``` + +**Key options:** +- `--n-limit 1` - Run on 1 instance (remove for full dataset) +- `--num-workers 1` - Parallel workers (increase for faster eval) +- `--max-iterations 10` - Max agent steps per instance + +### 4. Check Results + +```bash +# View full output +cat ./agentic_code_search_outputs/agentic_code_search_gt_location/openai/Qwen/Qwen3-4B_sdk_*/output.jsonl | jq . + +# View just the reward scores +cat ./agentic_code_search_outputs/agentic_code_search_gt_location/openai/Qwen/Qwen3-4B_sdk_*/output.jsonl | jq '.test_result.reward' +``` + +### Example Output + +```json +{ + "file_reward": 0.5, + "module_reward": 0.5, + "entity_reward": 0.4, + "prediction": { + "files": ["sklearn/calibration.py", "sklearn/_config.py", "sklearn/isotonic.py"], + "modules": ["sklearn/calibration.py:_CalibratedClassifier", "sklearn/_config.py:set_config", "sklearn/isotonic.py:IsotonicRegression"], + "entities": ["sklearn/isotonic.py:IsotonicRegression.predict", "sklearn/_config.py:set_config", "sklearn/calibration.py:_CalibratedClassifier.predict_proba"] + }, + "ground_truth": { + "files": ["sklearn/isotonic.py"], + "modules": ["sklearn/isotonic.py:IsotonicRegression"], + "entities": ["sklearn/isotonic.py:IsotonicRegression.predict", "sklearn/isotonic.py:IsotonicRegression.transform"] + } +} +``` + +**Metrics explained:** +- **file_reward** - F1 score for file-level localization +- **module_reward** - F1 score for class-level localization +- **entity_reward** - F1 score for function/method-level localization + +--- + +## Implementation Details + +### Goal Integrate evaluation code from the [benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) into this repository to enable end-to-end training AND evaluation of code localization agents. @@ -10,8 +102,6 @@ Integrate evaluation code from the [benchmarks repo](https://github.com/adityaso - Use the same `software-agent-sdk` for both training and evaluation - No dependency conflicts with existing SkyRL training setup -## Solution Approach - ### The Problem The benchmarks repo is designed as a standalone project with its own workspace pointing to `vendor/software-agent-sdk/`. Directly integrating it as a workspace member caused: @@ -37,7 +127,20 @@ from benchmarks.agentic_code_search.run_infer import main - Our SDK packages are already installed via uv workspace - Python finds our SDK first, not benchmarks' vendor/ (which doesn't exist anyway) -## Files Added/Modified +### Version Module Patching + +The benchmarks code has a `version.py` that tries to get the SDK SHA from `vendor/software-agent-sdk` (which doesn't exist in our setup). The `eval_runner.py` script pre-creates this module with the SHA from our repo's SDK: + +```python +# Pre-create the version module with our SDK SHA before benchmarks imports it +_sdk_sha = _get_sdk_sha_from_parent_repo() +_version_module = ModuleType("benchmarks.utils.version") +_version_module.SDK_SHA = _sdk_sha +_version_module.SDK_SHORT_SHA = _sdk_sha[:7] +sys.modules["benchmarks.utils.version"] = _version_module +``` + +### Files Added/Modified | File | Description | | ------------------------ | ----------------------------------------------------------------------- | @@ -47,7 +150,7 @@ from benchmarks.agentic_code_search.run_infer import main | `scripts/eval_runner.py` | Python wrapper that sets up sys.path and runs eval | | `scripts/run_eval.sh` | Shell wrapper for `uv run` | -## Architecture +### Architecture ``` agentic-code-search-oss/ @@ -67,7 +170,7 @@ agentic-code-search-oss/ └── src/ # Training code (unchanged) ``` -## How Evaluation Works +### How Evaluation Works ``` ┌─────────────────┐ @@ -90,99 +193,47 @@ agentic-code-search-oss/ └─────────────────────────────────┘ ``` -## Learnings +### Learnings 1. **uv workspaces don't nest** - Can't add a package with its own workspace as a member 2. **sys.path manipulation is clean** - Keeps submodule pristine, easy to update 3. **Python import resolution** - First match in sys.path wins, so our installed SDK is used 4. **Dependency isolation** - We only add deps we actually need, avoiding conflicts +5. **Version module patching** - Pre-create the version module to use our repo's SDK SHA +6. **litellm provider prefix** - Local vLLM endpoints need `openai/` prefix in model name +7. **vLLM tool calling** - Requires `--enable-auto-tool-choice --tool-call-parser hermes` flags -## What to Test - -### On Linux with CUDA (training machine) - -1. **Sync dependencies:** - - ```bash - uv sync - ``` - -2. **Test import works:** - - ```bash - uv run python -c " - import sys - sys.path.insert(0, 'benchmarks') - from benchmarks.agentic_code_search.run_infer import main - print('Import successful!') - " - ``` +--- -3. **Run a minimal evaluation:** +## Troubleshooting - ```bash - # Create LLM config file first - cat > configs/llm_config.json << 'EOF' - { - "model": "openai/gpt-4o-mini", - "api_key": "your-api-key", - "base_url": "https://api.openai.com/v1", - "temperature": 0.0 - } - EOF +### "LLM Provider NOT provided" - # Run eval on 1 instance - ./scripts/run_eval.sh \ - --dataset_file benchmarks/gt_location.jsonl \ - --llm-config-path configs/llm_config.json \ - --max-iterations 10 \ - --num-workers 1 \ - --tools terminal \ - --n-limit 1 - ``` - -4. **Verify training still works:** - ```bash - # Your existing training command should work unchanged - bash scripts/run_async_training.sh -m Qwen/Qwen3-4B -d $DATA_PATH - ``` - -### Expected Output Format - -The evaluation produces JSONL output with F1 scores for: - -- **File-level**: Did the agent find the correct files? -- **Module-level**: Did it find the correct classes? -- **Entity-level**: Did it find the correct functions/methods? +Add `openai/` prefix to your model name in `llm_config.json`: +```json +{"model": "openai/Qwen/Qwen3-4B", ...} +``` -Example output: +### "auto tool choice requires --enable-auto-tool-choice" -```json -{ - "instance_id": "astropy__astropy-12907", - "test_result": { - "reward": { - "file_reward": 1.0, - "module_reward": 0.8, - "entity_reward": 0.6 - }, - "raw_prediction": "astropy/modeling/separable.py\nfunction: _cstack", - "wall_time_seconds": 45.2, - "num_steps": 5, - "num_tool_calls": 12 - } -} +Restart vLLM with tool calling flags: +```bash +uv run vllm serve Qwen/Qwen3-4B \ + --port 8000 \ + --enable-auto-tool-choice \ + --tool-call-parser hermes ``` -## Next Steps +### "Processing 0 instances" -1. **Test on training machine** - Verify uv sync works with CUDA deps -2. **Prepare SWE-Bench Pro/Verified datasets** - May need to download separately -3. **Run base model evals** - Establish baseline before training -4. **Integrate with training loop** - Optional: run evals at checkpoints +Previous failed runs left stale output. Delete the output directory: +```bash +rm -rf ./agentic_code_search_outputs/ +``` -## References +### Import errors from benchmarks -- [Benchmarks repo](https://github.com/adityasoni9998/benchmarks/tree/agentic_code_search) -- [Original Slack conversation](#) - Aditya's integration instructions -- [SWE-Bench](https://www.swebench.com/) - Benchmark website +Ensure the submodule is initialized: +```bash +git submodule update --init --recursive +``` diff --git a/scripts/eval_runner.py b/scripts/eval_runner.py index f6a6efd..156d290 100755 --- a/scripts/eval_runner.py +++ b/scripts/eval_runner.py @@ -20,13 +20,43 @@ python scripts/eval_runner.py --help """ +import subprocess import sys from pathlib import Path +from types import ModuleType # Add the benchmarks submodule to sys.path so we can import from it _benchmarks_path = Path(__file__).parent.parent / "benchmarks" +_project_root = Path(__file__).parent.parent sys.path.insert(0, str(_benchmarks_path.resolve())) + +def _get_sdk_sha_from_parent_repo() -> str: + """Get SDK SHA from the parent repo's software-agent-sdk submodule.""" + sdk_path = _project_root / "software-agent-sdk" + try: + result = subprocess.run( + ["git", "submodule", "status", str(sdk_path)], + capture_output=True, + text=True, + check=True, + cwd=str(_project_root), + ) + sha = result.stdout.strip().split()[0].lstrip("+-") + return sha + except Exception: + # Fallback if git command fails + return "unknown" + + +# Pre-create the version module with our SDK SHA before benchmarks imports it +_sdk_sha = _get_sdk_sha_from_parent_repo() +_version_module = ModuleType("benchmarks.utils.version") +_version_module.SDK_SHA = _sdk_sha +_version_module.SDK_SHORT_SHA = _sdk_sha[:7] if _sdk_sha != "unknown" else "unknown" +_version_module.PROJECT_ROOT = _benchmarks_path +sys.modules["benchmarks.utils.version"] = _version_module + from benchmarks.agentic_code_search.run_infer import main if __name__ == "__main__":