Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions configs/simple.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
# Configuration for standalone FreeEnv + FreeAgent runs.
task_name: free-session
output_path: exps/free_env

llm:
name: frogboss

# Tools to load into the environment toolbox.
tools:
- bash
- submit:
eval_on_submit: False # Here we only terminate after submission, no auto-eval.

task_data:
env_type: FreeEnv
image: ubuntu:22.04
local_path: /home/macote/src/debug-gym/data/mini_nightmare/pandas_dataframe
workspace_dir: /testbed

terminal:
type: docker

agent:
type: simple_agent
max_steps: 20
system_prompt: |-
You are a helpful assistant that can interact with a computer to solve tasks.
<IMPORTANT>
* If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
</IMPORTANT>

You have access to the following functions:

---- BEGIN FUNCTION #1:
bash ----
Description: Execute a bash command in the terminal.

Parameters:
(1) command (string, required): The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.
---- END FUNCTION #1 ----

---- BEGIN FUNCTION #2: submit ----
Description: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.
No parameters are required for this function.
---- END FUNCTION #2 ----

If you choose to call a function ONLY reply in the following format with NO suffix:

Provide any reasoning for the function call here.
<function=example_function_name>
<parameter=example_parameter_1>value_1</parameter>
<parameter=example_parameter_2>
This is the value for the second parameter
that can span
multiple lines
</parameter>
</function>

<IMPORTANT>
Reminder:
- Function calls MUST follow the specified format, start with <function= and end with </function>
- Required parameters MUST be specified
- Only call one function at a time
- Always provide reasoning for your function call in natural language BEFORE the function call (not after)
</IMPORTANT>

instance_prompt: >-
Look at the codebase check that everything is working properly.
2 changes: 2 additions & 0 deletions debug_gym/agents/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from debug_gym.agents.base_agent import BaseAgent, register_agent
from debug_gym.agents.froggy_agent import FroggyAgent
from debug_gym.agents.simple_agent import SimpleAgent
from debug_gym.agents.solution_agent import AgentSolution

__all__ = [
"BaseAgent",
"register_agent",
"FroggyAgent",
"AgentSolution",
"SimpleAgent",
]
153 changes: 153 additions & 0 deletions debug_gym/agents/simple_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import re

from debug_gym.agents.base_agent import BaseAgent, register_agent
from debug_gym.gym.envs.env import EnvInfo, RepoEnv
from debug_gym.gym.tools.tool import ToolCall
from debug_gym.llms.base import LLM


@register_agent
class SimpleAgent(BaseAgent):
name: str = "simple_agent"

def _parse_tool_call(self, response: str) -> list[ToolCall]:
# Extract tool calls from LLM response.
# Supports multiple tool calls in a single response.
# Assume the following format.
# <function=example_function_name>
# <parameter=example_parameter_1>value_1</parameter>
# <parameter=example_parameter_2>
# This is the value for the second parameter
# that can span
# multiple lines
# </parameter>
# </function>
try:
tool_calls = []

# Extract all function blocks with their content
func_pattern = r"<function=([^>]+)>(.*?)</function>"
for func_match in re.finditer(func_pattern, response, re.DOTALL):
function_name = func_match.group(1)
function_content = func_match.group(2)

# Extract all parameters within this function block
arguments = {}
param_pattern = r"<parameter=([^>]+)>(.*?)</parameter>"
for param_match in re.finditer(
param_pattern, function_content, re.DOTALL
):
param_name = param_match.group(1)
param_value = param_match.group(2).rstrip()
arguments[param_name] = param_value

tool_calls.append(
ToolCall(
id="None",
name=function_name,
arguments=arguments,
)
)

# Return list with unknown_function if no tool calls found
if not tool_calls:
tool_calls.append(
ToolCall(id="None", name="unknown_function", arguments={})
)

return tool_calls
except Exception as e:
self.logger.warning(
f"Failed to parse tool call from LLM response: {e!r}. "
f"LLM response was: {response}. "
"The agent will stop execution."
)
return None

def run(self, env: RepoEnv, llm: LLM, debug=False):
self.env = env
self.llm = llm
info = None
step = 0

try:
info = self.env.reset()
self.history.init(
self.build_system_prompt(info), self.build_instance_prompt(info), info
)

if info.resolved:
self.logger.report_progress(
problem_id=env.task_name,
step=0,
total_steps=self.args.max_steps,
score=info.score,
max_score=info.max_score,
status="resolved",
)
return self._build_trajectory()

highscore = info.score
should_stop = False
step = 1

while not should_stop:
self.logger.info(f"\n{'='*20} STEP {step} {'='*20}\n")

messages = self.build_prompt(info)
llm_response = self.llm(messages, tools=None)
tool_calls = self._parse_tool_call(llm_response.response)

# Handle multiple tool calls - use the first one
if tool_calls and len(tool_calls) > 1:
self.logger.info(
f"Multiple tool calls detected ({len(tool_calls)}), using the first one."
)

# TODO: deal with multiple tool calls.
llm_response.tool_call = tool_calls[0] if tool_calls else None

if debug:
breakpoint()

info = self.env.step(
llm_response.tool,
llm_response.response,
llm_response.reasoning_response,
)
self.history.step(info, llm_response)
should_stop, reason = self.should_stop(step + 1, info)
status = (
"resolved"
if info.resolved
else ("unresolved" if should_stop else "running")
)

highscore = max(highscore, info.score)
msg = f"[{env.task_name[:10]:<10}] Step {step} | Score: {info.score}/{info.max_score or '-'} [Best: {highscore}]"
if should_stop:
msg += f" | Stopping Reason: {reason}"
self.logger.info(msg)
step += 1

# keep progress bar running until max_steps is reached
self.logger.report_progress(
problem_id=env.task_name,
step=step,
total_steps=self.args.max_steps,
score=info.score,
max_score=info.max_score,
status=status,
)
return self._build_trajectory()
except Exception as e:
# report any error that happens during the run
self.logger.report_progress(
problem_id=env.task_name,
step=step,
total_steps=step,
score=getattr(info, "score", 0),
max_score=getattr(info, "max_score", None),
status="error",
)
raise e
21 changes: 14 additions & 7 deletions debug_gym/llms/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,13 +268,20 @@ def generate(self, messages, tools, **kwargs) -> LLMResponse:
self.need_to_be_retried,
)
try:
response = api_call(
model=self.config.model,
messages=messages,
tools=self.define_tools(tools),
tool_choice="auto",
**kwargs,
)
if tools:
response = api_call(
model=self.config.model,
messages=messages,
tools=self.define_tools(tools),
tool_choice="auto",
**kwargs,
)
else:
response = api_call(
model=self.config.model,
messages=messages,
**kwargs,
)
except openai.BadRequestError as e:
# Handle specific error for context length exceeded, otherwise just propagate the error
if self.is_context_length_error(e):
Expand Down
95 changes: 95 additions & 0 deletions tests/agents/test_simple_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from unittest.mock import Mock

import pytest

from debug_gym.agents.base_agent import AgentArgs
from debug_gym.agents.simple_agent import SimpleAgent


@pytest.fixture
def agent():
agent = SimpleAgent(agent_args=AgentArgs(max_steps=10))
agent.logger = Mock()
return agent


def test_parse_with_parameters(agent):
"""Covers main parsing logic and multiline parameters"""
response = """
<function=test>
<parameter=x>1</parameter>
<parameter=code>
def hello():
pass
</parameter>
</function>
"""
tool_calls = agent._parse_tool_call(response)
assert len(tool_calls) == 1
assert tool_calls[0].name == "test"
assert tool_calls[0].arguments["x"] == "1"
assert "def hello():" in tool_calls[0].arguments["code"]


def test_parse_multiple_and_empty(agent):
"""Covers multiple functions and parameter scoping"""
response = (
"<function=a><parameter=x>1</parameter></function><function=b></function>"
)
tool_calls = agent._parse_tool_call(response)
assert len(tool_calls) == 2
assert tool_calls[0].arguments == {"x": "1"}
assert tool_calls[1].arguments == {}


def test_parse_fallback_and_exception(agent):
"""Covers no-match fallback and exception handling"""
# No match fallback
tool_calls = agent._parse_tool_call("text")
assert tool_calls[0].name == "unknown_function"

# Exception path
result = agent._parse_tool_call(None)
assert result is None
agent.logger.warning.assert_called_once()


def test_run_resolved_and_loop(agent):
"""Covers already-resolved and main loop with multiple tool calls"""
mock_env = Mock(task_name="test")
mock_llm = Mock()

# Test 1: Already resolved
mock_env.reset.return_value = Mock(resolved=True, score=100, max_score=100)
agent.build_system_prompt = Mock(return_value="sys")
agent.build_instance_prompt = Mock(return_value="inst")
agent._build_trajectory = Mock(return_value="traj")

result = agent.run(mock_env, mock_llm)
assert result == "traj"

# Test 2: Main loop with multiple tool calls
mock_env.reset.return_value = Mock(resolved=False, score=0, max_score=100)
mock_env.step.return_value = Mock(resolved=False, score=50, max_score=100)
mock_llm.return_value = Mock(
response="<function=a></function><function=b></function>",
tool=None,
reasoning_response=None,
)
agent.build_prompt = Mock(return_value=[])
agent.should_stop = Mock(return_value=(True, "done"))

agent.run(mock_env, mock_llm)
info_calls = [str(c) for c in agent.logger.info.call_args_list]
assert any("Multiple tool calls detected" in c for c in info_calls)


def test_run_exception(agent):
"""Covers exception handling in run"""
mock_env = Mock(task_name="test")
mock_env.reset.side_effect = Exception("error")

with pytest.raises(Exception):
agent.run(mock_env, Mock())

assert agent.logger.report_progress.call_args[1]["status"] == "error"
Loading