microsoft · MarcCote · Dec 12, 2025 · Dec 13, 2025 · Dec 13, 2025 · Dec 13, 2025
diff --git a/configs/simple.yaml b/configs/simple.yaml
@@ -0,0 +1,68 @@
+# Configuration for standalone FreeEnv + FreeAgent runs.
+task_name: free-session
+output_path: exps/free_env
+
+llm:
+  name: frogboss
+
+# Tools to load into the environment toolbox.
+tools:
+  - bash
+  - submit:
+      eval_on_submit: False  # Here we only terminate after submission, no auto-eval.
+
+task_data:
+  env_type: FreeEnv
+  image: ubuntu:22.04
+  local_path: /home/macote/src/debug-gym/data/mini_nightmare/pandas_dataframe
+  workspace_dir: /testbed
+
+terminal:
+  type: docker
+
+agent:
+  type: simple_agent
+  max_steps: 20
+  system_prompt: |-
+    You are a helpful assistant that can interact with a computer to solve tasks.
+    <IMPORTANT>
+    * If user provides a path, you should NOT assume it's relative to the current working directory. Instead, you should explore the file system to find the file before working on it.
+    </IMPORTANT>
+
+    You have access to the following functions:
+
+    ---- BEGIN FUNCTION #1:
+    bash ----
+    Description: Execute a bash command in the terminal.
+
+    Parameters:
+    (1) command (string, required): The bash command to execute. Can be empty to view additional logs when previous exit code is `-1`. Can be `ctrl+c` to interrupt the currently running process.
+    ---- END FUNCTION #1 ----
+
+    ---- BEGIN FUNCTION #2: submit ----
+    Description: Finish the interaction when the task is complete OR if the assistant cannot proceed further with the task.
+    No parameters are required for this function.
+    ---- END FUNCTION #2 ----
+
+    If you choose to call a function ONLY reply in the following format with NO suffix:
+
+    Provide any reasoning for the function call here.
+    <function=example_function_name>
+    <parameter=example_parameter_1>value_1</parameter>
+    <parameter=example_parameter_2>
+    This is the value for the second parameter
+    that can span
+    multiple lines
+    </parameter>
+    </function>
+
+    <IMPORTANT>
+    Reminder:
+    - Function calls MUST follow the specified format, start with <function= and end with </function>
+    - Required parameters MUST be specified
+    - Only call one function at a time
+    - Always provide reasoning for your function call in natural language BEFORE the function call (not after)
+    </IMPORTANT>
+
+  instance_prompt: >-
+    Look at the codebase check that everything is working properly.
diff --git a/debug_gym/agents/__init__.py b/debug_gym/agents/__init__.py
@@ -1,10 +1,12 @@
 from debug_gym.agents.base_agent import BaseAgent, register_agent
 from debug_gym.agents.froggy_agent import FroggyAgent
+from debug_gym.agents.simple_agent import SimpleAgent
 from debug_gym.agents.solution_agent import AgentSolution
 
 __all__ = [
     "BaseAgent",
     "register_agent",
     "FroggyAgent",
     "AgentSolution",
+    "SimpleAgent",
 ]
diff --git a/debug_gym/agents/simple_agent.py b/debug_gym/agents/simple_agent.py
@@ -0,0 +1,153 @@
+import re
+
+from debug_gym.agents.base_agent import BaseAgent, register_agent
+from debug_gym.gym.envs.env import EnvInfo, RepoEnv
+from debug_gym.gym.tools.tool import ToolCall
+from debug_gym.llms.base import LLM
+
+
+@register_agent
+class SimpleAgent(BaseAgent):
+    name: str = "simple_agent"
+
+    def _parse_tool_call(self, response: str) -> list[ToolCall]:
+        # Extract tool calls from LLM response.
+        # Supports multiple tool calls in a single response.
+        # Assume the following format.
+        # <function=example_function_name>
+        # <parameter=example_parameter_1>value_1</parameter>
+        # <parameter=example_parameter_2>
+        # This is the value for the second parameter
+        # that can span
+        # multiple lines
+        # </parameter>
+        # </function>
+        try:
+            tool_calls = []
+
+            # Extract all function blocks with their content
+            func_pattern = r"<function=([^>]+)>(.*?)</function>"
+            for func_match in re.finditer(func_pattern, response, re.DOTALL):
+                function_name = func_match.group(1)
+                function_content = func_match.group(2)
+
+                # Extract all parameters within this function block
+                arguments = {}
+                param_pattern = r"<parameter=([^>]+)>(.*?)</parameter>"
+                for param_match in re.finditer(
+                    param_pattern, function_content, re.DOTALL
+                ):
+                    param_name = param_match.group(1)
+                    param_value = param_match.group(2).rstrip()
+                    arguments[param_name] = param_value
+
+                tool_calls.append(
+                    ToolCall(
+                        id="None",
+                        name=function_name,
+                        arguments=arguments,
+                    )
+                )
+
+            # Return list with unknown_function if no tool calls found
+            if not tool_calls:
+                tool_calls.append(
+                    ToolCall(id="None", name="unknown_function", arguments={})
+                )
+
+            return tool_calls
+        except Exception as e:
+            self.logger.warning(
+                f"Failed to parse tool call from LLM response: {e!r}. "
+                f"LLM response was: {response}. "
+                "The agent will stop execution."
+            )
+            return None
+
+    def run(self, env: RepoEnv, llm: LLM, debug=False):
+        self.env = env
+        self.llm = llm
+        info = None
+        step = 0
+
+        try:
+            info = self.env.reset()
+            self.history.init(
+                self.build_system_prompt(info), self.build_instance_prompt(info), info
+            )
+
+            if info.resolved:
+                self.logger.report_progress(
+                    problem_id=env.task_name,
+                    step=0,
+                    total_steps=self.args.max_steps,
+                    score=info.score,
+                    max_score=info.max_score,
+                    status="resolved",
+                )
+                return self._build_trajectory()
+
+            highscore = info.score
+            should_stop = False
+            step = 1
+
+            while not should_stop:
+                self.logger.info(f"\n{'='*20} STEP {step} {'='*20}\n")
+
+                messages = self.build_prompt(info)
+                llm_response = self.llm(messages, tools=None)
+                tool_calls = self._parse_tool_call(llm_response.response)
+
+                # Handle multiple tool calls - use the first one
+                if tool_calls and len(tool_calls) > 1:
+                    self.logger.info(
+                        f"Multiple tool calls detected ({len(tool_calls)}), using the first one."
+                    )
+
+                # TODO: deal with multiple tool calls.
+                llm_response.tool_call = tool_calls[0] if tool_calls else None
+
+                if debug:
+                    breakpoint()
+
+                info = self.env.step(
+                    llm_response.tool,
+                    llm_response.response,
+                    llm_response.reasoning_response,
+                )
+                self.history.step(info, llm_response)
+                should_stop, reason = self.should_stop(step + 1, info)
+                status = (
+                    "resolved"
+                    if info.resolved
+                    else ("unresolved" if should_stop else "running")
+                )
+
+                highscore = max(highscore, info.score)
+                msg = f"[{env.task_name[:10]:<10}] Step {step} | Score: {info.score}/{info.max_score or '-'} [Best: {highscore}]"
+                if should_stop:
+                    msg += f" | Stopping Reason: {reason}"
+                self.logger.info(msg)
+                step += 1
+
+                # keep progress bar running until max_steps is reached
+                self.logger.report_progress(
+                    problem_id=env.task_name,
+                    step=step,
+                    total_steps=self.args.max_steps,
+                    score=info.score,
+                    max_score=info.max_score,
+                    status=status,
+                )
+            return self._build_trajectory()
+        except Exception as e:
+            # report any error that happens during the run
+            self.logger.report_progress(
+                problem_id=env.task_name,
+                step=step,
+                total_steps=step,
+                score=getattr(info, "score", 0),
+                max_score=getattr(info, "max_score", None),
+                status="error",
+            )
+            raise e
diff --git a/debug_gym/llms/openai.py b/debug_gym/llms/openai.py
@@ -268,13 +268,20 @@ def generate(self, messages, tools, **kwargs) -> LLMResponse:
             self.need_to_be_retried,
         )
         try:
-            response = api_call(
-                model=self.config.model,
-                messages=messages,
-                tools=self.define_tools(tools),
-                tool_choice="auto",
-                **kwargs,
-            )
+            if tools:
+                response = api_call(
+                    model=self.config.model,
+                    messages=messages,
+                    tools=self.define_tools(tools),
+                    tool_choice="auto",
+                    **kwargs,
+                )
+            else:
+                response = api_call(
+                    model=self.config.model,
+                    messages=messages,
+                    **kwargs,
+                )
         except openai.BadRequestError as e:
             # Handle specific error for context length exceeded, otherwise just propagate the error
             if self.is_context_length_error(e):

diff --git a/tests/agents/test_simple_agent.py b/tests/agents/test_simple_agent.py
@@ -0,0 +1,95 @@
+from unittest.mock import Mock
+
+import pytest
+
+from debug_gym.agents.base_agent import AgentArgs
+from debug_gym.agents.simple_agent import SimpleAgent
+
+
+@pytest.fixture
+def agent():
+    agent = SimpleAgent(agent_args=AgentArgs(max_steps=10))
+    agent.logger = Mock()
+    return agent
+
+
+def test_parse_with_parameters(agent):
+    """Covers main parsing logic and multiline parameters"""
+    response = """
+<function=test>
+<parameter=x>1</parameter>
+<parameter=code>
+def hello():
+    pass
+</parameter>
+</function>
+"""
+    tool_calls = agent._parse_tool_call(response)
+    assert len(tool_calls) == 1
+    assert tool_calls[0].name == "test"
+    assert tool_calls[0].arguments["x"] == "1"
+    assert "def hello():" in tool_calls[0].arguments["code"]
+
+
+def test_parse_multiple_and_empty(agent):
+    """Covers multiple functions and parameter scoping"""
+    response = (
+        "<function=a><parameter=x>1</parameter></function><function=b></function>"
+    )
+    tool_calls = agent._parse_tool_call(response)
+    assert len(tool_calls) == 2
+    assert tool_calls[0].arguments == {"x": "1"}
+    assert tool_calls[1].arguments == {}
+
+
+def test_parse_fallback_and_exception(agent):
+    """Covers no-match fallback and exception handling"""
+    # No match fallback
+    tool_calls = agent._parse_tool_call("text")
+    assert tool_calls[0].name == "unknown_function"
+
+    # Exception path
+    result = agent._parse_tool_call(None)
+    assert result is None
+    agent.logger.warning.assert_called_once()
+
+
+def test_run_resolved_and_loop(agent):
+    """Covers already-resolved and main loop with multiple tool calls"""
+    mock_env = Mock(task_name="test")
+    mock_llm = Mock()
+
+    # Test 1: Already resolved
+    mock_env.reset.return_value = Mock(resolved=True, score=100, max_score=100)
+    agent.build_system_prompt = Mock(return_value="sys")
+    agent.build_instance_prompt = Mock(return_value="inst")
+    agent._build_trajectory = Mock(return_value="traj")
+
+    result = agent.run(mock_env, mock_llm)
+    assert result == "traj"
+
+    # Test 2: Main loop with multiple tool calls
+    mock_env.reset.return_value = Mock(resolved=False, score=0, max_score=100)
+    mock_env.step.return_value = Mock(resolved=False, score=50, max_score=100)
+    mock_llm.return_value = Mock(
+        response="<function=a></function><function=b></function>",
+        tool=None,
+        reasoning_response=None,
+    )
+    agent.build_prompt = Mock(return_value=[])
+    agent.should_stop = Mock(return_value=(True, "done"))
+
+    agent.run(mock_env, mock_llm)
+    info_calls = [str(c) for c in agent.logger.info.call_args_list]
+    assert any("Multiple tool calls detected" in c for c in info_calls)
+
+
+def test_run_exception(agent):
+    """Covers exception handling in run"""
+    mock_env = Mock(task_name="test")
+    mock_env.reset.side_effect = Exception("error")
+
+    with pytest.raises(Exception):
+        agent.run(mock_env, Mock())
+
+    assert agent.logger.report_progress.call_args[1]["status"] == "error"