Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
58 commits
Select commit Hold shift + click to select a range
4e30e04
terminate the episode when encountering non-recoverable errors.
xingdi-eric-yuan Dec 10, 2025
d0215f3
Update docker.py
xingdi-eric-yuan Dec 10, 2025
b043f97
test cases
xingdi-eric-yuan Dec 10, 2025
1d48dc4
Fix edge cases in unrecoverable terminal error handling
xingdi-eric-yuan Dec 11, 2025
b886678
Update run.py
xingdi-eric-yuan Dec 11, 2025
54f6d30
Update experiment.py
xingdi-eric-yuan Dec 11, 2025
7f75f2f
Update experiment.py
xingdi-eric-yuan Dec 11, 2025
70cc39e
Update experiment.py
xingdi-eric-yuan Dec 11, 2025
d63e32a
Update experiment.py
xingdi-eric-yuan Dec 11, 2025
0fa81e9
Update test_experiment.py
xingdi-eric-yuan Dec 11, 2025
6aacd22
Update run.py
xingdi-eric-yuan Dec 11, 2025
841024a
Update test_experiment.py
xingdi-eric-yuan Dec 11, 2025
8bfa68d
more tests
xingdi-eric-yuan Dec 11, 2025
8cf0cab
Update mini_nightmare.py
xingdi-eric-yuan Dec 11, 2025
586a991
multiple processes share the same temp file path
xingdi-eric-yuan Dec 11, 2025
c8da26b
Update run.py
xingdi-eric-yuan Dec 11, 2025
4e20ae0
Fix pickle error when logging exceptions in worker processes
xingdi-eric-yuan Dec 11, 2025
863a294
Merge branch 'main' into error_handling
xingdi-eric-yuan Dec 11, 2025
974b237
Update test_experiment.py
xingdi-eric-yuan Dec 11, 2025
4d12f88
Update env.py
xingdi-eric-yuan Dec 12, 2025
1dee2bf
minor
xingdi-eric-yuan Dec 12, 2025
94f3043
remove listdir
xingdi-eric-yuan Dec 12, 2025
6809736
add test
xingdi-eric-yuan Dec 12, 2025
021b240
Update test_env.py
xingdi-eric-yuan Dec 12, 2025
139959d
Update run.py
xingdi-eric-yuan Dec 12, 2025
8eedeb6
update readme
xingdi-eric-yuan Dec 12, 2025
1e855b2
add back listdir, introducing tool dependencies (when defining tools)
xingdi-eric-yuan Dec 12, 2025
7c00f94
Update test_experiment.py
xingdi-eric-yuan Dec 12, 2025
5652c99
Update README.md
xingdi-eric-yuan Dec 12, 2025
a56ca76
move setup_commands to tool.py so it's more generic
xingdi-eric-yuan Dec 12, 2025
4015c74
Update tool.py
xingdi-eric-yuan Dec 12, 2025
64ebedf
Move tool setup_commands to base EnvironmentTool class
xingdi-eric-yuan Dec 12, 2025
eb2e59b
Simplify tool's setup_commands
MarcCote Dec 13, 2025
2edd5fc
Better LLM.instantiate (#313)
sordonia Dec 13, 2025
805a303
Update submit.py
xingdi-eric-yuan Dec 13, 2025
c03a36a
Fix formatting
MarcCote Dec 13, 2025
49a0e4d
fix copy
Dec 13, 2025
de97b55
step / init
Dec 13, 2025
912040f
rename .tool -> .action
Dec 13, 2025
d726f6d
merge and rename
Dec 13, 2025
5bf4c6b
merge
Dec 13, 2025
55f8940
agents
Dec 13, 2025
56e1017
precommit
Dec 13, 2025
1526908
finish removing
Dec 13, 2025
3a93442
revert back renaming, too much mess
Dec 13, 2025
f050928
llm mock
Dec 13, 2025
9133c8c
fix tests
Dec 13, 2025
81e374f
accept llm as None otw all hell breaks loose
Dec 13, 2025
8f1813a
remove llm from run
Dec 13, 2025
7225a6a
value error
Dec 13, 2025
01fdbde
sys prompt for froggy
Dec 13, 2025
ff76f2d
fix order of args
Dec 13, 2025
2bbb7fd
dont require llm
Dec 13, 2025
801e238
info rename
Dec 13, 2025
22334fe
return env info
Dec 13, 2025
ab41a84
Refactor step and execute_action methods
sordonia Dec 13, 2025
44633f7
remove env in config
Dec 13, 2025
e5101b1
Merge branch 'refactor_step_init' of github.com:microsoft/debug-gym i…
Dec 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 90 additions & 55 deletions debug_gym/agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
import os
import uuid
from dataclasses import MISSING, asdict, dataclass, field, fields
from typing import Any, Dict
from typing import Any, Dict, List

from jinja2 import Environment, Template

from debug_gym.agents.history_tracker import HistoryTracker
from debug_gym.gym.envs.env import EnvInfo, RepoEnv
from debug_gym.gym.utils import filter_non_utf8
from debug_gym.llms.base import LLM
from debug_gym.llms.base import LLM, LLMResponse
from debug_gym.llms.utils import trim
from debug_gym.logger import DebugGymLogger

Expand All @@ -27,8 +27,8 @@ def register_agent(cls):

@dataclass
class AgentArgs:
system_prompt: str | None = None
instance_prompt: str | None = None
system_prompt: str = ""
instance_prompt: str = "Instructions: {{ info.instructions }}"
max_steps: int = 100
max_history_token_cutoff: int = -1
max_history_steps_cutoff: int = -1
Expand Down Expand Up @@ -83,8 +83,6 @@ def to_dict(self) -> Dict[str, Any]:
class BaseAgent:
name: str = None
args_class = AgentArgs
system_prompt: str = ""
instance_prompt: str = "Instructions: {{ info.instructions }}"

def __init__(
self,
Expand All @@ -95,14 +93,10 @@ def __init__(
self.args = self.args_class.make(agent_args or {})
self.history = HistoryTracker()
self.logger = logger or DebugGymLogger("debug-gym")
self.llm = None
self.llm = llm
self.env = None

# Override prompts if provided in args
if self.args.system_prompt is not None:
self.system_prompt = str(self.args.system_prompt)
if self.args.instance_prompt is not None:
self.instance_prompt = str(self.args.instance_prompt)
self.system_prompt = str(self.args.system_prompt)
self.instance_prompt = str(self.args.instance_prompt)

@staticmethod
def to_pretty_json(value):
Expand Down Expand Up @@ -238,17 +232,89 @@ def should_stop(self, step: int, info: EnvInfo):
reason = "max_steps reached"
return should_stop, reason

def run(self, env: RepoEnv, llm: LLM, debug=False):
self.env = env
self.llm = llm
def init(self, info: EnvInfo) -> None:
"""Initialize the agent with environment

Args:
info: The environment info to interact with.
"""
self.history.init(
self.build_system_prompt(info), self.build_instance_prompt(info), info
)

self.logger.info(
"Available tools (in LLM's tool calling format):\n"
f"{json.dumps(self.llm.define_tools(info.tools), indent=4)}\n"
)

def step(self, info: EnvInfo) -> LLMResponse | List[LLMResponse]:
"""Execute a single agent step (LLM decision only).

Args:
info: Current environment info.

Returns:
LLMResponse with the agent's decision.
"""
messages = self.build_prompt(info)
return self.llm(messages, info.tools)

def execute_action(self, llm_response: LLMResponse | List[LLMResponse]) -> EnvInfo:
next_info = self.env.step(
llm_response.tool,
llm_response.response,
llm_response.reasoning_response,
)
self.history.step(next_info, llm_response)
return next_info

def build_trajectory(self) -> Dict[str, Any]:
"""Return the trajectory as a JSON-serializable dict without writing it."""
tools = [f"{tool.name}({tool.arguments})" for tool in self.env.tools]
json_output = {
"problem": self.env.task_name,
"config": self.args.to_dict(),
"tools": self.llm.define_tools(self.env.tools) if self.llm else tools,
"uuid": self.args.uuid,
"success": self.env.resolved,
"log": [],
"agent_type": self.__class__.__name__,
"logger": str(self.logger.log_file),
}
for step_id in range(len(self.history)):
step_json = self.history.json(step_id)
json_output["log"].append(step_json)
return json_output

def run(
self,
env: RepoEnv,
debug: bool = False,
reset_env: bool = True,
) -> Dict[str, Any]:
"""Run the agent loop until termination or max steps.

Args:
env: The environment to interact with.
debug: Whether to drop into debugger after each LLM call.
reset_env: Whether to reset the environment (default True).

Returns:
The trajectory as a JSON-serializable dict.
"""
info = None
step = 0

# assign the env
self.env = env

try:
info = self.env.reset()
self.history.init(
self.build_system_prompt(info), self.build_instance_prompt(info), info
)
if reset_env:
info = env.reset()
else:
info = env.info

self.init(info)

if info.resolved:
self.logger.report_progress(
Expand All @@ -259,12 +325,7 @@ def run(self, env: RepoEnv, llm: LLM, debug=False):
max_score=info.max_score,
status="resolved",
)
return self._build_trajectory()

self.logger.info(
"Available tools (in LLM's tool calling format):\n"
f"{json.dumps(self.llm.define_tools(info.tools), indent=4)}\n"
)
return self.build_trajectory()

highscore = info.score
should_stop = False
Expand All @@ -273,18 +334,12 @@ def run(self, env: RepoEnv, llm: LLM, debug=False):
while not should_stop:
self.logger.info(f"\n{'='*20} STEP {step} {'='*20}\n")

messages = self.build_prompt(info)
llm_response = self.llm(messages, info.tools)
agent_response = self.step(info)
info = self.execute_action(agent_response)

if debug:
breakpoint()

info = self.env.step(
llm_response.tool,
llm_response.response,
llm_response.reasoning_response,
)
self.history.step(info, llm_response)
should_stop, reason = self.should_stop(step + 1, info)
status = (
"resolved"
Expand All @@ -299,7 +354,6 @@ def run(self, env: RepoEnv, llm: LLM, debug=False):
self.logger.info(msg)
step += 1

# keep progress bar running until max_steps is reached
self.logger.report_progress(
problem_id=env.task_name,
step=step,
Expand All @@ -308,9 +362,8 @@ def run(self, env: RepoEnv, llm: LLM, debug=False):
max_score=info.max_score,
status=status,
)
return self._build_trajectory()
return self.build_trajectory()
except Exception as e:
# report any error that happens during the run
self.logger.report_progress(
problem_id=env.task_name,
step=step,
Expand All @@ -321,24 +374,6 @@ def run(self, env: RepoEnv, llm: LLM, debug=False):
)
raise e

def _build_trajectory(self) -> Dict[str, Any]:
"""Return the trajectory as a JSON-serializable dict without writing it."""
tools = [f"{tool.name}({tool.arguments})" for tool in self.env.tools]
json_output = {
"problem": self.env.task_name,
"config": self.args.to_dict(),
"tools": self.llm.define_tools(self.env.tools) if self.llm else tools,
"uuid": self.args.uuid,
"success": self.env.resolved,
"log": [],
"agent_type": self.__class__.__name__,
"logger": str(self.logger.log_file),
}
for step_id in range(len(self.history)):
step_json = self.history.json(step_id)
json_output["log"].append(step_json)
return json_output


def create_agent(config: Dict[str, Any], **kwargs) -> BaseAgent:
"""Create an agent from the config dictionary."""
Expand Down
2 changes: 1 addition & 1 deletion debug_gym/agents/froggy_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
@dataclass
class FroggyAgentArgs(AgentArgs):
show_current_breakpoints: bool = False
system_prompt: str = "{{ agent._default_system_prompt(info) }}"


@register_agent
class FroggyAgent(BaseAgent):
name: str = "froggy_agent"
args_class = FroggyAgentArgs
system_prompt: str = "{{ agent._default_system_prompt(info) }}"

def shortcut_features(self):
features = []
Expand Down
126 changes: 52 additions & 74 deletions debug_gym/agents/solution_agent.py
Original file line number Diff line number Diff line change
@@ -1,88 +1,66 @@
from typing import Any, Dict

from debug_gym.agents.base_agent import BaseAgent, register_agent
from debug_gym.gym.envs.env import EnvInfo, RepoEnv
from debug_gym.gym.tools.tool import ToolCall
from debug_gym.llms.base import LLM, LLMResponse


@register_agent
class AgentSolution(BaseAgent):
"""Agent that applies the gold patch and submits - used for testing environments."""

name: str = "solution_agent"

def _report_progress(self, task_name, info, status):
self.logger.report_progress(
problem_id=task_name,
step=1,
total_steps=1,
score=getattr(info, "score", 0),
max_score=getattr(info, "max_score", 0),
status=status,
)
def __init__(
self,
llm: LLM | None = None,
**kwargs,
):
super().__init__(llm=llm, **kwargs)

def _env_implements_apply_gold_patch(self):
"""Fail early if the environment does not implement apply_gold_patch."""
return hasattr(self.env, "apply_gold_patch")

def run(self, env, llm=None, debug=False):
self.env = env
info = None
try:
if not self._env_implements_apply_gold_patch():
raise NotImplementedError(
f"The environment {type(self.env)} is not compatible with SolutionAgent."
" Check the README.md to see which environments are compatible."
)

info = self.env.reset()

if info.resolved is True:
self._report_progress(env.task_name, info, "resolved")
return True

self.logger.info(f"Score: {info.score}/{info.max_score or '-'}")

if env.has_tool("pdb"):
# Make a simple pdb call to make sure it is working.
action = ToolCall(
name="pdb", id="pdb", arguments={"command": "help help"}
)
pdb_help_info = self.env.step(action, None, None)
assert "h(elp)" in pdb_help_info.step_observation.observation, (
"PDB command did not return expected help message.\n"
f"{pdb_help_info.step_observation.observation}"
)

# Send a pdb continue command, and check the output matches the one from env.reset.
action = ToolCall(
name="pdb", id="pdb", arguments={"command": "continue"}
)
pdb_continue_info = self.env.step(action, None, None)

pdb_observation = pdb_continue_info.step_observation.observation
expected_messages = [
"Reached the end of the program. Restarting the debugging session.",
"Uncaught exception. Entering post mortem debugging",
]
reset_observation = info.step_observation.observation
if reset_observation.splitlines():
expected_messages.append(reset_observation.splitlines()[-1])

assert any(
msg in pdb_observation for msg in expected_messages
), f"PDB command did not return expected continue message.\n{pdb_observation}"

self.env.apply_gold_patch()

if debug:
breakpoint()

action = ToolCall(name="submit", id="submit", arguments={})
info = self.env.step(action, None, None)
def _run_pdb_sanity_checks(self, info: EnvInfo):
"""Run PDB sanity checks if PDB tool is available."""
if not self.env.has_tool("pdb"):
return

# Make a simple pdb call to make sure it is working.
action = ToolCall(name="pdb", id="pdb", arguments={"command": "help help"})
pdb_help_info = self.env.step(action, None, None)
assert "h(elp)" in pdb_help_info.step_observation.observation, (
"PDB command did not return expected help message.\n"
f"{pdb_help_info.step_observation.observation}"
)

self.logger.info(f"Score: {info.score}/{info.max_score or '-'}")
assert info.resolved, (
"The task is not done after applying the gold patch.\n"
f"{info.step_observation.observation}"
)
self._report_progress(env.task_name, info, "resolved")
except Exception:
self._report_progress(env.task_name, info, "error")
raise
return info.resolved
# Send a pdb continue command, and check the output matches the one from env.reset.
action = ToolCall(name="pdb", id="pdb", arguments={"command": "continue"})
pdb_continue_info = self.env.step(action, None, None)

pdb_observation = pdb_continue_info.step_observation.observation
expected_messages = [
"Reached the end of the program. Restarting the debugging session.",
"Uncaught exception. Entering post mortem debugging",
]
reset_observation = info.step_observation.observation
if reset_observation.splitlines():
expected_messages.append(reset_observation.splitlines()[-1])

assert any(
msg in pdb_observation for msg in expected_messages
), f"PDB command did not return expected continue message.\n{pdb_observation}"

def step(self, info: EnvInfo) -> EnvInfo:
tool_call = ToolCall(name="submit", id="submit", arguments={})
return LLMResponse([], tool=tool_call)

def execute_action(self, llm_response, **kwargs):
self.env.apply_gold_patch()
info = self.env.step(llm_response.tool, None, None)
return info

def init(self, info: EnvInfo) -> None:
self._run_pdb_sanity_checks(info)
2 changes: 1 addition & 1 deletion debug_gym/agents/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def save_patch(env, problem_path: Path, logger: DebugGymLogger):
def save_trajectory(agent, problem_path: Path, logger: DebugGymLogger):
"""Persist the agent trajectory to disk."""
problem_path.mkdir(parents=True, exist_ok=True)
trajectory = agent._build_trajectory()
trajectory = agent.build_trajectory()
json_file = problem_path / "trajectory.json"
with open(json_file, "w") as f:
json.dump(trajectory, f, indent=4)
Expand Down
Loading
Loading