WIP

MarcCote · MarcCote · commit f6ddcadfc3f4 · 2025-07-17T10:30:05.000-07:00
diff --git a/debug_gym/agents/guided_agent.py b/debug_gym/agents/guided_agent.py
@@ -1,9 +1,10 @@
 import logging
 
-from termcolor import colored
-
 from debug_gym.agents.base_agent import register_agent
 from debug_gym.agents.rewrite_agent import RewriteAgent
+from debug_gym.gym.entities import Event
+from debug_gym.gym.tools.tool import ToolCall
+from debug_gym.gym.tools.toolbox import Toolbox
 from debug_gym.llms.base import LLM
 from debug_gym.logger import DebugGymLogger
 
@@ -12,80 +13,147 @@
 class GuidedRewriteAgent(RewriteAgent):
     name: str = "guided_agent"
 
-    def try_rewrite(self, task_name):
-        # make a copy of the env for the llm
-        from ipdb import set_trace
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
 
-        set_trace()
-        cloned_env = self.env.clone()
+        # Create a dedicated env for the guided rewrite agent.
+        self.llm.logger = DebugGymLogger(
+            name="LLM",
+            level=logging.DEBUG,
+            log_dir=self.logger.log_file.parent,
+            icon="🤖",
+        )
 
-        # Only keep the rewrite tool in the cloned env
-        for tool in cloned_env.tools:
-            if tool.name != "rewrite":
-                cloned_env.remove_tool(tool.name)
+        # Create a human interface for the guided agent.
+        self.logger.level = logging.DEBUG
+        self.logger.icon = "👤"
+        self.human = LLM.instantiate(llm_name="human", logger=self.logger)
 
-        # Reset the cloned environment and replay the history.
-        info = cloned_env.reset(options={"task_name": task_name})
-        # replay the history up to the current step
-        for step in self.history.get_all():
-            assert not step.done
-            info = cloned_env.step(step.action)
+    def try_rewrite_and_rollback(self, last_info):
+        prompt = self.build_prompt(last_info)
 
-        prompt = self.build_prompt(info)
-        response = self.llm(prompt, info.tools)
-        info = cloned_env.step(response.response)
+        # Git commit the current state before trying to rewrite.
+        self.env.terminal.run("git add . && git commit -m 'Before rewrite attempt'")
 
-        return info.done
+        # Remove all tools except the rewrite tool.
+        tools = [tool for tool in last_info.tools if tool.name == "rewrite"]
+        response = self.llm(prompt, tools)
+        self.llm.logger.info(f"LLM response: {response.response}")
+        self.llm.logger.info(f"LLM tool: {response.tool}")
 
-    def run(self, task_name=None, debug=False):
-        self.logger.level = logging.DEBUG
-        self.llm.logger = DebugGymLogger(
-            name="LLM", level=logging.ERROR, log_dir=self.logger.log_file.parent
-        )
-        self.human = LLM.instantiate(llm_name="human", logger=self.logger)
+        # Temporarily disable the REWRITE_SUCCESS event.
+        self.env.event_hooks.mute(Event.REWRITE_SUCCESS)
+        info_after_rewrite = self.env.step(response.tool)
+        info = self.env.step(ToolCall(id="eval", name="eval", arguments={}))
+        self.env.event_hooks.unmute(Event.REWRITE_SUCCESS)
 
-        self.history.reset()
-        info = self.env.reset(options={"task_name": task_name})
-        # initial state does not have prompt and response
-        self.history.step(info, None)
+        self.llm.logger.info(f"LLM observation: {info.eval_observation.observation}.")
 
-        if info.done is True:
-            # msg = "Environment started with entrypoint passing without errors."
-            return True
+        # Rollback any changes made by the LLM.
+        self.env.terminal.run("git reset --hard HEAD")
 
-        highscore = info.score
+        return info.done
 
-        for step in self.logger.tqdm(range(self.config["max_steps"])):
-            highscore = max(highscore, info.score)
-            self.logger.info(
-                f"Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) [Best: {highscore}]"
+    def run(self, task_name=None, debug=False):
+        step = 0
+        max_steps = self.config["max_steps"]
+        try:
+            self.history.reset()
+            info = self.env.reset(options={"task_name": task_name})
+            # initial state does not have prompt and response
+            self.history.step(info, None)
+
+            # First make sure git is setup correctly.
+            self.env.terminal.run(
+                "git init && git config user.name 'debug-gym' && git config user.email '<>'"
             )
 
-            llm_done = self.try_rewrite(task_name)
-            if llm_done:
-                msg = f"*** The rewrite-only agent with {self.llm.model_name} managed to solve the task with the current context. ***"
-                self.logger.info(colored(msg, "green"))
-                break
-            else:
-                msg = f"*** The rewrite-only agent with {self.llm.model_name} failed to solve the task with the current context. ***"
-                self.logger.info(colored(msg, "red"))
-
-            # If the LLM did not manage to solve the task, we continue with the guided approach.
-            prompt = self.build_prompt(info)
-            human_response = self.human(prompt, info.tools)
-
-            if debug:
-                breakpoint()
+            if info.done is True:
+                self.logger.report_progress(
+                    problem_id=task_name,
+                    step=1,
+                    total_steps=1,
+                    score=info.score,
+                    max_score=info.max_score,
+                    status="resolved",
+                )
+                return True
 
-            # step the environment with the human response
-            info = self.env.step(human_response.tool)
-            # log the human response
-            self.history.step(info, human_response)
+            highscore = info.score
 
-            if info.done:
+            for step in range(max_steps):
+                self.logger.info(f"\n{'='*20} STEP {step+1} {'='*20}\n")
+                highscore = max(highscore, info.score)
                 self.logger.info(
-                    "You managed to provide the patch that solves the task before the LLM. Congrats!"
+                    f"Step: {step} | Score: {info.score}/{info.max_score} ({info.score/info.max_score:.1%}) [Best: {highscore}]"
                 )
-                break
 
-        return info.done
+                llm_done = self.try_rewrite_and_rollback(info)
+                if llm_done:
+                    msg = f"[green]*** The rewrite-only agent with {self.llm.model_name} managed to solve the task with the current context. ***[/green]"
+                    self.llm.logger.error(msg)
+                    break
+                else:
+                    msg = f"[red]*** The rewrite-only agent with {self.llm.model_name} failed to solve the task with the current context. ***[/red]"
+                    self.llm.logger.error(msg)
+
+                # If the LLM did not manage to solve the task, we continue with the guided approach.
+                prompt = self.build_prompt(info)
+                human_response = self.human(prompt, info.tools)
+                if not llm_done:
+                    msg = f"[red]*** The rewrite-only agent with {self.llm.model_name} failed to solve the task with the current context. ***[/red]"
+                    self.llm.logger.error(msg)
+
+                if debug:
+                    breakpoint()
+
+                # step the environment with the human response
+                info = self.env.step(human_response.tool)
+                # log the human response
+                self.history.step(info, human_response)
+
+                if info.done:
+                    self.logger.info(
+                        "You managed to provide the patch that solves the task before the LLM. Congrats!"
+                    )
+                    # early stop, set current step and total steps to be the same
+                    self.logger.report_progress(
+                        problem_id=task_name,
+                        step=step + 1,
+                        total_steps=step + 1,
+                        score=info.score,
+                        max_score=info.max_score,
+                        status="resolved" if info.done else "unresolved",
+                    )
+                    break
+                # keep progress bar running until max_steps is reached
+                self.logger.report_progress(
+                    problem_id=task_name,
+                    step=step + 1,
+                    total_steps=max_steps + 1,
+                    score=info.score,
+                    max_score=info.max_score,
+                    status="running",
+                )
+            # max_steps was reached, task was either resolved or unresolved
+            self.logger.report_progress(
+                problem_id=task_name,
+                step=step + 1,
+                total_steps=step + 1,
+                score=info.score,
+                max_score=info.max_score,
+                status="resolved" if info.done else "unresolved",
+            )
+
+            return info.done
+        except Exception:
+            # report any error that happens during the run
+            self.logger.report_progress(
+                problem_id=task_name,
+                step=step + 1,
+                total_steps=step + 1,
+                score=info.score if info else 0,
+                max_score=info.max_score if info else 1,
+                status="error",
+            )
+            raise
diff --git a/debug_gym/gym/envs/env.py b/debug_gym/gym/envs/env.py
@@ -37,6 +37,7 @@ class EnvInfo:
 class EventHooks:
     def __init__(self):
         self.event_listeners = {event: [] for event in Event}
+        self.event_listeners_muted = {event: [] for event in Event}
 
     def subscribe(self, event: Event, tool: "Tool"):
         if event not in self.event_listeners:
@@ -50,6 +51,20 @@ def subscribe(self, event: Event, tool: "Tool"):
     def unsubscribe(self, event: Event, tool):
         self.event_listeners[event].remove(tool)
 
+    def mute(self, event: Event):
+        """Mute all tools for the given event."""
+        if event not in self.event_listeners_muted:
+            raise ValueError(f"Unknown event type: {event}")
+        self.event_listeners_muted[event] = self.event_listeners[event][:]
+        self.event_listeners[event] = []
+
+    def unmute(self, event: Event):
+        """Unmute all tools for the given event."""
+        if event not in self.event_listeners_muted:
+            raise ValueError(f"Unknown event type: {event}")
+        self.event_listeners[event] = self.event_listeners_muted[event][:]
+        self.event_listeners_muted[event] = []
+
     def notify(
         self, environment, event: Event, source=None, **kwargs
     ) -> list[Observation]:
@@ -555,23 +570,6 @@ def step(self, action: ToolCall, action_reasoning: str = "") -> EnvInfo:
 
         return self.infos
 
-    def clone(self):
-        # Create a new instance of RepoEnv
-        new_env = RepoEnv(
-            path=self.path,
-            entrypoint=self.entrypoint,
-            debug_entrypoint=self.debug_entrypoint,
-            max_score=self.max_score,
-            readonly_patterns=None,
-            run_timeout=self.run_timeout,
-            dir_tree_depth=self.dir_tree_depth,
-            terminal=Terminal(),
-            logger=self.logger,
-        )
-        for tool in self.tools:
-            new_env.add_tool(tool)
-        return new_env
-
     def post_process_event(self, event: Event, source, kwargs, observations):
         """Post-process the event after it has been handled by the tools."""
         if event in (Event.REWRITE_SUCCESS, Event.REWRITE_FAIL):
diff --git a/debug_gym/logger.py b/debug_gym/logger.py
@@ -405,6 +405,20 @@ def _status_listener(self):
         self.logger.debug("Status listener thread exiting...")
 
 
+class IconFilter(logging.Filter):
+    def __init__(self, *args, icon="🐸", **kwargs):
+        super().__init__(*args, **kwargs)
+        self.icon = icon
+
+    def filter(self, record):
+        if not hasattr(record, "icon"):
+            # If the record does not have an icon attribute, set it
+            # This allows the icon to be used in log messages
+            record.icon = self.icon
+
+        return True
+
+
 class DebugGymLogger(logging.Logger):
     """A multiprocess friendly logger that integrates with Rich for progress reporting.
     Multiprocess workers can use this logger to log messages and report progress via
@@ -420,6 +434,7 @@ def __init__(
         log_dir: str | None = None,
         level: str | int = logging.INFO,
         mode: str = "a",
+        icon: str = "🐸",
     ):
         super().__init__(name)
         # If var env "DEBUG_GYM_DEBUG" is set, turn on debug mode
@@ -428,6 +443,8 @@ def __init__(
 
         # Prevent the log messages from being propagated to the root logger
         self.propagate = False
+        self.icon_filter = IconFilter(icon=icon)
+        self.addFilter(self.icon_filter)
 
         self.setLevel(level)  # Set logger level, might be overridden by file handler
         self.log_file = None  # File handler for logging to a file
@@ -443,6 +460,16 @@ def __init__(
         if log_dir:
             self._initialize_file_handler(name, log_dir, mode)
 
+    @property
+    def icon(self):
+        """Get the icon used in log messages."""
+        return self.icon_filter.icon
+
+    @icon.setter
+    def icon(self, icon: str):
+        """Set the icon for the logger. This will update the icon used in log messages."""
+        self.icon_filter.icon = icon
+
     def _initialize_main_logger(self, level):
         self._live = Live(transient=True, refresh_per_second=2)
         rich_handler = RichHandler(
@@ -451,7 +478,9 @@ def _initialize_main_logger(self, level):
             rich_tracebacks=True,
             markup=True,
         )
-        rich_handler.setFormatter(logging.Formatter("🐸 [%(name)-12s]: %(message)s"))
+        rich_handler.setFormatter(
+            logging.Formatter(r"%(icon)s \[%(name)-12s]: %(message)s")
+        )
         rich_handler.setLevel(level)
         self.addHandler(rich_handler)
 
@@ -481,6 +510,7 @@ def handle(self, record):
         record into the log queue for the main process to display
         logs through Rich."""
         if self._is_worker:
+            # record.args.append(self.icon)
             self.LOG_QUEUE.put(record)
         super().handle(record)
 
diff --git a/scripts/config_mini_nightmare.yaml b/scripts/config_mini_nightmare.yaml
@@ -20,7 +20,7 @@ base:
         # session_commands define commands that are always executed before starting a shell session or running a single command in the terminal.
         # session_commands:["conda activate aider"],
         # setup_commands define commands that are executed only once when the terminal is created. This is only supported for Docker terminal.
-        setup_commands: ["pip install pytest pandas"],
+        setup_commands: ["apt update", "apt install -y git", "pip install pytest pandas"],
     }
 
     # LLM configs

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ base:`
`20`	`20`	`# session_commands define commands that are always executed before starting a shell session or running a single command in the terminal.`
`21`	`21`	`# session_commands:["conda activate aider"],`
`22`	`22`	`# setup_commands define commands that are executed only once when the terminal is created. This is only supported for Docker terminal.`
`23`		`- setup_commands: ["pip install pytest pandas"],`
	`23`	`+ setup_commands: ["apt update", "apt install -y git", "pip install pytest pandas"],`
`24`	`24`	`}`
`25`	`25`
`26`	`26`	`# LLM configs`