From 7eeaeab7bb997e07c43cb6d317eac17dd07ad3a8 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Fri, 16 Jan 2026 14:24:01 +0800
Subject: [PATCH 1/9] add reward manager

---
 configs/agents/rl/push_cube/gym_config.json   | 117 ++++-
 configs/agents/rl/push_cube/train_config.json |   2 +-
 embodichain/lab/gym/envs/base_env.py          |  32 +-
 embodichain/lab/gym/envs/embodied_env.py      |  32 ++
 embodichain/lab/gym/envs/managers/__init__.py |   2 +
 embodichain/lab/gym/envs/managers/cfg.py      |  23 +
 .../lab/gym/envs/managers/observations.py     |  13 +
 .../envs/managers/randomization/spatial.py    |  83 +++
 .../lab/gym/envs/managers/reward_manager.py   | 241 +++++++++
 embodichain/lab/gym/envs/managers/rewards.py  | 482 ++++++++++++++++++
 .../lab/gym/envs/tasks/rl/push_cube.py        |  91 +---
 embodichain/lab/gym/utils/gym_utils.py        |  44 ++
 12 files changed, 1056 insertions(+), 106 deletions(-)
 create mode 100644 embodichain/lab/gym/envs/managers/reward_manager.py
 create mode 100644 embodichain/lab/gym/envs/managers/rewards.py

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index b78e7f3..b4a5cd0 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -13,20 +13,107 @@
           "position_range": [[-0.2, -0.2, 0.0], [0.2, 0.2, 0.0]],
           "relative_position": true
         }
+      },
+      "randomize_goal": {
+        "func": "randomize_target_pose",
+        "mode": "reset",
+        "params": {
+          "position_range": [[-0.3, -0.3, 0.05], [0.3, 0.3, 0.05]],
+          "relative_position": false,
+          "store_key": "goal_pose"
+        }
+      }
+    },
+    "observations": {
+      "robot_qpos": {
+        "func": "embodichain.lab.gym.envs.managers.observations:normalize_robot_joint_data",
+        "mode": "concatenate",
+        "name": "robot/qpos",
+        "params": {
+          "robot_uid": "Manipulator",
+          "joint_ids": [0, 1, 2, 3, 4, 5],
+          "data_type": "qpos"
+        }
+      },
+      "robot_ee_pos": {
+        "func": "embodichain.lab.gym.envs.managers.observations:get_robot_ee_pose",
+        "mode": "concatenate",
+        "name": "robot/ee_pos",
+        "params": {
+          "robot_uid": "Manipulator",
+          "part_name": "arm"
+        }
+      },
+      "cube_pos": {
+        "func": "embodichain.lab.gym.envs.managers.observations:get_rigid_object_pose",
+        "mode": "concatenate",
+        "name": "object/cube_pos",
+        "params": {
+          "entity_cfg": {"uid": "cube"}
+        }
+      },
+      "goal_pos": {
+        "func": "embodichain.lab.gym.envs.managers.observations:virtual_target_position",
+        "mode": "concatenate",
+        "name": "object/goal_pos",
+        "params": {
+          "target_pose_key": "goal_pose"
+        }
+      }
+    },
+    "rewards": {
+      "reaching_reward": {
+        "func": "reaching_behind_object_reward",
+        "mode": "add",
+        "name": "reaching",
+        "params": {
+          "weight": 0.1,
+          "end_effector_cfg": {"uid": "Manipulator", "body_ids": "ee_link"},
+          "object_cfg": {"uid": "cube"},
+          "goal_cfg": {"uid": "goal_sphere"},
+          "behind_offset": 0.015,
+          "height_offset": 0.015,
+          "distance_scale": 5.0
+        }
+      },
+      "place_reward": {
+        "func": "incremental_distance_to_target",
+        "mode": "add",
+        "name": "place",
+        "params": {
+          "weight": 1.0,
+          "source_entity_cfg": {"uid": "cube"},
+          "target_pose_key": "goal_pose",
+          "tanh_scale": 10.0,
+          "positive_weight": 2.0,
+          "negative_weight": 0.5,
+          "use_xy_only": true
+        }
+      },
+      "action_penalty": {
+        "func": "action_smoothness_penalty",
+        "mode": "add",
+        "name": "action_penalty",
+        "params": {
+          "weight": 0.01
+        }
+      },
+      "success_bonus": {
+        "func": "success_reward",
+        "mode": "add",
+        "name": "success",
+        "params": {
+          "weight": 10.0,
+          "reward_value": 1.0
+        }
       }
     },
-    "observations": {},
     "extensions": {
       "obs_mode": "state",
       "episode_length": 100,
       "joint_limits": 0.5,
       "action_scale": 0.1,
-      "success_threshold": 0.1,
-      "reaching_reward_weight": 0.1,
-      "place_reward_weight": 2.0,
-      "place_penalty_weight": 0.5,
-      "action_penalty_weight": 0.01,
-      "success_bonus_weight": 10.0
+      "success_threshold": 0.1
     }
   },
   "robot": {
@@ -85,21 +172,7 @@
   "sensor": [],
   "light": {
   },
-  "background": [
-    {
-      "uid": "goal_sphere",
-      "shape": {
-        "shape_type": "Sphere",
-        "radius": 0.02
-      },
-      "body_type": "kinematic",
-      "init_pos": [-0.9, -0.6, 0.05],
-      "attrs": {
-        "enable_collision": false,
-        "mass": 0.0
-      }
-    }
-  ],
+  "background": [],
   "rigid_object": [
     {
       "uid": "cube",
diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json
index f1558fd..7e94155 100644
--- a/configs/agents/rl/push_cube/train_config.json
+++ b/configs/agents/rl/push_cube/train_config.json
@@ -7,7 +7,7 @@
     "headless": true,
     "iterations": 1000,
     "rollout_steps": 1024,
-    "eval_freq": 2,
+    "eval_freq": 200,
     "save_freq": 200,
     "use_wandb": false,
     "wandb_project_name": "embodychain-push_cube",
diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py
index 15f5d3b..72e1f40 100644
--- a/embodichain/lab/gym/envs/base_env.py
+++ b/embodichain/lab/gym/envs/base_env.py
@@ -397,6 +397,30 @@ def check_truncated(self, obs: EnvObs, info: Dict[str, Any]) -> torch.Tensor:
         """
         return torch.zeros(self.num_envs, dtype=torch.bool, device=self.device)
 
+    def _extend_reward(
+        self,
+        rewards: torch.Tensor,
+        obs: EnvObs,
+        action: EnvAction,
+        info: Dict[str, Any],
+        **kwargs,
+    ) -> torch.Tensor:
+        """Extend the reward computation.
+
+        Overwrite this function to extend or modify the reward computation.
+
+        Args:
+            rewards: The base reward tensor.
+            obs: The observation from the environment.
+            action: The action applied to the robot agent.
+            info: The info dictionary.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            The extended reward tensor.
+        """
+        return rewards
+
     def get_reward(
         self,
         obs: EnvObs,
@@ -417,7 +441,13 @@ def get_reward(
             The reward for the current step.
         """
 
-        return torch.zeros(self.num_envs, dtype=torch.float32, device=self.device)
+        rewards = torch.zeros(self.num_envs, dtype=torch.float32, device=self.device)
+
+        rewards = self._extend_reward(
+            rewards=rewards, obs=obs, action=action, info=info
+        )
+
+        return rewards
 
     def _step_action(self, action: EnvAction) -> EnvAction:
         """Set action control command into simulation.
diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py
index 8ce3134..c5e04fa 100644
--- a/embodichain/lab/gym/envs/embodied_env.py
+++ b/embodichain/lab/gym/envs/embodied_env.py
@@ -42,6 +42,7 @@
 from embodichain.lab.gym.envs.managers import (
     EventManager,
     ObservationManager,
+    RewardManager,
     DatasetManager,
 )
 from embodichain.lab.gym.utils.registration import register_env
@@ -91,6 +92,13 @@ class EnvLightCfg:
     Please refer to the :class:`embodichain.lab.gym.managers.ObservationManager` class for more details.
     """
 
+    rewards: Union[object, None] = None
+    """Reward settings. Defaults to None, in which case no reward computation is performed through
+    the reward manager.
+
+    Please refer to the :class:`embodichain.lab.gym.managers.RewardManager` class for more details.
+    """
+
     dataset: Union[object, None] = None
     """Dataset settings. Defaults to None, in which case no dataset collection is performed.
 
@@ -176,6 +184,9 @@ def _init_sim_state(self, **kwargs):
         if self.cfg.observations:
             self.observation_manager = ObservationManager(self.cfg.observations, self)
 
+        if self.cfg.rewards:
+            self.reward_manager = RewardManager(self.cfg.rewards, self)
+
         if self.cfg.dataset:
             self.dataset_manager = DatasetManager(self.cfg.dataset, self)
 
@@ -329,6 +340,23 @@ def _extend_obs(self, obs: EnvObs, **kwargs) -> EnvObs:
             obs = self.observation_manager.compute(obs)
         return obs
 
+    def _extend_reward(
+        self,
+        rewards: torch.Tensor,
+        obs: EnvObs,
+        action: EnvAction,
+        info: Dict[str, Any],
+        **kwargs,
+    ) -> torch.Tensor:
+        if self.reward_manager:
+            rewards, reward_info = self.reward_manager.compute(
+                obs=obs, action=action, info=info
+            )
+            # Add individual reward terms to info for logging
+            for term_name, term_value in reward_info.items():
+                info[f"reward/{term_name}"] = term_value
+        return rewards
+
     def _prepare_scene(self, **kwargs) -> None:
         self._setup_lights()
         self._setup_background()
@@ -352,6 +380,10 @@ def _initialize_episode(
             if "reset" in self.event_manager.available_modes:
                 self.event_manager.apply(mode="reset", env_ids=env_ids)
 
+        # reset reward manager for environments that need a reset
+        if self.cfg.rewards:
+            self.reward_manager.reset(env_ids=env_ids)
+
     def _step_action(self, action: EnvAction) -> EnvAction:
         """Set action control command into simulation.
 
diff --git a/embodichain/lab/gym/envs/managers/__init__.py b/embodichain/lab/gym/envs/managers/__init__.py
index e38f4f2..88c96ef 100644
--- a/embodichain/lab/gym/envs/managers/__init__.py
+++ b/embodichain/lab/gym/envs/managers/__init__.py
@@ -19,10 +19,12 @@
     SceneEntityCfg,
     EventCfg,
     ObservationCfg,
+    RewardCfg,
     DatasetFunctorCfg,
 )
 from .manager_base import Functor, ManagerBase
 from .event_manager import EventManager
 from .observation_manager import ObservationManager
+from .reward_manager import RewardManager
 from .dataset_manager import DatasetManager
 from .datasets import *
diff --git a/embodichain/lab/gym/envs/managers/cfg.py b/embodichain/lab/gym/envs/managers/cfg.py
index 07888c9..8da8f4e 100644
--- a/embodichain/lab/gym/envs/managers/cfg.py
+++ b/embodichain/lab/gym/envs/managers/cfg.py
@@ -311,6 +311,29 @@ def _resolve_body_names(self, scene: SimulationManager):
                 self.body_names = [entity.body_names[i] for i in self.body_ids]
 
 
+@configclass
+class RewardCfg(FunctorCfg):
+    """Configuration for a reward functor.
+
+    The reward functor is used to compute rewards for the environment. The `mode` attribute
+    determines how the reward is combined with existing rewards.
+    """
+
+    mode: Literal["add", "replace"] = "add"
+    """The mode for the reward computation.
+
+    - `add`: The reward is added to the existing total reward.
+    - `replace`: The reward replaces the total reward (useful for single reward functions).
+    """
+
+    name: str = MISSING
+    """The name of the reward term.
+
+    This is used for logging and debugging purposes. The name should be descriptive of what
+    the reward term represents, e.g., "distance_to_goal", "gripper_close", "collision_penalty".
+    """
+
+
 @configclass
 class DatasetFunctorCfg(FunctorCfg):
     """Configuration for dataset collection functors.
diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py
index d7ae016..e54f362 100644
--- a/embodichain/lab/gym/envs/managers/observations.py
+++ b/embodichain/lab/gym/envs/managers/observations.py
@@ -713,3 +713,16 @@ def __call__(
                 exteroception[sensor_uid] = projected_kpnts
 
         return exteroception
+
+
+def virtual_target_position(
+    env: "EmbodiedEnv",
+    obs: EnvObs,
+    target_pose_key: str = "goal_pose",
+) -> torch.Tensor:
+    """Get virtual target position from env state."""
+    state_attr = f"_{target_pose_key}s"
+    if hasattr(env, state_attr):
+        target_poses = getattr(env, state_attr)
+        return target_poses[:, :3, 3]
+    return torch.zeros(env.num_envs, 3, device=env.device)
diff --git a/embodichain/lab/gym/envs/managers/randomization/spatial.py b/embodichain/lab/gym/envs/managers/randomization/spatial.py
index 2437e88..6c3f92f 100644
--- a/embodichain/lab/gym/envs/managers/randomization/spatial.py
+++ b/embodichain/lab/gym/envs/managers/randomization/spatial.py
@@ -266,3 +266,86 @@ def randomize_robot_qpos(
 
     robot.set_qpos(qpos=current_qpos, env_ids=env_ids, joint_ids=joint_ids)
     env.sim.update(step=100)
+
+
+def randomize_target_pose(
+    env: EmbodiedEnv,
+    env_ids: Union[torch.Tensor, None],
+    position_range: tuple[list[float], list[float]],
+    rotation_range: tuple[list[float], list[float]] | None = None,
+    relative_position: bool = False,
+    relative_rotation: bool = False,
+    reference_entity_cfg: SceneEntityCfg | None = None,
+    store_key: str = "target_pose",
+) -> None:
+    """Randomize a virtual target pose and store in env state for use in get_info().
+
+    This function generates random target poses without requiring a physical object in the scene.
+    The generated poses are stored in env and should be exposed in get_info() for reward functors.
+
+    Args:
+        env (EmbodiedEnv): The environment instance.
+        env_ids (Union[torch.Tensor, None]): The environment IDs to apply the randomization.
+        position_range (tuple[list[float], list[float]]): The range for the position randomization.
+        rotation_range (tuple[list[float], list[float]] | None): The range for the rotation randomization.
+            The rotation is represented as Euler angles (roll, pitch, yaw) in degree.
+        relative_position (bool): Whether to randomize the position relative to a reference entity. Default is False.
+        relative_rotation (bool): Whether to randomize the rotation relative to a reference entity. Default is False.
+        reference_entity_cfg (SceneEntityCfg | None): The reference entity for relative randomization.
+            If None and relative mode is True, uses world origin.
+        store_key (str): The key to store the target pose in env state. Default is "target_pose".
+            The pose will be stored in env._{store_key}s and should be exposed in info[store_key].
+    """
+    num_instance = len(env_ids)
+
+    # Get reference pose if needed
+    if relative_position or relative_rotation:
+        if reference_entity_cfg is not None:
+            # Get reference entity pose
+            ref_obj = env.sim.get_rigid_object(reference_entity_cfg.uid)
+            if ref_obj is not None:
+                ref_pose = ref_obj.get_local_pose(to_matrix=True)[env_ids]
+                init_pos = ref_pose[:, :3, 3]
+                init_rot = ref_pose[:, :3, :3]
+            else:
+                # Fallback to world origin
+                init_pos = torch.zeros(num_instance, 3, device=env.device)
+                init_rot = (
+                    torch.eye(3, device=env.device)
+                    .unsqueeze(0)
+                    .repeat(num_instance, 1, 1)
+                )
+        else:
+            # Use world origin as reference
+            init_pos = torch.zeros(num_instance, 3, device=env.device)
+            init_rot = (
+                torch.eye(3, device=env.device).unsqueeze(0).repeat(num_instance, 1, 1)
+            )
+    else:
+        # Absolute randomization, init values won't be used
+        init_pos = torch.zeros(num_instance, 3, device=env.device)
+        init_rot = (
+            torch.eye(3, device=env.device).unsqueeze(0).repeat(num_instance, 1, 1)
+        )
+
+    # Generate random pose
+    pose = get_random_pose(
+        init_pos=init_pos,
+        init_rot=init_rot,
+        position_range=position_range,
+        rotation_range=rotation_range,
+        relative_position=relative_position,
+        relative_rotation=relative_rotation,
+    )
+
+    # Store in env state (to be exposed via get_info)
+    state_attr = f"_{store_key}s"
+    if not hasattr(env, state_attr):
+        setattr(
+            env,
+            state_attr,
+            torch.zeros(env.num_envs, 4, 4, device=env.device, dtype=torch.float32),
+        )
+
+    target_poses = getattr(env, state_attr)
+    target_poses[env_ids] = pose
diff --git a/embodichain/lab/gym/envs/managers/reward_manager.py b/embodichain/lab/gym/envs/managers/reward_manager.py
new file mode 100644
index 0000000..46d74d3
--- /dev/null
+++ b/embodichain/lab/gym/envs/managers/reward_manager.py
@@ -0,0 +1,241 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2025 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Reward manager for orchestrating reward computation in reinforcement learning tasks."""
+
+from __future__ import annotations
+
+import inspect
+import torch
+from collections.abc import Sequence
+from prettytable import PrettyTable
+from typing import TYPE_CHECKING, Union
+
+from embodichain.utils import logger
+from .manager_base import ManagerBase
+from .cfg import RewardCfg
+
+if TYPE_CHECKING:
+    from embodichain.lab.gym.envs import EmbodiedEnv
+
+
+class RewardManager(ManagerBase):
+    """Manager for orchestrating reward computation in reinforcement learning tasks.
+
+    The reward manager computes rewards based on the current state of the environment and actions.
+    It supports multiple reward terms that can be combined through weighted summation.
+
+    The reward manager offers two modes of operation:
+        - `add`: This mode computes a reward term and adds it to the total reward (weighted by the term's weight).
+        - `replace`: This mode replaces the total reward with the computed value (useful for single reward functions).
+    """
+
+    _env: EmbodiedEnv
+    """The environment instance."""
+
+    def __init__(self, cfg: object, env: EmbodiedEnv):
+        """Initialize the reward manager.
+
+        Args:
+            cfg: A configuration object or dictionary (``dict[str, RewardCfg]``).
+            env: An environment object.
+        """
+
+        self._mode_functor_names: dict[str, list[str]] = dict()
+        self._mode_functor_cfgs: dict[str, list[RewardCfg]] = dict()
+        self._mode_class_functor_cfgs: dict[str, list[RewardCfg]] = dict()
+
+        # call the base class (this will parse the functors config)
+        super().__init__(cfg, env)
+
+    def __str__(self) -> str:
+        """Returns: A string representation for reward manager."""
+        functor_num = sum(len(v) for v in self._mode_functor_names.values())
+        msg = f"<RewardManager> contains {functor_num} active reward terms.\n"
+
+        # add info on each mode
+        for mode in self._mode_functor_names:
+            # create table for functor information
+            table = PrettyTable()
+            table.title = f"Active Reward Terms in Mode: '{mode}'"
+
+            table.field_names = ["Index", "Name", "Weight"]
+            table.align["Name"] = "l"
+            for index, name in enumerate(self._mode_functor_names[mode]):
+                functor_cfg = self._mode_functor_cfgs[mode][index]
+                weight = functor_cfg.params.get("weight", 1.0)
+                table.add_row([index, name, f"{weight:.3f}"])
+
+            # convert table to string
+            msg += table.get_string()
+            msg += "\n"
+
+        return msg
+
+    """
+    Properties.
+    """
+
+    @property
+    def active_functors(self) -> dict[str, list[str]]:
+        """Name of active reward functors.
+
+        The keys are the modes of reward computation and the values are the names of the reward functors.
+        """
+        return self._mode_functor_names
+
+    """
+    Operations.
+    """
+
+    def reset(self, env_ids: Union[Sequence[int], None] = None) -> dict[str, float]:
+        """Reset reward terms that are stateful (implemented as classes).
+
+        Args:
+            env_ids: The environment indices to reset. If None, all environments are reset.
+
+        Returns:
+            An empty dictionary (no logging needed for reset).
+        """
+        # call all functors that are classes
+        for mode_cfg in self._mode_class_functor_cfgs.values():
+            for functor_cfg in mode_cfg:
+                functor_cfg.func.reset(env_ids=env_ids)
+
+        # nothing to log here
+        return {}
+
+    def compute(
+        self,
+        obs: "EnvObs",
+        action: "EnvAction",
+        info: dict,
+    ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        """Compute the total reward by calling each reward functor.
+
+        This function iterates over all the reward functors and calls them to compute individual
+        reward terms. The terms are then combined according to their mode and weight.
+
+        Args:
+            obs: The observation from the environment.
+            action: The action applied to the robot.
+            info: Additional information dictionary.
+
+        Returns:
+            A tuple containing:
+                - total_reward: The total reward for each environment (shape: [num_envs]).
+                - reward_info: A dictionary mapping reward term names to their values for logging.
+
+        Raises:
+            ValueError: If the mode is not supported.
+        """
+        # initialize total reward
+        total_reward = torch.zeros(self._env.num_envs, device=self._env.device)
+        reward_info = {}
+
+        # iterate over all the reward functors
+        for mode, functor_cfgs in self._mode_functor_cfgs.items():
+            for functor_name, functor_cfg in zip(
+                self._mode_functor_names[mode], functor_cfgs
+            ):
+                functor_cfg: RewardCfg
+
+                # compute reward term
+                reward_term = functor_cfg.func(
+                    self._env, obs=obs, action=action, info=info, **functor_cfg.params
+                )
+
+                # ensure reward is a tensor
+                if not isinstance(reward_term, torch.Tensor):
+                    reward_term = torch.tensor(
+                        reward_term, device=self._env.device, dtype=torch.float32
+                    )
+
+                # get weight from params or default to 1.0
+                weight = functor_cfg.params.get("weight", 1.0)
+                weighted_reward = reward_term * weight
+
+                # combine reward based on mode
+                if mode == "add":
+                    total_reward += weighted_reward
+                elif mode == "replace":
+                    total_reward = weighted_reward
+                else:
+                    logger.log_error(f"Unsupported reward mode '{mode}'.")
+
+                # store for logging (use unweighted value for clarity)
+                reward_info[functor_name] = reward_term
+
+        return total_reward, reward_info
+
+    def get_functor_cfg(self, functor_name: str) -> RewardCfg:
+        """Gets the configuration for the specified functor.
+
+        The method finds the functor by name by searching through all the modes.
+        It then returns the configuration of the functor with the first matching name.
+
+        Args:
+            functor_name: The name of the reward functor.
+
+        Returns:
+            The configuration of the reward functor.
+
+        Raises:
+            ValueError: If the functor name is not found.
+        """
+        for mode, functors in self._mode_functor_names.items():
+            if functor_name in functors:
+                return self._mode_functor_cfgs[mode][functors.index(functor_name)]
+        logger.log_error(f"Reward functor '{functor_name}' not found.")
+
+    """
+    Helper functions.
+    """
+
+    def _prepare_functors(self):
+        # check if config is dict already
+        if isinstance(self.cfg, dict):
+            cfg_items = self.cfg.items()
+        else:
+            cfg_items = self.cfg.__dict__.items()
+        # iterate over all the functors
+        for functor_name, functor_cfg in cfg_items:
+            # check for non config
+            if functor_cfg is None:
+                continue
+            # check for valid config type
+            if not isinstance(functor_cfg, RewardCfg):
+                raise TypeError(
+                    f"Configuration for the functor '{functor_name}' is not of type RewardCfg."
+                    f" Received: '{type(functor_cfg)}'."
+                )
+
+            # resolve common parameters
+            self._resolve_common_functor_cfg(functor_name, functor_cfg, min_argc=2)
+
+            # check if mode is a new mode
+            if functor_cfg.mode not in self._mode_functor_names:
+                # add new mode
+                self._mode_functor_names[functor_cfg.mode] = list()
+                self._mode_functor_cfgs[functor_cfg.mode] = list()
+                self._mode_class_functor_cfgs[functor_cfg.mode] = list()
+            # add functor name and parameters
+            self._mode_functor_names[functor_cfg.mode].append(functor_name)
+            self._mode_functor_cfgs[functor_cfg.mode].append(functor_cfg)
+
+            # check if the functor is a class
+            if inspect.isclass(functor_cfg.func):
+                self._mode_class_functor_cfgs[functor_cfg.mode].append(functor_cfg)
diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py
new file mode 100644
index 0000000..8023280
--- /dev/null
+++ b/embodichain/lab/gym/envs/managers/rewards.py
@@ -0,0 +1,482 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2025 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+"""Common reward functors for reinforcement learning tasks."""
+
+from __future__ import annotations
+
+import torch
+from typing import TYPE_CHECKING
+
+from embodichain.lab.gym.envs.managers.cfg import SceneEntityCfg
+
+if TYPE_CHECKING:
+    from embodichain.lab.gym.envs import EmbodiedEnv
+
+
+def reward_from_obs(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    obs_key: str = "robot/qpos",
+    target_value: float = 0.0,
+    scale: float = 1.0,
+) -> torch.Tensor:
+    """Reward based on observation values."""
+    # Parse nested keys (e.g., "robot/qpos")
+    keys = obs_key.split("/")
+    value = obs
+    for key in keys:
+        if isinstance(value, dict) and key in value:
+            value = value[key]
+        else:
+            return torch.zeros(env.num_envs, device=env.device)
+
+    # Compute distance to target
+    if isinstance(value, torch.Tensor):
+        if value.dim() > 1:
+            # Multiple values, compute norm
+            distance = torch.norm(value - target_value, dim=-1)
+        else:
+            distance = torch.abs(value - target_value)
+        reward = -scale * distance
+    else:
+        reward = torch.zeros(env.num_envs, device=env.device)
+
+    return reward
+
+
+def distance_between_objects(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    source_entity_cfg: SceneEntityCfg = None,
+    target_entity_cfg: SceneEntityCfg = None,
+    exponential: bool = False,
+    sigma: float = 1.0,
+) -> torch.Tensor:
+    """Reward based on distance between two entities."""
+    # get source entity position
+    source_obj = env.sim[source_entity_cfg.uid]
+    if hasattr(source_obj, "get_body_pose"):
+        source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[
+            :, :3, 3
+        ]
+    elif hasattr(source_obj, "get_local_pose"):
+        source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
+    else:
+        raise ValueError(
+            f"Entity '{source_entity_cfg.uid}' does not support position query."
+        )
+
+    # get target entity position
+    target_obj = env.sim[target_entity_cfg.uid]
+    if hasattr(target_obj, "get_body_pose"):
+        target_pos = target_obj.get_body_pose(body_ids=target_entity_cfg.body_ids)[
+            :, :3, 3
+        ]
+    elif hasattr(target_obj, "get_local_pose"):
+        target_pos = target_obj.get_local_pose(to_matrix=True)[:, :3, 3]
+    else:
+        raise ValueError(
+            f"Entity '{target_entity_cfg.uid}' does not support position query."
+        )
+
+    # compute distance
+    distance = torch.norm(source_pos - target_pos, dim=-1)
+
+    # compute reward
+    if exponential:
+        # exponential reward: exp(-distance^2 / (2 * sigma^2))
+        reward = torch.exp(-(distance**2) / (2 * sigma**2))
+    else:
+        # negative distance reward
+        reward = -distance
+
+    return reward
+
+
+def joint_velocity_penalty(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    robot_uid: str = "robot",
+    joint_ids: slice | list[int] = slice(None),
+) -> torch.Tensor:
+    """Penalize large joint velocities."""
+    robot = env.sim[robot_uid]
+
+    # get joint velocities
+    qvel = robot.body_data.qvel[:, joint_ids]
+
+    # compute L2 norm of joint velocities
+    velocity_norm = torch.norm(qvel, dim=-1)
+
+    # negative penalty (higher velocity = more negative reward)
+    return -velocity_norm
+
+
+def action_smoothness_penalty(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+) -> torch.Tensor:
+    """Penalize large changes in action between steps."""
+    # compute difference between current and previous action
+    if hasattr(env, "_prev_actions"):
+        action_diff = action - env._prev_actions
+        penalty = -torch.norm(action_diff, dim=-1)
+    else:
+        # no previous action, no penalty
+        penalty = torch.zeros(env.num_envs, device=env.device)
+
+    # store current action for next step
+    env._prev_actions = action.clone()
+
+    return penalty
+
+
+def joint_limit_penalty(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    robot_uid: str = "robot",
+    joint_ids: slice | list[int] = slice(None),
+    margin: float = 0.1,
+) -> torch.Tensor:
+    """Penalize joints approaching their limits."""
+    robot = env.sim[robot_uid]
+
+    # get joint positions and limits
+    qpos = robot.body_data.qpos[:, joint_ids]
+    qpos_limits = robot.body_data.qpos_limits[:, joint_ids, :]
+
+    # compute normalized position in range [0, 1]
+    qpos_normalized = (qpos - qpos_limits[:, :, 0]) / (
+        qpos_limits[:, :, 1] - qpos_limits[:, :, 0]
+    )
+
+    # compute distance to limits (minimum of distance to lower and upper limit)
+    dist_to_lower = qpos_normalized
+    dist_to_upper = 1.0 - qpos_normalized
+    dist_to_limit = torch.min(dist_to_lower, dist_to_upper)
+
+    # penalize joints within margin of limits
+    penalty_mask = dist_to_limit < margin
+    penalty = torch.where(
+        penalty_mask,
+        -(margin - dist_to_limit),  # negative penalty
+        torch.zeros_like(dist_to_limit),
+    )
+
+    # sum over all joints
+    return penalty.sum(dim=-1)
+
+
+def collision_penalty(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    robot_uid: str = "robot",
+    force_threshold: float = 1.0,
+) -> torch.Tensor:
+    """Penalize collisions based on contact forces."""
+    robot = env.sim[robot_uid]
+
+    # get joint forces (torques)
+    qf = robot.body_data.qf
+
+    # check if any joint force exceeds threshold
+    collision_detected = (torch.abs(qf) > force_threshold).any(dim=-1)
+
+    # return penalty for collisions
+    penalty = torch.where(
+        collision_detected,
+        torch.full((env.num_envs,), -1.0, device=env.device),
+        torch.zeros(env.num_envs, device=env.device),
+    )
+
+    return penalty
+
+
+def orientation_alignment_reward(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    source_entity_cfg: SceneEntityCfg = None,
+    target_entity_cfg: SceneEntityCfg = None,
+) -> torch.Tensor:
+    """Reward alignment of orientations between two entities."""
+    # get source entity rotation matrix
+    source_obj = env.sim[source_entity_cfg.uid]
+    if hasattr(source_obj, "get_body_pose"):
+        source_rot = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[
+            :, :3, :3
+        ]
+    elif hasattr(source_obj, "get_local_pose"):
+        source_rot = source_obj.get_local_pose(to_matrix=True)[:, :3, :3]
+    else:
+        raise ValueError(
+            f"Entity '{source_entity_cfg.uid}' does not support orientation query."
+        )
+
+    # get target entity rotation matrix
+    target_obj = env.sim[target_entity_cfg.uid]
+    if hasattr(target_obj, "get_body_pose"):
+        target_rot = target_obj.get_body_pose(body_ids=target_entity_cfg.body_ids)[
+            :, :3, :3
+        ]
+    elif hasattr(target_obj, "get_local_pose"):
+        target_rot = target_obj.get_local_pose(to_matrix=True)[:, :3, :3]
+    else:
+        raise ValueError(
+            f"Entity '{target_entity_cfg.uid}' does not support orientation query."
+        )
+
+    # compute rotation difference
+    rot_diff = torch.bmm(source_rot, target_rot.transpose(-1, -2))
+
+    # trace of rotation matrix difference (measure of alignment)
+    # trace = 1 + 2*cos(theta) for rotation by angle theta
+    # normalized to range [0, 1] where 1 is perfect alignment
+    trace = rot_diff.diagonal(dim1=-2, dim2=-1).sum(-1)
+    alignment = (trace - 1.0) / 2.0  # normalize to [-1, 1]
+
+    return alignment
+
+
+def success_reward(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    reward_value: float = 1.0,
+) -> torch.Tensor:
+    """Sparse reward for task success."""
+    # Check if success info is available in info dict
+    if "success" in info:
+        success = info["success"]
+        if isinstance(success, bool):
+            success = torch.tensor([success], device=env.device, dtype=torch.bool)
+        elif not isinstance(success, torch.Tensor):
+            success = torch.tensor(success, device=env.device, dtype=torch.bool)
+    else:
+        # No success info available
+        return torch.zeros(env.num_envs, device=env.device)
+
+    # return reward
+    reward = torch.where(
+        success,
+        torch.full((env.num_envs,), reward_value, device=env.device),
+        torch.zeros(env.num_envs, device=env.device),
+    )
+
+    return reward
+
+
+def reaching_behind_object_reward(
+    env: EmbodiedEnv,
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    end_effector_cfg: SceneEntityCfg = None,
+    object_cfg: SceneEntityCfg = None,
+    goal_cfg: SceneEntityCfg = None,
+    behind_offset: float = 0.015,
+    height_offset: float = 0.015,
+    distance_scale: float = 5.0,
+) -> torch.Tensor:
+    """Reward for reaching behind an object along object-to-goal direction."""
+    # get end effector position
+    ee_obj = env.sim[end_effector_cfg.uid]
+    if hasattr(ee_obj, "get_body_pose"):
+        ee_pos = ee_obj.get_body_pose(body_ids=end_effector_cfg.body_ids)[:, :3, 3]
+    elif hasattr(ee_obj, "get_local_pose"):
+        ee_pos = ee_obj.get_local_pose(to_matrix=True)[:, :3, 3]
+    else:
+        raise ValueError(
+            f"Entity '{end_effector_cfg.uid}' does not support position query."
+        )
+
+    # get object position
+    obj = env.sim[object_cfg.uid]
+    if hasattr(obj, "get_body_pose"):
+        obj_pos = obj.get_body_pose(body_ids=object_cfg.body_ids)[:, :3, 3]
+    elif hasattr(obj, "get_local_pose"):
+        obj_pos = obj.get_local_pose(to_matrix=True)[:, :3, 3]
+    else:
+        raise ValueError(f"Entity '{object_cfg.uid}' does not support position query.")
+
+    # get goal position
+    goal_obj = env.sim[goal_cfg.uid]
+    if hasattr(goal_obj, "get_body_pose"):
+        goal_pos = goal_obj.get_body_pose(body_ids=goal_cfg.body_ids)[:, :3, 3]
+    elif hasattr(goal_obj, "get_local_pose"):
+        goal_pos = goal_obj.get_local_pose(to_matrix=True)[:, :3, 3]
+    else:
+        raise ValueError(f"Entity '{goal_cfg.uid}' does not support position query.")
+
+    # compute push direction (from object to goal)
+    push_direction = goal_pos - obj_pos
+    push_dir_norm = torch.norm(push_direction, dim=-1, keepdim=True) + 1e-6
+    push_dir_normalized = push_direction / push_dir_norm
+
+    # compute target "behind" position
+    height_vec = torch.tensor(
+        [0, 0, height_offset], device=env.device, dtype=torch.float32
+    )
+    target_pos = obj_pos - behind_offset * push_dir_normalized + height_vec
+
+    # distance to target position
+    ee_to_target_dist = torch.norm(ee_pos - target_pos, dim=-1)
+
+    # tanh-shaped reward (1.0 when at target, 0.0 when far)
+    reward = 1.0 - torch.tanh(distance_scale * ee_to_target_dist)
+
+    return reward
+
+
+def distance_to_target(
+    env: "EmbodiedEnv",
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    source_entity_cfg: SceneEntityCfg = None,
+    target_pose_key: str = "target_pose",
+    exponential: bool = False,
+    sigma: float = 1.0,
+    use_xy_only: bool = False,
+) -> torch.Tensor:
+    """Reward based on distance to a virtual target pose from info."""
+    # get source entity position
+    source_obj = env.sim[source_entity_cfg.uid]
+    if hasattr(source_obj, "get_body_pose"):
+        source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[
+            :, :3, 3
+        ]
+    elif hasattr(source_obj, "get_local_pose"):
+        source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
+    else:
+        raise ValueError(
+            f"Entity '{source_entity_cfg.uid}' does not support position query."
+        )
+
+    # get target position from info
+    if target_pose_key not in info:
+        raise ValueError(
+            f"Target pose '{target_pose_key}' not found in info dict. "
+            f"Make sure to provide it in get_info()."
+        )
+
+    target_poses = info[target_pose_key]
+    if target_poses.dim() == 2:  # (num_envs, 3)
+        target_pos = target_poses
+    else:  # (num_envs, 4, 4)
+        target_pos = target_poses[:, :3, 3]
+
+    # compute distance
+    if use_xy_only:
+        distance = torch.norm(source_pos[:, :2] - target_pos[:, :2], dim=-1)
+    else:
+        distance = torch.norm(source_pos - target_pos, dim=-1)
+
+    # compute reward
+    if exponential:
+        # exponential reward: exp(-distance^2 / (2 * sigma^2))
+        reward = torch.exp(-(distance**2) / (2 * sigma**2))
+    else:
+        # negative distance reward
+        reward = -distance
+
+    return reward
+
+
+def incremental_distance_to_target(
+    env: "EmbodiedEnv",
+    obs: dict,
+    action: torch.Tensor,
+    info: dict,
+    source_entity_cfg: SceneEntityCfg = None,
+    target_pose_key: str = "target_pose",
+    tanh_scale: float = 10.0,
+    positive_weight: float = 1.0,
+    negative_weight: float = 1.0,
+    use_xy_only: bool = False,
+) -> torch.Tensor:
+    """Incremental reward for progress toward a virtual target pose from info."""
+    # get source entity position
+    source_obj = env.sim[source_entity_cfg.uid]
+    if hasattr(source_obj, "get_body_pose"):
+        source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[
+            :, :3, 3
+        ]
+    elif hasattr(source_obj, "get_local_pose"):
+        source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
+    else:
+        raise ValueError(
+            f"Entity '{source_entity_cfg.uid}' does not support position query."
+        )
+
+    # get target position from info
+    if target_pose_key not in info:
+        raise ValueError(
+            f"Target pose '{target_pose_key}' not found in info dict. "
+            f"Make sure to provide it in get_info()."
+        )
+
+    target_poses = info[target_pose_key]
+    if target_poses.dim() == 2:  # (num_envs, 3)
+        target_pos = target_poses
+    else:  # (num_envs, 4, 4)
+        target_pos = target_poses[:, :3, 3]
+
+    # compute current distance
+    if use_xy_only:
+        current_dist = torch.norm(source_pos[:, :2] - target_pos[:, :2], dim=-1)
+    else:
+        current_dist = torch.norm(source_pos - target_pos, dim=-1)
+
+    # initialize previous distance on first call
+    prev_dist_key = f"_prev_dist_{source_entity_cfg.uid}_{target_pose_key}"
+    if not hasattr(env, prev_dist_key):
+        setattr(env, prev_dist_key, current_dist.clone())
+        return torch.zeros(env.num_envs, device=env.device)
+
+    # compute distance delta (positive = getting closer)
+    prev_dist = getattr(env, prev_dist_key)
+    distance_delta = prev_dist - current_dist
+
+    # apply tanh shaping
+    distance_delta_normalized = torch.tanh(tanh_scale * distance_delta)
+
+    # asymmetric weighting
+    reward = torch.where(
+        distance_delta_normalized >= 0,
+        positive_weight * distance_delta_normalized,
+        negative_weight * distance_delta_normalized,
+    )
+
+    # update previous distance
+    setattr(env, prev_dist_key, current_dist.clone())
+
+    return reward
diff --git a/embodichain/lab/gym/envs/tasks/rl/push_cube.py b/embodichain/lab/gym/envs/tasks/rl/push_cube.py
index 4aef16c..01e5074 100644
--- a/embodichain/lab/gym/envs/tasks/rl/push_cube.py
+++ b/embodichain/lab/gym/envs/tasks/rl/push_cube.py
@@ -44,19 +44,12 @@ def __init__(self, cfg=None, **kwargs):
 
         defaults = {
             "success_threshold": 0.1,
-            "reaching_reward_weight": 0.1,
-            "place_reward_weight": 2.0,
-            "place_penalty_weight": 0.5,
-            "action_penalty_weight": 0.01,
-            "success_bonus_weight": 10.0,
         }
         for name, default in defaults.items():
             value = extensions.get(name, getattr(cfg, name, default))
             setattr(cfg, name, value)
             setattr(self, name, getattr(cfg, name))
 
-        self.last_cube_goal_dist = None
-
         super().__init__(cfg, **kwargs)
 
     def _draw_goal_marker(self):
@@ -104,15 +97,6 @@ def _initialize_episode(
         self, env_ids: Sequence[int] | None = None, **kwargs
     ) -> None:
         super()._initialize_episode(env_ids=env_ids, **kwargs)
-        cube = self.sim.get_rigid_object("cube")
-
-        # Calculate previous distance (for incremental reward) based on current (possibly randomized) pose
-        cube_pos = cube.body_data.pose[:, :3]
-        goal_sphere = self.sim.get_rigid_object("goal_sphere")
-        goal_pos = goal_sphere.body_data.pose[
-            :, :3
-        ]  # Get actual goal positions for each environment
-        self.last_cube_goal_dist = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1)
 
         # Draw marker at goal position
         # self._draw_goal_marker()
@@ -128,84 +112,27 @@ def _step_action(self, action: EnvAction) -> EnvAction:
         self.robot.set_qpos(qpos=target_qpos)
         return scaled_action
 
-    def get_obs(self, **kwargs) -> EnvObs:
-        qpos_all = self.robot.body_data.qpos[:, :6]
-        ee_pose_matrix = self.robot.compute_fk(
-            name="arm", qpos=qpos_all, to_matrix=True
-        )
-        ee_pos_all = ee_pose_matrix[:, :3, 3]
+    def get_info(self, **kwargs) -> Dict[str, Any]:
         cube = self.sim.get_rigid_object("cube")
-        cube_pos_all = cube.body_data.pose[:, :3]
-        # Get actual goal positions for each environment
-        goal_sphere = self.sim.get_rigid_object("goal_sphere")
-        goal_pos_all = goal_sphere.body_data.pose[:, :3]
-        if self.obs_mode == "state":
-            return torch.cat([qpos_all, ee_pos_all, cube_pos_all, goal_pos_all], dim=1)
-        return {
-            "robot": {"qpos": qpos_all, "ee_pos": ee_pos_all},
-            "object": {"cube_pos": cube_pos_all, "goal_pos": goal_pos_all},
-        }
+        cube_pos = cube.body_data.pose[:, :3]
 
-    def get_reward(
-        self, obs: EnvObs, action: EnvAction, info: Dict[str, Any]
-    ) -> torch.Tensor:
-        if self.obs_mode == "state":
-            ee_pos = obs[:, 6:9]
-            cube_pos = obs[:, 9:12]
-            goal_pos = obs[:, 12:15]
+        # Get virtual goal pose from env state (set by randomize_target_pose event)
+        if hasattr(self, "_goal_poses"):
+            goal_pos = self._goal_poses[:, :3, 3]
         else:
-            ee_pos = obs["robot"]["ee_pos"]
-            cube_pos = obs["object"]["cube_pos"]
-            goal_pos = obs["object"]["goal_pos"]
-        push_direction = goal_pos - cube_pos
-        push_dir_norm = torch.norm(push_direction, dim=1, keepdim=True) + 1e-6
-        push_dir_normalized = push_direction / push_dir_norm
-        push_pose = (
-            cube_pos
-            - 0.015 * push_dir_normalized
-            + torch.tensor([0, 0, 0.015], device=self.device, dtype=torch.float32)
-        )
-        ee_to_push_dist = torch.norm(ee_pos - push_pose, dim=1)
-        reaching_reward_raw = 1.0 - torch.tanh(5.0 * ee_to_push_dist)
-        reaching_reward = self.reaching_reward_weight * reaching_reward_raw
-        cube_to_goal_dist = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1)
-        distance_delta = 10.0 * (self.last_cube_goal_dist - cube_to_goal_dist)
-        distance_delta_normalized = torch.tanh(distance_delta)
-        place_reward = torch.where(
-            distance_delta_normalized >= 0,
-            self.place_reward_weight * distance_delta_normalized,
-            self.place_penalty_weight * distance_delta_normalized,
-        )
-        self.last_cube_goal_dist = cube_to_goal_dist
-        action_magnitude = torch.norm(action, dim=1)
-        action_penalty = -self.action_penalty_weight * action_magnitude
-        success_bonus_raw = info["success"].float()
-        success_bonus = self.success_bonus_weight * success_bonus_raw
-        reward = reaching_reward + place_reward + action_penalty + success_bonus
-        # Organize reward components in a dedicated "rewards" dict
-        # This allows trainer to easily identify and log reward components
-        if "rewards" not in info:
-            info["rewards"] = {}
-        info["rewards"]["reaching_reward"] = reaching_reward
-        info["rewards"]["place_reward"] = place_reward
-        info["rewards"]["action_penalty"] = action_penalty
-        info["rewards"]["success_bonus"] = success_bonus
-        return reward
+            # Fallback: no virtual goal set
+            goal_pos = torch.zeros_like(cube_pos)
 
-    def get_info(self, **kwargs) -> Dict[str, Any]:
-        cube = self.sim.get_rigid_object("cube")
-        cube_pos = cube.body_data.pose[:, :3]
-        # Get actual goal positions for each environment
-        goal_sphere = self.sim.get_rigid_object("goal_sphere")
-        goal_pos = goal_sphere.body_data.pose[:, :3]
         xy_distance = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1)
         is_success = xy_distance < self.success_threshold
+
         info = {
             "success": is_success,
             "fail": torch.zeros(
                 self.cfg.num_envs, device=self.device, dtype=torch.bool
             ),
             "elapsed_steps": self._elapsed_steps,
+            "goal_pose": self._goal_poses if hasattr(self, "_goal_poses") else None,
         }
         info["metrics"] = {
             "distance_to_goal": xy_distance,
diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py
index 6ebb6d6..aa7123f 100644
--- a/embodichain/lab/gym/utils/gym_utils.py
+++ b/embodichain/lab/gym/utils/gym_utils.py
@@ -546,6 +546,50 @@ class ComponentCfg:
 
             setattr(env_cfg.observations, obs_name, observation)
 
+    env_cfg.rewards = ComponentCfg()
+    if "rewards" in config["env"]:
+        # Define modules to search for reward functions
+        reward_modules = [
+            "embodichain.lab.gym.envs.managers.rewards",
+        ]
+
+        for reward_name, reward_params in config["env"]["rewards"].items():
+            reward_params_modified = deepcopy(reward_params)
+
+            # Handle entity_cfg parameters
+            for param_key in [
+                "entity_cfg",
+                "source_entity_cfg",
+                "target_entity_cfg",
+                "end_effector_cfg",
+                "object_cfg",
+                "goal_cfg",
+                "reference_entity_cfg",
+            ]:
+                if param_key in reward_params["params"]:
+                    entity_cfg = SceneEntityCfg(
+                        **reward_params_modified["params"][param_key]
+                    )
+                    reward_params_modified["params"][param_key] = entity_cfg
+
+            # Find the function from multiple modules using the utility function
+            reward_func = find_function_from_modules(
+                reward_params["func"],
+                reward_modules,
+                raise_if_not_found=True,
+            )
+
+            from embodichain.lab.gym.envs.managers import RewardCfg
+
+            reward = RewardCfg(
+                func=reward_func,
+                mode=reward_params_modified["mode"],
+                name=reward_params_modified["name"],
+                params=reward_params_modified["params"],
+            )
+
+            setattr(env_cfg.rewards, reward_name, reward)
+
     return env_cfg
 
 

From f2a1d19d9b946235a627626661583b71d3797ecf Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Fri, 16 Jan 2026 16:07:47 +0800
Subject: [PATCH 2/9] fix obs

---
 configs/agents/rl/push_cube/gym_config.json   | 32 +++++-----
 embodichain/lab/gym/envs/managers/cfg.py      |  7 +++
 .../lab/gym/envs/managers/observations.py     | 59 +++++++++++++++----
 .../lab/gym/envs/managers/reward_manager.py   |  7 +--
 4 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index b4a5cd0..280f86c 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -26,35 +26,32 @@
     },
     "observations": {
       "robot_qpos": {
-        "func": "embodichain.lab.gym.envs.managers.observations:normalize_robot_joint_data",
-        "mode": "concatenate",
+        "func": "normalize_robot_joint_data",
+        "mode": "add",
         "name": "robot/qpos",
         "params": {
-          "robot_uid": "Manipulator",
-          "joint_ids": [0, 1, 2, 3, 4, 5],
-          "data_type": "qpos"
+          "joint_ids": [0, 1, 2, 3, 4, 5]
         }
       },
       "robot_ee_pos": {
-        "func": "embodichain.lab.gym.envs.managers.observations:get_robot_ee_pose",
-        "mode": "concatenate",
+        "func": "get_robot_ee_pose",
+        "mode": "add",
         "name": "robot/ee_pos",
         "params": {
-          "robot_uid": "Manipulator",
           "part_name": "arm"
         }
       },
       "cube_pos": {
-        "func": "embodichain.lab.gym.envs.managers.observations:get_rigid_object_pose",
-        "mode": "concatenate",
+        "func": "get_rigid_object_pose",
+        "mode": "add",
         "name": "object/cube_pos",
         "params": {
           "entity_cfg": {"uid": "cube"}
         }
       },
       "goal_pos": {
-        "func": "embodichain.lab.gym.envs.managers.observations:virtual_target_position",
-        "mode": "concatenate",
+        "func": "target_position",
+        "mode": "add",
         "name": "object/goal_pos",
         "params": {
           "target_pose_key": "goal_pose"
@@ -66,8 +63,8 @@
         "func": "reaching_behind_object_reward",
         "mode": "add",
         "name": "reaching",
+        "weight": 0.1,
         "params": {
-          "weight": 0.1,
           "end_effector_cfg": {"uid": "Manipulator", "body_ids": "ee_link"},
           "object_cfg": {"uid": "cube"},
           "goal_cfg": {"uid": "goal_sphere"},
@@ -80,8 +77,8 @@
         "func": "incremental_distance_to_target",
         "mode": "add",
         "name": "place",
+        "weight": 1.0,
         "params": {
-          "weight": 1.0,
           "source_entity_cfg": {"uid": "cube"},
           "target_pose_key": "goal_pose",
           "tanh_scale": 10.0,
@@ -94,16 +91,15 @@
         "func": "action_smoothness_penalty",
         "mode": "add",
         "name": "action_penalty",
-        "params": {
-          "weight": 0.01
-        }
+        "weight": 0.01,
+        "params": {}
       },
       "success_bonus": {
         "func": "success_reward",
         "mode": "add",
         "name": "success",
+        "weight": 10.0,
         "params": {
-          "weight": 10.0,
           "reward_value": 1.0
         }
       }
diff --git a/embodichain/lab/gym/envs/managers/cfg.py b/embodichain/lab/gym/envs/managers/cfg.py
index 8da8f4e..eeacc86 100644
--- a/embodichain/lab/gym/envs/managers/cfg.py
+++ b/embodichain/lab/gym/envs/managers/cfg.py
@@ -333,6 +333,13 @@ class RewardCfg(FunctorCfg):
     the reward term represents, e.g., "distance_to_goal", "gripper_close", "collision_penalty".
     """
 
+    weight: float = 1.0
+    """The weight multiplier for this reward term.
+
+    This value is used to scale the reward before adding it to the total reward.
+    Default is 1.0 (no scaling).
+    """
+
 
 @configclass
 class DatasetFunctorCfg(FunctorCfg):
diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py
index e54f362..306d993 100644
--- a/embodichain/lab/gym/envs/managers/observations.py
+++ b/embodichain/lab/gym/envs/managers/observations.py
@@ -252,6 +252,52 @@ def compute_semantic_mask(
     return torch.stack(masks, dim=-1)
 
 
+def get_robot_ee_pose(
+    env: "EmbodiedEnv",
+    obs: EnvObs,
+    part_name: str | None = None,
+    position_only: bool = False,
+) -> torch.Tensor:
+    """Get robot end-effector pose using forward kinematics.
+
+    Args:
+        env: The environment instance.
+        obs: The observation dictionary.
+        robot_uid: The uid of the robot. If None, uses env.robot.
+        part_name: The name of the control part. If None, uses default part.
+        position_only: If True, returns only position (3D). If False, returns full pose (4x4 matrix).
+
+    Returns:
+        A tensor of shape (num_envs, 3) if position_only=True, or (num_envs, 4, 4) otherwise.
+    """
+    robot = env.robot
+
+    if part_name is not None:
+        joint_ids = robot.get_joint_ids(part_name)
+        qpos = robot.body_data.qpos[:, joint_ids]
+        ee_pose = robot.compute_fk(name=part_name, qpos=qpos, to_matrix=True)
+    else:
+        qpos = robot.get_qpos()
+        ee_pose = robot.compute_fk(qpos=qpos, to_matrix=True)
+
+    if position_only:
+        return ee_pose[:, :3, 3]
+    return ee_pose
+
+
+def target_position(
+    env: "EmbodiedEnv",
+    obs: EnvObs,
+    target_pose_key: str = "goal_pose",
+) -> torch.Tensor:
+    """Get virtual target position from env state."""
+    state_attr = f"_{target_pose_key}s"
+    if hasattr(env, state_attr):
+        target_poses = getattr(env, state_attr)
+        return target_poses[:, :3, 3]
+    return torch.zeros(env.num_envs, 3, device=env.device)
+
+
 class compute_exteroception(Functor):
     """Compute the exteroception for the observation space.
 
@@ -713,16 +759,3 @@ def __call__(
                 exteroception[sensor_uid] = projected_kpnts
 
         return exteroception
-
-
-def virtual_target_position(
-    env: "EmbodiedEnv",
-    obs: EnvObs,
-    target_pose_key: str = "goal_pose",
-) -> torch.Tensor:
-    """Get virtual target position from env state."""
-    state_attr = f"_{target_pose_key}s"
-    if hasattr(env, state_attr):
-        target_poses = getattr(env, state_attr)
-        return target_poses[:, :3, 3]
-    return torch.zeros(env.num_envs, 3, device=env.device)
diff --git a/embodichain/lab/gym/envs/managers/reward_manager.py b/embodichain/lab/gym/envs/managers/reward_manager.py
index 46d74d3..8f282df 100644
--- a/embodichain/lab/gym/envs/managers/reward_manager.py
+++ b/embodichain/lab/gym/envs/managers/reward_manager.py
@@ -164,9 +164,8 @@ def compute(
                         reward_term, device=self._env.device, dtype=torch.float32
                     )
 
-                # get weight from params or default to 1.0
-                weight = functor_cfg.params.get("weight", 1.0)
-                weighted_reward = reward_term * weight
+                # apply weight from config
+                weighted_reward = reward_term * functor_cfg.weight
 
                 # combine reward based on mode
                 if mode == "add":
@@ -224,7 +223,7 @@ def _prepare_functors(self):
                 )
 
             # resolve common parameters
-            self._resolve_common_functor_cfg(functor_name, functor_cfg, min_argc=2)
+            self._resolve_common_functor_cfg(functor_name, functor_cfg, min_argc=4)
 
             # check if mode is a new mode
             if functor_cfg.mode not in self._mode_functor_names:

From 548c1b10818804f0403a38c2d31c6c44d7af7ac3 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Mon, 19 Jan 2026 11:28:47 +0800
Subject: [PATCH 3/9] fix obs_dim with flatten dict input

---
 configs/agents/rl/push_cube/gym_config.json   |  8 +--
 embodichain/agents/rl/algo/ppo.py             |  6 +-
 embodichain/agents/rl/train.py                | 18 +++---
 embodichain/agents/rl/utils/__init__.py       |  2 +
 embodichain/agents/rl/utils/helper.py         | 52 ++++++++++++++++
 embodichain/agents/rl/utils/trainer.py        | 26 ++++----
 embodichain/lab/gym/envs/base_env.py          | 16 +++++
 embodichain/lab/gym/envs/managers/cfg.py      |  4 +-
 embodichain/lab/gym/envs/managers/rewards.py  | 62 +++++++------------
 embodichain/lab/gym/envs/tasks/rl/__init__.py |  4 +-
 .../lab/gym/envs/tasks/rl/push_cube.py        | 25 --------
 embodichain/lab/gym/utils/gym_utils.py        | 19 +-----
 12 files changed, 133 insertions(+), 109 deletions(-)
 create mode 100644 embodichain/agents/rl/utils/helper.py

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index 280f86c..87858e3 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -27,7 +27,7 @@
     "observations": {
       "robot_qpos": {
         "func": "normalize_robot_joint_data",
-        "mode": "add",
+        "mode": "modify",
         "name": "robot/qpos",
         "params": {
           "joint_ids": [0, 1, 2, 3, 4, 5]
@@ -65,12 +65,12 @@
         "name": "reaching",
         "weight": 0.1,
         "params": {
-          "end_effector_cfg": {"uid": "Manipulator", "body_ids": "ee_link"},
           "object_cfg": {"uid": "cube"},
-          "goal_cfg": {"uid": "goal_sphere"},
+          "target_pose_key": "goal_pose",
           "behind_offset": 0.015,
           "height_offset": 0.015,
-          "distance_scale": 5.0
+          "distance_scale": 5.0,
+          "part_name": "arm"
         }
       },
       "place_reward": {
diff --git a/embodichain/agents/rl/algo/ppo.py b/embodichain/agents/rl/algo/ppo.py
index afd636e..5253e89 100644
--- a/embodichain/agents/rl/algo/ppo.py
+++ b/embodichain/agents/rl/algo/ppo.py
@@ -17,7 +17,7 @@
 import torch
 from typing import Dict, Any, Tuple, Callable
 
-from embodichain.agents.rl.utils import AlgorithmCfg
+from embodichain.agents.rl.utils import AlgorithmCfg, flatten_dict_observation
 from embodichain.agents.rl.buffer import RolloutBuffer
 from embodichain.utils import configclass
 from .base import BaseAlgorithm
@@ -102,6 +102,10 @@ def collect_rollout(
             reward = reward.float()
             done = done.bool()
 
+            # Flatten dict observation from ObservationManager if needed
+            if isinstance(next_obs, dict):
+                next_obs = flatten_dict_observation(next_obs)
+
             # Add to buffer
             self.buffer.add(current_obs, actions, reward, done, value, log_prob)
 
diff --git a/embodichain/agents/rl/train.py b/embodichain/agents/rl/train.py
index e87d462..4d95ff4 100644
--- a/embodichain/agents/rl/train.py
+++ b/embodichain/agents/rl/train.py
@@ -32,7 +32,7 @@
 from embodichain.agents.rl.utils.trainer import Trainer
 from embodichain.utils import logger
 from embodichain.lab.gym.envs.tasks.rl import build_env
-from embodichain.lab.gym.utils.gym_utils import config_to_rl_cfg
+from embodichain.lab.gym.utils.gym_utils import config_to_cfg
 from embodichain.utils.utility import load_json
 from embodichain.utils.module_utils import find_function_from_modules
 from embodichain.lab.sim import SimulationManagerCfg
@@ -120,7 +120,7 @@ def main():
     logger.log_info(f"Current working directory: {Path.cwd()}")
 
     gym_config_data = load_json(str(gym_config_path))
-    gym_env_cfg = config_to_rl_cfg(gym_config_data)
+    gym_env_cfg = config_to_cfg(gym_config_data)
 
     # Ensure sim configuration mirrors runtime overrides
     if gym_env_cfg.sim_cfg is None:
@@ -137,22 +137,24 @@ def main():
     gym_env_cfg.sim_cfg.headless = headless
 
     logger.log_info(
-        f"Loaded gym_config from {gym_config_path} (env_id={gym_env_cfg.env_id}, headless={gym_env_cfg.sim_cfg.headless}, sim_device={gym_env_cfg.sim_cfg.sim_device})"
+        f"Loaded gym_config from {gym_config_path} (env_id={gym_config_data['id']}, headless={gym_env_cfg.sim_cfg.headless}, sim_device={gym_env_cfg.sim_cfg.sim_device})"
     )
 
-    env = build_env(gym_env_cfg.env_id, base_env_cfg=gym_env_cfg)
+    env = build_env(gym_config_data["id"], base_env_cfg=gym_env_cfg)
 
     eval_gym_env_cfg = deepcopy(gym_env_cfg)
     eval_gym_env_cfg.num_envs = 4
     eval_gym_env_cfg.sim_cfg.headless = True
 
-    eval_env = build_env(eval_gym_env_cfg.env_id, base_env_cfg=eval_gym_env_cfg)
+    eval_env = build_env(gym_config_data["id"], base_env_cfg=eval_gym_env_cfg)
 
     # Build Policy via registry
     policy_name = policy_block["name"]
     # Build Policy via registry (actor/critic must be explicitly defined in JSON when using actor_critic)
     if policy_name.lower() == "actor_critic":
-        obs_dim = env.observation_space.shape[-1]
+        # Get observation dimension from flattened observation space
+        # flattened_observation_space returns Box space for RL training
+        obs_dim = env.flattened_observation_space.shape[-1]
         action_dim = env.action_space.shape[-1]
 
         actor_cfg = policy_block.get("actor")
@@ -167,7 +169,7 @@ def main():
 
         policy = build_policy(
             policy_block,
-            env.observation_space,
+            env.flattened_observation_space,
             env.action_space,
             device,
             actor=actor,
@@ -175,7 +177,7 @@ def main():
         )
     else:
         policy = build_policy(
-            policy_block, env.observation_space, env.action_space, device
+            policy_block, env.flattened_observation_space, env.action_space, device
         )
 
     # Build Algorithm via factory
diff --git a/embodichain/agents/rl/utils/__init__.py b/embodichain/agents/rl/utils/__init__.py
index f6f9f4f..e6f9e57 100644
--- a/embodichain/agents/rl/utils/__init__.py
+++ b/embodichain/agents/rl/utils/__init__.py
@@ -15,7 +15,9 @@
 # ----------------------------------------------------------------------------
 
 from .config import AlgorithmCfg
+from .helper import flatten_dict_observation
 
 __all__ = [
     "AlgorithmCfg",
+    "flatten_dict_observation",
 ]
diff --git a/embodichain/agents/rl/utils/helper.py b/embodichain/agents/rl/utils/helper.py
new file mode 100644
index 0000000..3021a31
--- /dev/null
+++ b/embodichain/agents/rl/utils/helper.py
@@ -0,0 +1,52 @@
+# ----------------------------------------------------------------------------
+# Copyright (c) 2021-2025 DexForce Technology Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ----------------------------------------------------------------------------
+
+import torch
+
+
+def flatten_dict_observation(input_dict: dict) -> torch.Tensor:
+    """
+    Flatten hierarchical dict observations from ObservationManager.
+
+    Recursively traverse nested dicts, collect all tensor values,
+    flatten each to (num_envs, -1), and concatenate in sorted key order.
+
+    Args:
+        input_dict: Nested dict structure, e.g. {"robot": {"qpos": tensor, "ee_pos": tensor}, "object": {...}}
+
+    Returns:
+        Concatenated flat tensor of shape (num_envs, total_dim)
+    """
+    obs_list = []
+
+    def _collect_tensors(d, prefix=""):
+        """Recursively collect tensors from nested dicts in sorted order."""
+        for key in sorted(d.keys()):
+            full_key = f"{prefix}/{key}" if prefix else key
+            value = d[key]
+            if isinstance(value, dict):
+                _collect_tensors(value, full_key)
+            elif isinstance(value, torch.Tensor):
+                # Flatten tensor to (num_envs, -1) shape
+                obs_list.append(value.flatten(start_dim=1))
+
+    _collect_tensors(input_dict)
+
+    if not obs_list:
+        raise ValueError("No tensors found in observation dict")
+
+    result = torch.cat(obs_list, dim=-1)
+    return result
diff --git a/embodichain/agents/rl/utils/trainer.py b/embodichain/agents/rl/utils/trainer.py
index 88c0490..6d38800 100644
--- a/embodichain/agents/rl/utils/trainer.py
+++ b/embodichain/agents/rl/utils/trainer.py
@@ -25,6 +25,7 @@
 import wandb
 
 from embodichain.lab.gym.envs.managers.event_manager import EventManager
+from .helper import flatten_dict_observation
 
 
 class Trainer:
@@ -74,22 +75,22 @@ def __init__(
 
         # initial obs (assume env returns torch tensors already on target device)
         obs, _ = self.env.reset()
-        self.obs = obs
 
         # Initialize algorithm's buffer
-        self.observation_space = getattr(self.env, "observation_space", None)
-        self.action_space = getattr(self.env, "action_space", None)
-        obs_dim = (
-            self.observation_space.shape[-1]
-            if self.observation_space
-            else self.obs.shape[-1]
-        )
-        action_dim = self.action_space.shape[-1] if self.action_space else None
+        # Flatten dict observations from ObservationManager to tensor for RL algorithms
+        if isinstance(obs, dict):
+            obs_tensor = flatten_dict_observation(obs)
+            obs_dim = obs_tensor.shape[-1]
+            num_envs = obs_tensor.shape[0]
+            # Store flattened observation for RL training
+            self.obs = obs_tensor
+
+        action_space = getattr(self.env, "action_space", None)
+        action_dim = action_space.shape[-1] if action_space else None
         if action_dim is None:
             raise RuntimeError(
                 "Env must expose action_space with shape for buffer initialization."
             )
-        num_envs = self.obs.shape[0] if self.obs.ndim == 2 else 1
 
         # Algorithm manages its own buffer
         self.algorithm.initialize_buffer(num_steps, num_envs, obs_dim, action_dim)
@@ -160,8 +161,9 @@ def on_step(obs, actions, reward, done, info, next_obs):
                 self.curr_len[done_idx] = 0
 
             # Update global step and observation
+            # next_obs is already flattened in algorithm's collect_rollout
             self.obs = next_obs
-            self.global_step += next_obs.shape[0] if next_obs.ndim == 2 else 1
+            self.global_step += next_obs.shape[0]
 
             if isinstance(info, dict):
                 rewards_dict = info.get("rewards")
@@ -226,6 +228,8 @@ def _eval_once(self, num_episodes: int = 5):
         returns = []
         for _ in range(num_episodes):
             obs, _ = self.eval_env.reset()
+            obs = flatten_dict_observation(obs)
+
             done_any = torch.zeros(
                 obs.shape[0] if obs.ndim == 2 else 1,
                 dtype=torch.bool,
diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py
index 72e1f40..1c28f51 100644
--- a/embodichain/lab/gym/envs/base_env.py
+++ b/embodichain/lab/gym/envs/base_env.py
@@ -15,6 +15,7 @@
 # ----------------------------------------------------------------------------
 
 import torch
+import numpy as np
 import gymnasium as gym
 
 from typing import Dict, List, Union, Tuple, Any, Sequence
@@ -173,6 +174,21 @@ def observation_space(self) -> gym.spaces.Space:
                 self.single_observation_space, n=self.num_envs
             )
 
+    @cached_property
+    def flattened_observation_space(self) -> gym.spaces.Box:
+        """Flattened observation space for RL training.
+
+        Returns a Box space by computing total dimensions from nested dict observations.
+        This is needed because RL algorithms (PPO, SAC, etc.) require flat vector inputs.
+        """
+        from embodichain.agents.rl.utils.helper import flatten_dict_observation
+
+        flattened_obs = flatten_dict_observation(self._init_raw_obs)
+        total_dim = flattened_obs.shape[-1]
+        return gym.spaces.Box(
+            low=-np.inf, high=np.inf, shape=(total_dim,), dtype=np.float32
+        )
+
     @cached_property
     def action_space(self) -> gym.spaces.Space:
         if self.num_envs == 1:
diff --git a/embodichain/lab/gym/envs/managers/cfg.py b/embodichain/lab/gym/envs/managers/cfg.py
index eeacc86..3161fb8 100644
--- a/embodichain/lab/gym/envs/managers/cfg.py
+++ b/embodichain/lab/gym/envs/managers/cfg.py
@@ -231,7 +231,7 @@ def resolve(self, scene: SimulationManager):
     def _resolve_joint_names(self, scene: SimulationManager):
         # convert joint names to indices based on regex
         if self.joint_names is not None or self.joint_ids != slice(None):
-            entity: Articulation = scene[self.uid]
+            entity: Articulation = scene.get_articulation(self.uid)
             # -- if both are not their default values, check if they are valid
             if self.joint_names is not None and self.joint_ids != slice(None):
                 if isinstance(self.joint_names, str):
@@ -272,7 +272,7 @@ def _resolve_joint_names(self, scene: SimulationManager):
     def _resolve_body_names(self, scene: SimulationManager):
         # convert body names to indices based on regex
         if self.body_names is not None or self.body_ids != slice(None):
-            entity: RigidObject = scene[self.uid]
+            entity: RigidObject = scene.get_rigid_object(self.uid)
             # -- if both are not their default values, check if they are valid
             if self.body_names is not None and self.body_ids != slice(None):
                 if isinstance(self.body_names, str):
diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py
index 8023280..d8892ef 100644
--- a/embodichain/lab/gym/envs/managers/rewards.py
+++ b/embodichain/lab/gym/envs/managers/rewards.py
@@ -299,42 +299,37 @@ def reaching_behind_object_reward(
     obs: dict,
     action: torch.Tensor,
     info: dict,
-    end_effector_cfg: SceneEntityCfg = None,
     object_cfg: SceneEntityCfg = None,
-    goal_cfg: SceneEntityCfg = None,
+    target_pose_key: str = "goal_pose",
     behind_offset: float = 0.015,
     height_offset: float = 0.015,
     distance_scale: float = 5.0,
+    part_name: str = None,
 ) -> torch.Tensor:
     """Reward for reaching behind an object along object-to-goal direction."""
-    # get end effector position
-    ee_obj = env.sim[end_effector_cfg.uid]
-    if hasattr(ee_obj, "get_body_pose"):
-        ee_pos = ee_obj.get_body_pose(body_ids=end_effector_cfg.body_ids)[:, :3, 3]
-    elif hasattr(ee_obj, "get_local_pose"):
-        ee_pos = ee_obj.get_local_pose(to_matrix=True)[:, :3, 3]
-    else:
+    # get end effector position from robot FK
+    robot = env.robot
+    joint_ids = robot.get_joint_ids(part_name)
+    qpos = robot.get_qpos()[:, joint_ids]
+    ee_pose = robot.compute_fk(name=part_name, qpos=qpos, to_matrix=True)
+    ee_pos = ee_pose[:, :3, 3]
+
+    # get object position
+    obj = env.sim.get_rigid_object(object_cfg.uid)
+    obj_pos = obj.get_local_pose(to_matrix=True)[:, :3, 3]
+
+    # get goal position from info
+    if target_pose_key not in info:
         raise ValueError(
-            f"Entity '{end_effector_cfg.uid}' does not support position query."
+            f"Target pose '{target_pose_key}' not found in info dict. "
+            f"Make sure to provide it in get_info()."
         )
 
-    # get object position
-    obj = env.sim[object_cfg.uid]
-    if hasattr(obj, "get_body_pose"):
-        obj_pos = obj.get_body_pose(body_ids=object_cfg.body_ids)[:, :3, 3]
-    elif hasattr(obj, "get_local_pose"):
-        obj_pos = obj.get_local_pose(to_matrix=True)[:, :3, 3]
-    else:
-        raise ValueError(f"Entity '{object_cfg.uid}' does not support position query.")
-
-    # get goal position
-    goal_obj = env.sim[goal_cfg.uid]
-    if hasattr(goal_obj, "get_body_pose"):
-        goal_pos = goal_obj.get_body_pose(body_ids=goal_cfg.body_ids)[:, :3, 3]
-    elif hasattr(goal_obj, "get_local_pose"):
-        goal_pos = goal_obj.get_local_pose(to_matrix=True)[:, :3, 3]
-    else:
-        raise ValueError(f"Entity '{goal_cfg.uid}' does not support position query.")
+    target_poses = info[target_pose_key]
+    if target_poses.dim() == 2:  # (num_envs, 3)
+        goal_pos = target_poses
+    else:  # (num_envs, 4, 4)
+        goal_pos = target_poses[:, :3, 3]
 
     # compute push direction (from object to goal)
     push_direction = goal_pos - obj_pos
@@ -425,17 +420,8 @@ def incremental_distance_to_target(
 ) -> torch.Tensor:
     """Incremental reward for progress toward a virtual target pose from info."""
     # get source entity position
-    source_obj = env.sim[source_entity_cfg.uid]
-    if hasattr(source_obj, "get_body_pose"):
-        source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[
-            :, :3, 3
-        ]
-    elif hasattr(source_obj, "get_local_pose"):
-        source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
-    else:
-        raise ValueError(
-            f"Entity '{source_entity_cfg.uid}' does not support position query."
-        )
+    source_obj = env.sim.get_rigid_object(source_entity_cfg.uid)
+    source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
 
     # get target position from info
     if target_pose_key not in info:
diff --git a/embodichain/lab/gym/envs/tasks/rl/__init__.py b/embodichain/lab/gym/envs/tasks/rl/__init__.py
index f8cf303..be52afc 100644
--- a/embodichain/lab/gym/envs/tasks/rl/__init__.py
+++ b/embodichain/lab/gym/envs/tasks/rl/__init__.py
@@ -18,10 +18,10 @@
 
 from copy import deepcopy
 from embodichain.lab.gym.utils import registration as env_registry
-from embodichain.lab.gym.envs.rl_env_cfg import RLEnvCfg
+from embodichain.lab.gym.envs.embodied_env import EmbodiedEnvCfg
 
 
-def build_env(env_id: str, base_env_cfg: RLEnvCfg):
+def build_env(env_id: str, base_env_cfg: EmbodiedEnvCfg):
     """Create env from registry id, auto-inferring cfg class (EnvName -> EnvNameCfg)."""
     env = env_registry.make(env_id, cfg=deepcopy(base_env_cfg))
     return env
diff --git a/embodichain/lab/gym/envs/tasks/rl/push_cube.py b/embodichain/lab/gym/envs/tasks/rl/push_cube.py
index 01e5074..f26a1fc 100644
--- a/embodichain/lab/gym/envs/tasks/rl/push_cube.py
+++ b/embodichain/lab/gym/envs/tasks/rl/push_cube.py
@@ -38,18 +38,6 @@ def __init__(self, cfg=None, **kwargs):
         if cfg is None:
             cfg = EmbodiedEnvCfg()
 
-        extensions = getattr(cfg, "extensions", {}) or {}
-
-        # cfg.sim_cfg.enable_rt = True
-
-        defaults = {
-            "success_threshold": 0.1,
-        }
-        for name, default in defaults.items():
-            value = extensions.get(name, getattr(cfg, name, default))
-            setattr(cfg, name, value)
-            setattr(self, name, getattr(cfg, name))
-
         super().__init__(cfg, **kwargs)
 
     def _draw_goal_marker(self):
@@ -80,19 +68,6 @@ def _draw_goal_marker(self):
             )
             self.sim.draw_marker(cfg=marker_cfg)
 
-    def _init_sim_state(self, **kwargs):
-        super()._init_sim_state(**kwargs)
-        self.single_action_space = spaces.Box(
-            low=-self.joint_limits,
-            high=self.joint_limits,
-            shape=(6,),
-            dtype=np.float32,
-        )
-        if self.obs_mode == "state":
-            self.single_observation_space = spaces.Box(
-                low=-np.inf, high=np.inf, shape=(15,), dtype=np.float32
-            )
-
     def _initialize_episode(
         self, env_ids: Sequence[int] | None = None, **kwargs
     ) -> None:
diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py
index aa7123f..825a83a 100644
--- a/embodichain/lab/gym/utils/gym_utils.py
+++ b/embodichain/lab/gym/utils/gym_utils.py
@@ -323,24 +323,6 @@ def cat_tensor_with_ids(
     return out
 
 
-def config_to_rl_cfg(config: dict) -> "RLEnvCfg":
-    """Parse gym-style configuration dict into an RL-ready config object."""
-
-    from embodichain.lab.gym.envs.rl_env_cfg import RLEnvCfg
-
-    # Use config_to_cfg to parse shared fields
-    env_cfg = config_to_cfg(config)
-    # Convert to RLEnvCfg if needed
-    if not isinstance(env_cfg, RLEnvCfg):
-        env_cfg = RLEnvCfg.from_dict(env_cfg.__dict__)
-    # RL-specific fields
-    env_cfg.env_id = config.get("id")
-    env_cfg.num_envs = config["env"].get("num_envs", env_cfg.num_envs)
-    env_cfg.extensions = deepcopy(config.get("env", {}).get("extensions", {}))
-    # Add any RL-specific parsing here
-    return env_cfg
-
-
 def config_to_cfg(config: dict) -> "EmbodiedEnvCfg":
     """Parser configuration file into cfgs for env initialization.
 
@@ -452,6 +434,7 @@ class ComponentCfg:
             env_cfg.articulation.append(cfg)
 
     env_cfg.sim_steps_per_control = config["env"].get("sim_steps_per_control", 4)
+    env_cfg.extensions = deepcopy(config.get("env", {}).get("extensions", {}))
 
     # TODO: support more env events, eg, grasp pose generation, mesh preprocessing, etc.
 

From 4e66b14fa56e464010f024520f45187509157775 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Mon, 19 Jan 2026 16:14:57 +0800
Subject: [PATCH 4/9] update review changes

---
 configs/agents/rl/push_cube/gym_config.json   | 381 ++++++++-------
 configs/agents/rl/push_cube/train_config.json | 125 ++---
 embodichain/agents/rl/train.py                |  11 +-
 embodichain/agents/rl/utils/trainer.py        |   8 +-
 embodichain/lab/gym/envs/embodied_env.py      |   4 +-
 .../lab/gym/envs/managers/observations.py     |   3 +-
 .../envs/managers/randomization/spatial.py    |   2 +-
 .../lab/gym/envs/managers/reward_manager.py   |   2 +-
 embodichain/lab/gym/envs/managers/rewards.py  | 452 ++++++++++++------
 .../lab/gym/envs/tasks/rl/push_cube.py        |  25 +-
 embodichain/lab/gym/utils/gym_utils.py        |   5 +-
 11 files changed, 599 insertions(+), 419 deletions(-)

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index 87858e3..cd5113b 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -1,198 +1,197 @@
-{
-  "id": "PushCubeRL",
-  "max_episodes": 5,
-  "env": {
-    "num_envs": 128,
-    "sim_steps_per_control": 4,
-    "events": {
-      "randomize_cube": {
-        "func": "randomize_rigid_object_pose",
-        "mode": "reset",
-        "params": {
-          "entity_cfg": {"uid": "cube"},
-          "position_range": [[-0.2, -0.2, 0.0], [0.2, 0.2, 0.0]],
-          "relative_position": true
-        }
-      },
-      "randomize_goal": {
-        "func": "randomize_target_pose",
-        "mode": "reset",
-        "params": {
-          "position_range": [[-0.3, -0.3, 0.05], [0.3, 0.3, 0.05]],
-          "relative_position": false,
-          "store_key": "goal_pose"
-        }
-      }
-    },
-    "observations": {
-      "robot_qpos": {
-        "func": "normalize_robot_joint_data",
-        "mode": "modify",
-        "name": "robot/qpos",
-        "params": {
-          "joint_ids": [0, 1, 2, 3, 4, 5]
-        }
-      },
-      "robot_ee_pos": {
-        "func": "get_robot_ee_pose",
-        "mode": "add",
-        "name": "robot/ee_pos",
-        "params": {
-          "part_name": "arm"
-        }
-      },
-      "cube_pos": {
-        "func": "get_rigid_object_pose",
-        "mode": "add",
-        "name": "object/cube_pos",
-        "params": {
-          "entity_cfg": {"uid": "cube"}
-        }
-      },
-      "goal_pos": {
-        "func": "target_position",
-        "mode": "add",
-        "name": "object/goal_pos",
-        "params": {
-          "target_pose_key": "goal_pose"
+{ 
+    "id": "PushCubeRL",
+    "max_episodes": 5,
+    "env": {
+        "num_envs": 128,
+        "sim_steps_per_control": 4,
+        "events": {
+            "randomize_cube": {
+                "func": "randomize_rigid_object_pose",
+                "mode": "reset",
+                "params": {
+                    "entity_cfg": {
+                        "uid": "cube"
+                    },
+                    "position_range": [
+                        [-0.2, -0.2, 0.0],
+                        [0.2, 0.2, 0.0]
+                    ],
+                    "relative_position": true
+                }
+            },
+            "randomize_goal": {
+                "func": "randomize_target_pose",
+                "mode": "reset",
+                "params": {
+                    "position_range": [
+                        [-0.3, -0.3, 0.05],
+                        [0.3, 0.3, 0.05]
+                    ],
+                    "relative_position": false,
+                    "store_key": "goal_pose"
+                }
+            }
+        },
+        "observations": {
+            "robot_qpos": {
+                "func": "normalize_robot_joint_data",
+                "mode": "modify",
+                "name": "robot/qpos",
+                "params": {
+                    "joint_ids": [0, 1, 2, 3, 4, 5]
+                }
+            },
+            "robot_ee_pos": {
+                "func": "get_robot_eef_pose",
+                "mode": "add",
+                "name": "robot/ee_pos",
+                "params": {
+                    "part_name": "arm"
+                }
+            },
+            "cube_pos": {
+                "func": "get_rigid_object_pose",
+                "mode": "add",
+                "name": "object/cube_pos",
+                "params": {
+                    "entity_cfg": {
+                        "uid": "cube"
+                    }
+                }
+            },
+            "goal_pos": {
+                "func": "target_position",
+                "mode": "add",
+                "name": "object/goal_pos",
+                "params": {
+                    "target_pose_key": "goal_pose"
+                }
+            }
+        },
+        "rewards": {
+            "reaching_reward": {
+                "func": "reaching_behind_object_reward",
+                "mode": "add",
+                "name": "reaching",
+                "weight": 0.1,
+                "params": {
+                    "object_cfg": {
+                        "uid": "cube"
+                    },
+                    "target_pose_key": "goal_pose",
+                    "behind_offset": 0.015,
+                    "height_offset": 0.015,
+                    "distance_scale": 5.0,
+                    "part_name": "arm"
+                }
+            },
+            "place_reward": {
+                "func": "incremental_distance_to_target",
+                "mode": "add",
+                "name": "place",
+                "weight": 1.0,
+                "params": {
+                    "source_entity_cfg": {
+                        "uid": "cube"
+                    },
+                    "target_pose_key": "goal_pose",
+                    "tanh_scale": 10.0,
+                    "positive_weight": 2.0,
+                    "negative_weight": 0.5,
+                    "use_xy_only": true
+                }
+            },
+            "action_penalty": {
+                "func": "action_smoothness_penalty",
+                "mode": "add",
+                "name": "action_penalty",
+                "weight": 0.01,
+                "params": {}
+            },
+            "success_bonus": {
+                "func": "success_reward",
+                "mode": "add",
+                "name": "success",
+                "weight": 10.0,
+                "params": {}
+            }
+        },
+        "extensions": {
+            "obs_mode": "state",
+            "episode_length": 100,
+            "joint_limits": 0.5,
+            "action_scale": 0.1,
+            "success_threshold": 0.1
         }
-      }
     },
-    "rewards": {
-      "reaching_reward": {
-        "func": "reaching_behind_object_reward",
-        "mode": "add",
-        "name": "reaching",
-        "weight": 0.1,
-        "params": {
-          "object_cfg": {"uid": "cube"},
-          "target_pose_key": "goal_pose",
-          "behind_offset": 0.015,
-          "height_offset": 0.015,
-          "distance_scale": 5.0,
-          "part_name": "arm"
-        }
-      },
-      "place_reward": {
-        "func": "incremental_distance_to_target",
-        "mode": "add",
-        "name": "place",
-        "weight": 1.0,
-        "params": {
-          "source_entity_cfg": {"uid": "cube"},
-          "target_pose_key": "goal_pose",
-          "tanh_scale": 10.0,
-          "positive_weight": 2.0,
-          "negative_weight": 0.5,
-          "use_xy_only": true
-        }
-      },
-      "action_penalty": {
-        "func": "action_smoothness_penalty",
-        "mode": "add",
-        "name": "action_penalty",
-        "weight": 0.01,
-        "params": {}
-      },
-      "success_bonus": {
-        "func": "success_reward",
-        "mode": "add",
-        "name": "success",
-        "weight": 10.0,
-        "params": {
-          "reward_value": 1.0
+    "robot": {
+        "uid": "Manipulator",
+        "urdf_cfg": {
+            "components": [
+                {
+                    "component_type": "arm",
+                    "urdf_path": "UniversalRobots/UR10/UR10.urdf"
+                },
+                {
+                    "component_type": "hand",
+                    "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf"
+                }
+            ]
+        },
+        "init_pos": [0.0, 0.0, 0.0],
+        "init_rot": [0.0, 0.0, 0.0],
+        "init_qpos": [0.0, -1.57, 1.57, -1.57, -1.57, 0.0, 0.04, 0.04],
+        "drive_pros": {
+            "drive_type": "force",
+            "stiffness": 100000.0,
+            "damping": 1000.0,
+            "max_velocity": 2.0,
+            "max_effort": 500.0
+        },
+        "solver_cfg": {
+            "arm": {
+                "class_type": "PytorchSolver",
+                "end_link_name": "ee_link",
+                "root_link_name": "base_link",
+                "tcp": [
+                    [1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0],
+                    [0.0, 0.0, 1.0, 0.16],
+                    [0.0, 0.0, 0.0, 1.0]
+                ]
+            }
+        },
+        "control_parts": {
+            "arm": ["JOINT[1-6]"]
         }
-      }
     },
-    "extensions": {
-      "obs_mode": "state",
-      "episode_length": 100,
-      "joint_limits": 0.5,
-      "action_scale": 0.1,
-      "success_threshold": 0.1
-    }
-  },
-  "robot": {
-    "uid": "Manipulator",
-    "urdf_cfg": {
-      "components": [
+    "sensor": [],
+    "light": {},
+    "background": [],
+    "rigid_object": [
         {
-          "component_type": "arm",
-          "urdf_path": "UniversalRobots/UR10/UR10.urdf",
-          "transform": [
-            [1.0, 0.0, 0.0, 0.0],
-            [0.0, 1.0, 0.0, 0.0],
-            [0.0, 0.0, 1.0, 0.0],
-            [0.0, 0.0, 0.0, 1.0]
-          ]
-        },
-        {
-          "component_type": "hand",
-          "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf",
-          "transform": [
-            [1.0, 0.0, 0.0, 0.0],
-            [0.0, 1.0, 0.0, 0.0],
-            [0.0, 0.0, 1.0, 0.0],
-            [0.0, 0.0, 0.0, 1.0]
-          ]
+            "uid": "cube",
+            "shape": {
+                "shape_type": "Cube",
+                "size": [0.1, 0.1, 0.1]
+            },
+            "body_type": "dynamic",
+            "init_pos": [-0.6, -0.4, 0.05],
+            "attrs": {
+                "mass": 10.0,
+                "static_friction": 3.0,
+                "dynamic_friction": 2.0,
+                "linear_damping": 2.0,
+                "angular_damping": 2.0,
+                "contact_offset": 0.003,
+                "rest_offset": 0.001,
+                "restitution": 0.1,
+                "max_depenetration_velocity": 10.0,
+                "max_linear_velocity": 1.0,
+                "max_angular_velocity": 1.0
+            }
         }
-      ]
-    },
-    "init_pos": [0.0, 0.0, 0.0],
-    "init_rot": [0.0, 0.0, 0.0],
-    "init_qpos": [0.0, -1.57, 1.57, -1.57, -1.57, 0.0, 0.04, 0.04],
-    "drive_pros": {
-      "drive_type": "force",
-      "stiffness": 100000.0,
-      "damping": 1000.0,
-      "max_velocity": 2.0,
-      "max_effort": 500.0
-    },
-    "solver_cfg": {
-      "arm": {
-        "class_type": "PytorchSolver",
-        "end_link_name": "ee_link",
-        "root_link_name": "base_link",
-        "tcp": [
-          [1.0, 0.0, 0.0, 0.0],
-          [0.0, 1.0, 0.0, 0.0],
-          [0.0, 0.0, 1.0, 0.16],
-          [0.0, 0.0, 0.0, 1.0]
-        ]
-      }
-    },
-    "control_parts": {
-      "arm": ["JOINT[1-6]"]
-    }
-  },
-  "sensor": [],
-  "light": {
-  },
-  "background": [],
-  "rigid_object": [
-    {
-      "uid": "cube",
-      "shape": {
-        "shape_type": "Cube",
-        "size": [0.1, 0.1, 0.1]
-      },
-      "body_type": "dynamic",
-      "init_pos": [-0.6, -0.4, 0.05],
-      "attrs": {
-        "mass": 10.0,
-        "static_friction": 3.0,
-        "dynamic_friction": 2.0,
-        "linear_damping": 2.0,
-        "angular_damping": 2.0,
-        "contact_offset": 0.003,
-        "rest_offset": 0.001,
-        "restitution": 0.1,
-        "max_depenetration_velocity": 10.0,
-        "max_linear_velocity": 1.0,
-        "max_angular_velocity": 1.0
-      }
-    }
-  ],
-  "rigid_object_group": [],
-  "articulation": []
+    ],
+    "rigid_object_group": [],
+    "articulation": []
 }
diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json
index 7e94155..3c3bda0 100644
--- a/configs/agents/rl/push_cube/train_config.json
+++ b/configs/agents/rl/push_cube/train_config.json
@@ -1,64 +1,67 @@
-{
-  "trainer": {
-    "exp_name": "push_cube_ppo",
-    "gym_config": "configs/agents/rl/push_cube/gym_config.json",
-    "seed": 42,
-    "device": "cuda:0",
-    "headless": true,
-    "iterations": 1000,
-    "rollout_steps": 1024,
-    "eval_freq": 200,
-    "save_freq": 200,
-    "use_wandb": false,
-    "wandb_project_name": "embodychain-push_cube",
-    "events": {
-      "eval": {
-        "record_camera": {
-          "func": "record_camera_data_async",
-          "mode": "interval",
-          "interval_step": 1,
-          "params": {
-            "name": "main_cam",
-            "resolution": [640, 480],
-            "eye": [-1.4, 1.4, 2.0],
-            "target": [0, 0, 0],
-            "up": [0, 0, 1],
-            "intrinsics": [600, 600, 320, 240],
-            "save_path": "./outputs/videos/eval"
-          }
+{ 
+    "trainer": {
+        "exp_name": "push_cube_ppo",
+        "gym_config": "configs/agents/rl/push_cube/gym_config.json",
+        "seed": 42,
+        "device": "cuda:0",
+        "headless": false,
+        "enable_rt": false,
+        "gpu_id": 0,
+        "num_envs": 8,
+        "iterations": 1000,
+        "rollout_steps": 1024,
+        "eval_freq": 200,
+        "save_freq": 200,
+        "use_wandb": true,
+        "wandb_project_name": "embodychain-push_cube",
+        "events": {
+            "eval": {
+                "record_camera": {
+                    "func": "record_camera_data_async",
+                    "mode": "interval",
+                    "interval_step": 1,
+                    "params": {
+                        "name": "main_cam",
+                        "resolution": [640, 480],
+                        "eye": [-1.4, 1.4, 2.0],
+                        "target": [0, 0, 0],
+                        "up": [0, 0, 1],
+                        "intrinsics": [600, 600, 320, 240],
+                        "save_path": "./outputs/videos/eval"
+                    }
+                }
+            }
         }
-      }
-    }
-  },
-  "policy": {
-    "name": "actor_critic",
-    "actor": {
-      "type": "mlp",
-      "network_cfg": {
-        "hidden_sizes": [256, 256],
-        "activation": "relu"
-      }
     },
-    "critic": {
-      "type": "mlp",
-      "network_cfg": {
-        "hidden_sizes": [256, 256],
-        "activation": "relu"
-      }
-    }
-  },
-  "algorithm": {
-    "name": "ppo",
-    "cfg": {
-      "learning_rate": 0.0001,
-      "n_epochs": 10,
-      "batch_size": 8192,
-      "gamma": 0.99,
-      "gae_lambda": 0.95,
-      "clip_coef": 0.2,
-      "ent_coef": 0.01,
-      "vf_coef": 0.5,
-      "max_grad_norm": 0.5
+    "policy": {
+        "name": "actor_critic",
+        "actor": {
+            "type": "mlp",
+            "network_cfg": {
+                "hidden_sizes": [256, 256],
+                "activation": "relu"
+            }
+        },
+        "critic": {
+            "type": "mlp",
+            "network_cfg": {
+                "hidden_sizes": [256, 256],
+                "activation": "relu"
+            }
+        }
+    },
+    "algorithm": {
+        "name": "ppo",
+        "cfg": {
+            "learning_rate": 0.0001,
+            "n_epochs": 10,
+            "batch_size": 8192,
+            "gamma": 0.99,
+            "gae_lambda": 0.95,
+            "clip_coef": 0.2,
+            "ent_coef": 0.01,
+            "vf_coef": 0.5,
+            "max_grad_norm": 0.5
+        }
     }
-  }
-}
\ No newline at end of file
+}
diff --git a/embodichain/agents/rl/train.py b/embodichain/agents/rl/train.py
index 4d95ff4..ca3c299 100644
--- a/embodichain/agents/rl/train.py
+++ b/embodichain/agents/rl/train.py
@@ -60,6 +60,9 @@ def main():
     eval_freq = int(trainer_cfg.get("eval_freq", 10000))
     save_freq = int(trainer_cfg.get("save_freq", 50000))
     headless = bool(trainer_cfg.get("headless", True))
+    enable_rt = bool(trainer_cfg.get("enable_rt", False))
+    gpu_id = int(trainer_cfg.get("gpu_id", 0))
+    num_envs = trainer_cfg.get("num_envs", None)
     wandb_project_name = trainer_cfg.get("wandb_project_name", "embodychain-generic")
 
     # Device
@@ -122,6 +125,10 @@ def main():
     gym_config_data = load_json(str(gym_config_path))
     gym_env_cfg = config_to_cfg(gym_config_data)
 
+    # Override num_envs from train config if provided
+    if num_envs is not None:
+        gym_env_cfg.num_envs = num_envs
+
     # Ensure sim configuration mirrors runtime overrides
     if gym_env_cfg.sim_cfg is None:
         gym_env_cfg.sim_cfg = SimulationManagerCfg()
@@ -135,9 +142,11 @@ def main():
     else:
         gym_env_cfg.sim_cfg.sim_device = torch.device("cpu")
     gym_env_cfg.sim_cfg.headless = headless
+    gym_env_cfg.sim_cfg.enable_rt = enable_rt
+    gym_env_cfg.sim_cfg.gpu_id = gpu_id
 
     logger.log_info(
-        f"Loaded gym_config from {gym_config_path} (env_id={gym_config_data['id']}, headless={gym_env_cfg.sim_cfg.headless}, sim_device={gym_env_cfg.sim_cfg.sim_device})"
+        f"Loaded gym_config from {gym_config_path} (env_id={gym_config_data['id']}, num_envs={gym_env_cfg.num_envs}, headless={gym_env_cfg.sim_cfg.headless}, enable_rt={gym_env_cfg.sim_cfg.enable_rt}, sim_device={gym_env_cfg.sim_cfg.sim_device})"
     )
 
     env = build_env(gym_config_data["id"], base_env_cfg=gym_env_cfg)
diff --git a/embodichain/agents/rl/utils/trainer.py b/embodichain/agents/rl/utils/trainer.py
index 6d38800..34dc191 100644
--- a/embodichain/agents/rl/utils/trainer.py
+++ b/embodichain/agents/rl/utils/trainer.py
@@ -239,8 +239,12 @@ def _eval_once(self, num_episodes: int = 5):
             ep_ret = torch.zeros(num_envs_eval, dtype=torch.float32, device=self.device)
             while not done_any.any():
                 actions, _, _ = self.policy.get_action(obs, deterministic=True)
-                result = self.eval_env.step(actions)
-                obs, reward, terminated, truncated, info = result
+                obs, reward, terminated, truncated, info = self.eval_env.step(actions)
+
+                # Flatten dict observation from step
+                if isinstance(obs, dict):
+                    obs = flatten_dict_observation(obs)
+
                 done = terminated | truncated
                 reward = reward.float()
                 done_any = done
diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py
index c5e04fa..b87dba2 100644
--- a/embodichain/lab/gym/envs/embodied_env.py
+++ b/embodichain/lab/gym/envs/embodied_env.py
@@ -352,9 +352,7 @@ def _extend_reward(
             rewards, reward_info = self.reward_manager.compute(
                 obs=obs, action=action, info=info
             )
-            # Add individual reward terms to info for logging
-            for term_name, term_value in reward_info.items():
-                info[f"reward/{term_name}"] = term_value
+            info["rewards"] = reward_info
         return rewards
 
     def _prepare_scene(self, **kwargs) -> None:
diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py
index 306d993..7e500af 100644
--- a/embodichain/lab/gym/envs/managers/observations.py
+++ b/embodichain/lab/gym/envs/managers/observations.py
@@ -252,7 +252,7 @@ def compute_semantic_mask(
     return torch.stack(masks, dim=-1)
 
 
-def get_robot_ee_pose(
+def get_robot_eef_pose(
     env: "EmbodiedEnv",
     obs: EnvObs,
     part_name: str | None = None,
@@ -263,7 +263,6 @@ def get_robot_ee_pose(
     Args:
         env: The environment instance.
         obs: The observation dictionary.
-        robot_uid: The uid of the robot. If None, uses env.robot.
         part_name: The name of the control part. If None, uses default part.
         position_only: If True, returns only position (3D). If False, returns full pose (4x4 matrix).
 
diff --git a/embodichain/lab/gym/envs/managers/randomization/spatial.py b/embodichain/lab/gym/envs/managers/randomization/spatial.py
index 6c3f92f..c3e32d3 100644
--- a/embodichain/lab/gym/envs/managers/randomization/spatial.py
+++ b/embodichain/lab/gym/envs/managers/randomization/spatial.py
@@ -339,7 +339,7 @@ def randomize_target_pose(
     )
 
     # Store in env state (to be exposed via get_info)
-    state_attr = f"_{store_key}s"
+    state_attr = f"_{store_key}"
     if not hasattr(env, state_attr):
         setattr(
             env,
diff --git a/embodichain/lab/gym/envs/managers/reward_manager.py b/embodichain/lab/gym/envs/managers/reward_manager.py
index 8f282df..c034a90 100644
--- a/embodichain/lab/gym/envs/managers/reward_manager.py
+++ b/embodichain/lab/gym/envs/managers/reward_manager.py
@@ -76,7 +76,7 @@ def __str__(self) -> str:
             table.align["Name"] = "l"
             for index, name in enumerate(self._mode_functor_names[mode]):
                 functor_cfg = self._mode_functor_cfgs[mode][index]
-                weight = functor_cfg.params.get("weight", 1.0)
+                weight = getattr(functor_cfg, "weight", 1.0)
                 table.add_row([index, name, f"{weight:.3f}"])
 
             # convert table to string
diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py
index d8892ef..8b61ec6 100644
--- a/embodichain/lab/gym/envs/managers/rewards.py
+++ b/embodichain/lab/gym/envs/managers/rewards.py
@@ -27,39 +27,6 @@
     from embodichain.lab.gym.envs import EmbodiedEnv
 
 
-def reward_from_obs(
-    env: EmbodiedEnv,
-    obs: dict,
-    action: torch.Tensor,
-    info: dict,
-    obs_key: str = "robot/qpos",
-    target_value: float = 0.0,
-    scale: float = 1.0,
-) -> torch.Tensor:
-    """Reward based on observation values."""
-    # Parse nested keys (e.g., "robot/qpos")
-    keys = obs_key.split("/")
-    value = obs
-    for key in keys:
-        if isinstance(value, dict) and key in value:
-            value = value[key]
-        else:
-            return torch.zeros(env.num_envs, device=env.device)
-
-    # Compute distance to target
-    if isinstance(value, torch.Tensor):
-        if value.dim() > 1:
-            # Multiple values, compute norm
-            distance = torch.norm(value - target_value, dim=-1)
-        else:
-            distance = torch.abs(value - target_value)
-        reward = -scale * distance
-    else:
-        reward = torch.zeros(env.num_envs, device=env.device)
-
-    return reward
-
-
 def distance_between_objects(
     env: EmbodiedEnv,
     obs: dict,
@@ -70,32 +37,43 @@ def distance_between_objects(
     exponential: bool = False,
     sigma: float = 1.0,
 ) -> torch.Tensor:
-    """Reward based on distance between two entities."""
+    """Reward based on distance between two rigid objects.
+    
+    Encourages the source object to get closer to the target object. Can use either
+    linear negative distance or exponential Gaussian-shaped reward.
+    
+    Args:
+        source_entity_cfg: Configuration for the source object (e.g., {"uid": "cube"})
+        target_entity_cfg: Configuration for the target object (e.g., {"uid": "goal_sphere"})
+        exponential: If True, use exponential reward exp(-d²/2σ²), else use -distance
+        sigma: Standard deviation for exponential reward (controls reward spread)
+    
+    Returns:
+        Reward tensor of shape (num_envs,). Higher when objects are closer.
+        - Linear mode: ranges from -inf to 0 (0 when objects touch)
+        - Exponential mode: ranges from 0 to 1 (1 when objects touch)
+    
+    Example:
+        ```json
+        {
+            "func": "distance_between_objects",
+            "weight": 0.5,
+            "params": {
+                "source_entity_cfg": {"uid": "cube"},
+                "target_entity_cfg": {"uid": "target"},
+                "exponential": true,
+                "sigma": 0.2
+            }
+        }
+        ```
+    """
     # get source entity position
-    source_obj = env.sim[source_entity_cfg.uid]
-    if hasattr(source_obj, "get_body_pose"):
-        source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[
-            :, :3, 3
-        ]
-    elif hasattr(source_obj, "get_local_pose"):
-        source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
-    else:
-        raise ValueError(
-            f"Entity '{source_entity_cfg.uid}' does not support position query."
-        )
+    source_obj = env.sim.get_rigid_object(source_entity_cfg.uid)
+    source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
 
     # get target entity position
-    target_obj = env.sim[target_entity_cfg.uid]
-    if hasattr(target_obj, "get_body_pose"):
-        target_pos = target_obj.get_body_pose(body_ids=target_entity_cfg.body_ids)[
-            :, :3, 3
-        ]
-    elif hasattr(target_obj, "get_local_pose"):
-        target_pos = target_obj.get_local_pose(to_matrix=True)[:, :3, 3]
-    else:
-        raise ValueError(
-            f"Entity '{target_entity_cfg.uid}' does not support position query."
-        )
+    target_obj = env.sim.get_rigid_object(target_entity_cfg.uid)
+    target_pos = target_obj.get_local_pose(to_matrix=True)[:, :3, 3]
 
     # compute distance
     distance = torch.norm(source_pos - target_pos, dim=-1)
@@ -117,13 +95,46 @@ def joint_velocity_penalty(
     action: torch.Tensor,
     info: dict,
     robot_uid: str = "robot",
-    joint_ids: slice | list[int] = slice(None),
+    joint_ids: slice | list[int] | None = None,
+    part_name: str | None = None,
 ) -> torch.Tensor:
-    """Penalize large joint velocities."""
-    robot = env.sim[robot_uid]
+    """Penalize high joint velocities to encourage smooth motion.
+    
+    Computes the L2 norm of joint velocities and returns negative value as penalty.
+    Useful for preventing jerky or unstable robot movements.
+    
+    Args:
+        robot_uid: Robot entity UID in simulation (default: "robot")
+        joint_ids: Specific joint indices to penalize. Takes priority over part_name.
+                   Example: [0, 1, 2] or slice(0, 6)
+        part_name: Control part name (e.g., "arm"). Used only if joint_ids is None.
+                   Will penalize all joints in the specified part.
+    
+    Returns:
+        Penalty tensor of shape (num_envs,). Always negative or zero.
+        Magnitude increases with joint velocity (larger velocity = more negative).
+    
+    Example:
+        ```json
+        {
+            "func": "joint_velocity_penalty",
+            "weight": 0.001,
+            "params": {
+                "robot_uid": "robot",
+                "part_name": "arm"
+            }
+        }
+        ```
+    """
+    robot = env.sim.get_robot(robot_uid)
 
     # get joint velocities
-    qvel = robot.body_data.qvel[:, joint_ids]
+    if joint_ids is not None:
+        qvel = robot.get_qvel()[:, joint_ids]
+    elif part_name is not None:
+        qvel = robot.get_qvel(name=part_name)
+    else:
+        qvel = robot.get_qvel()
 
     # compute L2 norm of joint velocities
     velocity_norm = torch.norm(qvel, dim=-1)
@@ -138,17 +149,42 @@ def action_smoothness_penalty(
     action: torch.Tensor,
     info: dict,
 ) -> torch.Tensor:
-    """Penalize large changes in action between steps."""
+    """Penalize large action changes between consecutive timesteps.
+    
+    Encourages smooth control commands by penalizing sudden changes in actions.
+    Stores previous action in env._reward_states for comparison.
+    
+    Returns:
+        Penalty tensor of shape (num_envs,). Zero on first call (no previous action),
+        negative on subsequent calls (larger change = more negative).
+    
+    Note:
+        This function maintains state across calls using env._reward_states['prev_actions'].
+        State is automatically reset when the environment resets.
+    
+    Example:
+        ```json
+        {
+            "func": "action_smoothness_penalty",
+            "weight": 0.01,
+            "params": {}
+        }
+        ```
+    """
+    # Use dictionary-based state management
+    if not hasattr(env, "_reward_states"):
+        env._reward_states = {}
+
     # compute difference between current and previous action
-    if hasattr(env, "_prev_actions"):
-        action_diff = action - env._prev_actions
+    if "prev_actions" in env._reward_states:
+        action_diff = action - env._reward_states["prev_actions"]
         penalty = -torch.norm(action_diff, dim=-1)
     else:
         # no previous action, no penalty
         penalty = torch.zeros(env.num_envs, device=env.device)
 
     # store current action for next step
-    env._prev_actions = action.clone()
+    env._reward_states["prev_actions"] = action.clone()
 
     return penalty
 
@@ -162,12 +198,40 @@ def joint_limit_penalty(
     joint_ids: slice | list[int] = slice(None),
     margin: float = 0.1,
 ) -> torch.Tensor:
-    """Penalize joints approaching their limits."""
-    robot = env.sim[robot_uid]
+    """Penalize robot joints that are close to their position limits.
+    
+    Prevents joints from reaching their physical limits, which can cause instability
+    or singularities. Penalty increases as joints approach limits within the margin.
+    
+    Args:
+        robot_uid: Robot entity UID in simulation (default: "robot")
+        joint_ids: Joint indices to monitor (default: all joints)
+        margin: Normalized distance threshold (0 to 1). Penalty applied when joint
+                is within this fraction of its range from either limit.
+                Example: 0.1 means penalty when within 10% of limits.
+    
+    Returns:
+        Penalty tensor of shape (num_envs,). Always negative or zero.
+        Sum of penalties across all monitored joints.
+    
+    Example:
+        ```json
+        {
+            "func": "joint_limit_penalty",
+            "weight": 0.01,
+            "params": {
+                "robot_uid": "robot",
+                "joint_ids": [0, 1, 2, 3, 4, 5],
+                "margin": 0.1
+            }
+        }
+        ```
+    """
+    robot = env.sim.get_robot(robot_uid)
 
     # get joint positions and limits
-    qpos = robot.body_data.qpos[:, joint_ids]
-    qpos_limits = robot.body_data.qpos_limits[:, joint_ids, :]
+    qpos = robot.get_qpos()[:, joint_ids]
+    qpos_limits = robot.get_qpos_limits()[:, joint_ids, :]
 
     # compute normalized position in range [0, 1]
     qpos_normalized = (qpos - qpos_limits[:, :, 0]) / (
@@ -191,33 +255,6 @@ def joint_limit_penalty(
     return penalty.sum(dim=-1)
 
 
-def collision_penalty(
-    env: EmbodiedEnv,
-    obs: dict,
-    action: torch.Tensor,
-    info: dict,
-    robot_uid: str = "robot",
-    force_threshold: float = 1.0,
-) -> torch.Tensor:
-    """Penalize collisions based on contact forces."""
-    robot = env.sim[robot_uid]
-
-    # get joint forces (torques)
-    qf = robot.body_data.qf
-
-    # check if any joint force exceeds threshold
-    collision_detected = (torch.abs(qf) > force_threshold).any(dim=-1)
-
-    # return penalty for collisions
-    penalty = torch.where(
-        collision_detected,
-        torch.full((env.num_envs,), -1.0, device=env.device),
-        torch.zeros(env.num_envs, device=env.device),
-    )
-
-    return penalty
-
-
 def orientation_alignment_reward(
     env: EmbodiedEnv,
     obs: dict,
@@ -226,32 +263,40 @@ def orientation_alignment_reward(
     source_entity_cfg: SceneEntityCfg = None,
     target_entity_cfg: SceneEntityCfg = None,
 ) -> torch.Tensor:
-    """Reward alignment of orientations between two entities."""
+    """Reward rotational alignment between two rigid objects.
+    
+    Encourages the source object's orientation to match the target object's orientation.
+    Uses rotation matrix trace to measure alignment.
+    
+    Args:
+        source_entity_cfg: Configuration for the source object (e.g., {"uid": "cube"})
+        target_entity_cfg: Configuration for the target object (e.g., {"uid": "reference"})
+    
+    Returns:
+        Reward tensor of shape (num_envs,). Ranges from -1 to 1.
+        - 1.0: Perfect alignment (same orientation)
+        - 0.0: 90° rotation difference
+        - -1.0: 180° rotation difference (opposite orientation)
+    
+    Example:
+        ```json
+        {
+            "func": "orientation_alignment_reward",
+            "weight": 0.5,
+            "params": {
+                "source_entity_cfg": {"uid": "object"},
+                "target_entity_cfg": {"uid": "goal_object"}
+            }
+        }
+        ```
+    """
     # get source entity rotation matrix
-    source_obj = env.sim[source_entity_cfg.uid]
-    if hasattr(source_obj, "get_body_pose"):
-        source_rot = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[
-            :, :3, :3
-        ]
-    elif hasattr(source_obj, "get_local_pose"):
-        source_rot = source_obj.get_local_pose(to_matrix=True)[:, :3, :3]
-    else:
-        raise ValueError(
-            f"Entity '{source_entity_cfg.uid}' does not support orientation query."
-        )
+    source_obj = env.sim.get_rigid_object(source_entity_cfg.uid)
+    source_rot = source_obj.get_local_pose(to_matrix=True)[:, :3, :3]
 
     # get target entity rotation matrix
-    target_obj = env.sim[target_entity_cfg.uid]
-    if hasattr(target_obj, "get_body_pose"):
-        target_rot = target_obj.get_body_pose(body_ids=target_entity_cfg.body_ids)[
-            :, :3, :3
-        ]
-    elif hasattr(target_obj, "get_local_pose"):
-        target_rot = target_obj.get_local_pose(to_matrix=True)[:, :3, :3]
-    else:
-        raise ValueError(
-            f"Entity '{target_entity_cfg.uid}' does not support orientation query."
-        )
+    target_obj = env.sim.get_rigid_object(target_entity_cfg.uid)
+    target_rot = target_obj.get_local_pose(to_matrix=True)[:, :3, :3]
 
     # compute rotation difference
     rot_diff = torch.bmm(source_rot, target_rot.transpose(-1, -2))
@@ -270,9 +315,30 @@ def success_reward(
     obs: dict,
     action: torch.Tensor,
     info: dict,
-    reward_value: float = 1.0,
-) -> torch.Tensor:
-    """Sparse reward for task success."""
+    ) -> torch.Tensor:
+    """Sparse bonus reward when task succeeds.
+    
+    Provides a fixed reward when the task success condition is met.
+    Reads success status from info['success'] which should be set by the environment.
+    
+    Returns:
+        Reward tensor of shape (num_envs,).
+        - 1.0 when successful
+        - 0.0 when not successful or if 'success' key missing
+    
+    Note:
+        The environment's get_info() must populate info['success'] with a boolean
+        tensor indicating success status for each environment.
+    
+    Example:
+        ```json
+        {
+            "func": "success_reward",
+            "weight": 10.0,
+            "params": {}
+        }
+        ```
+    """
     # Check if success info is available in info dict
     if "success" in info:
         success = info["success"]
@@ -285,14 +351,12 @@ def success_reward(
         return torch.zeros(env.num_envs, device=env.device)
 
     # return reward
-    reward = torch.where(
+    return torch.where(
         success,
-        torch.full((env.num_envs,), reward_value, device=env.device),
+        torch.ones(env.num_envs, device=env.device),
         torch.zeros(env.num_envs, device=env.device),
     )
 
-    return reward
-
 
 def reaching_behind_object_reward(
     env: EmbodiedEnv,
@@ -306,7 +370,41 @@ def reaching_behind_object_reward(
     distance_scale: float = 5.0,
     part_name: str = None,
 ) -> torch.Tensor:
-    """Reward for reaching behind an object along object-to-goal direction."""
+    """Reward for positioning end-effector behind object for pushing.
+    
+    Encourages the robot's end-effector to reach a position behind the object along
+    the object-to-goal direction. Useful for push manipulation tasks.
+    
+    Args:
+        object_cfg: Configuration for the object to push (e.g., {"uid": "cube"})
+        target_pose_key: Key in info dict for goal pose (default: "goal_pose")
+                        Can be (num_envs, 3) position or (num_envs, 4, 4) transform
+        behind_offset: Distance behind object to reach (in meters, default: 0.015)
+        height_offset: Additional height above object (in meters, default: 0.015)
+        distance_scale: Scaling factor for tanh function (higher = steeper, default: 5.0)
+        part_name: Robot part name for FK computation (e.g., "arm")
+    
+    Returns:
+        Reward tensor of shape (num_envs,). Ranges from 0 to 1.
+        - 1.0: End-effector at ideal pushing position
+        - 0.0: End-effector far from ideal position
+    
+    Example:
+        ```json
+        {
+            "func": "reaching_behind_object_reward",
+            "weight": 0.1,
+            "params": {
+                "object_cfg": {"uid": "cube"},
+                "target_pose_key": "goal_pose",
+                "behind_offset": 0.015,
+                "height_offset": 0.015,
+                "distance_scale": 5.0,
+                "part_name": "arm"
+            }
+        }
+        ```
+    """
     # get end effector position from robot FK
     robot = env.robot
     joint_ids = robot.get_joint_ids(part_name)
@@ -362,19 +460,41 @@ def distance_to_target(
     sigma: float = 1.0,
     use_xy_only: bool = False,
 ) -> torch.Tensor:
-    """Reward based on distance to a virtual target pose from info."""
+    """Reward based on absolute distance to a virtual target pose.
+    
+    Encourages an object to get closer to a target pose specified in the info dict.
+    Unlike incremental_distance_to_target, this provides direct distance-based reward.
+    
+    Args:
+        source_entity_cfg: Configuration for the object (e.g., {"uid": "cube"})
+        target_pose_key: Key in info dict for target pose (default: "target_pose")
+                        Can be (num_envs, 3) position or (num_envs, 4, 4) transform
+        exponential: If True, use exponential reward exp(-d²/2σ²), else use -distance
+        sigma: Standard deviation for exponential reward (default: 1.0)
+        use_xy_only: If True, ignore z-axis and only consider horizontal distance
+    
+    Returns:
+        Reward tensor of shape (num_envs,).
+        - Linear mode: -distance (negative, approaches 0 when close)
+        - Exponential mode: exp(-d²/2σ²) (0 to 1, approaches 1 when close)
+    
+    Example:
+        ```json
+        {
+            "func": "distance_to_target",
+            "weight": 0.5,
+            "params": {
+                "source_entity_cfg": {"uid": "cube"},
+                "target_pose_key": "goal_pose",
+                "exponential": false,
+                "use_xy_only": true
+            }
+        }
+        ```
+    """
     # get source entity position
-    source_obj = env.sim[source_entity_cfg.uid]
-    if hasattr(source_obj, "get_body_pose"):
-        source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[
-            :, :3, 3
-        ]
-    elif hasattr(source_obj, "get_local_pose"):
-        source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
-    else:
-        raise ValueError(
-            f"Entity '{source_entity_cfg.uid}' does not support position query."
-        )
+    source_obj = env.sim.get_rigid_object(source_entity_cfg.uid)
+    source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
 
     # get target position from info
     if target_pose_key not in info:
@@ -418,7 +538,47 @@ def incremental_distance_to_target(
     negative_weight: float = 1.0,
     use_xy_only: bool = False,
 ) -> torch.Tensor:
-    """Incremental reward for progress toward a virtual target pose from info."""
+    """Incremental reward for progress toward a virtual target pose.
+    
+    Rewards the robot for getting closer to the target compared to previous timestep.
+    Stores previous distance in env._reward_states for comparison. Uses tanh shaping
+    to normalize rewards and supports asymmetric weighting for approach vs. retreat.
+    
+    Args:
+        source_entity_cfg: Configuration for the object (e.g., {"uid": "cube"})
+        target_pose_key: Key in info dict for target pose (default: "target_pose")
+                        Can be (num_envs, 3) position or (num_envs, 4, 4) transform
+        tanh_scale: Scaling for tanh normalization (higher = more sensitive, default: 10.0)
+        positive_weight: Multiplier for reward when getting closer (default: 1.0)
+        negative_weight: Multiplier for penalty when moving away (default: 1.0)
+        use_xy_only: If True, ignore z-axis and only consider horizontal distance
+    
+    Returns:
+        Reward tensor of shape (num_envs,). Zero on first call, then:
+        - Positive when getting closer (scaled by positive_weight)
+        - Negative when moving away (scaled by negative_weight)
+        - Magnitude bounded by tanh function
+    
+    Note:
+        This function maintains state using env._reward_states[f"prev_dist_{uid}_{key}"].
+        State is automatically reset when the environment resets.
+    
+    Example:
+        ```json
+        {
+            "func": "incremental_distance_to_target",
+            "weight": 1.0,
+            "params": {
+                "source_entity_cfg": {"uid": "cube"},
+                "target_pose_key": "goal_pose",
+                "tanh_scale": 10.0,
+                "positive_weight": 2.0,
+                "negative_weight": 0.5,
+                "use_xy_only": true
+            }
+        }
+        ```
+    """
     # get source entity position
     source_obj = env.sim.get_rigid_object(source_entity_cfg.uid)
     source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3]
@@ -443,13 +603,17 @@ def incremental_distance_to_target(
         current_dist = torch.norm(source_pos - target_pos, dim=-1)
 
     # initialize previous distance on first call
-    prev_dist_key = f"_prev_dist_{source_entity_cfg.uid}_{target_pose_key}"
-    if not hasattr(env, prev_dist_key):
-        setattr(env, prev_dist_key, current_dist.clone())
+    # Use dictionary-based state management for better organization
+    if not hasattr(env, "_reward_states"):
+        env._reward_states = {}
+
+    state_key = f"prev_dist_{source_entity_cfg.uid}_{target_pose_key}"
+    if state_key not in env._reward_states:
+        env._reward_states[state_key] = current_dist.clone()
         return torch.zeros(env.num_envs, device=env.device)
 
     # compute distance delta (positive = getting closer)
-    prev_dist = getattr(env, prev_dist_key)
+    prev_dist = env._reward_states[state_key]
     distance_delta = prev_dist - current_dist
 
     # apply tanh shaping
@@ -463,6 +627,6 @@ def incremental_distance_to_target(
     )
 
     # update previous distance
-    setattr(env, prev_dist_key, current_dist.clone())
+    env._reward_states[state_key] = current_dist.clone()
 
     return reward
diff --git a/embodichain/lab/gym/envs/tasks/rl/push_cube.py b/embodichain/lab/gym/envs/tasks/rl/push_cube.py
index f26a1fc..072574f 100644
--- a/embodichain/lab/gym/envs/tasks/rl/push_cube.py
+++ b/embodichain/lab/gym/envs/tasks/rl/push_cube.py
@@ -40,6 +40,11 @@ def __init__(self, cfg=None, **kwargs):
 
         super().__init__(cfg, **kwargs)
 
+    @property
+    def goal_pose(self) -> torch.Tensor:
+        """Get current goal poses (4x4 matrices) for all environments."""
+        return self._goal_pose
+
     def _draw_goal_marker(self):
         """Draw axis marker at goal position for visualization."""
         goal_sphere = self.sim.get_rigid_object("goal_sphere")
@@ -91,15 +96,17 @@ def get_info(self, **kwargs) -> Dict[str, Any]:
         cube = self.sim.get_rigid_object("cube")
         cube_pos = cube.body_data.pose[:, :3]
 
-        # Get virtual goal pose from env state (set by randomize_target_pose event)
-        if hasattr(self, "_goal_poses"):
-            goal_pos = self._goal_poses[:, :3, 3]
+        # Get goal position from event-managed goal pose
+        if self.goal_pose is not None:
+            goal_pos = self.goal_pose[:, :3, 3]
+            xy_distance = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1)
+            is_success = xy_distance < self.success_threshold
         else:
-            # Fallback: no virtual goal set
-            goal_pos = torch.zeros_like(cube_pos)
-
-        xy_distance = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1)
-        is_success = xy_distance < self.success_threshold
+            # Goal not yet set by randomize_target_pose event (e.g., before first reset)
+            xy_distance = torch.zeros(self.cfg.num_envs, device=self.device)
+            is_success = torch.zeros(
+                self.cfg.num_envs, device=self.device, dtype=torch.bool
+            )
 
         info = {
             "success": is_success,
@@ -107,7 +114,7 @@ def get_info(self, **kwargs) -> Dict[str, Any]:
                 self.cfg.num_envs, device=self.device, dtype=torch.bool
             ),
             "elapsed_steps": self._elapsed_steps,
-            "goal_pose": self._goal_poses if hasattr(self, "_goal_poses") else None,
+            "goal_pose": self.goal_pose,
         }
         info["metrics"] = {
             "distance_to_goal": xy_distance,
diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py
index 825a83a..cc6781a 100644
--- a/embodichain/lab/gym/utils/gym_utils.py
+++ b/embodichain/lab/gym/utils/gym_utils.py
@@ -346,6 +346,7 @@ def config_to_cfg(config: dict) -> "EmbodiedEnvCfg":
         SceneEntityCfg,
         EventCfg,
         ObservationCfg,
+        RewardCfg,
         DatasetFunctorCfg,
     )
     from embodichain.utils import configclass
@@ -455,8 +456,6 @@ class ComponentCfg:
                 raise_if_not_found=True,
             )
 
-            from embodichain.lab.gym.envs.managers import DatasetFunctorCfg
-
             dataset = DatasetFunctorCfg(
                 func=dataset_func,
                 mode=dataset_params_modified["mode"],
@@ -562,8 +561,6 @@ class ComponentCfg:
                 raise_if_not_found=True,
             )
 
-            from embodichain.lab.gym.envs.managers import RewardCfg
-
             reward = RewardCfg(
                 func=reward_func,
                 mode=reward_params_modified["mode"],

From bea6d5e1b5580aab2bba7e78df5386eac723f2a6 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Mon, 19 Jan 2026 16:17:30 +0800
Subject: [PATCH 5/9] delete RLenvCfg

---
 embodichain/lab/gym/envs/rl_env_cfg.py | 33 --------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 embodichain/lab/gym/envs/rl_env_cfg.py

diff --git a/embodichain/lab/gym/envs/rl_env_cfg.py b/embodichain/lab/gym/envs/rl_env_cfg.py
deleted file mode 100644
index 3444874..0000000
--- a/embodichain/lab/gym/envs/rl_env_cfg.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# ----------------------------------------------------------------------------
-# Copyright (c) 2021-2025 DexForce Technology Co., Ltd.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ----------------------------------------------------------------------------
-
-from typing import Any, Dict
-
-from embodichain.lab.gym.envs.embodied_env import EmbodiedEnvCfg
-from embodichain.utils import configclass
-
-
-@configclass
-class RLEnvCfg(EmbodiedEnvCfg):
-    """Extended configuration for RL environments built from gym-style specs."""
-
-    env_id: str = ""
-    extensions: Dict[str, Any] = {}
-
-    @classmethod
-    def from_dict(cls, d):
-        """Create an instance from a dictionary."""
-        return cls(**d)

From 69241fbaf5cb62dcbb4a366ec8ebe1db6b614de3 Mon Sep 17 00:00:00 2001
From: yuanhaonan <yuanhaonan@dexforce.top>
Date: Mon, 19 Jan 2026 18:15:57 +0800
Subject: [PATCH 6/9] update review changes

---
 configs/agents/rl/push_cube/gym_config.json   |  4 -
 embodichain/lab/gym/envs/managers/cfg.py      |  7 --
 .../lab/gym/envs/managers/reward_manager.py   | 15 +---
 embodichain/lab/gym/envs/managers/rewards.py  | 76 +++++++++----------
 embodichain/lab/gym/utils/gym_utils.py        |  1 -
 5 files changed, 40 insertions(+), 63 deletions(-)

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index cd5113b..54283a2 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -72,7 +72,6 @@
             "reaching_reward": {
                 "func": "reaching_behind_object_reward",
                 "mode": "add",
-                "name": "reaching",
                 "weight": 0.1,
                 "params": {
                     "object_cfg": {
@@ -88,7 +87,6 @@
             "place_reward": {
                 "func": "incremental_distance_to_target",
                 "mode": "add",
-                "name": "place",
                 "weight": 1.0,
                 "params": {
                     "source_entity_cfg": {
@@ -104,14 +102,12 @@
             "action_penalty": {
                 "func": "action_smoothness_penalty",
                 "mode": "add",
-                "name": "action_penalty",
                 "weight": 0.01,
                 "params": {}
             },
             "success_bonus": {
                 "func": "success_reward",
                 "mode": "add",
-                "name": "success",
                 "weight": 10.0,
                 "params": {}
             }
diff --git a/embodichain/lab/gym/envs/managers/cfg.py b/embodichain/lab/gym/envs/managers/cfg.py
index 3161fb8..6aeb280 100644
--- a/embodichain/lab/gym/envs/managers/cfg.py
+++ b/embodichain/lab/gym/envs/managers/cfg.py
@@ -326,13 +326,6 @@ class RewardCfg(FunctorCfg):
     - `replace`: The reward replaces the total reward (useful for single reward functions).
     """
 
-    name: str = MISSING
-    """The name of the reward term.
-
-    This is used for logging and debugging purposes. The name should be descriptive of what
-    the reward term represents, e.g., "distance_to_goal", "gripper_close", "collision_penalty".
-    """
-
     weight: float = 1.0
     """The weight multiplier for this reward term.
 
diff --git a/embodichain/lab/gym/envs/managers/reward_manager.py b/embodichain/lab/gym/envs/managers/reward_manager.py
index c034a90..b3e46de 100644
--- a/embodichain/lab/gym/envs/managers/reward_manager.py
+++ b/embodichain/lab/gym/envs/managers/reward_manager.py
@@ -41,6 +41,8 @@ class RewardManager(ManagerBase):
     The reward manager offers two modes of operation:
         - `add`: This mode computes a reward term and adds it to the total reward (weighted by the term's weight).
         - `replace`: This mode replaces the total reward with the computed value (useful for single reward functions).
+
+    Note: The config key is used as the unique identifier and display name for each reward functor.
     """
 
     _env: EmbodiedEnv
@@ -82,13 +84,8 @@ def __str__(self) -> str:
             # convert table to string
             msg += table.get_string()
             msg += "\n"
-
         return msg
 
-    """
-    Properties.
-    """
-
     @property
     def active_functors(self) -> dict[str, list[str]]:
         """Name of active reward functors.
@@ -97,10 +94,6 @@ def active_functors(self) -> dict[str, list[str]]:
         """
         return self._mode_functor_names
 
-    """
-    Operations.
-    """
-
     def reset(self, env_ids: Union[Sequence[int], None] = None) -> dict[str, float]:
         """Reset reward terms that are stateful (implemented as classes).
 
@@ -200,10 +193,6 @@ def get_functor_cfg(self, functor_name: str) -> RewardCfg:
                 return self._mode_functor_cfgs[mode][functors.index(functor_name)]
         logger.log_error(f"Reward functor '{functor_name}' not found.")
 
-    """
-    Helper functions.
-    """
-
     def _prepare_functors(self):
         # check if config is dict already
         if isinstance(self.cfg, dict):
diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py
index 8b61ec6..c832313 100644
--- a/embodichain/lab/gym/envs/managers/rewards.py
+++ b/embodichain/lab/gym/envs/managers/rewards.py
@@ -38,21 +38,21 @@ def distance_between_objects(
     sigma: float = 1.0,
 ) -> torch.Tensor:
     """Reward based on distance between two rigid objects.
-    
+
     Encourages the source object to get closer to the target object. Can use either
     linear negative distance or exponential Gaussian-shaped reward.
-    
+
     Args:
         source_entity_cfg: Configuration for the source object (e.g., {"uid": "cube"})
         target_entity_cfg: Configuration for the target object (e.g., {"uid": "goal_sphere"})
         exponential: If True, use exponential reward exp(-d²/2σ²), else use -distance
         sigma: Standard deviation for exponential reward (controls reward spread)
-    
+
     Returns:
         Reward tensor of shape (num_envs,). Higher when objects are closer.
         - Linear mode: ranges from -inf to 0 (0 when objects touch)
         - Exponential mode: ranges from 0 to 1 (1 when objects touch)
-    
+
     Example:
         ```json
         {
@@ -99,21 +99,21 @@ def joint_velocity_penalty(
     part_name: str | None = None,
 ) -> torch.Tensor:
     """Penalize high joint velocities to encourage smooth motion.
-    
+
     Computes the L2 norm of joint velocities and returns negative value as penalty.
     Useful for preventing jerky or unstable robot movements.
-    
+
     Args:
         robot_uid: Robot entity UID in simulation (default: "robot")
         joint_ids: Specific joint indices to penalize. Takes priority over part_name.
                    Example: [0, 1, 2] or slice(0, 6)
         part_name: Control part name (e.g., "arm"). Used only if joint_ids is None.
                    Will penalize all joints in the specified part.
-    
+
     Returns:
         Penalty tensor of shape (num_envs,). Always negative or zero.
         Magnitude increases with joint velocity (larger velocity = more negative).
-    
+
     Example:
         ```json
         {
@@ -150,18 +150,18 @@ def action_smoothness_penalty(
     info: dict,
 ) -> torch.Tensor:
     """Penalize large action changes between consecutive timesteps.
-    
+
     Encourages smooth control commands by penalizing sudden changes in actions.
     Stores previous action in env._reward_states for comparison.
-    
+
     Returns:
         Penalty tensor of shape (num_envs,). Zero on first call (no previous action),
         negative on subsequent calls (larger change = more negative).
-    
+
     Note:
         This function maintains state across calls using env._reward_states['prev_actions'].
         State is automatically reset when the environment resets.
-    
+
     Example:
         ```json
         {
@@ -199,21 +199,21 @@ def joint_limit_penalty(
     margin: float = 0.1,
 ) -> torch.Tensor:
     """Penalize robot joints that are close to their position limits.
-    
+
     Prevents joints from reaching their physical limits, which can cause instability
     or singularities. Penalty increases as joints approach limits within the margin.
-    
+
     Args:
         robot_uid: Robot entity UID in simulation (default: "robot")
         joint_ids: Joint indices to monitor (default: all joints)
         margin: Normalized distance threshold (0 to 1). Penalty applied when joint
                 is within this fraction of its range from either limit.
                 Example: 0.1 means penalty when within 10% of limits.
-    
+
     Returns:
         Penalty tensor of shape (num_envs,). Always negative or zero.
         Sum of penalties across all monitored joints.
-    
+
     Example:
         ```json
         {
@@ -264,20 +264,20 @@ def orientation_alignment_reward(
     target_entity_cfg: SceneEntityCfg = None,
 ) -> torch.Tensor:
     """Reward rotational alignment between two rigid objects.
-    
+
     Encourages the source object's orientation to match the target object's orientation.
     Uses rotation matrix trace to measure alignment.
-    
+
     Args:
         source_entity_cfg: Configuration for the source object (e.g., {"uid": "cube"})
         target_entity_cfg: Configuration for the target object (e.g., {"uid": "reference"})
-    
+
     Returns:
         Reward tensor of shape (num_envs,). Ranges from -1 to 1.
         - 1.0: Perfect alignment (same orientation)
         - 0.0: 90° rotation difference
         - -1.0: 180° rotation difference (opposite orientation)
-    
+
     Example:
         ```json
         {
@@ -315,21 +315,21 @@ def success_reward(
     obs: dict,
     action: torch.Tensor,
     info: dict,
-    ) -> torch.Tensor:
+) -> torch.Tensor:
     """Sparse bonus reward when task succeeds.
-    
+
     Provides a fixed reward when the task success condition is met.
     Reads success status from info['success'] which should be set by the environment.
-    
+
     Returns:
         Reward tensor of shape (num_envs,).
         - 1.0 when successful
         - 0.0 when not successful or if 'success' key missing
-    
+
     Note:
         The environment's get_info() must populate info['success'] with a boolean
         tensor indicating success status for each environment.
-    
+
     Example:
         ```json
         {
@@ -371,10 +371,10 @@ def reaching_behind_object_reward(
     part_name: str = None,
 ) -> torch.Tensor:
     """Reward for positioning end-effector behind object for pushing.
-    
+
     Encourages the robot's end-effector to reach a position behind the object along
     the object-to-goal direction. Useful for push manipulation tasks.
-    
+
     Args:
         object_cfg: Configuration for the object to push (e.g., {"uid": "cube"})
         target_pose_key: Key in info dict for goal pose (default: "goal_pose")
@@ -383,12 +383,12 @@ def reaching_behind_object_reward(
         height_offset: Additional height above object (in meters, default: 0.015)
         distance_scale: Scaling factor for tanh function (higher = steeper, default: 5.0)
         part_name: Robot part name for FK computation (e.g., "arm")
-    
+
     Returns:
         Reward tensor of shape (num_envs,). Ranges from 0 to 1.
         - 1.0: End-effector at ideal pushing position
         - 0.0: End-effector far from ideal position
-    
+
     Example:
         ```json
         {
@@ -461,10 +461,10 @@ def distance_to_target(
     use_xy_only: bool = False,
 ) -> torch.Tensor:
     """Reward based on absolute distance to a virtual target pose.
-    
+
     Encourages an object to get closer to a target pose specified in the info dict.
     Unlike incremental_distance_to_target, this provides direct distance-based reward.
-    
+
     Args:
         source_entity_cfg: Configuration for the object (e.g., {"uid": "cube"})
         target_pose_key: Key in info dict for target pose (default: "target_pose")
@@ -472,12 +472,12 @@ def distance_to_target(
         exponential: If True, use exponential reward exp(-d²/2σ²), else use -distance
         sigma: Standard deviation for exponential reward (default: 1.0)
         use_xy_only: If True, ignore z-axis and only consider horizontal distance
-    
+
     Returns:
         Reward tensor of shape (num_envs,).
         - Linear mode: -distance (negative, approaches 0 when close)
         - Exponential mode: exp(-d²/2σ²) (0 to 1, approaches 1 when close)
-    
+
     Example:
         ```json
         {
@@ -539,11 +539,11 @@ def incremental_distance_to_target(
     use_xy_only: bool = False,
 ) -> torch.Tensor:
     """Incremental reward for progress toward a virtual target pose.
-    
+
     Rewards the robot for getting closer to the target compared to previous timestep.
     Stores previous distance in env._reward_states for comparison. Uses tanh shaping
     to normalize rewards and supports asymmetric weighting for approach vs. retreat.
-    
+
     Args:
         source_entity_cfg: Configuration for the object (e.g., {"uid": "cube"})
         target_pose_key: Key in info dict for target pose (default: "target_pose")
@@ -552,17 +552,17 @@ def incremental_distance_to_target(
         positive_weight: Multiplier for reward when getting closer (default: 1.0)
         negative_weight: Multiplier for penalty when moving away (default: 1.0)
         use_xy_only: If True, ignore z-axis and only consider horizontal distance
-    
+
     Returns:
         Reward tensor of shape (num_envs,). Zero on first call, then:
         - Positive when getting closer (scaled by positive_weight)
         - Negative when moving away (scaled by negative_weight)
         - Magnitude bounded by tanh function
-    
+
     Note:
         This function maintains state using env._reward_states[f"prev_dist_{uid}_{key}"].
         State is automatically reset when the environment resets.
-    
+
     Example:
         ```json
         {
diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py
index cc6781a..dff96e9 100644
--- a/embodichain/lab/gym/utils/gym_utils.py
+++ b/embodichain/lab/gym/utils/gym_utils.py
@@ -564,7 +564,6 @@ class ComponentCfg:
             reward = RewardCfg(
                 func=reward_func,
                 mode=reward_params_modified["mode"],
-                name=reward_params_modified["name"],
                 params=reward_params_modified["params"],
             )
 

From f05a1eb852f0ea6c0d2ccafedc1545503b5e4fbb Mon Sep 17 00:00:00 2001
From: Chen Jian <mtfl1996@outlook.com>
Date: Mon, 19 Jan 2026 18:18:25 +0800
Subject: [PATCH 7/9] multiple simulation manager (#74)

Co-authored-by: chenjian <chenjian@dexforce.com>
---
 docs/source/overview/sim/sim_manager.md |  4 ++
 embodichain/lab/sim/sim_manager.py      | 86 ++++++++++++++++---------
 2 files changed, 58 insertions(+), 32 deletions(-)

diff --git a/docs/source/overview/sim/sim_manager.md b/docs/source/overview/sim/sim_manager.md
index d2f19be..d49188c 100644
--- a/docs/source/overview/sim/sim_manager.md
+++ b/docs/source/overview/sim/sim_manager.md
@@ -26,6 +26,10 @@ sim_config = SimulationManagerCfg(
 )
 ```
 
+### multiple instance
+- current instance number of `SimulationManager`: `SimulationManager.get_n_instances()`
+- get specific instance: `SimulationManager.get_instance(instance_id)`, `instance_id` < `SimulationManager.get_n_instances()`
+
 ### Configuration Parameters
 
 | Parameter | Type | Default | Description |
diff --git a/embodichain/lab/sim/sim_manager.py b/embodichain/lab/sim/sim_manager.py
index 1f9c662..48343d4 100644
--- a/embodichain/lab/sim/sim_manager.py
+++ b/embodichain/lab/sim/sim_manager.py
@@ -158,14 +158,11 @@ class SimulationManager:
             - physics simulation management, eg. time step, manual update, etc.
             - interactive control via gizmo and window callbacks events.
 
-    This class implements the singleton pattern to ensure only one instance exists at a time.
-
     Args:
         sim_config (SimulationManagerCfg, optional): simulation configuration. Defaults to SimulationManagerCfg().
     """
 
-    _instance = None
-    _is_initialized = False
+    _instances = {}
 
     SUPPORTED_SENSOR_TYPES = {
         "Camera": Camera,
@@ -173,26 +170,40 @@ class SimulationManager:
         "ContactSensor": ContactSensor,
     }
 
-    def __new__(cls, sim_config: SimulationManagerCfg = SimulationManagerCfg()):
-        """Create or return the singleton instance."""
-        if cls._instance is None:
-            cls._instance = super(SimulationManager, cls).__new__(cls)
-        return cls._instance
+    def __new__(
+        cls,
+        sim_config: SimulationManagerCfg = SimulationManagerCfg()
+    ):
+        """Create or return the instance based on instance_id."""
+        n_instance = len(list(cls._instances.keys()))
+        instance = super(SimulationManager, cls).__new__(cls)
+        # Store sim_config in the instance for use in __init__ or elsewhere
+        instance.sim_config = sim_config
+        cls._instances[str(n_instance + 1)] = instance
+        return instance
 
     def __init__(
-        self, sim_config: SimulationManagerCfg = SimulationManagerCfg()
+        self,
+        sim_config: SimulationManagerCfg = SimulationManagerCfg()
     ) -> None:
+        instance_id = SimulationManager.get_n_instances() + 1
         # Skip initialization if already initialized
-        if self._is_initialized:
+        if hasattr(self, "_is_initialized") and self._is_initialized:
             logger.log_warning(
-                "SimulationManager is already initialized. Skipping re-initialization. "
-                "Use SimulationManager.get_instance() to get the existing instance or "
-                "SimulationManager.reset() to create a new instance."
+                f"SimulationManager (id={instance_id}) is already initialized. Skipping re-initialization. "
+                "Use SimulationManager.get_instance(instance_id) to get the existing instance or "
+                "SimulationManager.reset(instance_id) to create a new instance."
             )
             return
 
+        if sim_config.enable_rt and instance_id > 0:
+            logger.log_error(
+                f"Ray Tracing rendering backend is only supported for single instance (instance_id=0). "
+            )
+
         # Mark as initialized
-        SimulationManager._is_initialized = True
+        self._is_initialized = True
+        self.instance_id = instance_id
         # Cache paths
         self._sim_cache_dir = SIM_CACHE_DIR
         self._material_cache_dir = MATERIAL_CACHE_DIR
@@ -278,41 +289,52 @@ def __init__(
         self._build_multiple_arenas(sim_config.num_envs)
 
     @classmethod
-    def get_instance(cls) -> SimulationManager:
-        """Get the singleton instance of SimulationManager.
+    def get_instance(cls, instance_id: int = 0) -> SimulationManager:
+        """Get the instance of SimulationManager by id.
+
+        Args:
+            instance_id (int): The instance id. Defaults to 0.
 
         Returns:
-            SimulationManager: The singleton instance.
+            SimulationManager: The instance.
 
         Raises:
             RuntimeError: If the instance has not been created yet.
         """
-        if cls._instance is None:
+        if instance_id not in cls._instances:
             logger.log_error(
-                "SimulationManager has not been instantiated yet. "
-                "Create an instance first using SimulationManager(sim_config)."
+                f"SimulationManager (id={instance_id}) has not been instantiated yet. "
+                f"Create an instance first using SimulationManager(sim_config, instance_id={instance_id})."
             )
-        return cls._instance
+        return cls._instances[instance_id]
+
+    @classmethod
+    def get_n_instances(cls) -> int:
+        """Get the number of instantiated SimulationManager instances.
+
+        Returns:
+            int: The number of instances.
+        """
+        return len(cls._instances)
 
     @classmethod
-    def reset(cls) -> None:
-        """Reset the singleton instance.
+    def reset(cls, instance_id: int = 0) -> None:
+        """Reset the instance.
 
         This allows creating a new instance with different configuration.
         """
-        if cls._instance is not None:
-            logger.log_info("Resetting SimulationManager singleton instance.")
-        cls._instance = None
-        cls._is_initialized = False
+        if instance_id in cls._instances:
+            logger.log_info(f"Resetting SimulationManager instance {instance_id}.")
+            del cls._instances[instance_id]
 
     @classmethod
-    def is_instantiated(cls) -> bool:
-        """Check if the singleton instance has been created.
+    def is_instantiated(cls, instance_id: int = 0) -> bool:
+        """Check if the instance has been created.
 
         Returns:
             bool: True if the instance exists, False otherwise.
         """
-        return cls._instance is not None
+        return instance_id in cls._instances
 
     @property
     def num_envs(self) -> int:
@@ -1560,4 +1582,4 @@ def destroy(self) -> None:
         self._env.clean()
         self._world.quit()
 
-        self.reset()
+        SimulationManager.reset(self.instance_id)
\ No newline at end of file

From 16004fd33b18510bf065411ad1cb6d1fe48d0b9e Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Mon, 19 Jan 2026 19:23:10 +0800
Subject: [PATCH 8/9] wip

---
 embodichain/lab/sim/sim_manager.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/embodichain/lab/sim/sim_manager.py b/embodichain/lab/sim/sim_manager.py
index 48343d4..799013f 100644
--- a/embodichain/lab/sim/sim_manager.py
+++ b/embodichain/lab/sim/sim_manager.py
@@ -170,10 +170,7 @@ class SimulationManager:
         "ContactSensor": ContactSensor,
     }
 
-    def __new__(
-        cls,
-        sim_config: SimulationManagerCfg = SimulationManagerCfg()
-    ):
+    def __new__(cls, sim_config: SimulationManagerCfg = SimulationManagerCfg()):
         """Create or return the instance based on instance_id."""
         n_instance = len(list(cls._instances.keys()))
         instance = super(SimulationManager, cls).__new__(cls)
@@ -183,8 +180,7 @@ def __new__(
         return instance
 
     def __init__(
-        self,
-        sim_config: SimulationManagerCfg = SimulationManagerCfg()
+        self, sim_config: SimulationManagerCfg = SimulationManagerCfg()
     ) -> None:
         instance_id = SimulationManager.get_n_instances() + 1
         # Skip initialization if already initialized
@@ -1582,4 +1578,4 @@ def destroy(self) -> None:
         self._env.clean()
         self._world.quit()
 
-        SimulationManager.reset(self.instance_id)
\ No newline at end of file
+        SimulationManager.reset(self.instance_id)

From ba6ea10aff6a2def561e69471fed4c30adcd356f Mon Sep 17 00:00:00 2001
From: yuecideng <dengyueci@qq.com>
Date: Mon, 19 Jan 2026 19:29:04 +0800
Subject: [PATCH 9/9] wip

---
 configs/agents/rl/push_cube/gym_config.json  | 2 +-
 embodichain/lab/gym/envs/base_env.py         | 7 +++----
 embodichain/lab/gym/envs/managers/rewards.py | 8 ++++----
 3 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json
index 54283a2..766d12c 100644
--- a/configs/agents/rl/push_cube/gym_config.json
+++ b/configs/agents/rl/push_cube/gym_config.json
@@ -70,7 +70,7 @@
         },
         "rewards": {
             "reaching_reward": {
-                "func": "reaching_behind_object_reward",
+                "func": "reaching_behind_object",
                 "mode": "add",
                 "weight": 0.1,
                 "params": {
diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py
index 2ca18d7..9974360 100644
--- a/embodichain/lab/gym/envs/base_env.py
+++ b/embodichain/lab/gym/envs/base_env.py
@@ -480,10 +480,6 @@ def get_reward(
 
         rewards = torch.zeros(self.num_envs, dtype=torch.float32, device=self.device)
 
-        rewards = self._extend_reward(
-            rewards=rewards, obs=obs, action=action, info=info
-        )
-
         return rewards
 
     def _step_action(self, action: EnvAction) -> EnvAction:
@@ -548,6 +544,9 @@ def step(
         obs = self.get_obs(**kwargs)
         info = self.get_info(**kwargs)
         rewards = self.get_reward(obs=obs, action=action, info=info)
+        rewards = self._extend_reward(
+            rewards=rewards, obs=obs, action=action, info=info
+        )
 
         terminateds = torch.logical_or(
             info.get(
diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py
index c832313..5cfbef5 100644
--- a/embodichain/lab/gym/envs/managers/rewards.py
+++ b/embodichain/lab/gym/envs/managers/rewards.py
@@ -255,7 +255,7 @@ def joint_limit_penalty(
     return penalty.sum(dim=-1)
 
 
-def orientation_alignment_reward(
+def orientation_alignment(
     env: EmbodiedEnv,
     obs: dict,
     action: torch.Tensor,
@@ -281,7 +281,7 @@ def orientation_alignment_reward(
     Example:
         ```json
         {
-            "func": "orientation_alignment_reward",
+            "func": "orientation_alignment",
             "weight": 0.5,
             "params": {
                 "source_entity_cfg": {"uid": "object"},
@@ -358,7 +358,7 @@ def success_reward(
     )
 
 
-def reaching_behind_object_reward(
+def reaching_behind_object(
     env: EmbodiedEnv,
     obs: dict,
     action: torch.Tensor,
@@ -392,7 +392,7 @@ def reaching_behind_object_reward(
     Example:
         ```json
         {
-            "func": "reaching_behind_object_reward",
+            "func": "reaching_behind_object",
             "weight": 0.1,
             "params": {
                 "object_cfg": {"uid": "cube"},