From 7eeaeab7bb997e07c43cb6d317eac17dd07ad3a8 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Fri, 16 Jan 2026 14:24:01 +0800 Subject: [PATCH 1/9] add reward manager --- configs/agents/rl/push_cube/gym_config.json | 117 ++++- configs/agents/rl/push_cube/train_config.json | 2 +- embodichain/lab/gym/envs/base_env.py | 32 +- embodichain/lab/gym/envs/embodied_env.py | 32 ++ embodichain/lab/gym/envs/managers/__init__.py | 2 + embodichain/lab/gym/envs/managers/cfg.py | 23 + .../lab/gym/envs/managers/observations.py | 13 + .../envs/managers/randomization/spatial.py | 83 +++ .../lab/gym/envs/managers/reward_manager.py | 241 +++++++++ embodichain/lab/gym/envs/managers/rewards.py | 482 ++++++++++++++++++ .../lab/gym/envs/tasks/rl/push_cube.py | 91 +--- embodichain/lab/gym/utils/gym_utils.py | 44 ++ 12 files changed, 1056 insertions(+), 106 deletions(-) create mode 100644 embodichain/lab/gym/envs/managers/reward_manager.py create mode 100644 embodichain/lab/gym/envs/managers/rewards.py diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index b78e7f3..b4a5cd0 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -13,20 +13,107 @@ "position_range": [[-0.2, -0.2, 0.0], [0.2, 0.2, 0.0]], "relative_position": true } + }, + "randomize_goal": { + "func": "randomize_target_pose", + "mode": "reset", + "params": { + "position_range": [[-0.3, -0.3, 0.05], [0.3, 0.3, 0.05]], + "relative_position": false, + "store_key": "goal_pose" + } + } + }, + "observations": { + "robot_qpos": { + "func": "embodichain.lab.gym.envs.managers.observations:normalize_robot_joint_data", + "mode": "concatenate", + "name": "robot/qpos", + "params": { + "robot_uid": "Manipulator", + "joint_ids": [0, 1, 2, 3, 4, 5], + "data_type": "qpos" + } + }, + "robot_ee_pos": { + "func": "embodichain.lab.gym.envs.managers.observations:get_robot_ee_pose", + "mode": "concatenate", + "name": "robot/ee_pos", + "params": { + "robot_uid": "Manipulator", + "part_name": "arm" + } + }, + "cube_pos": { + "func": "embodichain.lab.gym.envs.managers.observations:get_rigid_object_pose", + "mode": "concatenate", + "name": "object/cube_pos", + "params": { + "entity_cfg": {"uid": "cube"} + } + }, + "goal_pos": { + "func": "embodichain.lab.gym.envs.managers.observations:virtual_target_position", + "mode": "concatenate", + "name": "object/goal_pos", + "params": { + "target_pose_key": "goal_pose" + } + } + }, + "rewards": { + "reaching_reward": { + "func": "reaching_behind_object_reward", + "mode": "add", + "name": "reaching", + "params": { + "weight": 0.1, + "end_effector_cfg": {"uid": "Manipulator", "body_ids": "ee_link"}, + "object_cfg": {"uid": "cube"}, + "goal_cfg": {"uid": "goal_sphere"}, + "behind_offset": 0.015, + "height_offset": 0.015, + "distance_scale": 5.0 + } + }, + "place_reward": { + "func": "incremental_distance_to_target", + "mode": "add", + "name": "place", + "params": { + "weight": 1.0, + "source_entity_cfg": {"uid": "cube"}, + "target_pose_key": "goal_pose", + "tanh_scale": 10.0, + "positive_weight": 2.0, + "negative_weight": 0.5, + "use_xy_only": true + } + }, + "action_penalty": { + "func": "action_smoothness_penalty", + "mode": "add", + "name": "action_penalty", + "params": { + "weight": 0.01 + } + }, + "success_bonus": { + "func": "success_reward", + "mode": "add", + "name": "success", + "params": { + "weight": 10.0, + "reward_value": 1.0 + } } }, - "observations": {}, "extensions": { "obs_mode": "state", "episode_length": 100, "joint_limits": 0.5, "action_scale": 0.1, - "success_threshold": 0.1, - "reaching_reward_weight": 0.1, - "place_reward_weight": 2.0, - "place_penalty_weight": 0.5, - "action_penalty_weight": 0.01, - "success_bonus_weight": 10.0 + "success_threshold": 0.1 } }, "robot": { @@ -85,21 +172,7 @@ "sensor": [], "light": { }, - "background": [ - { - "uid": "goal_sphere", - "shape": { - "shape_type": "Sphere", - "radius": 0.02 - }, - "body_type": "kinematic", - "init_pos": [-0.9, -0.6, 0.05], - "attrs": { - "enable_collision": false, - "mass": 0.0 - } - } - ], + "background": [], "rigid_object": [ { "uid": "cube", diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json index f1558fd..7e94155 100644 --- a/configs/agents/rl/push_cube/train_config.json +++ b/configs/agents/rl/push_cube/train_config.json @@ -7,7 +7,7 @@ "headless": true, "iterations": 1000, "rollout_steps": 1024, - "eval_freq": 2, + "eval_freq": 200, "save_freq": 200, "use_wandb": false, "wandb_project_name": "embodychain-push_cube", diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py index 15f5d3b..72e1f40 100644 --- a/embodichain/lab/gym/envs/base_env.py +++ b/embodichain/lab/gym/envs/base_env.py @@ -397,6 +397,30 @@ def check_truncated(self, obs: EnvObs, info: Dict[str, Any]) -> torch.Tensor: """ return torch.zeros(self.num_envs, dtype=torch.bool, device=self.device) + def _extend_reward( + self, + rewards: torch.Tensor, + obs: EnvObs, + action: EnvAction, + info: Dict[str, Any], + **kwargs, + ) -> torch.Tensor: + """Extend the reward computation. + + Overwrite this function to extend or modify the reward computation. + + Args: + rewards: The base reward tensor. + obs: The observation from the environment. + action: The action applied to the robot agent. + info: The info dictionary. + **kwargs: Additional keyword arguments. + + Returns: + The extended reward tensor. + """ + return rewards + def get_reward( self, obs: EnvObs, @@ -417,7 +441,13 @@ def get_reward( The reward for the current step. """ - return torch.zeros(self.num_envs, dtype=torch.float32, device=self.device) + rewards = torch.zeros(self.num_envs, dtype=torch.float32, device=self.device) + + rewards = self._extend_reward( + rewards=rewards, obs=obs, action=action, info=info + ) + + return rewards def _step_action(self, action: EnvAction) -> EnvAction: """Set action control command into simulation. diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py index 8ce3134..c5e04fa 100644 --- a/embodichain/lab/gym/envs/embodied_env.py +++ b/embodichain/lab/gym/envs/embodied_env.py @@ -42,6 +42,7 @@ from embodichain.lab.gym.envs.managers import ( EventManager, ObservationManager, + RewardManager, DatasetManager, ) from embodichain.lab.gym.utils.registration import register_env @@ -91,6 +92,13 @@ class EnvLightCfg: Please refer to the :class:`embodichain.lab.gym.managers.ObservationManager` class for more details. """ + rewards: Union[object, None] = None + """Reward settings. Defaults to None, in which case no reward computation is performed through + the reward manager. + + Please refer to the :class:`embodichain.lab.gym.managers.RewardManager` class for more details. + """ + dataset: Union[object, None] = None """Dataset settings. Defaults to None, in which case no dataset collection is performed. @@ -176,6 +184,9 @@ def _init_sim_state(self, **kwargs): if self.cfg.observations: self.observation_manager = ObservationManager(self.cfg.observations, self) + if self.cfg.rewards: + self.reward_manager = RewardManager(self.cfg.rewards, self) + if self.cfg.dataset: self.dataset_manager = DatasetManager(self.cfg.dataset, self) @@ -329,6 +340,23 @@ def _extend_obs(self, obs: EnvObs, **kwargs) -> EnvObs: obs = self.observation_manager.compute(obs) return obs + def _extend_reward( + self, + rewards: torch.Tensor, + obs: EnvObs, + action: EnvAction, + info: Dict[str, Any], + **kwargs, + ) -> torch.Tensor: + if self.reward_manager: + rewards, reward_info = self.reward_manager.compute( + obs=obs, action=action, info=info + ) + # Add individual reward terms to info for logging + for term_name, term_value in reward_info.items(): + info[f"reward/{term_name}"] = term_value + return rewards + def _prepare_scene(self, **kwargs) -> None: self._setup_lights() self._setup_background() @@ -352,6 +380,10 @@ def _initialize_episode( if "reset" in self.event_manager.available_modes: self.event_manager.apply(mode="reset", env_ids=env_ids) + # reset reward manager for environments that need a reset + if self.cfg.rewards: + self.reward_manager.reset(env_ids=env_ids) + def _step_action(self, action: EnvAction) -> EnvAction: """Set action control command into simulation. diff --git a/embodichain/lab/gym/envs/managers/__init__.py b/embodichain/lab/gym/envs/managers/__init__.py index e38f4f2..88c96ef 100644 --- a/embodichain/lab/gym/envs/managers/__init__.py +++ b/embodichain/lab/gym/envs/managers/__init__.py @@ -19,10 +19,12 @@ SceneEntityCfg, EventCfg, ObservationCfg, + RewardCfg, DatasetFunctorCfg, ) from .manager_base import Functor, ManagerBase from .event_manager import EventManager from .observation_manager import ObservationManager +from .reward_manager import RewardManager from .dataset_manager import DatasetManager from .datasets import * diff --git a/embodichain/lab/gym/envs/managers/cfg.py b/embodichain/lab/gym/envs/managers/cfg.py index 07888c9..8da8f4e 100644 --- a/embodichain/lab/gym/envs/managers/cfg.py +++ b/embodichain/lab/gym/envs/managers/cfg.py @@ -311,6 +311,29 @@ def _resolve_body_names(self, scene: SimulationManager): self.body_names = [entity.body_names[i] for i in self.body_ids] +@configclass +class RewardCfg(FunctorCfg): + """Configuration for a reward functor. + + The reward functor is used to compute rewards for the environment. The `mode` attribute + determines how the reward is combined with existing rewards. + """ + + mode: Literal["add", "replace"] = "add" + """The mode for the reward computation. + + - `add`: The reward is added to the existing total reward. + - `replace`: The reward replaces the total reward (useful for single reward functions). + """ + + name: str = MISSING + """The name of the reward term. + + This is used for logging and debugging purposes. The name should be descriptive of what + the reward term represents, e.g., "distance_to_goal", "gripper_close", "collision_penalty". + """ + + @configclass class DatasetFunctorCfg(FunctorCfg): """Configuration for dataset collection functors. diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py index d7ae016..e54f362 100644 --- a/embodichain/lab/gym/envs/managers/observations.py +++ b/embodichain/lab/gym/envs/managers/observations.py @@ -713,3 +713,16 @@ def __call__( exteroception[sensor_uid] = projected_kpnts return exteroception + + +def virtual_target_position( + env: "EmbodiedEnv", + obs: EnvObs, + target_pose_key: str = "goal_pose", +) -> torch.Tensor: + """Get virtual target position from env state.""" + state_attr = f"_{target_pose_key}s" + if hasattr(env, state_attr): + target_poses = getattr(env, state_attr) + return target_poses[:, :3, 3] + return torch.zeros(env.num_envs, 3, device=env.device) diff --git a/embodichain/lab/gym/envs/managers/randomization/spatial.py b/embodichain/lab/gym/envs/managers/randomization/spatial.py index 2437e88..6c3f92f 100644 --- a/embodichain/lab/gym/envs/managers/randomization/spatial.py +++ b/embodichain/lab/gym/envs/managers/randomization/spatial.py @@ -266,3 +266,86 @@ def randomize_robot_qpos( robot.set_qpos(qpos=current_qpos, env_ids=env_ids, joint_ids=joint_ids) env.sim.update(step=100) + + +def randomize_target_pose( + env: EmbodiedEnv, + env_ids: Union[torch.Tensor, None], + position_range: tuple[list[float], list[float]], + rotation_range: tuple[list[float], list[float]] | None = None, + relative_position: bool = False, + relative_rotation: bool = False, + reference_entity_cfg: SceneEntityCfg | None = None, + store_key: str = "target_pose", +) -> None: + """Randomize a virtual target pose and store in env state for use in get_info(). + + This function generates random target poses without requiring a physical object in the scene. + The generated poses are stored in env and should be exposed in get_info() for reward functors. + + Args: + env (EmbodiedEnv): The environment instance. + env_ids (Union[torch.Tensor, None]): The environment IDs to apply the randomization. + position_range (tuple[list[float], list[float]]): The range for the position randomization. + rotation_range (tuple[list[float], list[float]] | None): The range for the rotation randomization. + The rotation is represented as Euler angles (roll, pitch, yaw) in degree. + relative_position (bool): Whether to randomize the position relative to a reference entity. Default is False. + relative_rotation (bool): Whether to randomize the rotation relative to a reference entity. Default is False. + reference_entity_cfg (SceneEntityCfg | None): The reference entity for relative randomization. + If None and relative mode is True, uses world origin. + store_key (str): The key to store the target pose in env state. Default is "target_pose". + The pose will be stored in env._{store_key}s and should be exposed in info[store_key]. + """ + num_instance = len(env_ids) + + # Get reference pose if needed + if relative_position or relative_rotation: + if reference_entity_cfg is not None: + # Get reference entity pose + ref_obj = env.sim.get_rigid_object(reference_entity_cfg.uid) + if ref_obj is not None: + ref_pose = ref_obj.get_local_pose(to_matrix=True)[env_ids] + init_pos = ref_pose[:, :3, 3] + init_rot = ref_pose[:, :3, :3] + else: + # Fallback to world origin + init_pos = torch.zeros(num_instance, 3, device=env.device) + init_rot = ( + torch.eye(3, device=env.device) + .unsqueeze(0) + .repeat(num_instance, 1, 1) + ) + else: + # Use world origin as reference + init_pos = torch.zeros(num_instance, 3, device=env.device) + init_rot = ( + torch.eye(3, device=env.device).unsqueeze(0).repeat(num_instance, 1, 1) + ) + else: + # Absolute randomization, init values won't be used + init_pos = torch.zeros(num_instance, 3, device=env.device) + init_rot = ( + torch.eye(3, device=env.device).unsqueeze(0).repeat(num_instance, 1, 1) + ) + + # Generate random pose + pose = get_random_pose( + init_pos=init_pos, + init_rot=init_rot, + position_range=position_range, + rotation_range=rotation_range, + relative_position=relative_position, + relative_rotation=relative_rotation, + ) + + # Store in env state (to be exposed via get_info) + state_attr = f"_{store_key}s" + if not hasattr(env, state_attr): + setattr( + env, + state_attr, + torch.zeros(env.num_envs, 4, 4, device=env.device, dtype=torch.float32), + ) + + target_poses = getattr(env, state_attr) + target_poses[env_ids] = pose diff --git a/embodichain/lab/gym/envs/managers/reward_manager.py b/embodichain/lab/gym/envs/managers/reward_manager.py new file mode 100644 index 0000000..46d74d3 --- /dev/null +++ b/embodichain/lab/gym/envs/managers/reward_manager.py @@ -0,0 +1,241 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +"""Reward manager for orchestrating reward computation in reinforcement learning tasks.""" + +from __future__ import annotations + +import inspect +import torch +from collections.abc import Sequence +from prettytable import PrettyTable +from typing import TYPE_CHECKING, Union + +from embodichain.utils import logger +from .manager_base import ManagerBase +from .cfg import RewardCfg + +if TYPE_CHECKING: + from embodichain.lab.gym.envs import EmbodiedEnv + + +class RewardManager(ManagerBase): + """Manager for orchestrating reward computation in reinforcement learning tasks. + + The reward manager computes rewards based on the current state of the environment and actions. + It supports multiple reward terms that can be combined through weighted summation. + + The reward manager offers two modes of operation: + - `add`: This mode computes a reward term and adds it to the total reward (weighted by the term's weight). + - `replace`: This mode replaces the total reward with the computed value (useful for single reward functions). + """ + + _env: EmbodiedEnv + """The environment instance.""" + + def __init__(self, cfg: object, env: EmbodiedEnv): + """Initialize the reward manager. + + Args: + cfg: A configuration object or dictionary (``dict[str, RewardCfg]``). + env: An environment object. + """ + + self._mode_functor_names: dict[str, list[str]] = dict() + self._mode_functor_cfgs: dict[str, list[RewardCfg]] = dict() + self._mode_class_functor_cfgs: dict[str, list[RewardCfg]] = dict() + + # call the base class (this will parse the functors config) + super().__init__(cfg, env) + + def __str__(self) -> str: + """Returns: A string representation for reward manager.""" + functor_num = sum(len(v) for v in self._mode_functor_names.values()) + msg = f" contains {functor_num} active reward terms.\n" + + # add info on each mode + for mode in self._mode_functor_names: + # create table for functor information + table = PrettyTable() + table.title = f"Active Reward Terms in Mode: '{mode}'" + + table.field_names = ["Index", "Name", "Weight"] + table.align["Name"] = "l" + for index, name in enumerate(self._mode_functor_names[mode]): + functor_cfg = self._mode_functor_cfgs[mode][index] + weight = functor_cfg.params.get("weight", 1.0) + table.add_row([index, name, f"{weight:.3f}"]) + + # convert table to string + msg += table.get_string() + msg += "\n" + + return msg + + """ + Properties. + """ + + @property + def active_functors(self) -> dict[str, list[str]]: + """Name of active reward functors. + + The keys are the modes of reward computation and the values are the names of the reward functors. + """ + return self._mode_functor_names + + """ + Operations. + """ + + def reset(self, env_ids: Union[Sequence[int], None] = None) -> dict[str, float]: + """Reset reward terms that are stateful (implemented as classes). + + Args: + env_ids: The environment indices to reset. If None, all environments are reset. + + Returns: + An empty dictionary (no logging needed for reset). + """ + # call all functors that are classes + for mode_cfg in self._mode_class_functor_cfgs.values(): + for functor_cfg in mode_cfg: + functor_cfg.func.reset(env_ids=env_ids) + + # nothing to log here + return {} + + def compute( + self, + obs: "EnvObs", + action: "EnvAction", + info: dict, + ) -> tuple[torch.Tensor, dict[str, torch.Tensor]]: + """Compute the total reward by calling each reward functor. + + This function iterates over all the reward functors and calls them to compute individual + reward terms. The terms are then combined according to their mode and weight. + + Args: + obs: The observation from the environment. + action: The action applied to the robot. + info: Additional information dictionary. + + Returns: + A tuple containing: + - total_reward: The total reward for each environment (shape: [num_envs]). + - reward_info: A dictionary mapping reward term names to their values for logging. + + Raises: + ValueError: If the mode is not supported. + """ + # initialize total reward + total_reward = torch.zeros(self._env.num_envs, device=self._env.device) + reward_info = {} + + # iterate over all the reward functors + for mode, functor_cfgs in self._mode_functor_cfgs.items(): + for functor_name, functor_cfg in zip( + self._mode_functor_names[mode], functor_cfgs + ): + functor_cfg: RewardCfg + + # compute reward term + reward_term = functor_cfg.func( + self._env, obs=obs, action=action, info=info, **functor_cfg.params + ) + + # ensure reward is a tensor + if not isinstance(reward_term, torch.Tensor): + reward_term = torch.tensor( + reward_term, device=self._env.device, dtype=torch.float32 + ) + + # get weight from params or default to 1.0 + weight = functor_cfg.params.get("weight", 1.0) + weighted_reward = reward_term * weight + + # combine reward based on mode + if mode == "add": + total_reward += weighted_reward + elif mode == "replace": + total_reward = weighted_reward + else: + logger.log_error(f"Unsupported reward mode '{mode}'.") + + # store for logging (use unweighted value for clarity) + reward_info[functor_name] = reward_term + + return total_reward, reward_info + + def get_functor_cfg(self, functor_name: str) -> RewardCfg: + """Gets the configuration for the specified functor. + + The method finds the functor by name by searching through all the modes. + It then returns the configuration of the functor with the first matching name. + + Args: + functor_name: The name of the reward functor. + + Returns: + The configuration of the reward functor. + + Raises: + ValueError: If the functor name is not found. + """ + for mode, functors in self._mode_functor_names.items(): + if functor_name in functors: + return self._mode_functor_cfgs[mode][functors.index(functor_name)] + logger.log_error(f"Reward functor '{functor_name}' not found.") + + """ + Helper functions. + """ + + def _prepare_functors(self): + # check if config is dict already + if isinstance(self.cfg, dict): + cfg_items = self.cfg.items() + else: + cfg_items = self.cfg.__dict__.items() + # iterate over all the functors + for functor_name, functor_cfg in cfg_items: + # check for non config + if functor_cfg is None: + continue + # check for valid config type + if not isinstance(functor_cfg, RewardCfg): + raise TypeError( + f"Configuration for the functor '{functor_name}' is not of type RewardCfg." + f" Received: '{type(functor_cfg)}'." + ) + + # resolve common parameters + self._resolve_common_functor_cfg(functor_name, functor_cfg, min_argc=2) + + # check if mode is a new mode + if functor_cfg.mode not in self._mode_functor_names: + # add new mode + self._mode_functor_names[functor_cfg.mode] = list() + self._mode_functor_cfgs[functor_cfg.mode] = list() + self._mode_class_functor_cfgs[functor_cfg.mode] = list() + # add functor name and parameters + self._mode_functor_names[functor_cfg.mode].append(functor_name) + self._mode_functor_cfgs[functor_cfg.mode].append(functor_cfg) + + # check if the functor is a class + if inspect.isclass(functor_cfg.func): + self._mode_class_functor_cfgs[functor_cfg.mode].append(functor_cfg) diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py new file mode 100644 index 0000000..8023280 --- /dev/null +++ b/embodichain/lab/gym/envs/managers/rewards.py @@ -0,0 +1,482 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +"""Common reward functors for reinforcement learning tasks.""" + +from __future__ import annotations + +import torch +from typing import TYPE_CHECKING + +from embodichain.lab.gym.envs.managers.cfg import SceneEntityCfg + +if TYPE_CHECKING: + from embodichain.lab.gym.envs import EmbodiedEnv + + +def reward_from_obs( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, + obs_key: str = "robot/qpos", + target_value: float = 0.0, + scale: float = 1.0, +) -> torch.Tensor: + """Reward based on observation values.""" + # Parse nested keys (e.g., "robot/qpos") + keys = obs_key.split("/") + value = obs + for key in keys: + if isinstance(value, dict) and key in value: + value = value[key] + else: + return torch.zeros(env.num_envs, device=env.device) + + # Compute distance to target + if isinstance(value, torch.Tensor): + if value.dim() > 1: + # Multiple values, compute norm + distance = torch.norm(value - target_value, dim=-1) + else: + distance = torch.abs(value - target_value) + reward = -scale * distance + else: + reward = torch.zeros(env.num_envs, device=env.device) + + return reward + + +def distance_between_objects( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, + source_entity_cfg: SceneEntityCfg = None, + target_entity_cfg: SceneEntityCfg = None, + exponential: bool = False, + sigma: float = 1.0, +) -> torch.Tensor: + """Reward based on distance between two entities.""" + # get source entity position + source_obj = env.sim[source_entity_cfg.uid] + if hasattr(source_obj, "get_body_pose"): + source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[ + :, :3, 3 + ] + elif hasattr(source_obj, "get_local_pose"): + source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] + else: + raise ValueError( + f"Entity '{source_entity_cfg.uid}' does not support position query." + ) + + # get target entity position + target_obj = env.sim[target_entity_cfg.uid] + if hasattr(target_obj, "get_body_pose"): + target_pos = target_obj.get_body_pose(body_ids=target_entity_cfg.body_ids)[ + :, :3, 3 + ] + elif hasattr(target_obj, "get_local_pose"): + target_pos = target_obj.get_local_pose(to_matrix=True)[:, :3, 3] + else: + raise ValueError( + f"Entity '{target_entity_cfg.uid}' does not support position query." + ) + + # compute distance + distance = torch.norm(source_pos - target_pos, dim=-1) + + # compute reward + if exponential: + # exponential reward: exp(-distance^2 / (2 * sigma^2)) + reward = torch.exp(-(distance**2) / (2 * sigma**2)) + else: + # negative distance reward + reward = -distance + + return reward + + +def joint_velocity_penalty( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, + robot_uid: str = "robot", + joint_ids: slice | list[int] = slice(None), +) -> torch.Tensor: + """Penalize large joint velocities.""" + robot = env.sim[robot_uid] + + # get joint velocities + qvel = robot.body_data.qvel[:, joint_ids] + + # compute L2 norm of joint velocities + velocity_norm = torch.norm(qvel, dim=-1) + + # negative penalty (higher velocity = more negative reward) + return -velocity_norm + + +def action_smoothness_penalty( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, +) -> torch.Tensor: + """Penalize large changes in action between steps.""" + # compute difference between current and previous action + if hasattr(env, "_prev_actions"): + action_diff = action - env._prev_actions + penalty = -torch.norm(action_diff, dim=-1) + else: + # no previous action, no penalty + penalty = torch.zeros(env.num_envs, device=env.device) + + # store current action for next step + env._prev_actions = action.clone() + + return penalty + + +def joint_limit_penalty( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, + robot_uid: str = "robot", + joint_ids: slice | list[int] = slice(None), + margin: float = 0.1, +) -> torch.Tensor: + """Penalize joints approaching their limits.""" + robot = env.sim[robot_uid] + + # get joint positions and limits + qpos = robot.body_data.qpos[:, joint_ids] + qpos_limits = robot.body_data.qpos_limits[:, joint_ids, :] + + # compute normalized position in range [0, 1] + qpos_normalized = (qpos - qpos_limits[:, :, 0]) / ( + qpos_limits[:, :, 1] - qpos_limits[:, :, 0] + ) + + # compute distance to limits (minimum of distance to lower and upper limit) + dist_to_lower = qpos_normalized + dist_to_upper = 1.0 - qpos_normalized + dist_to_limit = torch.min(dist_to_lower, dist_to_upper) + + # penalize joints within margin of limits + penalty_mask = dist_to_limit < margin + penalty = torch.where( + penalty_mask, + -(margin - dist_to_limit), # negative penalty + torch.zeros_like(dist_to_limit), + ) + + # sum over all joints + return penalty.sum(dim=-1) + + +def collision_penalty( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, + robot_uid: str = "robot", + force_threshold: float = 1.0, +) -> torch.Tensor: + """Penalize collisions based on contact forces.""" + robot = env.sim[robot_uid] + + # get joint forces (torques) + qf = robot.body_data.qf + + # check if any joint force exceeds threshold + collision_detected = (torch.abs(qf) > force_threshold).any(dim=-1) + + # return penalty for collisions + penalty = torch.where( + collision_detected, + torch.full((env.num_envs,), -1.0, device=env.device), + torch.zeros(env.num_envs, device=env.device), + ) + + return penalty + + +def orientation_alignment_reward( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, + source_entity_cfg: SceneEntityCfg = None, + target_entity_cfg: SceneEntityCfg = None, +) -> torch.Tensor: + """Reward alignment of orientations between two entities.""" + # get source entity rotation matrix + source_obj = env.sim[source_entity_cfg.uid] + if hasattr(source_obj, "get_body_pose"): + source_rot = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[ + :, :3, :3 + ] + elif hasattr(source_obj, "get_local_pose"): + source_rot = source_obj.get_local_pose(to_matrix=True)[:, :3, :3] + else: + raise ValueError( + f"Entity '{source_entity_cfg.uid}' does not support orientation query." + ) + + # get target entity rotation matrix + target_obj = env.sim[target_entity_cfg.uid] + if hasattr(target_obj, "get_body_pose"): + target_rot = target_obj.get_body_pose(body_ids=target_entity_cfg.body_ids)[ + :, :3, :3 + ] + elif hasattr(target_obj, "get_local_pose"): + target_rot = target_obj.get_local_pose(to_matrix=True)[:, :3, :3] + else: + raise ValueError( + f"Entity '{target_entity_cfg.uid}' does not support orientation query." + ) + + # compute rotation difference + rot_diff = torch.bmm(source_rot, target_rot.transpose(-1, -2)) + + # trace of rotation matrix difference (measure of alignment) + # trace = 1 + 2*cos(theta) for rotation by angle theta + # normalized to range [0, 1] where 1 is perfect alignment + trace = rot_diff.diagonal(dim1=-2, dim2=-1).sum(-1) + alignment = (trace - 1.0) / 2.0 # normalize to [-1, 1] + + return alignment + + +def success_reward( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, + reward_value: float = 1.0, +) -> torch.Tensor: + """Sparse reward for task success.""" + # Check if success info is available in info dict + if "success" in info: + success = info["success"] + if isinstance(success, bool): + success = torch.tensor([success], device=env.device, dtype=torch.bool) + elif not isinstance(success, torch.Tensor): + success = torch.tensor(success, device=env.device, dtype=torch.bool) + else: + # No success info available + return torch.zeros(env.num_envs, device=env.device) + + # return reward + reward = torch.where( + success, + torch.full((env.num_envs,), reward_value, device=env.device), + torch.zeros(env.num_envs, device=env.device), + ) + + return reward + + +def reaching_behind_object_reward( + env: EmbodiedEnv, + obs: dict, + action: torch.Tensor, + info: dict, + end_effector_cfg: SceneEntityCfg = None, + object_cfg: SceneEntityCfg = None, + goal_cfg: SceneEntityCfg = None, + behind_offset: float = 0.015, + height_offset: float = 0.015, + distance_scale: float = 5.0, +) -> torch.Tensor: + """Reward for reaching behind an object along object-to-goal direction.""" + # get end effector position + ee_obj = env.sim[end_effector_cfg.uid] + if hasattr(ee_obj, "get_body_pose"): + ee_pos = ee_obj.get_body_pose(body_ids=end_effector_cfg.body_ids)[:, :3, 3] + elif hasattr(ee_obj, "get_local_pose"): + ee_pos = ee_obj.get_local_pose(to_matrix=True)[:, :3, 3] + else: + raise ValueError( + f"Entity '{end_effector_cfg.uid}' does not support position query." + ) + + # get object position + obj = env.sim[object_cfg.uid] + if hasattr(obj, "get_body_pose"): + obj_pos = obj.get_body_pose(body_ids=object_cfg.body_ids)[:, :3, 3] + elif hasattr(obj, "get_local_pose"): + obj_pos = obj.get_local_pose(to_matrix=True)[:, :3, 3] + else: + raise ValueError(f"Entity '{object_cfg.uid}' does not support position query.") + + # get goal position + goal_obj = env.sim[goal_cfg.uid] + if hasattr(goal_obj, "get_body_pose"): + goal_pos = goal_obj.get_body_pose(body_ids=goal_cfg.body_ids)[:, :3, 3] + elif hasattr(goal_obj, "get_local_pose"): + goal_pos = goal_obj.get_local_pose(to_matrix=True)[:, :3, 3] + else: + raise ValueError(f"Entity '{goal_cfg.uid}' does not support position query.") + + # compute push direction (from object to goal) + push_direction = goal_pos - obj_pos + push_dir_norm = torch.norm(push_direction, dim=-1, keepdim=True) + 1e-6 + push_dir_normalized = push_direction / push_dir_norm + + # compute target "behind" position + height_vec = torch.tensor( + [0, 0, height_offset], device=env.device, dtype=torch.float32 + ) + target_pos = obj_pos - behind_offset * push_dir_normalized + height_vec + + # distance to target position + ee_to_target_dist = torch.norm(ee_pos - target_pos, dim=-1) + + # tanh-shaped reward (1.0 when at target, 0.0 when far) + reward = 1.0 - torch.tanh(distance_scale * ee_to_target_dist) + + return reward + + +def distance_to_target( + env: "EmbodiedEnv", + obs: dict, + action: torch.Tensor, + info: dict, + source_entity_cfg: SceneEntityCfg = None, + target_pose_key: str = "target_pose", + exponential: bool = False, + sigma: float = 1.0, + use_xy_only: bool = False, +) -> torch.Tensor: + """Reward based on distance to a virtual target pose from info.""" + # get source entity position + source_obj = env.sim[source_entity_cfg.uid] + if hasattr(source_obj, "get_body_pose"): + source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[ + :, :3, 3 + ] + elif hasattr(source_obj, "get_local_pose"): + source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] + else: + raise ValueError( + f"Entity '{source_entity_cfg.uid}' does not support position query." + ) + + # get target position from info + if target_pose_key not in info: + raise ValueError( + f"Target pose '{target_pose_key}' not found in info dict. " + f"Make sure to provide it in get_info()." + ) + + target_poses = info[target_pose_key] + if target_poses.dim() == 2: # (num_envs, 3) + target_pos = target_poses + else: # (num_envs, 4, 4) + target_pos = target_poses[:, :3, 3] + + # compute distance + if use_xy_only: + distance = torch.norm(source_pos[:, :2] - target_pos[:, :2], dim=-1) + else: + distance = torch.norm(source_pos - target_pos, dim=-1) + + # compute reward + if exponential: + # exponential reward: exp(-distance^2 / (2 * sigma^2)) + reward = torch.exp(-(distance**2) / (2 * sigma**2)) + else: + # negative distance reward + reward = -distance + + return reward + + +def incremental_distance_to_target( + env: "EmbodiedEnv", + obs: dict, + action: torch.Tensor, + info: dict, + source_entity_cfg: SceneEntityCfg = None, + target_pose_key: str = "target_pose", + tanh_scale: float = 10.0, + positive_weight: float = 1.0, + negative_weight: float = 1.0, + use_xy_only: bool = False, +) -> torch.Tensor: + """Incremental reward for progress toward a virtual target pose from info.""" + # get source entity position + source_obj = env.sim[source_entity_cfg.uid] + if hasattr(source_obj, "get_body_pose"): + source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[ + :, :3, 3 + ] + elif hasattr(source_obj, "get_local_pose"): + source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] + else: + raise ValueError( + f"Entity '{source_entity_cfg.uid}' does not support position query." + ) + + # get target position from info + if target_pose_key not in info: + raise ValueError( + f"Target pose '{target_pose_key}' not found in info dict. " + f"Make sure to provide it in get_info()." + ) + + target_poses = info[target_pose_key] + if target_poses.dim() == 2: # (num_envs, 3) + target_pos = target_poses + else: # (num_envs, 4, 4) + target_pos = target_poses[:, :3, 3] + + # compute current distance + if use_xy_only: + current_dist = torch.norm(source_pos[:, :2] - target_pos[:, :2], dim=-1) + else: + current_dist = torch.norm(source_pos - target_pos, dim=-1) + + # initialize previous distance on first call + prev_dist_key = f"_prev_dist_{source_entity_cfg.uid}_{target_pose_key}" + if not hasattr(env, prev_dist_key): + setattr(env, prev_dist_key, current_dist.clone()) + return torch.zeros(env.num_envs, device=env.device) + + # compute distance delta (positive = getting closer) + prev_dist = getattr(env, prev_dist_key) + distance_delta = prev_dist - current_dist + + # apply tanh shaping + distance_delta_normalized = torch.tanh(tanh_scale * distance_delta) + + # asymmetric weighting + reward = torch.where( + distance_delta_normalized >= 0, + positive_weight * distance_delta_normalized, + negative_weight * distance_delta_normalized, + ) + + # update previous distance + setattr(env, prev_dist_key, current_dist.clone()) + + return reward diff --git a/embodichain/lab/gym/envs/tasks/rl/push_cube.py b/embodichain/lab/gym/envs/tasks/rl/push_cube.py index 4aef16c..01e5074 100644 --- a/embodichain/lab/gym/envs/tasks/rl/push_cube.py +++ b/embodichain/lab/gym/envs/tasks/rl/push_cube.py @@ -44,19 +44,12 @@ def __init__(self, cfg=None, **kwargs): defaults = { "success_threshold": 0.1, - "reaching_reward_weight": 0.1, - "place_reward_weight": 2.0, - "place_penalty_weight": 0.5, - "action_penalty_weight": 0.01, - "success_bonus_weight": 10.0, } for name, default in defaults.items(): value = extensions.get(name, getattr(cfg, name, default)) setattr(cfg, name, value) setattr(self, name, getattr(cfg, name)) - self.last_cube_goal_dist = None - super().__init__(cfg, **kwargs) def _draw_goal_marker(self): @@ -104,15 +97,6 @@ def _initialize_episode( self, env_ids: Sequence[int] | None = None, **kwargs ) -> None: super()._initialize_episode(env_ids=env_ids, **kwargs) - cube = self.sim.get_rigid_object("cube") - - # Calculate previous distance (for incremental reward) based on current (possibly randomized) pose - cube_pos = cube.body_data.pose[:, :3] - goal_sphere = self.sim.get_rigid_object("goal_sphere") - goal_pos = goal_sphere.body_data.pose[ - :, :3 - ] # Get actual goal positions for each environment - self.last_cube_goal_dist = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1) # Draw marker at goal position # self._draw_goal_marker() @@ -128,84 +112,27 @@ def _step_action(self, action: EnvAction) -> EnvAction: self.robot.set_qpos(qpos=target_qpos) return scaled_action - def get_obs(self, **kwargs) -> EnvObs: - qpos_all = self.robot.body_data.qpos[:, :6] - ee_pose_matrix = self.robot.compute_fk( - name="arm", qpos=qpos_all, to_matrix=True - ) - ee_pos_all = ee_pose_matrix[:, :3, 3] + def get_info(self, **kwargs) -> Dict[str, Any]: cube = self.sim.get_rigid_object("cube") - cube_pos_all = cube.body_data.pose[:, :3] - # Get actual goal positions for each environment - goal_sphere = self.sim.get_rigid_object("goal_sphere") - goal_pos_all = goal_sphere.body_data.pose[:, :3] - if self.obs_mode == "state": - return torch.cat([qpos_all, ee_pos_all, cube_pos_all, goal_pos_all], dim=1) - return { - "robot": {"qpos": qpos_all, "ee_pos": ee_pos_all}, - "object": {"cube_pos": cube_pos_all, "goal_pos": goal_pos_all}, - } + cube_pos = cube.body_data.pose[:, :3] - def get_reward( - self, obs: EnvObs, action: EnvAction, info: Dict[str, Any] - ) -> torch.Tensor: - if self.obs_mode == "state": - ee_pos = obs[:, 6:9] - cube_pos = obs[:, 9:12] - goal_pos = obs[:, 12:15] + # Get virtual goal pose from env state (set by randomize_target_pose event) + if hasattr(self, "_goal_poses"): + goal_pos = self._goal_poses[:, :3, 3] else: - ee_pos = obs["robot"]["ee_pos"] - cube_pos = obs["object"]["cube_pos"] - goal_pos = obs["object"]["goal_pos"] - push_direction = goal_pos - cube_pos - push_dir_norm = torch.norm(push_direction, dim=1, keepdim=True) + 1e-6 - push_dir_normalized = push_direction / push_dir_norm - push_pose = ( - cube_pos - - 0.015 * push_dir_normalized - + torch.tensor([0, 0, 0.015], device=self.device, dtype=torch.float32) - ) - ee_to_push_dist = torch.norm(ee_pos - push_pose, dim=1) - reaching_reward_raw = 1.0 - torch.tanh(5.0 * ee_to_push_dist) - reaching_reward = self.reaching_reward_weight * reaching_reward_raw - cube_to_goal_dist = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1) - distance_delta = 10.0 * (self.last_cube_goal_dist - cube_to_goal_dist) - distance_delta_normalized = torch.tanh(distance_delta) - place_reward = torch.where( - distance_delta_normalized >= 0, - self.place_reward_weight * distance_delta_normalized, - self.place_penalty_weight * distance_delta_normalized, - ) - self.last_cube_goal_dist = cube_to_goal_dist - action_magnitude = torch.norm(action, dim=1) - action_penalty = -self.action_penalty_weight * action_magnitude - success_bonus_raw = info["success"].float() - success_bonus = self.success_bonus_weight * success_bonus_raw - reward = reaching_reward + place_reward + action_penalty + success_bonus - # Organize reward components in a dedicated "rewards" dict - # This allows trainer to easily identify and log reward components - if "rewards" not in info: - info["rewards"] = {} - info["rewards"]["reaching_reward"] = reaching_reward - info["rewards"]["place_reward"] = place_reward - info["rewards"]["action_penalty"] = action_penalty - info["rewards"]["success_bonus"] = success_bonus - return reward + # Fallback: no virtual goal set + goal_pos = torch.zeros_like(cube_pos) - def get_info(self, **kwargs) -> Dict[str, Any]: - cube = self.sim.get_rigid_object("cube") - cube_pos = cube.body_data.pose[:, :3] - # Get actual goal positions for each environment - goal_sphere = self.sim.get_rigid_object("goal_sphere") - goal_pos = goal_sphere.body_data.pose[:, :3] xy_distance = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1) is_success = xy_distance < self.success_threshold + info = { "success": is_success, "fail": torch.zeros( self.cfg.num_envs, device=self.device, dtype=torch.bool ), "elapsed_steps": self._elapsed_steps, + "goal_pose": self._goal_poses if hasattr(self, "_goal_poses") else None, } info["metrics"] = { "distance_to_goal": xy_distance, diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py index 6ebb6d6..aa7123f 100644 --- a/embodichain/lab/gym/utils/gym_utils.py +++ b/embodichain/lab/gym/utils/gym_utils.py @@ -546,6 +546,50 @@ class ComponentCfg: setattr(env_cfg.observations, obs_name, observation) + env_cfg.rewards = ComponentCfg() + if "rewards" in config["env"]: + # Define modules to search for reward functions + reward_modules = [ + "embodichain.lab.gym.envs.managers.rewards", + ] + + for reward_name, reward_params in config["env"]["rewards"].items(): + reward_params_modified = deepcopy(reward_params) + + # Handle entity_cfg parameters + for param_key in [ + "entity_cfg", + "source_entity_cfg", + "target_entity_cfg", + "end_effector_cfg", + "object_cfg", + "goal_cfg", + "reference_entity_cfg", + ]: + if param_key in reward_params["params"]: + entity_cfg = SceneEntityCfg( + **reward_params_modified["params"][param_key] + ) + reward_params_modified["params"][param_key] = entity_cfg + + # Find the function from multiple modules using the utility function + reward_func = find_function_from_modules( + reward_params["func"], + reward_modules, + raise_if_not_found=True, + ) + + from embodichain.lab.gym.envs.managers import RewardCfg + + reward = RewardCfg( + func=reward_func, + mode=reward_params_modified["mode"], + name=reward_params_modified["name"], + params=reward_params_modified["params"], + ) + + setattr(env_cfg.rewards, reward_name, reward) + return env_cfg From f2a1d19d9b946235a627626661583b71d3797ecf Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Fri, 16 Jan 2026 16:07:47 +0800 Subject: [PATCH 2/9] fix obs --- configs/agents/rl/push_cube/gym_config.json | 32 +++++----- embodichain/lab/gym/envs/managers/cfg.py | 7 +++ .../lab/gym/envs/managers/observations.py | 59 +++++++++++++++---- .../lab/gym/envs/managers/reward_manager.py | 7 +-- 4 files changed, 70 insertions(+), 35 deletions(-) diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index b4a5cd0..280f86c 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -26,35 +26,32 @@ }, "observations": { "robot_qpos": { - "func": "embodichain.lab.gym.envs.managers.observations:normalize_robot_joint_data", - "mode": "concatenate", + "func": "normalize_robot_joint_data", + "mode": "add", "name": "robot/qpos", "params": { - "robot_uid": "Manipulator", - "joint_ids": [0, 1, 2, 3, 4, 5], - "data_type": "qpos" + "joint_ids": [0, 1, 2, 3, 4, 5] } }, "robot_ee_pos": { - "func": "embodichain.lab.gym.envs.managers.observations:get_robot_ee_pose", - "mode": "concatenate", + "func": "get_robot_ee_pose", + "mode": "add", "name": "robot/ee_pos", "params": { - "robot_uid": "Manipulator", "part_name": "arm" } }, "cube_pos": { - "func": "embodichain.lab.gym.envs.managers.observations:get_rigid_object_pose", - "mode": "concatenate", + "func": "get_rigid_object_pose", + "mode": "add", "name": "object/cube_pos", "params": { "entity_cfg": {"uid": "cube"} } }, "goal_pos": { - "func": "embodichain.lab.gym.envs.managers.observations:virtual_target_position", - "mode": "concatenate", + "func": "target_position", + "mode": "add", "name": "object/goal_pos", "params": { "target_pose_key": "goal_pose" @@ -66,8 +63,8 @@ "func": "reaching_behind_object_reward", "mode": "add", "name": "reaching", + "weight": 0.1, "params": { - "weight": 0.1, "end_effector_cfg": {"uid": "Manipulator", "body_ids": "ee_link"}, "object_cfg": {"uid": "cube"}, "goal_cfg": {"uid": "goal_sphere"}, @@ -80,8 +77,8 @@ "func": "incremental_distance_to_target", "mode": "add", "name": "place", + "weight": 1.0, "params": { - "weight": 1.0, "source_entity_cfg": {"uid": "cube"}, "target_pose_key": "goal_pose", "tanh_scale": 10.0, @@ -94,16 +91,15 @@ "func": "action_smoothness_penalty", "mode": "add", "name": "action_penalty", - "params": { - "weight": 0.01 - } + "weight": 0.01, + "params": {} }, "success_bonus": { "func": "success_reward", "mode": "add", "name": "success", + "weight": 10.0, "params": { - "weight": 10.0, "reward_value": 1.0 } } diff --git a/embodichain/lab/gym/envs/managers/cfg.py b/embodichain/lab/gym/envs/managers/cfg.py index 8da8f4e..eeacc86 100644 --- a/embodichain/lab/gym/envs/managers/cfg.py +++ b/embodichain/lab/gym/envs/managers/cfg.py @@ -333,6 +333,13 @@ class RewardCfg(FunctorCfg): the reward term represents, e.g., "distance_to_goal", "gripper_close", "collision_penalty". """ + weight: float = 1.0 + """The weight multiplier for this reward term. + + This value is used to scale the reward before adding it to the total reward. + Default is 1.0 (no scaling). + """ + @configclass class DatasetFunctorCfg(FunctorCfg): diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py index e54f362..306d993 100644 --- a/embodichain/lab/gym/envs/managers/observations.py +++ b/embodichain/lab/gym/envs/managers/observations.py @@ -252,6 +252,52 @@ def compute_semantic_mask( return torch.stack(masks, dim=-1) +def get_robot_ee_pose( + env: "EmbodiedEnv", + obs: EnvObs, + part_name: str | None = None, + position_only: bool = False, +) -> torch.Tensor: + """Get robot end-effector pose using forward kinematics. + + Args: + env: The environment instance. + obs: The observation dictionary. + robot_uid: The uid of the robot. If None, uses env.robot. + part_name: The name of the control part. If None, uses default part. + position_only: If True, returns only position (3D). If False, returns full pose (4x4 matrix). + + Returns: + A tensor of shape (num_envs, 3) if position_only=True, or (num_envs, 4, 4) otherwise. + """ + robot = env.robot + + if part_name is not None: + joint_ids = robot.get_joint_ids(part_name) + qpos = robot.body_data.qpos[:, joint_ids] + ee_pose = robot.compute_fk(name=part_name, qpos=qpos, to_matrix=True) + else: + qpos = robot.get_qpos() + ee_pose = robot.compute_fk(qpos=qpos, to_matrix=True) + + if position_only: + return ee_pose[:, :3, 3] + return ee_pose + + +def target_position( + env: "EmbodiedEnv", + obs: EnvObs, + target_pose_key: str = "goal_pose", +) -> torch.Tensor: + """Get virtual target position from env state.""" + state_attr = f"_{target_pose_key}s" + if hasattr(env, state_attr): + target_poses = getattr(env, state_attr) + return target_poses[:, :3, 3] + return torch.zeros(env.num_envs, 3, device=env.device) + + class compute_exteroception(Functor): """Compute the exteroception for the observation space. @@ -713,16 +759,3 @@ def __call__( exteroception[sensor_uid] = projected_kpnts return exteroception - - -def virtual_target_position( - env: "EmbodiedEnv", - obs: EnvObs, - target_pose_key: str = "goal_pose", -) -> torch.Tensor: - """Get virtual target position from env state.""" - state_attr = f"_{target_pose_key}s" - if hasattr(env, state_attr): - target_poses = getattr(env, state_attr) - return target_poses[:, :3, 3] - return torch.zeros(env.num_envs, 3, device=env.device) diff --git a/embodichain/lab/gym/envs/managers/reward_manager.py b/embodichain/lab/gym/envs/managers/reward_manager.py index 46d74d3..8f282df 100644 --- a/embodichain/lab/gym/envs/managers/reward_manager.py +++ b/embodichain/lab/gym/envs/managers/reward_manager.py @@ -164,9 +164,8 @@ def compute( reward_term, device=self._env.device, dtype=torch.float32 ) - # get weight from params or default to 1.0 - weight = functor_cfg.params.get("weight", 1.0) - weighted_reward = reward_term * weight + # apply weight from config + weighted_reward = reward_term * functor_cfg.weight # combine reward based on mode if mode == "add": @@ -224,7 +223,7 @@ def _prepare_functors(self): ) # resolve common parameters - self._resolve_common_functor_cfg(functor_name, functor_cfg, min_argc=2) + self._resolve_common_functor_cfg(functor_name, functor_cfg, min_argc=4) # check if mode is a new mode if functor_cfg.mode not in self._mode_functor_names: From 548c1b10818804f0403a38c2d31c6c44d7af7ac3 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Mon, 19 Jan 2026 11:28:47 +0800 Subject: [PATCH 3/9] fix obs_dim with flatten dict input --- configs/agents/rl/push_cube/gym_config.json | 8 +-- embodichain/agents/rl/algo/ppo.py | 6 +- embodichain/agents/rl/train.py | 18 +++--- embodichain/agents/rl/utils/__init__.py | 2 + embodichain/agents/rl/utils/helper.py | 52 ++++++++++++++++ embodichain/agents/rl/utils/trainer.py | 26 ++++---- embodichain/lab/gym/envs/base_env.py | 16 +++++ embodichain/lab/gym/envs/managers/cfg.py | 4 +- embodichain/lab/gym/envs/managers/rewards.py | 62 +++++++------------ embodichain/lab/gym/envs/tasks/rl/__init__.py | 4 +- .../lab/gym/envs/tasks/rl/push_cube.py | 25 -------- embodichain/lab/gym/utils/gym_utils.py | 19 +----- 12 files changed, 133 insertions(+), 109 deletions(-) create mode 100644 embodichain/agents/rl/utils/helper.py diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index 280f86c..87858e3 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -27,7 +27,7 @@ "observations": { "robot_qpos": { "func": "normalize_robot_joint_data", - "mode": "add", + "mode": "modify", "name": "robot/qpos", "params": { "joint_ids": [0, 1, 2, 3, 4, 5] @@ -65,12 +65,12 @@ "name": "reaching", "weight": 0.1, "params": { - "end_effector_cfg": {"uid": "Manipulator", "body_ids": "ee_link"}, "object_cfg": {"uid": "cube"}, - "goal_cfg": {"uid": "goal_sphere"}, + "target_pose_key": "goal_pose", "behind_offset": 0.015, "height_offset": 0.015, - "distance_scale": 5.0 + "distance_scale": 5.0, + "part_name": "arm" } }, "place_reward": { diff --git a/embodichain/agents/rl/algo/ppo.py b/embodichain/agents/rl/algo/ppo.py index afd636e..5253e89 100644 --- a/embodichain/agents/rl/algo/ppo.py +++ b/embodichain/agents/rl/algo/ppo.py @@ -17,7 +17,7 @@ import torch from typing import Dict, Any, Tuple, Callable -from embodichain.agents.rl.utils import AlgorithmCfg +from embodichain.agents.rl.utils import AlgorithmCfg, flatten_dict_observation from embodichain.agents.rl.buffer import RolloutBuffer from embodichain.utils import configclass from .base import BaseAlgorithm @@ -102,6 +102,10 @@ def collect_rollout( reward = reward.float() done = done.bool() + # Flatten dict observation from ObservationManager if needed + if isinstance(next_obs, dict): + next_obs = flatten_dict_observation(next_obs) + # Add to buffer self.buffer.add(current_obs, actions, reward, done, value, log_prob) diff --git a/embodichain/agents/rl/train.py b/embodichain/agents/rl/train.py index e87d462..4d95ff4 100644 --- a/embodichain/agents/rl/train.py +++ b/embodichain/agents/rl/train.py @@ -32,7 +32,7 @@ from embodichain.agents.rl.utils.trainer import Trainer from embodichain.utils import logger from embodichain.lab.gym.envs.tasks.rl import build_env -from embodichain.lab.gym.utils.gym_utils import config_to_rl_cfg +from embodichain.lab.gym.utils.gym_utils import config_to_cfg from embodichain.utils.utility import load_json from embodichain.utils.module_utils import find_function_from_modules from embodichain.lab.sim import SimulationManagerCfg @@ -120,7 +120,7 @@ def main(): logger.log_info(f"Current working directory: {Path.cwd()}") gym_config_data = load_json(str(gym_config_path)) - gym_env_cfg = config_to_rl_cfg(gym_config_data) + gym_env_cfg = config_to_cfg(gym_config_data) # Ensure sim configuration mirrors runtime overrides if gym_env_cfg.sim_cfg is None: @@ -137,22 +137,24 @@ def main(): gym_env_cfg.sim_cfg.headless = headless logger.log_info( - f"Loaded gym_config from {gym_config_path} (env_id={gym_env_cfg.env_id}, headless={gym_env_cfg.sim_cfg.headless}, sim_device={gym_env_cfg.sim_cfg.sim_device})" + f"Loaded gym_config from {gym_config_path} (env_id={gym_config_data['id']}, headless={gym_env_cfg.sim_cfg.headless}, sim_device={gym_env_cfg.sim_cfg.sim_device})" ) - env = build_env(gym_env_cfg.env_id, base_env_cfg=gym_env_cfg) + env = build_env(gym_config_data["id"], base_env_cfg=gym_env_cfg) eval_gym_env_cfg = deepcopy(gym_env_cfg) eval_gym_env_cfg.num_envs = 4 eval_gym_env_cfg.sim_cfg.headless = True - eval_env = build_env(eval_gym_env_cfg.env_id, base_env_cfg=eval_gym_env_cfg) + eval_env = build_env(gym_config_data["id"], base_env_cfg=eval_gym_env_cfg) # Build Policy via registry policy_name = policy_block["name"] # Build Policy via registry (actor/critic must be explicitly defined in JSON when using actor_critic) if policy_name.lower() == "actor_critic": - obs_dim = env.observation_space.shape[-1] + # Get observation dimension from flattened observation space + # flattened_observation_space returns Box space for RL training + obs_dim = env.flattened_observation_space.shape[-1] action_dim = env.action_space.shape[-1] actor_cfg = policy_block.get("actor") @@ -167,7 +169,7 @@ def main(): policy = build_policy( policy_block, - env.observation_space, + env.flattened_observation_space, env.action_space, device, actor=actor, @@ -175,7 +177,7 @@ def main(): ) else: policy = build_policy( - policy_block, env.observation_space, env.action_space, device + policy_block, env.flattened_observation_space, env.action_space, device ) # Build Algorithm via factory diff --git a/embodichain/agents/rl/utils/__init__.py b/embodichain/agents/rl/utils/__init__.py index f6f9f4f..e6f9e57 100644 --- a/embodichain/agents/rl/utils/__init__.py +++ b/embodichain/agents/rl/utils/__init__.py @@ -15,7 +15,9 @@ # ---------------------------------------------------------------------------- from .config import AlgorithmCfg +from .helper import flatten_dict_observation __all__ = [ "AlgorithmCfg", + "flatten_dict_observation", ] diff --git a/embodichain/agents/rl/utils/helper.py b/embodichain/agents/rl/utils/helper.py new file mode 100644 index 0000000..3021a31 --- /dev/null +++ b/embodichain/agents/rl/utils/helper.py @@ -0,0 +1,52 @@ +# ---------------------------------------------------------------------------- +# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ---------------------------------------------------------------------------- + +import torch + + +def flatten_dict_observation(input_dict: dict) -> torch.Tensor: + """ + Flatten hierarchical dict observations from ObservationManager. + + Recursively traverse nested dicts, collect all tensor values, + flatten each to (num_envs, -1), and concatenate in sorted key order. + + Args: + input_dict: Nested dict structure, e.g. {"robot": {"qpos": tensor, "ee_pos": tensor}, "object": {...}} + + Returns: + Concatenated flat tensor of shape (num_envs, total_dim) + """ + obs_list = [] + + def _collect_tensors(d, prefix=""): + """Recursively collect tensors from nested dicts in sorted order.""" + for key in sorted(d.keys()): + full_key = f"{prefix}/{key}" if prefix else key + value = d[key] + if isinstance(value, dict): + _collect_tensors(value, full_key) + elif isinstance(value, torch.Tensor): + # Flatten tensor to (num_envs, -1) shape + obs_list.append(value.flatten(start_dim=1)) + + _collect_tensors(input_dict) + + if not obs_list: + raise ValueError("No tensors found in observation dict") + + result = torch.cat(obs_list, dim=-1) + return result diff --git a/embodichain/agents/rl/utils/trainer.py b/embodichain/agents/rl/utils/trainer.py index 88c0490..6d38800 100644 --- a/embodichain/agents/rl/utils/trainer.py +++ b/embodichain/agents/rl/utils/trainer.py @@ -25,6 +25,7 @@ import wandb from embodichain.lab.gym.envs.managers.event_manager import EventManager +from .helper import flatten_dict_observation class Trainer: @@ -74,22 +75,22 @@ def __init__( # initial obs (assume env returns torch tensors already on target device) obs, _ = self.env.reset() - self.obs = obs # Initialize algorithm's buffer - self.observation_space = getattr(self.env, "observation_space", None) - self.action_space = getattr(self.env, "action_space", None) - obs_dim = ( - self.observation_space.shape[-1] - if self.observation_space - else self.obs.shape[-1] - ) - action_dim = self.action_space.shape[-1] if self.action_space else None + # Flatten dict observations from ObservationManager to tensor for RL algorithms + if isinstance(obs, dict): + obs_tensor = flatten_dict_observation(obs) + obs_dim = obs_tensor.shape[-1] + num_envs = obs_tensor.shape[0] + # Store flattened observation for RL training + self.obs = obs_tensor + + action_space = getattr(self.env, "action_space", None) + action_dim = action_space.shape[-1] if action_space else None if action_dim is None: raise RuntimeError( "Env must expose action_space with shape for buffer initialization." ) - num_envs = self.obs.shape[0] if self.obs.ndim == 2 else 1 # Algorithm manages its own buffer self.algorithm.initialize_buffer(num_steps, num_envs, obs_dim, action_dim) @@ -160,8 +161,9 @@ def on_step(obs, actions, reward, done, info, next_obs): self.curr_len[done_idx] = 0 # Update global step and observation + # next_obs is already flattened in algorithm's collect_rollout self.obs = next_obs - self.global_step += next_obs.shape[0] if next_obs.ndim == 2 else 1 + self.global_step += next_obs.shape[0] if isinstance(info, dict): rewards_dict = info.get("rewards") @@ -226,6 +228,8 @@ def _eval_once(self, num_episodes: int = 5): returns = [] for _ in range(num_episodes): obs, _ = self.eval_env.reset() + obs = flatten_dict_observation(obs) + done_any = torch.zeros( obs.shape[0] if obs.ndim == 2 else 1, dtype=torch.bool, diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py index 72e1f40..1c28f51 100644 --- a/embodichain/lab/gym/envs/base_env.py +++ b/embodichain/lab/gym/envs/base_env.py @@ -15,6 +15,7 @@ # ---------------------------------------------------------------------------- import torch +import numpy as np import gymnasium as gym from typing import Dict, List, Union, Tuple, Any, Sequence @@ -173,6 +174,21 @@ def observation_space(self) -> gym.spaces.Space: self.single_observation_space, n=self.num_envs ) + @cached_property + def flattened_observation_space(self) -> gym.spaces.Box: + """Flattened observation space for RL training. + + Returns a Box space by computing total dimensions from nested dict observations. + This is needed because RL algorithms (PPO, SAC, etc.) require flat vector inputs. + """ + from embodichain.agents.rl.utils.helper import flatten_dict_observation + + flattened_obs = flatten_dict_observation(self._init_raw_obs) + total_dim = flattened_obs.shape[-1] + return gym.spaces.Box( + low=-np.inf, high=np.inf, shape=(total_dim,), dtype=np.float32 + ) + @cached_property def action_space(self) -> gym.spaces.Space: if self.num_envs == 1: diff --git a/embodichain/lab/gym/envs/managers/cfg.py b/embodichain/lab/gym/envs/managers/cfg.py index eeacc86..3161fb8 100644 --- a/embodichain/lab/gym/envs/managers/cfg.py +++ b/embodichain/lab/gym/envs/managers/cfg.py @@ -231,7 +231,7 @@ def resolve(self, scene: SimulationManager): def _resolve_joint_names(self, scene: SimulationManager): # convert joint names to indices based on regex if self.joint_names is not None or self.joint_ids != slice(None): - entity: Articulation = scene[self.uid] + entity: Articulation = scene.get_articulation(self.uid) # -- if both are not their default values, check if they are valid if self.joint_names is not None and self.joint_ids != slice(None): if isinstance(self.joint_names, str): @@ -272,7 +272,7 @@ def _resolve_joint_names(self, scene: SimulationManager): def _resolve_body_names(self, scene: SimulationManager): # convert body names to indices based on regex if self.body_names is not None or self.body_ids != slice(None): - entity: RigidObject = scene[self.uid] + entity: RigidObject = scene.get_rigid_object(self.uid) # -- if both are not their default values, check if they are valid if self.body_names is not None and self.body_ids != slice(None): if isinstance(self.body_names, str): diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py index 8023280..d8892ef 100644 --- a/embodichain/lab/gym/envs/managers/rewards.py +++ b/embodichain/lab/gym/envs/managers/rewards.py @@ -299,42 +299,37 @@ def reaching_behind_object_reward( obs: dict, action: torch.Tensor, info: dict, - end_effector_cfg: SceneEntityCfg = None, object_cfg: SceneEntityCfg = None, - goal_cfg: SceneEntityCfg = None, + target_pose_key: str = "goal_pose", behind_offset: float = 0.015, height_offset: float = 0.015, distance_scale: float = 5.0, + part_name: str = None, ) -> torch.Tensor: """Reward for reaching behind an object along object-to-goal direction.""" - # get end effector position - ee_obj = env.sim[end_effector_cfg.uid] - if hasattr(ee_obj, "get_body_pose"): - ee_pos = ee_obj.get_body_pose(body_ids=end_effector_cfg.body_ids)[:, :3, 3] - elif hasattr(ee_obj, "get_local_pose"): - ee_pos = ee_obj.get_local_pose(to_matrix=True)[:, :3, 3] - else: + # get end effector position from robot FK + robot = env.robot + joint_ids = robot.get_joint_ids(part_name) + qpos = robot.get_qpos()[:, joint_ids] + ee_pose = robot.compute_fk(name=part_name, qpos=qpos, to_matrix=True) + ee_pos = ee_pose[:, :3, 3] + + # get object position + obj = env.sim.get_rigid_object(object_cfg.uid) + obj_pos = obj.get_local_pose(to_matrix=True)[:, :3, 3] + + # get goal position from info + if target_pose_key not in info: raise ValueError( - f"Entity '{end_effector_cfg.uid}' does not support position query." + f"Target pose '{target_pose_key}' not found in info dict. " + f"Make sure to provide it in get_info()." ) - # get object position - obj = env.sim[object_cfg.uid] - if hasattr(obj, "get_body_pose"): - obj_pos = obj.get_body_pose(body_ids=object_cfg.body_ids)[:, :3, 3] - elif hasattr(obj, "get_local_pose"): - obj_pos = obj.get_local_pose(to_matrix=True)[:, :3, 3] - else: - raise ValueError(f"Entity '{object_cfg.uid}' does not support position query.") - - # get goal position - goal_obj = env.sim[goal_cfg.uid] - if hasattr(goal_obj, "get_body_pose"): - goal_pos = goal_obj.get_body_pose(body_ids=goal_cfg.body_ids)[:, :3, 3] - elif hasattr(goal_obj, "get_local_pose"): - goal_pos = goal_obj.get_local_pose(to_matrix=True)[:, :3, 3] - else: - raise ValueError(f"Entity '{goal_cfg.uid}' does not support position query.") + target_poses = info[target_pose_key] + if target_poses.dim() == 2: # (num_envs, 3) + goal_pos = target_poses + else: # (num_envs, 4, 4) + goal_pos = target_poses[:, :3, 3] # compute push direction (from object to goal) push_direction = goal_pos - obj_pos @@ -425,17 +420,8 @@ def incremental_distance_to_target( ) -> torch.Tensor: """Incremental reward for progress toward a virtual target pose from info.""" # get source entity position - source_obj = env.sim[source_entity_cfg.uid] - if hasattr(source_obj, "get_body_pose"): - source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[ - :, :3, 3 - ] - elif hasattr(source_obj, "get_local_pose"): - source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] - else: - raise ValueError( - f"Entity '{source_entity_cfg.uid}' does not support position query." - ) + source_obj = env.sim.get_rigid_object(source_entity_cfg.uid) + source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] # get target position from info if target_pose_key not in info: diff --git a/embodichain/lab/gym/envs/tasks/rl/__init__.py b/embodichain/lab/gym/envs/tasks/rl/__init__.py index f8cf303..be52afc 100644 --- a/embodichain/lab/gym/envs/tasks/rl/__init__.py +++ b/embodichain/lab/gym/envs/tasks/rl/__init__.py @@ -18,10 +18,10 @@ from copy import deepcopy from embodichain.lab.gym.utils import registration as env_registry -from embodichain.lab.gym.envs.rl_env_cfg import RLEnvCfg +from embodichain.lab.gym.envs.embodied_env import EmbodiedEnvCfg -def build_env(env_id: str, base_env_cfg: RLEnvCfg): +def build_env(env_id: str, base_env_cfg: EmbodiedEnvCfg): """Create env from registry id, auto-inferring cfg class (EnvName -> EnvNameCfg).""" env = env_registry.make(env_id, cfg=deepcopy(base_env_cfg)) return env diff --git a/embodichain/lab/gym/envs/tasks/rl/push_cube.py b/embodichain/lab/gym/envs/tasks/rl/push_cube.py index 01e5074..f26a1fc 100644 --- a/embodichain/lab/gym/envs/tasks/rl/push_cube.py +++ b/embodichain/lab/gym/envs/tasks/rl/push_cube.py @@ -38,18 +38,6 @@ def __init__(self, cfg=None, **kwargs): if cfg is None: cfg = EmbodiedEnvCfg() - extensions = getattr(cfg, "extensions", {}) or {} - - # cfg.sim_cfg.enable_rt = True - - defaults = { - "success_threshold": 0.1, - } - for name, default in defaults.items(): - value = extensions.get(name, getattr(cfg, name, default)) - setattr(cfg, name, value) - setattr(self, name, getattr(cfg, name)) - super().__init__(cfg, **kwargs) def _draw_goal_marker(self): @@ -80,19 +68,6 @@ def _draw_goal_marker(self): ) self.sim.draw_marker(cfg=marker_cfg) - def _init_sim_state(self, **kwargs): - super()._init_sim_state(**kwargs) - self.single_action_space = spaces.Box( - low=-self.joint_limits, - high=self.joint_limits, - shape=(6,), - dtype=np.float32, - ) - if self.obs_mode == "state": - self.single_observation_space = spaces.Box( - low=-np.inf, high=np.inf, shape=(15,), dtype=np.float32 - ) - def _initialize_episode( self, env_ids: Sequence[int] | None = None, **kwargs ) -> None: diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py index aa7123f..825a83a 100644 --- a/embodichain/lab/gym/utils/gym_utils.py +++ b/embodichain/lab/gym/utils/gym_utils.py @@ -323,24 +323,6 @@ def cat_tensor_with_ids( return out -def config_to_rl_cfg(config: dict) -> "RLEnvCfg": - """Parse gym-style configuration dict into an RL-ready config object.""" - - from embodichain.lab.gym.envs.rl_env_cfg import RLEnvCfg - - # Use config_to_cfg to parse shared fields - env_cfg = config_to_cfg(config) - # Convert to RLEnvCfg if needed - if not isinstance(env_cfg, RLEnvCfg): - env_cfg = RLEnvCfg.from_dict(env_cfg.__dict__) - # RL-specific fields - env_cfg.env_id = config.get("id") - env_cfg.num_envs = config["env"].get("num_envs", env_cfg.num_envs) - env_cfg.extensions = deepcopy(config.get("env", {}).get("extensions", {})) - # Add any RL-specific parsing here - return env_cfg - - def config_to_cfg(config: dict) -> "EmbodiedEnvCfg": """Parser configuration file into cfgs for env initialization. @@ -452,6 +434,7 @@ class ComponentCfg: env_cfg.articulation.append(cfg) env_cfg.sim_steps_per_control = config["env"].get("sim_steps_per_control", 4) + env_cfg.extensions = deepcopy(config.get("env", {}).get("extensions", {})) # TODO: support more env events, eg, grasp pose generation, mesh preprocessing, etc. From 4e66b14fa56e464010f024520f45187509157775 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Mon, 19 Jan 2026 16:14:57 +0800 Subject: [PATCH 4/9] update review changes --- configs/agents/rl/push_cube/gym_config.json | 381 ++++++++------- configs/agents/rl/push_cube/train_config.json | 125 ++--- embodichain/agents/rl/train.py | 11 +- embodichain/agents/rl/utils/trainer.py | 8 +- embodichain/lab/gym/envs/embodied_env.py | 4 +- .../lab/gym/envs/managers/observations.py | 3 +- .../envs/managers/randomization/spatial.py | 2 +- .../lab/gym/envs/managers/reward_manager.py | 2 +- embodichain/lab/gym/envs/managers/rewards.py | 452 ++++++++++++------ .../lab/gym/envs/tasks/rl/push_cube.py | 25 +- embodichain/lab/gym/utils/gym_utils.py | 5 +- 11 files changed, 599 insertions(+), 419 deletions(-) diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index 87858e3..cd5113b 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -1,198 +1,197 @@ -{ - "id": "PushCubeRL", - "max_episodes": 5, - "env": { - "num_envs": 128, - "sim_steps_per_control": 4, - "events": { - "randomize_cube": { - "func": "randomize_rigid_object_pose", - "mode": "reset", - "params": { - "entity_cfg": {"uid": "cube"}, - "position_range": [[-0.2, -0.2, 0.0], [0.2, 0.2, 0.0]], - "relative_position": true - } - }, - "randomize_goal": { - "func": "randomize_target_pose", - "mode": "reset", - "params": { - "position_range": [[-0.3, -0.3, 0.05], [0.3, 0.3, 0.05]], - "relative_position": false, - "store_key": "goal_pose" - } - } - }, - "observations": { - "robot_qpos": { - "func": "normalize_robot_joint_data", - "mode": "modify", - "name": "robot/qpos", - "params": { - "joint_ids": [0, 1, 2, 3, 4, 5] - } - }, - "robot_ee_pos": { - "func": "get_robot_ee_pose", - "mode": "add", - "name": "robot/ee_pos", - "params": { - "part_name": "arm" - } - }, - "cube_pos": { - "func": "get_rigid_object_pose", - "mode": "add", - "name": "object/cube_pos", - "params": { - "entity_cfg": {"uid": "cube"} - } - }, - "goal_pos": { - "func": "target_position", - "mode": "add", - "name": "object/goal_pos", - "params": { - "target_pose_key": "goal_pose" +{ + "id": "PushCubeRL", + "max_episodes": 5, + "env": { + "num_envs": 128, + "sim_steps_per_control": 4, + "events": { + "randomize_cube": { + "func": "randomize_rigid_object_pose", + "mode": "reset", + "params": { + "entity_cfg": { + "uid": "cube" + }, + "position_range": [ + [-0.2, -0.2, 0.0], + [0.2, 0.2, 0.0] + ], + "relative_position": true + } + }, + "randomize_goal": { + "func": "randomize_target_pose", + "mode": "reset", + "params": { + "position_range": [ + [-0.3, -0.3, 0.05], + [0.3, 0.3, 0.05] + ], + "relative_position": false, + "store_key": "goal_pose" + } + } + }, + "observations": { + "robot_qpos": { + "func": "normalize_robot_joint_data", + "mode": "modify", + "name": "robot/qpos", + "params": { + "joint_ids": [0, 1, 2, 3, 4, 5] + } + }, + "robot_ee_pos": { + "func": "get_robot_eef_pose", + "mode": "add", + "name": "robot/ee_pos", + "params": { + "part_name": "arm" + } + }, + "cube_pos": { + "func": "get_rigid_object_pose", + "mode": "add", + "name": "object/cube_pos", + "params": { + "entity_cfg": { + "uid": "cube" + } + } + }, + "goal_pos": { + "func": "target_position", + "mode": "add", + "name": "object/goal_pos", + "params": { + "target_pose_key": "goal_pose" + } + } + }, + "rewards": { + "reaching_reward": { + "func": "reaching_behind_object_reward", + "mode": "add", + "name": "reaching", + "weight": 0.1, + "params": { + "object_cfg": { + "uid": "cube" + }, + "target_pose_key": "goal_pose", + "behind_offset": 0.015, + "height_offset": 0.015, + "distance_scale": 5.0, + "part_name": "arm" + } + }, + "place_reward": { + "func": "incremental_distance_to_target", + "mode": "add", + "name": "place", + "weight": 1.0, + "params": { + "source_entity_cfg": { + "uid": "cube" + }, + "target_pose_key": "goal_pose", + "tanh_scale": 10.0, + "positive_weight": 2.0, + "negative_weight": 0.5, + "use_xy_only": true + } + }, + "action_penalty": { + "func": "action_smoothness_penalty", + "mode": "add", + "name": "action_penalty", + "weight": 0.01, + "params": {} + }, + "success_bonus": { + "func": "success_reward", + "mode": "add", + "name": "success", + "weight": 10.0, + "params": {} + } + }, + "extensions": { + "obs_mode": "state", + "episode_length": 100, + "joint_limits": 0.5, + "action_scale": 0.1, + "success_threshold": 0.1 } - } }, - "rewards": { - "reaching_reward": { - "func": "reaching_behind_object_reward", - "mode": "add", - "name": "reaching", - "weight": 0.1, - "params": { - "object_cfg": {"uid": "cube"}, - "target_pose_key": "goal_pose", - "behind_offset": 0.015, - "height_offset": 0.015, - "distance_scale": 5.0, - "part_name": "arm" - } - }, - "place_reward": { - "func": "incremental_distance_to_target", - "mode": "add", - "name": "place", - "weight": 1.0, - "params": { - "source_entity_cfg": {"uid": "cube"}, - "target_pose_key": "goal_pose", - "tanh_scale": 10.0, - "positive_weight": 2.0, - "negative_weight": 0.5, - "use_xy_only": true - } - }, - "action_penalty": { - "func": "action_smoothness_penalty", - "mode": "add", - "name": "action_penalty", - "weight": 0.01, - "params": {} - }, - "success_bonus": { - "func": "success_reward", - "mode": "add", - "name": "success", - "weight": 10.0, - "params": { - "reward_value": 1.0 + "robot": { + "uid": "Manipulator", + "urdf_cfg": { + "components": [ + { + "component_type": "arm", + "urdf_path": "UniversalRobots/UR10/UR10.urdf" + }, + { + "component_type": "hand", + "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf" + } + ] + }, + "init_pos": [0.0, 0.0, 0.0], + "init_rot": [0.0, 0.0, 0.0], + "init_qpos": [0.0, -1.57, 1.57, -1.57, -1.57, 0.0, 0.04, 0.04], + "drive_pros": { + "drive_type": "force", + "stiffness": 100000.0, + "damping": 1000.0, + "max_velocity": 2.0, + "max_effort": 500.0 + }, + "solver_cfg": { + "arm": { + "class_type": "PytorchSolver", + "end_link_name": "ee_link", + "root_link_name": "base_link", + "tcp": [ + [1.0, 0.0, 0.0, 0.0], + [0.0, 1.0, 0.0, 0.0], + [0.0, 0.0, 1.0, 0.16], + [0.0, 0.0, 0.0, 1.0] + ] + } + }, + "control_parts": { + "arm": ["JOINT[1-6]"] } - } }, - "extensions": { - "obs_mode": "state", - "episode_length": 100, - "joint_limits": 0.5, - "action_scale": 0.1, - "success_threshold": 0.1 - } - }, - "robot": { - "uid": "Manipulator", - "urdf_cfg": { - "components": [ + "sensor": [], + "light": {}, + "background": [], + "rigid_object": [ { - "component_type": "arm", - "urdf_path": "UniversalRobots/UR10/UR10.urdf", - "transform": [ - [1.0, 0.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.0], - [0.0, 0.0, 0.0, 1.0] - ] - }, - { - "component_type": "hand", - "urdf_path": "DH_PGI_140_80/DH_PGI_140_80.urdf", - "transform": [ - [1.0, 0.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.0], - [0.0, 0.0, 0.0, 1.0] - ] + "uid": "cube", + "shape": { + "shape_type": "Cube", + "size": [0.1, 0.1, 0.1] + }, + "body_type": "dynamic", + "init_pos": [-0.6, -0.4, 0.05], + "attrs": { + "mass": 10.0, + "static_friction": 3.0, + "dynamic_friction": 2.0, + "linear_damping": 2.0, + "angular_damping": 2.0, + "contact_offset": 0.003, + "rest_offset": 0.001, + "restitution": 0.1, + "max_depenetration_velocity": 10.0, + "max_linear_velocity": 1.0, + "max_angular_velocity": 1.0 + } } - ] - }, - "init_pos": [0.0, 0.0, 0.0], - "init_rot": [0.0, 0.0, 0.0], - "init_qpos": [0.0, -1.57, 1.57, -1.57, -1.57, 0.0, 0.04, 0.04], - "drive_pros": { - "drive_type": "force", - "stiffness": 100000.0, - "damping": 1000.0, - "max_velocity": 2.0, - "max_effort": 500.0 - }, - "solver_cfg": { - "arm": { - "class_type": "PytorchSolver", - "end_link_name": "ee_link", - "root_link_name": "base_link", - "tcp": [ - [1.0, 0.0, 0.0, 0.0], - [0.0, 1.0, 0.0, 0.0], - [0.0, 0.0, 1.0, 0.16], - [0.0, 0.0, 0.0, 1.0] - ] - } - }, - "control_parts": { - "arm": ["JOINT[1-6]"] - } - }, - "sensor": [], - "light": { - }, - "background": [], - "rigid_object": [ - { - "uid": "cube", - "shape": { - "shape_type": "Cube", - "size": [0.1, 0.1, 0.1] - }, - "body_type": "dynamic", - "init_pos": [-0.6, -0.4, 0.05], - "attrs": { - "mass": 10.0, - "static_friction": 3.0, - "dynamic_friction": 2.0, - "linear_damping": 2.0, - "angular_damping": 2.0, - "contact_offset": 0.003, - "rest_offset": 0.001, - "restitution": 0.1, - "max_depenetration_velocity": 10.0, - "max_linear_velocity": 1.0, - "max_angular_velocity": 1.0 - } - } - ], - "rigid_object_group": [], - "articulation": [] + ], + "rigid_object_group": [], + "articulation": [] } diff --git a/configs/agents/rl/push_cube/train_config.json b/configs/agents/rl/push_cube/train_config.json index 7e94155..3c3bda0 100644 --- a/configs/agents/rl/push_cube/train_config.json +++ b/configs/agents/rl/push_cube/train_config.json @@ -1,64 +1,67 @@ -{ - "trainer": { - "exp_name": "push_cube_ppo", - "gym_config": "configs/agents/rl/push_cube/gym_config.json", - "seed": 42, - "device": "cuda:0", - "headless": true, - "iterations": 1000, - "rollout_steps": 1024, - "eval_freq": 200, - "save_freq": 200, - "use_wandb": false, - "wandb_project_name": "embodychain-push_cube", - "events": { - "eval": { - "record_camera": { - "func": "record_camera_data_async", - "mode": "interval", - "interval_step": 1, - "params": { - "name": "main_cam", - "resolution": [640, 480], - "eye": [-1.4, 1.4, 2.0], - "target": [0, 0, 0], - "up": [0, 0, 1], - "intrinsics": [600, 600, 320, 240], - "save_path": "./outputs/videos/eval" - } +{ + "trainer": { + "exp_name": "push_cube_ppo", + "gym_config": "configs/agents/rl/push_cube/gym_config.json", + "seed": 42, + "device": "cuda:0", + "headless": false, + "enable_rt": false, + "gpu_id": 0, + "num_envs": 8, + "iterations": 1000, + "rollout_steps": 1024, + "eval_freq": 200, + "save_freq": 200, + "use_wandb": true, + "wandb_project_name": "embodychain-push_cube", + "events": { + "eval": { + "record_camera": { + "func": "record_camera_data_async", + "mode": "interval", + "interval_step": 1, + "params": { + "name": "main_cam", + "resolution": [640, 480], + "eye": [-1.4, 1.4, 2.0], + "target": [0, 0, 0], + "up": [0, 0, 1], + "intrinsics": [600, 600, 320, 240], + "save_path": "./outputs/videos/eval" + } + } + } } - } - } - }, - "policy": { - "name": "actor_critic", - "actor": { - "type": "mlp", - "network_cfg": { - "hidden_sizes": [256, 256], - "activation": "relu" - } }, - "critic": { - "type": "mlp", - "network_cfg": { - "hidden_sizes": [256, 256], - "activation": "relu" - } - } - }, - "algorithm": { - "name": "ppo", - "cfg": { - "learning_rate": 0.0001, - "n_epochs": 10, - "batch_size": 8192, - "gamma": 0.99, - "gae_lambda": 0.95, - "clip_coef": 0.2, - "ent_coef": 0.01, - "vf_coef": 0.5, - "max_grad_norm": 0.5 + "policy": { + "name": "actor_critic", + "actor": { + "type": "mlp", + "network_cfg": { + "hidden_sizes": [256, 256], + "activation": "relu" + } + }, + "critic": { + "type": "mlp", + "network_cfg": { + "hidden_sizes": [256, 256], + "activation": "relu" + } + } + }, + "algorithm": { + "name": "ppo", + "cfg": { + "learning_rate": 0.0001, + "n_epochs": 10, + "batch_size": 8192, + "gamma": 0.99, + "gae_lambda": 0.95, + "clip_coef": 0.2, + "ent_coef": 0.01, + "vf_coef": 0.5, + "max_grad_norm": 0.5 + } } - } -} \ No newline at end of file +} diff --git a/embodichain/agents/rl/train.py b/embodichain/agents/rl/train.py index 4d95ff4..ca3c299 100644 --- a/embodichain/agents/rl/train.py +++ b/embodichain/agents/rl/train.py @@ -60,6 +60,9 @@ def main(): eval_freq = int(trainer_cfg.get("eval_freq", 10000)) save_freq = int(trainer_cfg.get("save_freq", 50000)) headless = bool(trainer_cfg.get("headless", True)) + enable_rt = bool(trainer_cfg.get("enable_rt", False)) + gpu_id = int(trainer_cfg.get("gpu_id", 0)) + num_envs = trainer_cfg.get("num_envs", None) wandb_project_name = trainer_cfg.get("wandb_project_name", "embodychain-generic") # Device @@ -122,6 +125,10 @@ def main(): gym_config_data = load_json(str(gym_config_path)) gym_env_cfg = config_to_cfg(gym_config_data) + # Override num_envs from train config if provided + if num_envs is not None: + gym_env_cfg.num_envs = num_envs + # Ensure sim configuration mirrors runtime overrides if gym_env_cfg.sim_cfg is None: gym_env_cfg.sim_cfg = SimulationManagerCfg() @@ -135,9 +142,11 @@ def main(): else: gym_env_cfg.sim_cfg.sim_device = torch.device("cpu") gym_env_cfg.sim_cfg.headless = headless + gym_env_cfg.sim_cfg.enable_rt = enable_rt + gym_env_cfg.sim_cfg.gpu_id = gpu_id logger.log_info( - f"Loaded gym_config from {gym_config_path} (env_id={gym_config_data['id']}, headless={gym_env_cfg.sim_cfg.headless}, sim_device={gym_env_cfg.sim_cfg.sim_device})" + f"Loaded gym_config from {gym_config_path} (env_id={gym_config_data['id']}, num_envs={gym_env_cfg.num_envs}, headless={gym_env_cfg.sim_cfg.headless}, enable_rt={gym_env_cfg.sim_cfg.enable_rt}, sim_device={gym_env_cfg.sim_cfg.sim_device})" ) env = build_env(gym_config_data["id"], base_env_cfg=gym_env_cfg) diff --git a/embodichain/agents/rl/utils/trainer.py b/embodichain/agents/rl/utils/trainer.py index 6d38800..34dc191 100644 --- a/embodichain/agents/rl/utils/trainer.py +++ b/embodichain/agents/rl/utils/trainer.py @@ -239,8 +239,12 @@ def _eval_once(self, num_episodes: int = 5): ep_ret = torch.zeros(num_envs_eval, dtype=torch.float32, device=self.device) while not done_any.any(): actions, _, _ = self.policy.get_action(obs, deterministic=True) - result = self.eval_env.step(actions) - obs, reward, terminated, truncated, info = result + obs, reward, terminated, truncated, info = self.eval_env.step(actions) + + # Flatten dict observation from step + if isinstance(obs, dict): + obs = flatten_dict_observation(obs) + done = terminated | truncated reward = reward.float() done_any = done diff --git a/embodichain/lab/gym/envs/embodied_env.py b/embodichain/lab/gym/envs/embodied_env.py index c5e04fa..b87dba2 100644 --- a/embodichain/lab/gym/envs/embodied_env.py +++ b/embodichain/lab/gym/envs/embodied_env.py @@ -352,9 +352,7 @@ def _extend_reward( rewards, reward_info = self.reward_manager.compute( obs=obs, action=action, info=info ) - # Add individual reward terms to info for logging - for term_name, term_value in reward_info.items(): - info[f"reward/{term_name}"] = term_value + info["rewards"] = reward_info return rewards def _prepare_scene(self, **kwargs) -> None: diff --git a/embodichain/lab/gym/envs/managers/observations.py b/embodichain/lab/gym/envs/managers/observations.py index 306d993..7e500af 100644 --- a/embodichain/lab/gym/envs/managers/observations.py +++ b/embodichain/lab/gym/envs/managers/observations.py @@ -252,7 +252,7 @@ def compute_semantic_mask( return torch.stack(masks, dim=-1) -def get_robot_ee_pose( +def get_robot_eef_pose( env: "EmbodiedEnv", obs: EnvObs, part_name: str | None = None, @@ -263,7 +263,6 @@ def get_robot_ee_pose( Args: env: The environment instance. obs: The observation dictionary. - robot_uid: The uid of the robot. If None, uses env.robot. part_name: The name of the control part. If None, uses default part. position_only: If True, returns only position (3D). If False, returns full pose (4x4 matrix). diff --git a/embodichain/lab/gym/envs/managers/randomization/spatial.py b/embodichain/lab/gym/envs/managers/randomization/spatial.py index 6c3f92f..c3e32d3 100644 --- a/embodichain/lab/gym/envs/managers/randomization/spatial.py +++ b/embodichain/lab/gym/envs/managers/randomization/spatial.py @@ -339,7 +339,7 @@ def randomize_target_pose( ) # Store in env state (to be exposed via get_info) - state_attr = f"_{store_key}s" + state_attr = f"_{store_key}" if not hasattr(env, state_attr): setattr( env, diff --git a/embodichain/lab/gym/envs/managers/reward_manager.py b/embodichain/lab/gym/envs/managers/reward_manager.py index 8f282df..c034a90 100644 --- a/embodichain/lab/gym/envs/managers/reward_manager.py +++ b/embodichain/lab/gym/envs/managers/reward_manager.py @@ -76,7 +76,7 @@ def __str__(self) -> str: table.align["Name"] = "l" for index, name in enumerate(self._mode_functor_names[mode]): functor_cfg = self._mode_functor_cfgs[mode][index] - weight = functor_cfg.params.get("weight", 1.0) + weight = getattr(functor_cfg, "weight", 1.0) table.add_row([index, name, f"{weight:.3f}"]) # convert table to string diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py index d8892ef..8b61ec6 100644 --- a/embodichain/lab/gym/envs/managers/rewards.py +++ b/embodichain/lab/gym/envs/managers/rewards.py @@ -27,39 +27,6 @@ from embodichain.lab.gym.envs import EmbodiedEnv -def reward_from_obs( - env: EmbodiedEnv, - obs: dict, - action: torch.Tensor, - info: dict, - obs_key: str = "robot/qpos", - target_value: float = 0.0, - scale: float = 1.0, -) -> torch.Tensor: - """Reward based on observation values.""" - # Parse nested keys (e.g., "robot/qpos") - keys = obs_key.split("/") - value = obs - for key in keys: - if isinstance(value, dict) and key in value: - value = value[key] - else: - return torch.zeros(env.num_envs, device=env.device) - - # Compute distance to target - if isinstance(value, torch.Tensor): - if value.dim() > 1: - # Multiple values, compute norm - distance = torch.norm(value - target_value, dim=-1) - else: - distance = torch.abs(value - target_value) - reward = -scale * distance - else: - reward = torch.zeros(env.num_envs, device=env.device) - - return reward - - def distance_between_objects( env: EmbodiedEnv, obs: dict, @@ -70,32 +37,43 @@ def distance_between_objects( exponential: bool = False, sigma: float = 1.0, ) -> torch.Tensor: - """Reward based on distance between two entities.""" + """Reward based on distance between two rigid objects. + + Encourages the source object to get closer to the target object. Can use either + linear negative distance or exponential Gaussian-shaped reward. + + Args: + source_entity_cfg: Configuration for the source object (e.g., {"uid": "cube"}) + target_entity_cfg: Configuration for the target object (e.g., {"uid": "goal_sphere"}) + exponential: If True, use exponential reward exp(-d²/2σ²), else use -distance + sigma: Standard deviation for exponential reward (controls reward spread) + + Returns: + Reward tensor of shape (num_envs,). Higher when objects are closer. + - Linear mode: ranges from -inf to 0 (0 when objects touch) + - Exponential mode: ranges from 0 to 1 (1 when objects touch) + + Example: + ```json + { + "func": "distance_between_objects", + "weight": 0.5, + "params": { + "source_entity_cfg": {"uid": "cube"}, + "target_entity_cfg": {"uid": "target"}, + "exponential": true, + "sigma": 0.2 + } + } + ``` + """ # get source entity position - source_obj = env.sim[source_entity_cfg.uid] - if hasattr(source_obj, "get_body_pose"): - source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[ - :, :3, 3 - ] - elif hasattr(source_obj, "get_local_pose"): - source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] - else: - raise ValueError( - f"Entity '{source_entity_cfg.uid}' does not support position query." - ) + source_obj = env.sim.get_rigid_object(source_entity_cfg.uid) + source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] # get target entity position - target_obj = env.sim[target_entity_cfg.uid] - if hasattr(target_obj, "get_body_pose"): - target_pos = target_obj.get_body_pose(body_ids=target_entity_cfg.body_ids)[ - :, :3, 3 - ] - elif hasattr(target_obj, "get_local_pose"): - target_pos = target_obj.get_local_pose(to_matrix=True)[:, :3, 3] - else: - raise ValueError( - f"Entity '{target_entity_cfg.uid}' does not support position query." - ) + target_obj = env.sim.get_rigid_object(target_entity_cfg.uid) + target_pos = target_obj.get_local_pose(to_matrix=True)[:, :3, 3] # compute distance distance = torch.norm(source_pos - target_pos, dim=-1) @@ -117,13 +95,46 @@ def joint_velocity_penalty( action: torch.Tensor, info: dict, robot_uid: str = "robot", - joint_ids: slice | list[int] = slice(None), + joint_ids: slice | list[int] | None = None, + part_name: str | None = None, ) -> torch.Tensor: - """Penalize large joint velocities.""" - robot = env.sim[robot_uid] + """Penalize high joint velocities to encourage smooth motion. + + Computes the L2 norm of joint velocities and returns negative value as penalty. + Useful for preventing jerky or unstable robot movements. + + Args: + robot_uid: Robot entity UID in simulation (default: "robot") + joint_ids: Specific joint indices to penalize. Takes priority over part_name. + Example: [0, 1, 2] or slice(0, 6) + part_name: Control part name (e.g., "arm"). Used only if joint_ids is None. + Will penalize all joints in the specified part. + + Returns: + Penalty tensor of shape (num_envs,). Always negative or zero. + Magnitude increases with joint velocity (larger velocity = more negative). + + Example: + ```json + { + "func": "joint_velocity_penalty", + "weight": 0.001, + "params": { + "robot_uid": "robot", + "part_name": "arm" + } + } + ``` + """ + robot = env.sim.get_robot(robot_uid) # get joint velocities - qvel = robot.body_data.qvel[:, joint_ids] + if joint_ids is not None: + qvel = robot.get_qvel()[:, joint_ids] + elif part_name is not None: + qvel = robot.get_qvel(name=part_name) + else: + qvel = robot.get_qvel() # compute L2 norm of joint velocities velocity_norm = torch.norm(qvel, dim=-1) @@ -138,17 +149,42 @@ def action_smoothness_penalty( action: torch.Tensor, info: dict, ) -> torch.Tensor: - """Penalize large changes in action between steps.""" + """Penalize large action changes between consecutive timesteps. + + Encourages smooth control commands by penalizing sudden changes in actions. + Stores previous action in env._reward_states for comparison. + + Returns: + Penalty tensor of shape (num_envs,). Zero on first call (no previous action), + negative on subsequent calls (larger change = more negative). + + Note: + This function maintains state across calls using env._reward_states['prev_actions']. + State is automatically reset when the environment resets. + + Example: + ```json + { + "func": "action_smoothness_penalty", + "weight": 0.01, + "params": {} + } + ``` + """ + # Use dictionary-based state management + if not hasattr(env, "_reward_states"): + env._reward_states = {} + # compute difference between current and previous action - if hasattr(env, "_prev_actions"): - action_diff = action - env._prev_actions + if "prev_actions" in env._reward_states: + action_diff = action - env._reward_states["prev_actions"] penalty = -torch.norm(action_diff, dim=-1) else: # no previous action, no penalty penalty = torch.zeros(env.num_envs, device=env.device) # store current action for next step - env._prev_actions = action.clone() + env._reward_states["prev_actions"] = action.clone() return penalty @@ -162,12 +198,40 @@ def joint_limit_penalty( joint_ids: slice | list[int] = slice(None), margin: float = 0.1, ) -> torch.Tensor: - """Penalize joints approaching their limits.""" - robot = env.sim[robot_uid] + """Penalize robot joints that are close to their position limits. + + Prevents joints from reaching their physical limits, which can cause instability + or singularities. Penalty increases as joints approach limits within the margin. + + Args: + robot_uid: Robot entity UID in simulation (default: "robot") + joint_ids: Joint indices to monitor (default: all joints) + margin: Normalized distance threshold (0 to 1). Penalty applied when joint + is within this fraction of its range from either limit. + Example: 0.1 means penalty when within 10% of limits. + + Returns: + Penalty tensor of shape (num_envs,). Always negative or zero. + Sum of penalties across all monitored joints. + + Example: + ```json + { + "func": "joint_limit_penalty", + "weight": 0.01, + "params": { + "robot_uid": "robot", + "joint_ids": [0, 1, 2, 3, 4, 5], + "margin": 0.1 + } + } + ``` + """ + robot = env.sim.get_robot(robot_uid) # get joint positions and limits - qpos = robot.body_data.qpos[:, joint_ids] - qpos_limits = robot.body_data.qpos_limits[:, joint_ids, :] + qpos = robot.get_qpos()[:, joint_ids] + qpos_limits = robot.get_qpos_limits()[:, joint_ids, :] # compute normalized position in range [0, 1] qpos_normalized = (qpos - qpos_limits[:, :, 0]) / ( @@ -191,33 +255,6 @@ def joint_limit_penalty( return penalty.sum(dim=-1) -def collision_penalty( - env: EmbodiedEnv, - obs: dict, - action: torch.Tensor, - info: dict, - robot_uid: str = "robot", - force_threshold: float = 1.0, -) -> torch.Tensor: - """Penalize collisions based on contact forces.""" - robot = env.sim[robot_uid] - - # get joint forces (torques) - qf = robot.body_data.qf - - # check if any joint force exceeds threshold - collision_detected = (torch.abs(qf) > force_threshold).any(dim=-1) - - # return penalty for collisions - penalty = torch.where( - collision_detected, - torch.full((env.num_envs,), -1.0, device=env.device), - torch.zeros(env.num_envs, device=env.device), - ) - - return penalty - - def orientation_alignment_reward( env: EmbodiedEnv, obs: dict, @@ -226,32 +263,40 @@ def orientation_alignment_reward( source_entity_cfg: SceneEntityCfg = None, target_entity_cfg: SceneEntityCfg = None, ) -> torch.Tensor: - """Reward alignment of orientations between two entities.""" + """Reward rotational alignment between two rigid objects. + + Encourages the source object's orientation to match the target object's orientation. + Uses rotation matrix trace to measure alignment. + + Args: + source_entity_cfg: Configuration for the source object (e.g., {"uid": "cube"}) + target_entity_cfg: Configuration for the target object (e.g., {"uid": "reference"}) + + Returns: + Reward tensor of shape (num_envs,). Ranges from -1 to 1. + - 1.0: Perfect alignment (same orientation) + - 0.0: 90° rotation difference + - -1.0: 180° rotation difference (opposite orientation) + + Example: + ```json + { + "func": "orientation_alignment_reward", + "weight": 0.5, + "params": { + "source_entity_cfg": {"uid": "object"}, + "target_entity_cfg": {"uid": "goal_object"} + } + } + ``` + """ # get source entity rotation matrix - source_obj = env.sim[source_entity_cfg.uid] - if hasattr(source_obj, "get_body_pose"): - source_rot = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[ - :, :3, :3 - ] - elif hasattr(source_obj, "get_local_pose"): - source_rot = source_obj.get_local_pose(to_matrix=True)[:, :3, :3] - else: - raise ValueError( - f"Entity '{source_entity_cfg.uid}' does not support orientation query." - ) + source_obj = env.sim.get_rigid_object(source_entity_cfg.uid) + source_rot = source_obj.get_local_pose(to_matrix=True)[:, :3, :3] # get target entity rotation matrix - target_obj = env.sim[target_entity_cfg.uid] - if hasattr(target_obj, "get_body_pose"): - target_rot = target_obj.get_body_pose(body_ids=target_entity_cfg.body_ids)[ - :, :3, :3 - ] - elif hasattr(target_obj, "get_local_pose"): - target_rot = target_obj.get_local_pose(to_matrix=True)[:, :3, :3] - else: - raise ValueError( - f"Entity '{target_entity_cfg.uid}' does not support orientation query." - ) + target_obj = env.sim.get_rigid_object(target_entity_cfg.uid) + target_rot = target_obj.get_local_pose(to_matrix=True)[:, :3, :3] # compute rotation difference rot_diff = torch.bmm(source_rot, target_rot.transpose(-1, -2)) @@ -270,9 +315,30 @@ def success_reward( obs: dict, action: torch.Tensor, info: dict, - reward_value: float = 1.0, -) -> torch.Tensor: - """Sparse reward for task success.""" + ) -> torch.Tensor: + """Sparse bonus reward when task succeeds. + + Provides a fixed reward when the task success condition is met. + Reads success status from info['success'] which should be set by the environment. + + Returns: + Reward tensor of shape (num_envs,). + - 1.0 when successful + - 0.0 when not successful or if 'success' key missing + + Note: + The environment's get_info() must populate info['success'] with a boolean + tensor indicating success status for each environment. + + Example: + ```json + { + "func": "success_reward", + "weight": 10.0, + "params": {} + } + ``` + """ # Check if success info is available in info dict if "success" in info: success = info["success"] @@ -285,14 +351,12 @@ def success_reward( return torch.zeros(env.num_envs, device=env.device) # return reward - reward = torch.where( + return torch.where( success, - torch.full((env.num_envs,), reward_value, device=env.device), + torch.ones(env.num_envs, device=env.device), torch.zeros(env.num_envs, device=env.device), ) - return reward - def reaching_behind_object_reward( env: EmbodiedEnv, @@ -306,7 +370,41 @@ def reaching_behind_object_reward( distance_scale: float = 5.0, part_name: str = None, ) -> torch.Tensor: - """Reward for reaching behind an object along object-to-goal direction.""" + """Reward for positioning end-effector behind object for pushing. + + Encourages the robot's end-effector to reach a position behind the object along + the object-to-goal direction. Useful for push manipulation tasks. + + Args: + object_cfg: Configuration for the object to push (e.g., {"uid": "cube"}) + target_pose_key: Key in info dict for goal pose (default: "goal_pose") + Can be (num_envs, 3) position or (num_envs, 4, 4) transform + behind_offset: Distance behind object to reach (in meters, default: 0.015) + height_offset: Additional height above object (in meters, default: 0.015) + distance_scale: Scaling factor for tanh function (higher = steeper, default: 5.0) + part_name: Robot part name for FK computation (e.g., "arm") + + Returns: + Reward tensor of shape (num_envs,). Ranges from 0 to 1. + - 1.0: End-effector at ideal pushing position + - 0.0: End-effector far from ideal position + + Example: + ```json + { + "func": "reaching_behind_object_reward", + "weight": 0.1, + "params": { + "object_cfg": {"uid": "cube"}, + "target_pose_key": "goal_pose", + "behind_offset": 0.015, + "height_offset": 0.015, + "distance_scale": 5.0, + "part_name": "arm" + } + } + ``` + """ # get end effector position from robot FK robot = env.robot joint_ids = robot.get_joint_ids(part_name) @@ -362,19 +460,41 @@ def distance_to_target( sigma: float = 1.0, use_xy_only: bool = False, ) -> torch.Tensor: - """Reward based on distance to a virtual target pose from info.""" + """Reward based on absolute distance to a virtual target pose. + + Encourages an object to get closer to a target pose specified in the info dict. + Unlike incremental_distance_to_target, this provides direct distance-based reward. + + Args: + source_entity_cfg: Configuration for the object (e.g., {"uid": "cube"}) + target_pose_key: Key in info dict for target pose (default: "target_pose") + Can be (num_envs, 3) position or (num_envs, 4, 4) transform + exponential: If True, use exponential reward exp(-d²/2σ²), else use -distance + sigma: Standard deviation for exponential reward (default: 1.0) + use_xy_only: If True, ignore z-axis and only consider horizontal distance + + Returns: + Reward tensor of shape (num_envs,). + - Linear mode: -distance (negative, approaches 0 when close) + - Exponential mode: exp(-d²/2σ²) (0 to 1, approaches 1 when close) + + Example: + ```json + { + "func": "distance_to_target", + "weight": 0.5, + "params": { + "source_entity_cfg": {"uid": "cube"}, + "target_pose_key": "goal_pose", + "exponential": false, + "use_xy_only": true + } + } + ``` + """ # get source entity position - source_obj = env.sim[source_entity_cfg.uid] - if hasattr(source_obj, "get_body_pose"): - source_pos = source_obj.get_body_pose(body_ids=source_entity_cfg.body_ids)[ - :, :3, 3 - ] - elif hasattr(source_obj, "get_local_pose"): - source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] - else: - raise ValueError( - f"Entity '{source_entity_cfg.uid}' does not support position query." - ) + source_obj = env.sim.get_rigid_object(source_entity_cfg.uid) + source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] # get target position from info if target_pose_key not in info: @@ -418,7 +538,47 @@ def incremental_distance_to_target( negative_weight: float = 1.0, use_xy_only: bool = False, ) -> torch.Tensor: - """Incremental reward for progress toward a virtual target pose from info.""" + """Incremental reward for progress toward a virtual target pose. + + Rewards the robot for getting closer to the target compared to previous timestep. + Stores previous distance in env._reward_states for comparison. Uses tanh shaping + to normalize rewards and supports asymmetric weighting for approach vs. retreat. + + Args: + source_entity_cfg: Configuration for the object (e.g., {"uid": "cube"}) + target_pose_key: Key in info dict for target pose (default: "target_pose") + Can be (num_envs, 3) position or (num_envs, 4, 4) transform + tanh_scale: Scaling for tanh normalization (higher = more sensitive, default: 10.0) + positive_weight: Multiplier for reward when getting closer (default: 1.0) + negative_weight: Multiplier for penalty when moving away (default: 1.0) + use_xy_only: If True, ignore z-axis and only consider horizontal distance + + Returns: + Reward tensor of shape (num_envs,). Zero on first call, then: + - Positive when getting closer (scaled by positive_weight) + - Negative when moving away (scaled by negative_weight) + - Magnitude bounded by tanh function + + Note: + This function maintains state using env._reward_states[f"prev_dist_{uid}_{key}"]. + State is automatically reset when the environment resets. + + Example: + ```json + { + "func": "incremental_distance_to_target", + "weight": 1.0, + "params": { + "source_entity_cfg": {"uid": "cube"}, + "target_pose_key": "goal_pose", + "tanh_scale": 10.0, + "positive_weight": 2.0, + "negative_weight": 0.5, + "use_xy_only": true + } + } + ``` + """ # get source entity position source_obj = env.sim.get_rigid_object(source_entity_cfg.uid) source_pos = source_obj.get_local_pose(to_matrix=True)[:, :3, 3] @@ -443,13 +603,17 @@ def incremental_distance_to_target( current_dist = torch.norm(source_pos - target_pos, dim=-1) # initialize previous distance on first call - prev_dist_key = f"_prev_dist_{source_entity_cfg.uid}_{target_pose_key}" - if not hasattr(env, prev_dist_key): - setattr(env, prev_dist_key, current_dist.clone()) + # Use dictionary-based state management for better organization + if not hasattr(env, "_reward_states"): + env._reward_states = {} + + state_key = f"prev_dist_{source_entity_cfg.uid}_{target_pose_key}" + if state_key not in env._reward_states: + env._reward_states[state_key] = current_dist.clone() return torch.zeros(env.num_envs, device=env.device) # compute distance delta (positive = getting closer) - prev_dist = getattr(env, prev_dist_key) + prev_dist = env._reward_states[state_key] distance_delta = prev_dist - current_dist # apply tanh shaping @@ -463,6 +627,6 @@ def incremental_distance_to_target( ) # update previous distance - setattr(env, prev_dist_key, current_dist.clone()) + env._reward_states[state_key] = current_dist.clone() return reward diff --git a/embodichain/lab/gym/envs/tasks/rl/push_cube.py b/embodichain/lab/gym/envs/tasks/rl/push_cube.py index f26a1fc..072574f 100644 --- a/embodichain/lab/gym/envs/tasks/rl/push_cube.py +++ b/embodichain/lab/gym/envs/tasks/rl/push_cube.py @@ -40,6 +40,11 @@ def __init__(self, cfg=None, **kwargs): super().__init__(cfg, **kwargs) + @property + def goal_pose(self) -> torch.Tensor: + """Get current goal poses (4x4 matrices) for all environments.""" + return self._goal_pose + def _draw_goal_marker(self): """Draw axis marker at goal position for visualization.""" goal_sphere = self.sim.get_rigid_object("goal_sphere") @@ -91,15 +96,17 @@ def get_info(self, **kwargs) -> Dict[str, Any]: cube = self.sim.get_rigid_object("cube") cube_pos = cube.body_data.pose[:, :3] - # Get virtual goal pose from env state (set by randomize_target_pose event) - if hasattr(self, "_goal_poses"): - goal_pos = self._goal_poses[:, :3, 3] + # Get goal position from event-managed goal pose + if self.goal_pose is not None: + goal_pos = self.goal_pose[:, :3, 3] + xy_distance = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1) + is_success = xy_distance < self.success_threshold else: - # Fallback: no virtual goal set - goal_pos = torch.zeros_like(cube_pos) - - xy_distance = torch.norm(cube_pos[:, :2] - goal_pos[:, :2], dim=1) - is_success = xy_distance < self.success_threshold + # Goal not yet set by randomize_target_pose event (e.g., before first reset) + xy_distance = torch.zeros(self.cfg.num_envs, device=self.device) + is_success = torch.zeros( + self.cfg.num_envs, device=self.device, dtype=torch.bool + ) info = { "success": is_success, @@ -107,7 +114,7 @@ def get_info(self, **kwargs) -> Dict[str, Any]: self.cfg.num_envs, device=self.device, dtype=torch.bool ), "elapsed_steps": self._elapsed_steps, - "goal_pose": self._goal_poses if hasattr(self, "_goal_poses") else None, + "goal_pose": self.goal_pose, } info["metrics"] = { "distance_to_goal": xy_distance, diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py index 825a83a..cc6781a 100644 --- a/embodichain/lab/gym/utils/gym_utils.py +++ b/embodichain/lab/gym/utils/gym_utils.py @@ -346,6 +346,7 @@ def config_to_cfg(config: dict) -> "EmbodiedEnvCfg": SceneEntityCfg, EventCfg, ObservationCfg, + RewardCfg, DatasetFunctorCfg, ) from embodichain.utils import configclass @@ -455,8 +456,6 @@ class ComponentCfg: raise_if_not_found=True, ) - from embodichain.lab.gym.envs.managers import DatasetFunctorCfg - dataset = DatasetFunctorCfg( func=dataset_func, mode=dataset_params_modified["mode"], @@ -562,8 +561,6 @@ class ComponentCfg: raise_if_not_found=True, ) - from embodichain.lab.gym.envs.managers import RewardCfg - reward = RewardCfg( func=reward_func, mode=reward_params_modified["mode"], From bea6d5e1b5580aab2bba7e78df5386eac723f2a6 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Mon, 19 Jan 2026 16:17:30 +0800 Subject: [PATCH 5/9] delete RLenvCfg --- embodichain/lab/gym/envs/rl_env_cfg.py | 33 -------------------------- 1 file changed, 33 deletions(-) delete mode 100644 embodichain/lab/gym/envs/rl_env_cfg.py diff --git a/embodichain/lab/gym/envs/rl_env_cfg.py b/embodichain/lab/gym/envs/rl_env_cfg.py deleted file mode 100644 index 3444874..0000000 --- a/embodichain/lab/gym/envs/rl_env_cfg.py +++ /dev/null @@ -1,33 +0,0 @@ -# ---------------------------------------------------------------------------- -# Copyright (c) 2021-2025 DexForce Technology Co., Ltd. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ---------------------------------------------------------------------------- - -from typing import Any, Dict - -from embodichain.lab.gym.envs.embodied_env import EmbodiedEnvCfg -from embodichain.utils import configclass - - -@configclass -class RLEnvCfg(EmbodiedEnvCfg): - """Extended configuration for RL environments built from gym-style specs.""" - - env_id: str = "" - extensions: Dict[str, Any] = {} - - @classmethod - def from_dict(cls, d): - """Create an instance from a dictionary.""" - return cls(**d) From 69241fbaf5cb62dcbb4a366ec8ebe1db6b614de3 Mon Sep 17 00:00:00 2001 From: yuanhaonan Date: Mon, 19 Jan 2026 18:15:57 +0800 Subject: [PATCH 6/9] update review changes --- configs/agents/rl/push_cube/gym_config.json | 4 - embodichain/lab/gym/envs/managers/cfg.py | 7 -- .../lab/gym/envs/managers/reward_manager.py | 15 +--- embodichain/lab/gym/envs/managers/rewards.py | 76 +++++++++---------- embodichain/lab/gym/utils/gym_utils.py | 1 - 5 files changed, 40 insertions(+), 63 deletions(-) diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index cd5113b..54283a2 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -72,7 +72,6 @@ "reaching_reward": { "func": "reaching_behind_object_reward", "mode": "add", - "name": "reaching", "weight": 0.1, "params": { "object_cfg": { @@ -88,7 +87,6 @@ "place_reward": { "func": "incremental_distance_to_target", "mode": "add", - "name": "place", "weight": 1.0, "params": { "source_entity_cfg": { @@ -104,14 +102,12 @@ "action_penalty": { "func": "action_smoothness_penalty", "mode": "add", - "name": "action_penalty", "weight": 0.01, "params": {} }, "success_bonus": { "func": "success_reward", "mode": "add", - "name": "success", "weight": 10.0, "params": {} } diff --git a/embodichain/lab/gym/envs/managers/cfg.py b/embodichain/lab/gym/envs/managers/cfg.py index 3161fb8..6aeb280 100644 --- a/embodichain/lab/gym/envs/managers/cfg.py +++ b/embodichain/lab/gym/envs/managers/cfg.py @@ -326,13 +326,6 @@ class RewardCfg(FunctorCfg): - `replace`: The reward replaces the total reward (useful for single reward functions). """ - name: str = MISSING - """The name of the reward term. - - This is used for logging and debugging purposes. The name should be descriptive of what - the reward term represents, e.g., "distance_to_goal", "gripper_close", "collision_penalty". - """ - weight: float = 1.0 """The weight multiplier for this reward term. diff --git a/embodichain/lab/gym/envs/managers/reward_manager.py b/embodichain/lab/gym/envs/managers/reward_manager.py index c034a90..b3e46de 100644 --- a/embodichain/lab/gym/envs/managers/reward_manager.py +++ b/embodichain/lab/gym/envs/managers/reward_manager.py @@ -41,6 +41,8 @@ class RewardManager(ManagerBase): The reward manager offers two modes of operation: - `add`: This mode computes a reward term and adds it to the total reward (weighted by the term's weight). - `replace`: This mode replaces the total reward with the computed value (useful for single reward functions). + + Note: The config key is used as the unique identifier and display name for each reward functor. """ _env: EmbodiedEnv @@ -82,13 +84,8 @@ def __str__(self) -> str: # convert table to string msg += table.get_string() msg += "\n" - return msg - """ - Properties. - """ - @property def active_functors(self) -> dict[str, list[str]]: """Name of active reward functors. @@ -97,10 +94,6 @@ def active_functors(self) -> dict[str, list[str]]: """ return self._mode_functor_names - """ - Operations. - """ - def reset(self, env_ids: Union[Sequence[int], None] = None) -> dict[str, float]: """Reset reward terms that are stateful (implemented as classes). @@ -200,10 +193,6 @@ def get_functor_cfg(self, functor_name: str) -> RewardCfg: return self._mode_functor_cfgs[mode][functors.index(functor_name)] logger.log_error(f"Reward functor '{functor_name}' not found.") - """ - Helper functions. - """ - def _prepare_functors(self): # check if config is dict already if isinstance(self.cfg, dict): diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py index 8b61ec6..c832313 100644 --- a/embodichain/lab/gym/envs/managers/rewards.py +++ b/embodichain/lab/gym/envs/managers/rewards.py @@ -38,21 +38,21 @@ def distance_between_objects( sigma: float = 1.0, ) -> torch.Tensor: """Reward based on distance between two rigid objects. - + Encourages the source object to get closer to the target object. Can use either linear negative distance or exponential Gaussian-shaped reward. - + Args: source_entity_cfg: Configuration for the source object (e.g., {"uid": "cube"}) target_entity_cfg: Configuration for the target object (e.g., {"uid": "goal_sphere"}) exponential: If True, use exponential reward exp(-d²/2σ²), else use -distance sigma: Standard deviation for exponential reward (controls reward spread) - + Returns: Reward tensor of shape (num_envs,). Higher when objects are closer. - Linear mode: ranges from -inf to 0 (0 when objects touch) - Exponential mode: ranges from 0 to 1 (1 when objects touch) - + Example: ```json { @@ -99,21 +99,21 @@ def joint_velocity_penalty( part_name: str | None = None, ) -> torch.Tensor: """Penalize high joint velocities to encourage smooth motion. - + Computes the L2 norm of joint velocities and returns negative value as penalty. Useful for preventing jerky or unstable robot movements. - + Args: robot_uid: Robot entity UID in simulation (default: "robot") joint_ids: Specific joint indices to penalize. Takes priority over part_name. Example: [0, 1, 2] or slice(0, 6) part_name: Control part name (e.g., "arm"). Used only if joint_ids is None. Will penalize all joints in the specified part. - + Returns: Penalty tensor of shape (num_envs,). Always negative or zero. Magnitude increases with joint velocity (larger velocity = more negative). - + Example: ```json { @@ -150,18 +150,18 @@ def action_smoothness_penalty( info: dict, ) -> torch.Tensor: """Penalize large action changes between consecutive timesteps. - + Encourages smooth control commands by penalizing sudden changes in actions. Stores previous action in env._reward_states for comparison. - + Returns: Penalty tensor of shape (num_envs,). Zero on first call (no previous action), negative on subsequent calls (larger change = more negative). - + Note: This function maintains state across calls using env._reward_states['prev_actions']. State is automatically reset when the environment resets. - + Example: ```json { @@ -199,21 +199,21 @@ def joint_limit_penalty( margin: float = 0.1, ) -> torch.Tensor: """Penalize robot joints that are close to their position limits. - + Prevents joints from reaching their physical limits, which can cause instability or singularities. Penalty increases as joints approach limits within the margin. - + Args: robot_uid: Robot entity UID in simulation (default: "robot") joint_ids: Joint indices to monitor (default: all joints) margin: Normalized distance threshold (0 to 1). Penalty applied when joint is within this fraction of its range from either limit. Example: 0.1 means penalty when within 10% of limits. - + Returns: Penalty tensor of shape (num_envs,). Always negative or zero. Sum of penalties across all monitored joints. - + Example: ```json { @@ -264,20 +264,20 @@ def orientation_alignment_reward( target_entity_cfg: SceneEntityCfg = None, ) -> torch.Tensor: """Reward rotational alignment between two rigid objects. - + Encourages the source object's orientation to match the target object's orientation. Uses rotation matrix trace to measure alignment. - + Args: source_entity_cfg: Configuration for the source object (e.g., {"uid": "cube"}) target_entity_cfg: Configuration for the target object (e.g., {"uid": "reference"}) - + Returns: Reward tensor of shape (num_envs,). Ranges from -1 to 1. - 1.0: Perfect alignment (same orientation) - 0.0: 90° rotation difference - -1.0: 180° rotation difference (opposite orientation) - + Example: ```json { @@ -315,21 +315,21 @@ def success_reward( obs: dict, action: torch.Tensor, info: dict, - ) -> torch.Tensor: +) -> torch.Tensor: """Sparse bonus reward when task succeeds. - + Provides a fixed reward when the task success condition is met. Reads success status from info['success'] which should be set by the environment. - + Returns: Reward tensor of shape (num_envs,). - 1.0 when successful - 0.0 when not successful or if 'success' key missing - + Note: The environment's get_info() must populate info['success'] with a boolean tensor indicating success status for each environment. - + Example: ```json { @@ -371,10 +371,10 @@ def reaching_behind_object_reward( part_name: str = None, ) -> torch.Tensor: """Reward for positioning end-effector behind object for pushing. - + Encourages the robot's end-effector to reach a position behind the object along the object-to-goal direction. Useful for push manipulation tasks. - + Args: object_cfg: Configuration for the object to push (e.g., {"uid": "cube"}) target_pose_key: Key in info dict for goal pose (default: "goal_pose") @@ -383,12 +383,12 @@ def reaching_behind_object_reward( height_offset: Additional height above object (in meters, default: 0.015) distance_scale: Scaling factor for tanh function (higher = steeper, default: 5.0) part_name: Robot part name for FK computation (e.g., "arm") - + Returns: Reward tensor of shape (num_envs,). Ranges from 0 to 1. - 1.0: End-effector at ideal pushing position - 0.0: End-effector far from ideal position - + Example: ```json { @@ -461,10 +461,10 @@ def distance_to_target( use_xy_only: bool = False, ) -> torch.Tensor: """Reward based on absolute distance to a virtual target pose. - + Encourages an object to get closer to a target pose specified in the info dict. Unlike incremental_distance_to_target, this provides direct distance-based reward. - + Args: source_entity_cfg: Configuration for the object (e.g., {"uid": "cube"}) target_pose_key: Key in info dict for target pose (default: "target_pose") @@ -472,12 +472,12 @@ def distance_to_target( exponential: If True, use exponential reward exp(-d²/2σ²), else use -distance sigma: Standard deviation for exponential reward (default: 1.0) use_xy_only: If True, ignore z-axis and only consider horizontal distance - + Returns: Reward tensor of shape (num_envs,). - Linear mode: -distance (negative, approaches 0 when close) - Exponential mode: exp(-d²/2σ²) (0 to 1, approaches 1 when close) - + Example: ```json { @@ -539,11 +539,11 @@ def incremental_distance_to_target( use_xy_only: bool = False, ) -> torch.Tensor: """Incremental reward for progress toward a virtual target pose. - + Rewards the robot for getting closer to the target compared to previous timestep. Stores previous distance in env._reward_states for comparison. Uses tanh shaping to normalize rewards and supports asymmetric weighting for approach vs. retreat. - + Args: source_entity_cfg: Configuration for the object (e.g., {"uid": "cube"}) target_pose_key: Key in info dict for target pose (default: "target_pose") @@ -552,17 +552,17 @@ def incremental_distance_to_target( positive_weight: Multiplier for reward when getting closer (default: 1.0) negative_weight: Multiplier for penalty when moving away (default: 1.0) use_xy_only: If True, ignore z-axis and only consider horizontal distance - + Returns: Reward tensor of shape (num_envs,). Zero on first call, then: - Positive when getting closer (scaled by positive_weight) - Negative when moving away (scaled by negative_weight) - Magnitude bounded by tanh function - + Note: This function maintains state using env._reward_states[f"prev_dist_{uid}_{key}"]. State is automatically reset when the environment resets. - + Example: ```json { diff --git a/embodichain/lab/gym/utils/gym_utils.py b/embodichain/lab/gym/utils/gym_utils.py index cc6781a..dff96e9 100644 --- a/embodichain/lab/gym/utils/gym_utils.py +++ b/embodichain/lab/gym/utils/gym_utils.py @@ -564,7 +564,6 @@ class ComponentCfg: reward = RewardCfg( func=reward_func, mode=reward_params_modified["mode"], - name=reward_params_modified["name"], params=reward_params_modified["params"], ) From f05a1eb852f0ea6c0d2ccafedc1545503b5e4fbb Mon Sep 17 00:00:00 2001 From: Chen Jian Date: Mon, 19 Jan 2026 18:18:25 +0800 Subject: [PATCH 7/9] multiple simulation manager (#74) Co-authored-by: chenjian --- docs/source/overview/sim/sim_manager.md | 4 ++ embodichain/lab/sim/sim_manager.py | 86 ++++++++++++++++--------- 2 files changed, 58 insertions(+), 32 deletions(-) diff --git a/docs/source/overview/sim/sim_manager.md b/docs/source/overview/sim/sim_manager.md index d2f19be..d49188c 100644 --- a/docs/source/overview/sim/sim_manager.md +++ b/docs/source/overview/sim/sim_manager.md @@ -26,6 +26,10 @@ sim_config = SimulationManagerCfg( ) ``` +### multiple instance +- current instance number of `SimulationManager`: `SimulationManager.get_n_instances()` +- get specific instance: `SimulationManager.get_instance(instance_id)`, `instance_id` < `SimulationManager.get_n_instances()` + ### Configuration Parameters | Parameter | Type | Default | Description | diff --git a/embodichain/lab/sim/sim_manager.py b/embodichain/lab/sim/sim_manager.py index 1f9c662..48343d4 100644 --- a/embodichain/lab/sim/sim_manager.py +++ b/embodichain/lab/sim/sim_manager.py @@ -158,14 +158,11 @@ class SimulationManager: - physics simulation management, eg. time step, manual update, etc. - interactive control via gizmo and window callbacks events. - This class implements the singleton pattern to ensure only one instance exists at a time. - Args: sim_config (SimulationManagerCfg, optional): simulation configuration. Defaults to SimulationManagerCfg(). """ - _instance = None - _is_initialized = False + _instances = {} SUPPORTED_SENSOR_TYPES = { "Camera": Camera, @@ -173,26 +170,40 @@ class SimulationManager: "ContactSensor": ContactSensor, } - def __new__(cls, sim_config: SimulationManagerCfg = SimulationManagerCfg()): - """Create or return the singleton instance.""" - if cls._instance is None: - cls._instance = super(SimulationManager, cls).__new__(cls) - return cls._instance + def __new__( + cls, + sim_config: SimulationManagerCfg = SimulationManagerCfg() + ): + """Create or return the instance based on instance_id.""" + n_instance = len(list(cls._instances.keys())) + instance = super(SimulationManager, cls).__new__(cls) + # Store sim_config in the instance for use in __init__ or elsewhere + instance.sim_config = sim_config + cls._instances[str(n_instance + 1)] = instance + return instance def __init__( - self, sim_config: SimulationManagerCfg = SimulationManagerCfg() + self, + sim_config: SimulationManagerCfg = SimulationManagerCfg() ) -> None: + instance_id = SimulationManager.get_n_instances() + 1 # Skip initialization if already initialized - if self._is_initialized: + if hasattr(self, "_is_initialized") and self._is_initialized: logger.log_warning( - "SimulationManager is already initialized. Skipping re-initialization. " - "Use SimulationManager.get_instance() to get the existing instance or " - "SimulationManager.reset() to create a new instance." + f"SimulationManager (id={instance_id}) is already initialized. Skipping re-initialization. " + "Use SimulationManager.get_instance(instance_id) to get the existing instance or " + "SimulationManager.reset(instance_id) to create a new instance." ) return + if sim_config.enable_rt and instance_id > 0: + logger.log_error( + f"Ray Tracing rendering backend is only supported for single instance (instance_id=0). " + ) + # Mark as initialized - SimulationManager._is_initialized = True + self._is_initialized = True + self.instance_id = instance_id # Cache paths self._sim_cache_dir = SIM_CACHE_DIR self._material_cache_dir = MATERIAL_CACHE_DIR @@ -278,41 +289,52 @@ def __init__( self._build_multiple_arenas(sim_config.num_envs) @classmethod - def get_instance(cls) -> SimulationManager: - """Get the singleton instance of SimulationManager. + def get_instance(cls, instance_id: int = 0) -> SimulationManager: + """Get the instance of SimulationManager by id. + + Args: + instance_id (int): The instance id. Defaults to 0. Returns: - SimulationManager: The singleton instance. + SimulationManager: The instance. Raises: RuntimeError: If the instance has not been created yet. """ - if cls._instance is None: + if instance_id not in cls._instances: logger.log_error( - "SimulationManager has not been instantiated yet. " - "Create an instance first using SimulationManager(sim_config)." + f"SimulationManager (id={instance_id}) has not been instantiated yet. " + f"Create an instance first using SimulationManager(sim_config, instance_id={instance_id})." ) - return cls._instance + return cls._instances[instance_id] + + @classmethod + def get_n_instances(cls) -> int: + """Get the number of instantiated SimulationManager instances. + + Returns: + int: The number of instances. + """ + return len(cls._instances) @classmethod - def reset(cls) -> None: - """Reset the singleton instance. + def reset(cls, instance_id: int = 0) -> None: + """Reset the instance. This allows creating a new instance with different configuration. """ - if cls._instance is not None: - logger.log_info("Resetting SimulationManager singleton instance.") - cls._instance = None - cls._is_initialized = False + if instance_id in cls._instances: + logger.log_info(f"Resetting SimulationManager instance {instance_id}.") + del cls._instances[instance_id] @classmethod - def is_instantiated(cls) -> bool: - """Check if the singleton instance has been created. + def is_instantiated(cls, instance_id: int = 0) -> bool: + """Check if the instance has been created. Returns: bool: True if the instance exists, False otherwise. """ - return cls._instance is not None + return instance_id in cls._instances @property def num_envs(self) -> int: @@ -1560,4 +1582,4 @@ def destroy(self) -> None: self._env.clean() self._world.quit() - self.reset() + SimulationManager.reset(self.instance_id) \ No newline at end of file From 16004fd33b18510bf065411ad1cb6d1fe48d0b9e Mon Sep 17 00:00:00 2001 From: yuecideng Date: Mon, 19 Jan 2026 19:23:10 +0800 Subject: [PATCH 8/9] wip --- embodichain/lab/sim/sim_manager.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/embodichain/lab/sim/sim_manager.py b/embodichain/lab/sim/sim_manager.py index 48343d4..799013f 100644 --- a/embodichain/lab/sim/sim_manager.py +++ b/embodichain/lab/sim/sim_manager.py @@ -170,10 +170,7 @@ class SimulationManager: "ContactSensor": ContactSensor, } - def __new__( - cls, - sim_config: SimulationManagerCfg = SimulationManagerCfg() - ): + def __new__(cls, sim_config: SimulationManagerCfg = SimulationManagerCfg()): """Create or return the instance based on instance_id.""" n_instance = len(list(cls._instances.keys())) instance = super(SimulationManager, cls).__new__(cls) @@ -183,8 +180,7 @@ def __new__( return instance def __init__( - self, - sim_config: SimulationManagerCfg = SimulationManagerCfg() + self, sim_config: SimulationManagerCfg = SimulationManagerCfg() ) -> None: instance_id = SimulationManager.get_n_instances() + 1 # Skip initialization if already initialized @@ -1582,4 +1578,4 @@ def destroy(self) -> None: self._env.clean() self._world.quit() - SimulationManager.reset(self.instance_id) \ No newline at end of file + SimulationManager.reset(self.instance_id) From ba6ea10aff6a2def561e69471fed4c30adcd356f Mon Sep 17 00:00:00 2001 From: yuecideng Date: Mon, 19 Jan 2026 19:29:04 +0800 Subject: [PATCH 9/9] wip --- configs/agents/rl/push_cube/gym_config.json | 2 +- embodichain/lab/gym/envs/base_env.py | 7 +++---- embodichain/lab/gym/envs/managers/rewards.py | 8 ++++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/configs/agents/rl/push_cube/gym_config.json b/configs/agents/rl/push_cube/gym_config.json index 54283a2..766d12c 100644 --- a/configs/agents/rl/push_cube/gym_config.json +++ b/configs/agents/rl/push_cube/gym_config.json @@ -70,7 +70,7 @@ }, "rewards": { "reaching_reward": { - "func": "reaching_behind_object_reward", + "func": "reaching_behind_object", "mode": "add", "weight": 0.1, "params": { diff --git a/embodichain/lab/gym/envs/base_env.py b/embodichain/lab/gym/envs/base_env.py index 2ca18d7..9974360 100644 --- a/embodichain/lab/gym/envs/base_env.py +++ b/embodichain/lab/gym/envs/base_env.py @@ -480,10 +480,6 @@ def get_reward( rewards = torch.zeros(self.num_envs, dtype=torch.float32, device=self.device) - rewards = self._extend_reward( - rewards=rewards, obs=obs, action=action, info=info - ) - return rewards def _step_action(self, action: EnvAction) -> EnvAction: @@ -548,6 +544,9 @@ def step( obs = self.get_obs(**kwargs) info = self.get_info(**kwargs) rewards = self.get_reward(obs=obs, action=action, info=info) + rewards = self._extend_reward( + rewards=rewards, obs=obs, action=action, info=info + ) terminateds = torch.logical_or( info.get( diff --git a/embodichain/lab/gym/envs/managers/rewards.py b/embodichain/lab/gym/envs/managers/rewards.py index c832313..5cfbef5 100644 --- a/embodichain/lab/gym/envs/managers/rewards.py +++ b/embodichain/lab/gym/envs/managers/rewards.py @@ -255,7 +255,7 @@ def joint_limit_penalty( return penalty.sum(dim=-1) -def orientation_alignment_reward( +def orientation_alignment( env: EmbodiedEnv, obs: dict, action: torch.Tensor, @@ -281,7 +281,7 @@ def orientation_alignment_reward( Example: ```json { - "func": "orientation_alignment_reward", + "func": "orientation_alignment", "weight": 0.5, "params": { "source_entity_cfg": {"uid": "object"}, @@ -358,7 +358,7 @@ def success_reward( ) -def reaching_behind_object_reward( +def reaching_behind_object( env: EmbodiedEnv, obs: dict, action: torch.Tensor, @@ -392,7 +392,7 @@ def reaching_behind_object_reward( Example: ```json { - "func": "reaching_behind_object_reward", + "func": "reaching_behind_object", "weight": 0.1, "params": { "object_cfg": {"uid": "cube"},