num_rollout_goals:last_env_goal_idx] = \
env_goals[goal_key]
if num_future_goals > 0:
future_obs_idxs = []
for i in indices[-num_future_goals:]:
possible_future_obs_idxs = self._idx_to_future_obs_idx[i]
// This is generally faster than random.choice. Makes you wonder what
// random.choice is doing
num_options = len(possible_future_obs_idxs)
next_obs_i = int(np.random.randint(0, num_options))
future_obs_idxs.append(possible_future_obs_idxs[next_obs_i])
future_obs_idxs = np.array(future_obs_idxs)resampled_goals[-num_future_goals:] = self._next_obs[
self.achieved_goal_key
][future_obs_idxs]
for goal_key in self.goal_keys:
new_obs_dict[goal_key][-num_future_goals:] = \
self._next_obs[goal_key][future_obs_idxs]
new_next_obs_dict[goal_key][-num_future_goals:] = \
self._next_obs[goal_key][future_obs_idxs]
new_obs_dict[self.desired_goal_key] = resampled_goals
new_next_obs_dict[self.desired_goal_key] = resampled_goals
new_obs_dict = postprocess_obs_dict(new_obs_dict)
new_next_obs_dict = postprocess_obs_dict(new_next_obs_dict)
// resampled_goals must be postprocessed as well
resampled_goals = new_next_obs_dict[self.desired_goal_key]
new_actions = self._actions[indices]
For example, the environments in this repo have batch-wise
implementations of computing rewards:
https://github.com/vitchyr/multiworld
if hasattr(self.env, "compute_rewards"):
new_rewards = self.env.compute_rewards(
new_actions,
new_next_obs_dict,
)
else: // Assuming it"s a (possibly wrapped) gym GoalEnv
new_rewards = np.ones((batch_size, 1))
for i in range(batch_size):
new_rewards[i] = self.env.compute_reward(
new_next_obs_dict[self.achieved_goal_key][i],
new_next_obs_dict[self.desired_goal_key][i],
None
After Change
if num_future_goals > 0:
//// better future obs sample algorithm
future_indices = indices[-num_future_goals:]
possible_future_obs_lens = np.array([len(self._idx_to_future_obs_idx[i]) for i in future_indices])next_obs_idxs = (np.random.random(num_future_goals) * possible_future_obs_lens).astype(np.int)future_obs_idxs = np.array([self._idx_to_future_obs_idx[ids][next_obs_idxs[i]] for i, ids in enumerate(future_indices)])resampled_goals[-num_future_goals:] = self._next_obs[
self.achieved_goal_key
][future_obs_idxs]
for goal_key in self.goal_keys:
new_obs_dict[goal_key][-num_future_goals:] = \
self._next_obs[goal_key][future_obs_idxs]
new_next_obs_dict[goal_key][-num_future_goals:] = \
self._next_obs[goal_key][future_obs_idxs]
new_obs_dict[self.desired_goal_key] = resampled_goals
new_next_obs_dict[self.desired_goal_key] = resampled_goals
new_obs_dict = postprocess_obs_dict(new_obs_dict)
new_next_obs_dict = postprocess_obs_dict(new_next_obs_dict)
// resampled_goals must be postprocessed as well
resampled_goals = new_next_obs_dict[self.desired_goal_key]
new_actions = self._actions[indices]
For example, the environments in this repo have batch-wise
implementations of computing rewards:
https://github.com/vitchyr/multiworld
if hasattr(self.env, "compute_rewards"):
new_rewards = self.env.compute_rewards(
new_actions,
new_next_obs_dict,
)
else: // Assuming it"s a (possibly wrapped) gym GoalEnv
new_rewards = np.ones((batch_size, 1))
for i in range(batch_size):
new_rewards[i] = self.env.compute_reward(
new_next_obs_dict[self.achieved_goal_key][i],
new_next_obs_dict[self.desired_goal_key][i],
None