from typing import List import numpy as np import gym from ding.envs import BaseEnv, BaseEnvTimestep class GameEnv(BaseEnv): def __init__(self, game_type: str = 'prisoner_dilemma') -> None: self.game_type = game_type assert self.game_type in ['zero_sum', 'prisoner_dilemma'] if self.game_type == 'prisoner_dilemma': self.optimal_policy = [0, 1] elif self.game_type == 'zero_sum': self.optimal_policy = [0.375, 0.625] self._observation_space = None self._action_space = None self._reward_space = None def seed(self, seed: int, dynamic_seed: bool = False) -> None: # ignore seed pass def reset(self) -> np.ndarray: return np.array([[0, 1], [1, 0]]).astype(np.float32) # trivial observation def step(self, actions: List[int]) -> BaseEnvTimestep: if self.game_type == 'zero_sum': if actions == [0, 0]: rewards = 3, -3 results = "wins", "losses" elif actions == [0, 1]: rewards = -2, 2 results = "losses", "wins" elif actions == [1, 0]: rewards = -2, 2 results = "losses", "wins" elif actions == [1, 1]: rewards = 1, -1 results = "wins", "losses" else: raise RuntimeError("invalid actions: {}".format(actions)) elif self.game_type == 'prisoner_dilemma': if actions == [0, 0]: rewards = -1, -1 results = "draws", "draws" elif actions == [0, 1]: rewards = -20, 0 results = "losses", "wins" elif actions == [1, 0]: rewards = 0, -20 results = "wins", "losses" elif actions == [1, 1]: rewards = -10, -10 results = 'draws', 'draws' else: raise RuntimeError("invalid actions: {}".format(actions)) observations = np.array([[0, 1], [1, 0]]).astype(np.float32) rewards = np.array(rewards).astype(np.float32) rewards = rewards[..., np.newaxis] dones = True, True infos = { 'result': results[0], 'eval_episode_return': rewards[0] }, { 'result': results[1], 'eval_episode_return': rewards[1] } return BaseEnvTimestep(observations, rewards, True, infos) def close(self) -> None: pass def __repr__(self) -> str: return "DI-engine League Demo GameEnv" @property def observation_space(self) -> gym.spaces.Space: return self._observation_space @property def action_space(self) -> gym.spaces.Space: return self._action_space @property def reward_space(self) -> gym.spaces.Space: return self._reward_space def random_action(self) -> List[int]: return [np.random.randint(0, 2) for _ in range(2)]