File size: 3,024 Bytes
079c32c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from typing import List
import numpy as np
import gym

from ding.envs import BaseEnv, BaseEnvTimestep


class GameEnv(BaseEnv):

    def __init__(self, game_type: str = 'prisoner_dilemma') -> None:
        self.game_type = game_type
        assert self.game_type in ['zero_sum', 'prisoner_dilemma']
        if self.game_type == 'prisoner_dilemma':
            self.optimal_policy = [0, 1]
        elif self.game_type == 'zero_sum':
            self.optimal_policy = [0.375, 0.625]
        self._observation_space = None
        self._action_space = None
        self._reward_space = None

    def seed(self, seed: int, dynamic_seed: bool = False) -> None:
        # ignore seed
        pass

    def reset(self) -> np.ndarray:
        return np.array([[0, 1], [1, 0]]).astype(np.float32)  # trivial observation

    def step(self, actions: List[int]) -> BaseEnvTimestep:
        if self.game_type == 'zero_sum':
            if actions == [0, 0]:
                rewards = 3, -3
                results = "wins", "losses"
            elif actions == [0, 1]:
                rewards = -2, 2
                results = "losses", "wins"
            elif actions == [1, 0]:
                rewards = -2, 2
                results = "losses", "wins"
            elif actions == [1, 1]:
                rewards = 1, -1
                results = "wins", "losses"
            else:
                raise RuntimeError("invalid actions: {}".format(actions))
        elif self.game_type == 'prisoner_dilemma':
            if actions == [0, 0]:
                rewards = -1, -1
                results = "draws", "draws"
            elif actions == [0, 1]:
                rewards = -20, 0
                results = "losses", "wins"
            elif actions == [1, 0]:
                rewards = 0, -20
                results = "wins", "losses"
            elif actions == [1, 1]:
                rewards = -10, -10
                results = 'draws', 'draws'
            else:
                raise RuntimeError("invalid actions: {}".format(actions))
        observations = np.array([[0, 1], [1, 0]]).astype(np.float32)
        rewards = np.array(rewards).astype(np.float32)
        rewards = rewards[..., np.newaxis]
        dones = True, True
        infos = {
            'result': results[0],
            'eval_episode_return': rewards[0]
        }, {
            'result': results[1],
            'eval_episode_return': rewards[1]
        }
        return BaseEnvTimestep(observations, rewards, True, infos)

    def close(self) -> None:
        pass

    def __repr__(self) -> str:
        return "DI-engine League Demo GameEnv"

    @property
    def observation_space(self) -> gym.spaces.Space:
        return self._observation_space

    @property
    def action_space(self) -> gym.spaces.Space:
        return self._action_space

    @property
    def reward_space(self) -> gym.spaces.Space:
        return self._reward_space

    def random_action(self) -> List[int]:
        return [np.random.randint(0, 2) for _ in range(2)]