from typing import Optional, Dict, Union, List import math import torch import torch.nn as nn import torch.nn.functional as F from torch.distributions import Normal, Independent from ding.torch_utils import fc_block, noise_block, NoiseLinearLayer, MLP, PopArt, conv1d_block from ding.rl_utils import beta_function_map from ding.utils import lists_to_dicts, SequenceType class DiscreteHead(nn.Module): """ Overview: The ``DiscreteHead`` is used to generate discrete actions logit or Q-value logit, \ which is often used in q-learning algorithms or actor-critic algorithms for discrete action space. Interfaces: ``__init__``, ``forward``. """ def __init__( self, hidden_size: int, output_size: int, layer_num: int = 1, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, dropout: Optional[float] = None, noise: Optional[bool] = False, ) -> None: """ Overview: Init the ``DiscreteHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``DiscreteHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value output. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - dropout (:obj:`float`): The dropout rate, default set to None. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. """ super(DiscreteHead, self).__init__() layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block self.Q = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, layer_num, layer_fn=layer, activation=activation, use_dropout=dropout is not None, dropout_probability=dropout, norm_type=norm_type ), block(hidden_size, output_size) ) def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``DiscreteHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keyword ``logit`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. Examples: >>> head = DiscreteHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 64]) """ logit = self.Q(x) return {'logit': logit} class DistributionHead(nn.Module): """ Overview: The ``DistributionHead`` is used to generate distribution for Q-value. This module is used in C51 algorithm. Interfaces: ``__init__``, ``forward``. """ def __init__( self, hidden_size: int, output_size: int, layer_num: int = 1, n_atom: int = 51, v_min: float = -10, v_max: float = 10, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, noise: Optional[bool] = False, eps: Optional[float] = 1e-6, ) -> None: """ Overview: Init the ``DistributionHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``DistributionHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value distribution. - n_atom (:obj:`int`): The number of atoms (discrete supports). Default is ``51``. - v_min (:obj:`int`): Min value of atoms. Default is ``-10``. - v_max (:obj:`int`): Max value of atoms. Default is ``10``. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. - eps (:obj:`float`): Small constant used for numerical stability. """ super(DistributionHead, self).__init__() layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block self.Q = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, output_size * n_atom) ) self.output_size = output_size self.n_atom = n_atom self.v_min = v_min self.v_max = v_max self.eps = eps # for numerical stability def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``DistributionHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keywords ``logit`` (:obj:`torch.Tensor`) and \ ``distribution`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. - distribution: :math:`(B, M, n_atom)`. Examples: >>> head = DistributionHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['logit'].shape == torch.Size([4, 64]) >>> # default n_atom is 51 >>> assert outputs['distribution'].shape == torch.Size([4, 64, 51]) """ q = self.Q(x) q = q.view(*q.shape[:-1], self.output_size, self.n_atom) dist = torch.softmax(q, dim=-1) + self.eps q = dist * torch.linspace(self.v_min, self.v_max, self.n_atom).to(x) q = q.sum(-1) return {'logit': q, 'distribution': dist} class BranchingHead(nn.Module): """ Overview: The ``BranchingHead`` is used to generate Q-value with different branches. This module is used in Branch DQN. Interfaces: ``__init__``, ``forward``. """ def __init__( self, hidden_size: int, num_branches: int = 0, action_bins_per_branch: int = 2, layer_num: int = 1, a_layer_num: Optional[int] = None, v_layer_num: Optional[int] = None, norm_type: Optional[str] = None, activation: Optional[nn.Module] = nn.ReLU(), noise: Optional[bool] = False, ) -> None: """ Overview: Init the ``BranchingHead`` layers according to the provided arguments. \ This head achieves a linear increase of the number of network outputs \ with the number of degrees of freedom by allowing a level of independence for each individual action. Therefore, this head is suitable for high dimensional action Spaces. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``BranchingHead``. - num_branches (:obj:`int`): The number of branches, which is equivalent to the action dimension. - action_bins_per_branch (:obj:int): The number of action bins in each dimension. - layer_num (:obj:`int`): The number of layers used in the network to compute Advantage and Value output. - a_layer_num (:obj:`int`): The number of layers used in the network to compute Advantage output. - v_layer_num (:obj:`int`): The number of layers used in the network to compute Value output. - output_size (:obj:`int`): The number of outputs. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. """ super(BranchingHead, self).__init__() if a_layer_num is None: a_layer_num = layer_num if v_layer_num is None: v_layer_num = layer_num self.num_branches = num_branches self.action_bins_per_branch = action_bins_per_branch layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block # value network self.V = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, v_layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, 1) ) # action branching network action_output_dim = action_bins_per_branch self.branches = nn.ModuleList( [ nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, a_layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, action_output_dim) ) for _ in range(self.num_branches) ] ) def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``BranchingHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keyword ``logit`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. Examples: >>> head = BranchingHead(64, 5, 2) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) and outputs['logit'].shape == torch.Size([4, 5, 2]) """ value_out = self.V(x) value_out = torch.unsqueeze(value_out, 1) action_out = [] for b in self.branches: action_out.append(b(x)) action_scores = torch.stack(action_out, 1) # From the paper, this implementation performs better than both the naive alternative (Q = V + A) \ # and the local maximum reduction method (Q = V + max(A)). action_scores = action_scores - torch.mean(action_scores, 2, keepdim=True) logits = value_out + action_scores return {'logit': logits} class RainbowHead(nn.Module): """ Overview: The ``RainbowHead`` is used to generate distribution of Q-value. This module is used in Rainbow DQN. Interfaces: ``__init__``, ``forward``. """ def __init__( self, hidden_size: int, output_size: int, layer_num: int = 1, n_atom: int = 51, v_min: float = -10, v_max: float = 10, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, noise: Optional[bool] = True, eps: Optional[float] = 1e-6, ) -> None: """ Overview: Init the ``RainbowHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``RainbowHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value output. - n_atom (:obj:`int`): The number of atoms (discrete supports). Default is ``51``. - v_min (:obj:`int`): Min value of atoms. Default is ``-10``. - v_max (:obj:`int`): Max value of atoms. Default is ``10``. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. - eps (:obj:`float`): Small constant used for numerical stability. """ super(RainbowHead, self).__init__() layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block self.A = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, output_size * n_atom) ) self.Q = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, n_atom) ) self.output_size = output_size self.n_atom = n_atom self.v_min = v_min self.v_max = v_max self.eps = eps def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``RainbowHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keywords ``logit`` (:obj:`torch.Tensor`) and \ ``distribution`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. - distribution: :math:`(B, M, n_atom)`. Examples: >>> head = RainbowHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['logit'].shape == torch.Size([4, 64]) >>> # default n_atom is 51 >>> assert outputs['distribution'].shape == torch.Size([4, 64, 51]) """ a = self.A(x) q = self.Q(x) a = a.view(*a.shape[:-1], self.output_size, self.n_atom) q = q.view(*q.shape[:-1], 1, self.n_atom) q = q + a - a.mean(dim=-2, keepdim=True) dist = torch.softmax(q, dim=-1) + self.eps q = dist * torch.linspace(self.v_min, self.v_max, self.n_atom).to(x) q = q.sum(-1) return {'logit': q, 'distribution': dist} class QRDQNHead(nn.Module): """ Overview: The ``QRDQNHead`` (Quantile Regression DQN) is used to output action quantiles. Interfaces: ``__init__``, ``forward``. """ def __init__( self, hidden_size: int, output_size: int, layer_num: int = 1, num_quantiles: int = 32, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, noise: Optional[bool] = False, ) -> None: """ Overview: Init the ``QRDQNHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``QRDQNHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value output. - num_quantiles (:obj:`int`): The number of quantiles. Default is ``32``. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. """ super(QRDQNHead, self).__init__() layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block self.Q = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, output_size * num_quantiles) ) self.num_quantiles = num_quantiles self.output_size = output_size def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``QRDQNHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keywords ``logit`` (:obj:`torch.Tensor`), \ ``q`` (:obj:`torch.Tensor`), and ``tau`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. - q: :math:`(B, M, num_quantiles)`. - tau: :math:`(B, M, 1)`. Examples: >>> head = QRDQNHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['logit'].shape == torch.Size([4, 64]) >>> # default num_quantiles is 32 >>> assert outputs['q'].shape == torch.Size([4, 64, 32]) >>> assert outputs['tau'].shape == torch.Size([4, 32, 1]) """ q = self.Q(x) q = q.view(*q.shape[:-1], self.output_size, self.num_quantiles) logit = q.mean(-1) tau = torch.linspace(0, 1, self.num_quantiles + 1) tau = ((tau[:-1] + tau[1:]) / 2).view(1, -1, 1).repeat(q.shape[0], 1, 1).to(q) return {'logit': logit, 'q': q, 'tau': tau} class QuantileHead(nn.Module): """ Overview: The ``QuantileHead`` is used to output action quantiles. This module is used in IQN. Interfaces: ``__init__``, ``forward``, ``quantile_net``. .. note:: The difference between ``QuantileHead`` and ``QRDQNHead`` is that ``QuantileHead`` models the \ state-action quantile function as a mapping from state-actions and samples from some base distribution \ while ``QRDQNHead`` approximates random returns by a uniform mixture of Diracs functions. """ def __init__( self, hidden_size: int, output_size: int, layer_num: int = 1, num_quantiles: int = 32, quantile_embedding_size: int = 128, beta_function_type: Optional[str] = 'uniform', activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, noise: Optional[bool] = False, ) -> None: """ Overview: Init the ``QuantileHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``QuantileHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value output. - num_quantiles (:obj:`int`): The number of quantiles. - quantile_embedding_size (:obj:`int`): The embedding size of a quantile. - beta_function_type (:obj:`str`): Type of beta function. See ``ding.rl_utils.beta_function.py`` \ for more details. Default is ``uniform``. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. """ super(QuantileHead, self).__init__() layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block self.Q = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, output_size) ) self.num_quantiles = num_quantiles self.quantile_embedding_size = quantile_embedding_size self.output_size = output_size self.iqn_fc = nn.Linear(self.quantile_embedding_size, hidden_size) self.beta_function = beta_function_map[beta_function_type] def quantile_net(self, quantiles: torch.Tensor) -> torch.Tensor: """ Overview: Deterministic parametric function trained to reparameterize samples from a base distribution. \ By repeated Bellman update iterations of Q-learning, the optimal action-value function is estimated. Arguments: - x (:obj:`torch.Tensor`): The encoded embedding tensor of parametric sample. Returns: - quantile_net (:obj:`torch.Tensor`): Quantile network output tensor after reparameterization. Shapes: - quantile_net :math:`(quantile_embedding_size, M)`, where ``M = output_size``. Examples: >>> head = QuantileHead(64, 64) >>> quantiles = torch.randn(128,1) >>> qn_output = head.quantile_net(quantiles) >>> assert isinstance(qn_output, torch.Tensor) >>> # default quantile_embedding_size: int = 128, >>> assert qn_output.shape == torch.Size([128, 64]) """ quantile_net = quantiles.repeat([1, self.quantile_embedding_size]) quantile_net = torch.cos( torch.arange(1, self.quantile_embedding_size + 1, 1).to(quantiles) * math.pi * quantile_net ) quantile_net = self.iqn_fc(quantile_net) quantile_net = F.relu(quantile_net) return quantile_net def forward(self, x: torch.Tensor, num_quantiles: Optional[int] = None) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``QuantileHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keywords ``logit`` (:obj:`torch.Tensor`), \ ``q`` (:obj:`torch.Tensor`), and ``quantiles`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. - q: :math:`(num_quantiles, B, M)`. - quantiles: :math:`(quantile_embedding_size, 1)`. Examples: >>> head = QuantileHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['logit'].shape == torch.Size([4, 64]) >>> # default num_quantiles is 32 >>> assert outputs['q'].shape == torch.Size([32, 4, 64]) >>> assert outputs['quantiles'].shape == torch.Size([128, 1]) """ if num_quantiles is None: num_quantiles = self.num_quantiles batch_size = x.shape[0] q_quantiles = torch.FloatTensor(num_quantiles * batch_size, 1).uniform_(0, 1).to(x) logit_quantiles = torch.FloatTensor(num_quantiles * batch_size, 1).uniform_(0, 1).to(x) logit_quantiles = self.beta_function(logit_quantiles) q_quantile_net = self.quantile_net(q_quantiles) logit_quantile_net = self.quantile_net(logit_quantiles) x = x.repeat(num_quantiles, 1) q_x = x * q_quantile_net # 4*32,64 logit_x = x * logit_quantile_net q = self.Q(q_x).reshape(num_quantiles, batch_size, -1) logit = self.Q(logit_x).reshape(num_quantiles, batch_size, -1).mean(0) return {'logit': logit, 'q': q, 'quantiles': q_quantiles} class FQFHead(nn.Module): """ Overview: The ``FQFHead`` is used to output action quantiles. This module is used in FQF. Interfaces: ``__init__``, ``forward``, ``quantile_net``. .. note:: The implementation of FQFHead is based on the paper https://arxiv.org/abs/1911.02140. The difference between FQFHead and QuantileHead is that, in FQF, \ N adjustable quantile values for N adjustable quantile fractions are estimated to approximate \ the quantile function. The distribution of the return is approximated by a weighted mixture of N \ Diracs functions. While in IQN, the state-action quantile function is modeled as a mapping from \ state-actions and samples from some base distribution. """ def __init__( self, hidden_size: int, output_size: int, layer_num: int = 1, num_quantiles: int = 32, quantile_embedding_size: int = 128, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, noise: Optional[bool] = False, ) -> None: """ Overview: Init the ``FQFHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``FQFHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value output. - num_quantiles (:obj:`int`): The number of quantiles. - quantile_embedding_size (:obj:`int`): The embedding size of a quantile. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. """ super(FQFHead, self).__init__() layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block self.Q = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, output_size) ) self.num_quantiles = num_quantiles self.quantile_embedding_size = quantile_embedding_size self.output_size = output_size self.fqf_fc = nn.Sequential(nn.Linear(self.quantile_embedding_size, hidden_size), nn.ReLU()) self.register_buffer( 'sigma_pi', torch.arange(1, self.quantile_embedding_size + 1, 1).view(1, 1, self.quantile_embedding_size) * math.pi ) # initialize weights_xavier of quantiles_proposal network # NOTE(rjy): quantiles_proposal network mean fraction proposal network quantiles_proposal_fc = nn.Linear(hidden_size, num_quantiles) torch.nn.init.xavier_uniform_(quantiles_proposal_fc.weight, gain=0.01) torch.nn.init.constant_(quantiles_proposal_fc.bias, 0) self.quantiles_proposal = nn.Sequential(quantiles_proposal_fc, nn.LogSoftmax(dim=1)) def quantile_net(self, quantiles: torch.Tensor) -> torch.Tensor: """ Overview: Deterministic parametric function trained to reparameterize samples from the quantiles_proposal network. \ By repeated Bellman update iterations of Q-learning, the optimal action-value function is estimated. Arguments: - x (:obj:`torch.Tensor`): The encoded embedding tensor of parametric sample. Returns: - quantile_net (:obj:`torch.Tensor`): Quantile network output tensor after reparameterization. Examples: >>> head = FQFHead(64, 64) >>> quantiles = torch.randn(4,32) >>> qn_output = head.quantile_net(quantiles) >>> assert isinstance(qn_output, torch.Tensor) >>> # default quantile_embedding_size: int = 128, >>> assert qn_output.shape == torch.Size([4, 32, 64]) """ batch_size, num_quantiles = quantiles.shape[:2] quantile_net = torch.cos(self.sigma_pi.to(quantiles) * quantiles.view(batch_size, num_quantiles, 1)) quantile_net = self.fqf_fc(quantile_net) # (batch_size, num_quantiles, hidden_size) return quantile_net def forward(self, x: torch.Tensor, num_quantiles: Optional[int] = None) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``FQFHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keywords ``logit`` (:obj:`torch.Tensor`), \ ``q`` (:obj:`torch.Tensor`), ``quantiles`` (:obj:`torch.Tensor`), \ ``quantiles_hats`` (:obj:`torch.Tensor`), \ ``q_tau_i`` (:obj:`torch.Tensor`), ``entropies`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. - q: :math:`(B, num_quantiles, M)`. - quantiles: :math:`(B, num_quantiles + 1)`. - quantiles_hats: :math:`(B, num_quantiles)`. - q_tau_i: :math:`(B, num_quantiles - 1, M)`. - entropies: :math:`(B, 1)`. Examples: >>> head = FQFHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['logit'].shape == torch.Size([4, 64]) >>> # default num_quantiles is 32 >>> assert outputs['q'].shape == torch.Size([4, 32, 64]) >>> assert outputs['quantiles'].shape == torch.Size([4, 33]) >>> assert outputs['quantiles_hats'].shape == torch.Size([4, 32]) >>> assert outputs['q_tau_i'].shape == torch.Size([4, 31, 64]) >>> assert outputs['quantiles'].shape == torch.Size([4, 1]) """ if num_quantiles is None: num_quantiles = self.num_quantiles batch_size = x.shape[0] log_q_quantiles = self.quantiles_proposal( x.detach() ) # (batch_size, num_quantiles), not to update encoder when learning w1_loss(fraction loss) q_quantiles = log_q_quantiles.exp() # NOTE(rjy): e^log_q = q # Calculate entropies of value distributions. entropies = -(log_q_quantiles * q_quantiles).sum(dim=-1, keepdim=True) # (batch_size, 1) assert entropies.shape == (batch_size, 1) # accumalative softmax # NOTE(rjy): because quantiles are still expressed in the form of their respective proportions, # e.g. [0.33, 0.33, 0.33] => [0.33, 0.66, 0.99] q_quantiles = torch.cumsum(q_quantiles, dim=1) # quantile_hats: find the optimal condition for τ to minimize W1(Z, τ) tau_0 = torch.zeros((batch_size, 1)).to(x) q_quantiles = torch.cat((tau_0, q_quantiles), dim=1) # [batch_size, num_quantiles+1] # NOTE(rjy): theta_i = F^(-1)_Z((tau_i+tau_i+1)/2), τ^ = (tau_i+tau_i+1)/2, q_quantiles_hats is τ^ q_quantiles_hats = (q_quantiles[:, 1:] + q_quantiles[:, :-1]).detach() / 2. # (batch_size, num_quantiles) # NOTE(rjy): reparameterize q_quantiles_hats q_quantile_net = self.quantile_net(q_quantiles_hats) # [batch_size, num_quantiles, hidden_size(64)] # x.view[batch_size, 1, hidden_size(64)] q_x = (x.view(batch_size, 1, -1) * q_quantile_net) # [batch_size, num_quantiles, hidden_size(64)] q = self.Q(q_x) # [batch_size, num_quantiles, action_dim(64)] logit = q.mean(1) with torch.no_grad(): q_tau_i_net = self.quantile_net( q_quantiles[:, 1:-1].detach() ) # [batch_size, num_quantiles-1, hidden_size(64)] q_tau_i_x = (x.view(batch_size, 1, -1) * q_tau_i_net) # [batch_size, (num_quantiles-1), hidden_size(64)] q_tau_i = self.Q(q_tau_i_x) # [batch_size, num_quantiles-1, action_dim] return { 'logit': logit, 'q': q, 'quantiles': q_quantiles, 'quantiles_hats': q_quantiles_hats, 'q_tau_i': q_tau_i, 'entropies': entropies } class DuelingHead(nn.Module): """ Overview: The ``DuelingHead`` is used to output discrete actions logit. This module is used in Dueling DQN. Interfaces: ``__init__``, ``forward``. """ def __init__( self, hidden_size: int, output_size: int, layer_num: int = 1, a_layer_num: Optional[int] = None, v_layer_num: Optional[int] = None, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, dropout: Optional[float] = None, noise: Optional[bool] = False, ) -> None: """ Overview: Init the ``DuelingHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``DuelingHead``. - output_size (:obj:`int`): The number of outputs. - a_layer_num (:obj:`int`): The number of layers used in the network to compute action output. - v_layer_num (:obj:`int`): The number of layers used in the network to compute value output. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - dropout (:obj:`float`): The dropout rate of dropout layer. Default ``None``. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. """ super(DuelingHead, self).__init__() if a_layer_num is None: a_layer_num = layer_num if v_layer_num is None: v_layer_num = layer_num layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block self.A = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, a_layer_num, layer_fn=layer, activation=activation, use_dropout=dropout is not None, dropout_probability=dropout, norm_type=norm_type ), block(hidden_size, output_size) ) self.V = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, v_layer_num, layer_fn=layer, activation=activation, use_dropout=dropout is not None, dropout_probability=dropout, norm_type=norm_type ), block(hidden_size, 1) ) def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``DuelingHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keyword ``logit`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. Examples: >>> head = DuelingHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['logit'].shape == torch.Size([4, 64]) """ a = self.A(x) v = self.V(x) q_value = a - a.mean(dim=-1, keepdim=True) + v return {'logit': q_value} class StochasticDuelingHead(nn.Module): """ Overview: The ``Stochastic Dueling Network`` is proposed in paper ACER (arxiv 1611.01224). \ That is to say, dueling network architecture in continuous action space. Interfaces: ``__init__``, ``forward``. """ def __init__( self, hidden_size: int, action_shape: int, layer_num: int = 1, a_layer_num: Optional[int] = None, v_layer_num: Optional[int] = None, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, noise: Optional[bool] = False, last_tanh: Optional[bool] = True, ) -> None: """ Overview: Init the ``Stochastic DuelingHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``StochasticDuelingHead``. - action_shape (:obj:`int`): The number of continuous action shape, usually integer value. - layer_num (:obj:`int`): The number of default layers used in the network to compute action and value \ output. - a_layer_num (:obj:`int`): The number of layers used in the network to compute action output. Default is \ ``layer_num``. - v_layer_num (:obj:`int`): The number of layers used in the network to compute value output. Default is \ ``layer_num``. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - noise (:obj:`bool`): Whether use ``NoiseLinearLayer`` as ``layer_fn`` in Q networks' MLP. \ Default ``False``. - last_tanh (:obj:`bool`): If ``True`` Apply ``tanh`` to actions. Default ``True``. """ super(StochasticDuelingHead, self).__init__() if a_layer_num is None: a_layer_num = layer_num if v_layer_num is None: v_layer_num = layer_num layer = NoiseLinearLayer if noise else nn.Linear block = noise_block if noise else fc_block self.A = nn.Sequential( MLP( hidden_size + action_shape, hidden_size, hidden_size, a_layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, 1) ) self.V = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, v_layer_num, layer_fn=layer, activation=activation, norm_type=norm_type ), block(hidden_size, 1) ) if last_tanh: self.tanh = nn.Tanh() else: self.tanh = None def forward( self, s: torch.Tensor, a: torch.Tensor, mu: torch.Tensor, sigma: torch.Tensor, sample_size: int = 10, ) -> Dict[str, torch.Tensor]: """ Overview: Use encoded embedding tensor to run MLP with ``StochasticDuelingHead`` and return the prediction dictionary. Arguments: - s (:obj:`torch.Tensor`): Tensor containing input embedding. - a (:obj:`torch.Tensor`): The original continuous behaviour action. - mu (:obj:`torch.Tensor`): The ``mu`` gaussian reparameterization output of actor head at current \ timestep. - sigma (:obj:`torch.Tensor`): The ``sigma`` gaussian reparameterization output of actor head at \ current timestep. - sample_size (:obj:`int`): The number of samples for continuous action when computing the Q value. Returns: - outputs (:obj:`Dict`): Dict containing keywords \ ``q_value`` (:obj:`torch.Tensor`) and ``v_value`` (:obj:`torch.Tensor`). Shapes: - s: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - a: :math:`(B, A)`, where ``A = action_size``. - mu: :math:`(B, A)`. - sigma: :math:`(B, A)`. - q_value: :math:`(B, 1)`. - v_value: :math:`(B, 1)`. Examples: >>> head = StochasticDuelingHead(64, 64) >>> inputs = torch.randn(4, 64) >>> a = torch.randn(4, 64) >>> mu = torch.randn(4, 64) >>> sigma = torch.ones(4, 64) >>> outputs = head(inputs, a, mu, sigma) >>> assert isinstance(outputs, dict) >>> assert outputs['q_value'].shape == torch.Size([4, 1]) >>> assert outputs['v_value'].shape == torch.Size([4, 1]) """ batch_size = s.shape[0] # batch_size or batch_size * T hidden_size = s.shape[1] action_size = a.shape[1] state_cat_action = torch.cat((s, a), dim=1) # size (B, action_size + state_size) a_value = self.A(state_cat_action) # size (B, 1) v_value = self.V(s) # size (B, 1) # size (B, sample_size, hidden_size) expand_s = (torch.unsqueeze(s, 1)).expand((batch_size, sample_size, hidden_size)) # in case for gradient back propagation dist = Independent(Normal(mu, sigma), 1) action_sample = dist.rsample(sample_shape=(sample_size, )) if self.tanh: action_sample = self.tanh(action_sample) # (sample_size, B, action_size)->(B, sample_size, action_size) action_sample = action_sample.permute(1, 0, 2) # size (B, sample_size, action_size + hidden_size) state_cat_action_sample = torch.cat((expand_s, action_sample), dim=-1) a_val_sample = self.A(state_cat_action_sample) # size (B, sample_size, 1) q_value = v_value + a_value - a_val_sample.mean(dim=1) # size (B, 1) return {'q_value': q_value, 'v_value': v_value} class RegressionHead(nn.Module): """ Overview: The ``RegressionHead`` is used to regress continuous variables. This module is used for generating Q-value (DDPG critic) of continuous actions, \ or state value (A2C/PPO), or directly predicting continuous action (DDPG actor). Interfaces: ``__init__``, ``forward``. """ def __init__( self, input_size: int, output_size: int, layer_num: int = 2, final_tanh: Optional[bool] = False, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, hidden_size: int = None, ) -> None: """ Overview: Init the ``RegressionHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``RegressionHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value output. - final_tanh (:obj:`bool`): If ``True`` apply ``tanh`` to output. Default ``False``. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. """ super(RegressionHead, self).__init__() if hidden_size is None: hidden_size = input_size self.main = MLP(input_size, hidden_size, hidden_size, layer_num, activation=activation, norm_type=norm_type) self.last = nn.Linear(hidden_size, output_size) # for convenience of special initialization self.final_tanh = final_tanh if self.final_tanh: self.tanh = nn.Tanh() def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``RegressionHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keyword ``pred`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - pred: :math:`(B, M)`, where ``M = output_size``. Examples: >>> head = RegressionHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['pred'].shape == torch.Size([4, 64]) """ x = self.main(x) x = self.last(x) if self.final_tanh: x = self.tanh(x) if x.shape[-1] == 1 and len(x.shape) > 1: x = x.squeeze(-1) return {'pred': x} class ReparameterizationHead(nn.Module): """ Overview: The ``ReparameterizationHead`` is used to generate Gaussian distribution of continuous variable, \ which is parameterized by ``mu`` and ``sigma``. This module is often used in stochastic policies, such as PPO and SAC. Interfaces: ``__init__``, ``forward``. """ # The "happo" type here is to align with the sigma initialization method of the network in the original HAPPO \ # paper. The code here needs to be optimized later. default_sigma_type = ['fixed', 'independent', 'conditioned', 'happo'] default_bound_type = ['tanh', None] def __init__( self, input_size: int, output_size: int, layer_num: int = 2, sigma_type: Optional[str] = None, fixed_sigma_value: Optional[float] = 1.0, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, bound_type: Optional[str] = None, hidden_size: int = None ) -> None: """ Overview: Init the ``ReparameterizationHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``ReparameterizationHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value output. - sigma_type (:obj:`str`): Sigma type used. Choose among \ ``['fixed', 'independent', 'conditioned']``. Default is ``None``. - fixed_sigma_value (:obj:`float`): When choosing ``fixed`` type, the tensor ``output['sigma']`` \ is filled with this input value. Default is ``None``. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. - bound_type (:obj:`str`): Bound type to apply to output ``mu``. Choose among ``['tanh', None]``. \ Default is ``None``. """ super(ReparameterizationHead, self).__init__() if hidden_size is None: hidden_size = input_size self.sigma_type = sigma_type assert sigma_type in self.default_sigma_type, "Please indicate sigma_type as one of {}".format( self.default_sigma_type ) self.bound_type = bound_type assert bound_type in self.default_bound_type, "Please indicate bound_type as one of {}".format( self.default_bound_type ) self.main = MLP(input_size, hidden_size, hidden_size, layer_num, activation=activation, norm_type=norm_type) self.mu = nn.Linear(hidden_size, output_size) if self.sigma_type == 'fixed': self.sigma = torch.full((1, output_size), fixed_sigma_value) elif self.sigma_type == 'independent': # independent parameter self.log_sigma_param = nn.Parameter(torch.zeros(1, output_size)) elif self.sigma_type == 'conditioned': self.log_sigma_layer = nn.Linear(hidden_size, output_size) elif self.sigma_type == 'happo': self.sigma_x_coef = 1. self.sigma_y_coef = 0.5 # This parameter (x_coef, y_coef) refers to the HAPPO paper http://arxiv.org/abs/2109.11251. self.log_sigma_param = nn.Parameter(torch.ones(1, output_size) * self.sigma_x_coef) def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``ReparameterizationHead`` and return the prediction \ dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keywords ``mu`` (:obj:`torch.Tensor`) and ``sigma`` \ (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - mu: :math:`(B, M)`, where ``M = output_size``. - sigma: :math:`(B, M)`. Examples: >>> head = ReparameterizationHead(64, 64, sigma_type='fixed') >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['mu'].shape == torch.Size([4, 64]) >>> assert outputs['sigma'].shape == torch.Size([4, 64]) """ x = self.main(x) mu = self.mu(x) if self.bound_type == 'tanh': mu = torch.tanh(mu) if self.sigma_type == 'fixed': sigma = self.sigma.to(mu.device) + torch.zeros_like(mu) # addition aims to broadcast shape elif self.sigma_type == 'independent': log_sigma = self.log_sigma_param + torch.zeros_like(mu) # addition aims to broadcast shape sigma = torch.exp(log_sigma) elif self.sigma_type == 'conditioned': log_sigma = self.log_sigma_layer(x) sigma = torch.exp(torch.clamp(log_sigma, -20, 2)) elif self.sigma_type == 'happo': log_sigma = self.log_sigma_param + torch.zeros_like(mu) sigma = torch.sigmoid(log_sigma / self.sigma_x_coef) * self.sigma_y_coef return {'mu': mu, 'sigma': sigma} class PopArtVHead(nn.Module): """ Overview: The ``PopArtVHead`` is used to generate adaptive normalized state value. More information can be found in \ paper Multi-task Deep Reinforcement Learning with PopArt. \ https://arxiv.org/abs/1809.04474 \ This module is used in PPO or IMPALA. Interfaces: ``__init__``, ``forward``. """ def __init__( self, hidden_size: int, output_size: int, layer_num: int = 1, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None, ) -> None: """ Overview: Init the ``PopArtVHead`` layers according to the provided arguments. Arguments: - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to ``PopArtVHead``. - output_size (:obj:`int`): The number of outputs. - layer_num (:obj:`int`): The number of layers used in the network to compute Q value output. - activation (:obj:`nn.Module`): The type of activation function to use in MLP. \ If ``None``, then default set activation to ``nn.ReLU()``. Default ``None``. - norm_type (:obj:`str`): The type of normalization to use. See ``ding.torch_utils.network.fc_block`` \ for more details. Default ``None``. """ super(PopArtVHead, self).__init__() self.popart = PopArt(hidden_size, output_size) self.Q = nn.Sequential( MLP( hidden_size, hidden_size, hidden_size, layer_num, layer_fn=nn.Linear, activation=activation, norm_type=norm_type ), self.popart ) def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``PopArtVHead`` and return the normalized prediction and \ the unnormalized prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keyword ``pred`` (:obj:`torch.Tensor`) \ and ``unnormalized_pred`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, M)`, where ``M = output_size``. Examples: >>> head = PopArtVHead(64, 64) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) and outputs['pred'].shape == torch.Size([4, 64]) and \ outputs['unnormalized_pred'].shape == torch.Size([4, 64]) """ x = self.Q(x) return x class AttentionPolicyHead(nn.Module): """ Overview: Cross-attention-type discrete action policy head, which is often used in variable discrete action space. Interfaces: ``__init__``, ``forward``. """ def __init__(self) -> None: super(AttentionPolicyHead, self).__init__() def forward(self, key: torch.Tensor, query: torch.Tensor) -> torch.Tensor: """ Overview: Use attention-like mechanism to combine key and query tensor to output discrete action logit. Arguments: - key (:obj:`torch.Tensor`): Tensor containing key embedding. - query (:obj:`torch.Tensor`): Tensor containing query embedding. Returns: - logit (:obj:`torch.Tensor`): Tensor containing output discrete action logit. Shapes: - key: :math:`(B, N, K)`, where ``B = batch_size``, ``N = possible discrete action choices`` and \ ``K = hidden_size``. - query: :math:`(B, K)`. - logit: :math:`(B, N)`. Examples: >>> head = AttentionPolicyHead() >>> key = torch.randn(4, 5, 64) >>> query = torch.randn(4, 64) >>> logit = head(key, query) >>> assert logit.shape == torch.Size([4, 5]) .. note:: In this head, we assume that the ``key`` and ``query`` tensor are both normalized. """ if len(query.shape) == 2 and len(key.shape) == 3: query = query.unsqueeze(1) logit = (key * query).sum(-1) return logit class MultiHead(nn.Module): """ Overview: The ``MultiHead`` is used to generate multiple similar results. For example, we can combine ``Distribution`` and ``MultiHead`` to generate multi-discrete action space logit. Interfaces: ``__init__``, ``forward``. """ def __init__(self, head_cls: type, hidden_size: int, output_size_list: SequenceType, **head_kwargs) -> None: """ Overview: Init the ``MultiHead`` layers according to the provided arguments. Arguments: - head_cls (:obj:`type`): The class of head, choose among [``DuelingHead``, ``DistributionHead``, \ ''QuatileHead'', ...]. - hidden_size (:obj:`int`): The ``hidden_size`` of the MLP connected to the ``Head``. - output_size_list (:obj:`int`): Sequence of ``output_size`` for multi discrete action, e.g. ``[2, 3, 5]``. - head_kwargs: (:obj:`dict`): Dict containing class-specific arguments. """ super(MultiHead, self).__init__() self.pred = nn.ModuleList() for size in output_size_list: self.pred.append(head_cls(hidden_size, size, **head_kwargs)) def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``MultiHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keywords ``logit`` (:obj:`torch.Tensor`) \ corresponding to the logit of each ``output`` each accessed at ``['logit'][i]``. Shapes: - x: :math:`(B, N)`, where ``B = batch_size`` and ``N = hidden_size``. - logit: :math:`(B, Mi)`, where ``Mi = output_size`` corresponding to output ``i``. Examples: >>> head = MultiHead(DuelingHead, 64, [2, 3, 5], v_layer_num=2) >>> inputs = torch.randn(4, 64) >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> # output_size_list is [2, 3, 5] as set >>> # Therefore each dim of logit is as follows >>> outputs['logit'][0].shape >>> torch.Size([4, 2]) >>> outputs['logit'][1].shape >>> torch.Size([4, 3]) >>> outputs['logit'][2].shape >>> torch.Size([4, 5]) """ return lists_to_dicts([m(x) for m in self.pred]) class EnsembleHead(nn.Module): """ Overview: The ``EnsembleHead`` is used to generate Q-value for Q-ensemble in model-based RL algorithms. Interfaces: ``__init__``, ``forward``. """ def __init__( self, input_size: int, output_size: int, hidden_size: int, layer_num: int, ensemble_num: int, activation: Optional[nn.Module] = nn.ReLU(), norm_type: Optional[str] = None ) -> None: super(EnsembleHead, self).__init__() d = input_size layers = [] for _ in range(layer_num): layers.append( conv1d_block( d * ensemble_num, hidden_size * ensemble_num, kernel_size=1, stride=1, groups=ensemble_num, activation=activation, norm_type=norm_type ) ) d = hidden_size # Adding activation for last layer will lead to train fail layers.append( conv1d_block( hidden_size * ensemble_num, output_size * ensemble_num, kernel_size=1, stride=1, groups=ensemble_num, activation=None, norm_type=None ) ) self.pred = nn.Sequential(*layers) def forward(self, x: torch.Tensor) -> Dict: """ Overview: Use encoded embedding tensor to run MLP with ``EnsembleHead`` and return the prediction dictionary. Arguments: - x (:obj:`torch.Tensor`): Tensor containing input embedding. Returns: - outputs (:obj:`Dict`): Dict containing keyword ``pred`` (:obj:`torch.Tensor`). Shapes: - x: :math:`(B, N * ensemble_num, 1)`, where ``B = batch_size`` and ``N = hidden_size``. - pred: :math:`(B, M * ensemble_num, 1)`, where ``M = output_size``. Examples: >>> head = EnsembleHead(64 * 10, 64 * 10) >>> inputs = torch.randn(4, 64 * 10, 1) ` >>> outputs = head(inputs) >>> assert isinstance(outputs, dict) >>> assert outputs['pred'].shape == torch.Size([10, 64 * 10]) """ x = self.pred(x).squeeze(-1) return {'pred': x} def independent_normal_dist(logits: Union[List, Dict]) -> torch.distributions.Distribution: """ Overview: Convert different types logit to independent normal distribution. Arguments: - logits (:obj:`Union[List, Dict]`): The logits to be converted. Returns: - dist (:obj:`torch.distributions.Distribution`): The converted normal distribution. Examples: >>> logits = [torch.randn(4, 5), torch.ones(4, 5)] >>> dist = independent_normal_dist(logits) >>> assert isinstance(dist, torch.distributions.Independent) >>> assert isinstance(dist.base_dist, torch.distributions.Normal) >>> assert dist.base_dist.loc.shape == torch.Size([4, 5]) >>> assert dist.base_dist.scale.shape == torch.Size([4, 5]) Raises: - TypeError: If the type of logits is not ``list`` or ``dict``. """ if isinstance(logits, (list, tuple)): return Independent(Normal(*logits), 1) elif isinstance(logits, dict): return Independent(Normal(logits['mu'], logits['sigma']), 1) else: raise TypeError("invalid logits type: {}".format(type(logits))) head_cls_map = { # discrete 'discrete': DiscreteHead, 'dueling': DuelingHead, 'sdn': StochasticDuelingHead, 'distribution': DistributionHead, 'rainbow': RainbowHead, 'qrdqn': QRDQNHead, 'quantile': QuantileHead, 'fqf': FQFHead, 'branch': BranchingHead, 'attention_policy': AttentionPolicyHead, # continuous 'regression': RegressionHead, 'reparameterization': ReparameterizationHead, 'popart': PopArtVHead, 'sdn': StochasticDuelingHead, # multi 'multi': MultiHead, 'ensemble': EnsembleHead, }