from typing import Union, Optional, List, Any, Callable, Tuple
import torch
from ding.config import compile_config, read_config
from ding.envs import get_vec_env_setting
from ding.policy import create_policy
from ding.utils import set_pkg_seed
from ding.torch_utils import to_tensor, to_ndarray, tensor_to_list


def eval(
        input_cfg: Union[str, Tuple[dict, dict]],
        seed: int = 0,
        model: Optional[torch.nn.Module] = None,
        state_dict: Optional[dict] = None,
        replay_path: Optional[str] = './video',
) -> float:
    r"""
    Overview:
        The evaluation entry for NGU policy.
    Arguments:
        - input_cfg (:obj:`Union[str, Tuple[dict, dict]]`): Config in dict type. \
            ``str`` type means config file path. \
            ``Tuple[dict, dict]`` type means [user_config, create_cfg].
        - seed (:obj:`int`): Random seed.
        - env_setting (:obj:`Optional[List[Any]]`): A list with 3 elements: \
            ``BaseEnv`` subclass, collector env config, and evaluator env config.
        - model (:obj:`Optional[torch.nn.Module]`): Instance of torch.nn.Module.
        - state_dict (:obj:`Optional[dict]`): The state_dict of policy or model.
    """
    if isinstance(input_cfg, str):
        cfg, create_cfg = read_config(input_cfg)
    else:
        cfg, create_cfg = input_cfg
    create_cfg.policy.type += '_command'
    cfg = compile_config(cfg, auto=True, create_cfg=create_cfg)
    env_fn, _, evaluator_env_cfg = get_vec_env_setting(cfg.env)
    env = env_fn(evaluator_env_cfg[0])
    env.seed(seed, dynamic_seed=False)
    set_pkg_seed(seed, use_cuda=cfg.policy.cuda)
    policy = create_policy(cfg.policy, model=model, enable_field=['eval']).eval_mode
    if state_dict is None:
        state_dict = torch.load(cfg.learner.load_path, map_location='cpu')
    policy.load_state_dict(state_dict)
    env.enable_save_replay(replay_path=replay_path)
    obs = env.reset()
    obs = {0: obs}
    episode_return = 0.

    beta_index = {i: 0 for i in range(1)}
    beta_index = to_tensor(beta_index, dtype=torch.int64)
    prev_action = {i: torch.tensor(-1) for i in range(1)}
    prev_reward_e = {i: to_tensor(0, dtype=torch.float32) for i in range(1)}

    while True:
        # TODO(pu): r_i, reward embedding
        policy_output = policy.forward(beta_index, obs, prev_action, prev_reward_e)

        actions = {i: a['action'] for i, a in policy_output.items()}
        actions = to_ndarray(actions)

        action = policy_output[0]['action']
        action = to_ndarray(action)
        timestep = env.step(action)
        # print(action)
        # print(timestep.reward)

        timesteps = {0: timestep}
        timesteps = to_tensor(timesteps, dtype=torch.float32)

        prev_reward_e = {env_id: timestep.reward for env_id, timestep in timesteps.items()}
        prev_reward_e = to_ndarray(prev_reward_e)
        prev_action = actions

        timestep = timesteps[0]
        # print(timestep.info)
        episode_return += timestep.reward

        obs = timestep.obs
        obs = {0: obs}

        if timestep.done:
            print(timestep.info)
            break
    print('Eval is over! The performance of your RL policy is {}'.format(episode_return))


if __name__ == "__main__":
    # Users should add their own model path here. Model path should lead to a model.
    # Absolute path is recommended.
    # In DI-engine, it is ``exp_name/ckpt/ckpt_best.pth.tar``.
    model_path = './debug_minigrid_doorkey_ngu_ul298_er01_n32_rbs3e4_fixepseval/ckpt/ckpt_best.pth.tar',
    # model_path = 'model_path_placeholder',
    cfg = '../config/minigrid_ngu_config.py'

    state_dict = torch.load(model_path, map_location='cpu')
    for i in range(0, 10):
        eval(cfg, seed=i, state_dict=state_dict, replay_path='./video')