Spaces:

zjowowen
/

gomoku

Sleeping

File size: 23,724 Bytes

079c32c

# Code Reference: https://github.com/OptMLGroup/DeepBeerInventory-RL.
import numpy as np
from random import randint
from .BGAgent import Agent
from matplotlib import rc
rc('text', usetex=True)
from .plotting import plotting, savePlot
import matplotlib.pyplot as plt
import os
import time
from time import gmtime, strftime


class clBeerGame(object):

    def __init__(self, config):
        self.config = config
        self.curGame = 0  # The number associated with the current game (counter of the game)
        self.curTime = 0
        self.totIterPlayed = 0  # total iterations of the game, played so far in this and previous games
        self.players = self.createAgent()  # create the agents
        self.T = 0
        self.demand = []
        self.ifOptimalSolExist = self.config.ifOptimalSolExist
        self.getOptimalSol()
        self.totRew = 0  # it is reward of all players obtained for the current player.
        self.resultTest = []
        self.runnerMidlResults = []  # stores the results to use in runner comparisons
        self.runnerFinlResults = []  # stores the results to use in runner comparisons
        self.middleTestResult = [
        ]  # stores the whole middle results of bs, Strm, and random to avoid doing same tests multiple of times.
        self.runNumber = 0  # the runNumber which is used when use runner
        self.strNum = 0  # the runNumber which is used when use runner

    # createAgent : Create agent objects (agentNum,IL,OO,c_h,c_p,type,config)
    def createAgent(self):
        agentTypes = self.config.agentTypes
        return [
            Agent(
                i, self.config.ILInit[i], self.config.AOInit, self.config.ASInit[i], self.config.c_h[i],
                self.config.c_p[i], self.config.eta[i], agentTypes[i], self.config
            ) for i in range(self.config.NoAgent)
        ]

    # planHorizon : Find a random planning horizon
    def planHorizon(self):
        # TLow: minimum number for the planning horizon # TUp: maximum number for the planning horizon
        # output: The planning horizon which is chosen randomly.
        return randint(self.config.TLow, self.config.TUp)

    # this function resets the game for start of the new game
    def resetGame(self, demand: np.ndarray):
        self.demand = demand
        self.curTime = 0
        self.curGame += 1
        self.totIterPlayed += self.T
        self.T = self.planHorizon()
        # reset the required information of player for each episode
        for k in range(0, self.config.NoAgent):
            self.players[k].resetPlayer(self.T)

        # update OO when there are initial IL,AO,AS
        self.update_OO()

    # correction on cost at time T according to the cost of the other players
    def getTotRew(self):
        totRew = 0
        for i in range(self.config.NoAgent):
            # sum all rewards for the agents and make correction
            totRew += self.players[i].cumReward

        for i in range(self.config.NoAgent):
            self.players[i].curReward += self.players[i].eta * (totRew - self.players[i].cumReward)  # /(self.T)

    # make correction to the rewards in the experience replay for all iterations of current game
    def distTotReward(self, role: int):
        totRew = 0
        optRew = 0.1  # why?
        for i in range(self.config.NoAgent):
            # sum all rewards for the agents and make correction
            totRew += self.players[i].cumReward
        totRew += optRew

        return totRew, self.players[role].cumReward

    def getAction(self, k: int, action: np.ndarray, playType="train"):
        if playType == "train":
            if self.players[k].compType == "srdqn":
                self.players[k].action = np.zeros(self.config.actionListLen)
                self.players[k].action[action] = 1
            elif self.players[k].compType == "Strm":
                self.players[k].action = np.zeros(self.config.actionListLenOpt)
                self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt)\
                    - max(0, round(self.players[k].AO[self.curTime] + \
                    self.players[k].alpha_b*(self.players[k].IL - self.players[k].a_b) + \
                    self.players[k].betta_b*(self.players[k].OO - self.players[k].b_b)))))] = 1
            elif self.players[k].compType == "rnd":
                self.players[k].action = np.zeros(self.config.actionListLen)
                a = np.random.randint(self.config.actionListLen)
                self.players[k].action[a] = 1
            elif self.players[k].compType == "bs":
                self.players[k].action = np.zeros(self.config.actionListLenOpt)
                if self.config.demandDistribution == 2:
                    if self.curTime and self.config.use_initial_BS <= 4:
                        self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt) - \
                        max(0, (self.players[k].int_bslBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime])))))] = 1
                    else:
                        self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt) - \
                        max(0, (self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime])))))] = 1
                else:
                    self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt) - \
                    max(0, (self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime])))))] = 1
        elif playType == "test":
            if self.players[k].compTypeTest == "srdqn":
                self.players[k].action = np.zeros(self.config.actionListLen)
                self.players[k].action = self.players[k].brain.getDNNAction(self.playType)
            elif self.players[k].compTypeTest == "Strm":
                self.players[k].action = np.zeros(self.config.actionListLenOpt)

                self.players[k].action[np.argmin(np.abs(np.array(self.config.actionListOpt)-\
                    max(0,round(self.players[k].AO[self.curTime] +\
                     self.players[k].alpha_b*(self.players[k].IL - self.players[k].a_b) +\
                     self.players[k].betta_b*(self.players[k].OO - self.players[k].b_b)))))] = 1
            elif self.players[k].compTypeTest == "rnd":
                self.players[k].action = np.zeros(self.config.actionListLen)
                a = np.random.randint(self.config.actionListLen)
                self.players[k].action[a] = 1
            elif self.players[k].compTypeTest == "bs":
                self.players[k].action = np.zeros(self.config.actionListLenOpt)

                if self.config.demandDistribution == 2:
                    if self.curTime and self.config.use_initial_BS <= 4:
                        self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
                          max(0,(self.players[k].int_bslBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1
                    else:
                        self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
                          max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1
                else:
                    self.players[k].action [np.argmin(np.abs(np.array(self.config.actionListOpt)-\
                       max(0,(self.players[k].bsBaseStock - (self.players[k].IL + self.players[k].OO - self.players[k].AO[self.curTime]))) ))] = 1
        else:
            # not a valid player is defined.
            raise Exception('The player type is not defined or it is not a valid type.!')

    def next(self):
        # get a random leadtime
        leadTimeIn = randint(
            self.config.leadRecItemLow[self.config.NoAgent - 1], self.config.leadRecItemUp[self.config.NoAgent - 1]
        )
        # handle the most upstream recieved shipment
        self.players[self.config.NoAgent - 1].AS[self.curTime +
                                                 leadTimeIn] += self.players[self.config.NoAgent -
                                                                             1].actionValue(self.curTime)

        for k in range(self.config.NoAgent - 1, -1, -1):  # [3,2,1,0]

            # get current IL and Backorder
            current_IL = max(0, self.players[k].IL)
            current_backorder = max(0, -self.players[k].IL)

            # TODO: We have get the AS and AO from the UI and update our AS and AO, so that code update the corresponding variables

            # increase IL and decrease OO based on the action, for the next period
            self.players[k].recieveItems(self.curTime)

            # observe the reward
            possible_shipment = min(
                current_IL + self.players[k].AS[self.curTime], current_backorder + self.players[k].AO[self.curTime]
            )

            # plan arrivals of the items to the downstream agent
            if self.players[k].agentNum > 0:
                leadTimeIn = randint(self.config.leadRecItemLow[k - 1], self.config.leadRecItemUp[k - 1])
                self.players[k - 1].AS[self.curTime + leadTimeIn] += possible_shipment

            # update IL
            self.players[k].IL -= self.players[k].AO[self.curTime]
            # observe the reward
            self.players[k].getReward()
            self.players[k].hist[-1][-2] = self.players[k].curReward
            self.players[k].hist2[-1][-2] = self.players[k].curReward

            # update next observation
            self.players[k].nextObservation = self.players[k].getCurState(self.curTime + 1)

        if self.config.ifUseTotalReward:
            # correction on cost at time T
            if self.curTime == self.T:
                self.getTotRew()

        self.curTime += 1

    def handelAction(self, action: np.ndarray, playType="train"):
        # get random lead time
        leadTime = randint(self.config.leadRecOrderLow[0], self.config.leadRecOrderUp[0])
        # set AO
        self.players[0].AO[self.curTime] += self.demand[self.curTime]
        for k in range(0, self.config.NoAgent):
            self.getAction(k, action, playType)

            self.players[k].srdqnBaseStock += [self.players[k].actionValue( \
             self.curTime) + self.players[k].IL + self.players[k].OO]

            # update hist for the plots
            self.players[k].hist += [[self.curTime, self.players[k].IL, self.players[k].OO,\
               self.players[k].actionValue(self.curTime), self.players[k].curReward, self.players[k].srdqnBaseStock[-1]]]

            if self.players[k].compType == "srdqn":
                self.players[k].hist2 += [[self.curTime, self.players[k].IL, self.players[k].OO, self.players[k].AO[self.curTime], self.players[k].AS[self.curTime], \
                  self.players[k].actionValue(self.curTime), self.players[k].curReward, \
                  self.config.actionList[np.argmax(self.players[k].action)]]]

            else:
                self.players[k].hist2 += [[self.curTime, self.players[k].IL, self.players[k].OO, self.players[k].AO[self.curTime], self.players[k].AS[self.curTime], \
                  self.players[k].actionValue(self.curTime), self.players[k].curReward, 0]]

            # updates OO and AO at time t+1
            self.players[k].OO += self.players[k].actionValue(self.curTime)  # open order level update
            leadTime = randint(self.config.leadRecOrderLow[k], self.config.leadRecOrderUp[k])
            if self.players[k].agentNum < self.config.NoAgent - 1:
                self.players[k + 1].AO[self.curTime + leadTime] += self.players[k].actionValue(
                    self.curTime
                )  # open order level update

    # check the Shang and Song (2003) condition, and if it works, obtains the base stock policy values for each agent
    def getOptimalSol(self):
        # if self.config.NoAgent !=1:
        if self.config.NoAgent != 1 and 1 == 2:
            # check the Shang and Song (2003) condition.
            for k in range(self.config.NoAgent - 1):
                if not (self.players[k].c_h == self.players[k + 1].c_h and self.players[k + 1].c_p == 0):
                    self.ifOptimalSolExist = False

            # if the Shang and Song (2003) condition satisfied, it runs the algorithm
            if self.ifOptimalSolExist == True:
                calculations = np.zeros((7, self.config.NoAgent))
                for k in range(self.config.NoAgent):
                    # DL_high
                    calculations[0][k] = ((self.config.leadRecItemLow + self.config.leadRecItemUp + 2) / 2 \
                           + (self.config.leadRecOrderLow + self.config.leadRecOrderUp + 2) / 2) * \
                         (self.config.demandUp - self.config.demandLow - 1)
                    if k > 0:
                        calculations[0][k] += calculations[0][k - 1]
                    # probability_high
                    nominator_ch = 0
                    low_denominator_ch = 0
                    for j in range(k, self.config.NoAgent):
                        if j < self.config.NoAgent - 1:
                            nominator_ch += self.players[j + 1].c_h
                        low_denominator_ch += self.players[j].c_h
                    if k == 0:
                        high_denominator_ch = low_denominator_ch
                    calculations[2][k] = (self.players[0].c_p +
                                          nominator_ch) / (self.players[0].c_p + low_denominator_ch + 0.0)
                    # probability_low
                    calculations[3][k] = (self.players[0].c_p +
                                          nominator_ch) / (self.players[0].c_p + high_denominator_ch + 0.0)
                # S_high
                calculations[4] = np.round(np.multiply(calculations[0], calculations[2]))
                # S_low
                calculations[5] = np.round(np.multiply(calculations[0], calculations[3]))
                # S_avg
                calculations[6] = np.round(np.mean(calculations[4:6], axis=0))
                # S', set the base stock values into each agent.
                for k in range(self.config.NoAgent):
                    if k == 0:
                        self.players[k].bsBaseStock = calculations[6][k]

                    else:
                        self.players[k].bsBaseStock = calculations[6][k] - calculations[6][k - 1]
                        if self.players[k].bsBaseStock < 0:
                            self.players[k].bsBaseStock = 0
        elif self.config.NoAgent == 1:
            if self.config.demandDistribution == 0:
                self.players[0].bsBaseStock = np.ceil(
                    self.config.c_h[0] / (self.config.c_h[0] + self.config.c_p[0] + 0.0)
                ) * ((self.config.demandUp - self.config.demandLow - 1) / 2) * self.config.leadRecItemUp
        elif 1 == 1:
            f = self.config.f
            f_init = self.config.f_init
            for k in range(self.config.NoAgent):
                self.players[k].bsBaseStock = f[k]
                self.players[k].int_bslBaseStock = f_init[k]

    def update_OO(self):
        for k in range(0, self.config.NoAgent):
            if k < self.config.NoAgent - 1:
                self.players[k].OO = sum(self.players[k + 1].AO) + sum(self.players[k].AS)
            else:
                self.players[k].OO = sum(self.players[k].AS)

    def doTestMid(self, demandTs):
        self.resultTest = []
        m = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
        self.doTest(m, demandTs)
        print("---------------------------------------------------------------------------------------")
        resultSummary = np.array(self.resultTest).mean(axis=0).tolist()

        result_srdqn = ', '.join(map("{:.2f}".format, resultSummary[0]))
        result_rand = ', '.join(map("{:.2f}".format, resultSummary[1]))
        result_strm = ', '.join(map("{:.2f}".format, resultSummary[2]))
        if self.ifOptimalSolExist:
            result_bs = ', '.join(map("{:.2f}".format, resultSummary[3]))
            print(
                'SUMMARY; {0:s}; ITER= {1:d}; OURPOLICY= [{2:s}]; SUM = {3:2.4f}; Rand= [{4:s}]; SUM = {5:2.4f}; STRM= [{6:s}]; SUM = {7:2.4f}; BS= [{8:s}]; SUM = {9:2.4f}'
                .format(
                    strftime("%Y-%m-%d %H:%M:%S", gmtime()), self.curGame, result_srdqn, sum(resultSummary[0]),
                    result_rand, sum(resultSummary[1]), result_strm, sum(resultSummary[2]), result_bs,
                    sum(resultSummary[3])
                )
            )

        else:
            print(
                'SUMMARY; {0:s}; ITER= {1:d}; OURPOLICY= [{2:s}]; SUM = {3:2.4f}; Rand= [{4:s}]; SUM = {5:2.4f}; STRM= [{6:s}]; SUM = {7:2.4f}'
                .format(
                    strftime("%Y-%m-%d %H:%M:%S", gmtime()), self.curGame, result_srdqn, sum(resultSummary[0]),
                    result_rand, sum(resultSummary[1]), result_strm, sum(resultSummary[2])
                )
            )

        print("=======================================================================================")

    def doTest(self, m, demand):
        import matplotlib.pyplot as plt
        if self.config.ifSaveFigure:
            plt.figure(self.curGame, figsize=(12, 8), dpi=80, facecolor='w', edgecolor='k')

            # self.demand = demand
            # use dnn to get output.
            Rsltdnn, plt = self.tester(self.config.agentTypes, plt, 'b', 'OurPolicy', m)
            baseStockdata = self.players[0].srdqnBaseStock
            # # use random to get output.
            RsltRnd, plt = self.tester(["rnd", "rnd", "rnd", "rnd"], plt, 'y', 'RAND', m)

            # use formual to get output.
            RsltStrm, plt = self.tester(["Strm", "Strm", "Strm", "Strm"], plt, 'g', 'Strm', m)

            # use optimal strategy to get output, if it works.
            if self.ifOptimalSolExist:
                if self.config.agentTypes == ["srdqn", "Strm", "Strm", "Strm"]:
                    Rsltbs, plt = self.tester(["bs", "Strm", "Strm", "Strm"], plt, 'r', 'Strm-BS', m)
                elif self.config.agentTypes == ["Strm", "srdqn", "Strm", "Strm"]:
                    Rsltbs, plt = self.tester(["Strm", "bs", "Strm", "Strm"], plt, 'r', 'Strm-BS', m)
                elif self.config.agentTypes == ["Strm", "Strm", "srdqn", "Strm"]:
                    Rsltbs, plt = self.tester(["Strm", "Strm", "bs", "Strm"], plt, 'r', 'Strm-BS', m)
                elif self.config.agentTypes == ["Strm", "Strm", "Strm", "srdqn"]:
                    Rsltbs, plt = self.tester(["Strm", "Strm", "Strm", "bs"], plt, 'r', 'Strm-BS', m)
                elif self.config.agentTypes == ["srdqn", "rnd", "rnd", "rnd"]:
                    Rsltbs, plt = self.tester(["bs", "rnd", "rnd", "rnd"], plt, 'r', 'RND-BS', m)
                elif self.config.agentTypes == ["rnd", "srdqn", "rnd", "rnd"]:
                    Rsltbs, plt = self.tester(["rnd", "bs", "rnd", "rnd"], plt, 'r', 'RND-BS', m)
                elif self.config.agentTypes == ["rnd", "rnd", "srdqn", "rnd"]:
                    Rsltbs, plt = self.tester(["rnd", "rnd", "bs", "rnd"], plt, 'r', 'RND-BS', m)
                elif self.config.agentTypes == ["rnd", "rnd", "rnd", "srdqn"]:
                    Rsltbs, plt = self.tester(["rnd", "rnd", "rnd", "bs"], plt, 'r', 'RND-BS', m)
                else:
                    Rsltbs, plt = self.tester(["bs", "bs", "bs", "bs"], plt, 'r', 'BS', m)
            # hold the results of the optimal solution
                self.middleTestResult += [[RsltRnd, RsltStrm, Rsltbs]]
            else:
                self.middleTestResult += [[RsltRnd, RsltStrm]]

        else:
            # return the obtained results into their lists
            RsltRnd = self.middleTestResult[m][0]
            RsltStrm = self.middleTestResult[m][1]
            if self.ifOptimalSolExist:
                Rsltbs = self.middleTestResult[m][2]

        # save the figure
        if self.config.ifSaveFigure:
            savePlot(self.players, self.curGame, Rsltdnn, RsltStrm, Rsltbs, RsltRnd, self.config, m)
            plt.close()

        result_srdqn = ', '.join(map("{:.2f}".format, Rsltdnn))
        result_rand = ', '.join(map("{:.2f}".format, RsltRnd))
        result_strm = ', '.join(map("{:.2f}".format, RsltStrm))
        if self.ifOptimalSolExist:
            result_bs = ', '.join(map("{:.2f}".format, Rsltbs))
            print(
                'output; {0:s}; Iter= {1:s}; SRDQN= [{2:s}]; sum = {3:2.4f}; Rand= [{4:s}]; sum = {5:2.4f}; Strm= [{6:s}]; sum = {7:2.4f}; BS= [{8:s}]; sum = {9:2.4f}'
                .format(
                    strftime("%Y-%m-%d %H:%M:%S", gmtime()), str(str(self.curGame) + "-" + str(m)), result_srdqn,
                    sum(Rsltdnn), result_rand, sum(RsltRnd), result_strm, sum(RsltStrm), result_bs, sum(Rsltbs)
                )
            )
            self.resultTest += [[Rsltdnn, RsltRnd, RsltStrm, Rsltbs]]

        else:
            print(
                'output; {0:s}; Iter= {1:s}; SRDQN= [{2:s}]; sum = {3:2.4f}; Rand= [{4:s}]; sum = {5:2.4f}; Strm= [{6:s}]; sum = {7:2.4f}'
                .format(
                    strftime("%Y-%m-%d %H:%M:%S", gmtime()), str(str(self.curGame) + "-" + str(m)), result_srdqn,
                    sum(Rsltdnn), result_rand, sum(RsltRnd), result_strm, sum(RsltStrm)
                )
            )

            self.resultTest += [[Rsltdnn, RsltRnd, RsltStrm]]

        return sum(Rsltdnn)

    def tester(self, testType, plt, colori, labeli, m):

        # set computation type for test
        for k in range(0, self.config.NoAgent):
            # self.players[k].compTypeTest = testType[k]
            self.players[k].compType = testType[k]
        # run the episode to get the results.
        if labeli != 'OurPolicy':
            result = self.playGame(self.demand)
        else:
            result = [-1 * self.players[i].cumReward for i in range(0, self.config.NoAgent)]
        # add the results into the figure
        if self.config.ifSaveFigure:
            plt = plotting(plt, [np.array(self.players[i].hist) for i in range(0, self.config.NoAgent)], colori, labeli)
        if self.config.ifsaveHistInterval and ((self.curGame == 0) or (self.curGame == 1) or (self.curGame == 2) or (self.curGame == 3) or ((self.curGame - 1) % self.config.saveHistInterval == 0)\
         or ((self.curGame) % self.config.saveHistInterval == 0) or ((self.curGame) % self.config.saveHistInterval == 1) \
         or ((self.curGame) % self.config.saveHistInterval == 2)) :
            for k in range(0, self.config.NoAgent):
                name = labeli + "-" + str(self.curGame) + "-" + "player" + "-" + str(k) + "-" + str(m)
                np.save(os.path.join(self.config.model_dir, name), np.array(self.players[k].hist2))

        # save the figure of base stocks
        # if self.config.ifSaveFigure and (self.curGame in range(self.config.saveFigInt[0],self.config.saveFigInt[1])):
        # 	for k in range(self.config.NoAgent):
        # 		if self.players[k].compTypeTest == 'dnn':
        # 			plotBaseStock(self.players[k].srdqnBaseStock, 'b', 'base stock of agent '+ str(self.players[k].agentNum), self.curGame, self.config, m)

        return result, plt

    def playGame(self, demand):
        self.resetGame(demand)

        # run the game
        while self.curTime < self.T:
            self.handelAction(np.array(0))  # action won't be used.
            self.next()
        return [-1 * self.players[i].cumReward for i in range(0, self.config.NoAgent)]