# -*- coding: utf-8 -*-
"""
Created on Thu Mar 21 10:34:46 2024

@author: takan
"""

import MeCab
import torch
import copy
import time
import matplotlib.pyplot as plt
import re
import math
import numpy as np
from gensim.models import Word2Vec
import pickle
import threading
import sentencepiece as spm

class DenseBlock(torch.nn.Module):
    def __init__(self, dim, mul=1):
        super().__init__()
        self.I = torch.nn.Linear(dim, dim*mul)
        self.O = torch.nn.Linear(dim*mul, dim)
    def forward(self, x):
        x = self.I(x)
        x = torch.nn.functional.elu(x)
        x = self.O(x)
        return x

class AttentionBlock(torch.nn.Module):
    def __init__(self, dim, mul=1):
        super().__init__()
        self.Q = torch.nn.Linear(dim, dim*mul)
        self.K = torch.nn.Linear(dim, dim*mul)
        self.V = torch.nn.Linear(dim, dim*mul)
        self.O = torch.nn.Linear(dim*mul, dim)
    def forward(self, q,k,v):
        q = self.Q(q)
        k = self.K(k)
        v = self.V(v)
        x = torch.nn.functional.softmax(q * k, dim=-1) * v
        x = self.O(x)
        return x
"""
class AttentionBlock(torch.nn.Module):
    def __init__(self, dim, mul=1):
        super().__init__()
        self.attn = torch.nn.MultiheadAttention(dim, 16, batch_first=True)
    def forward(self, q,k,v):
        x = self.attn(q, k, v)[0]
        return x
"""
class SanokaLayer(torch.nn.Module):
    def __init__(self, dim, mul=1):
        super().__init__()
        self.x = None
        self.A = AttentionBlock(dim, mul)
        self.B = DenseBlock(dim, mul)
    def reset(self, x=None):
        self.x = x
    def forward(self, u):
        if (self.x != None):
            uu = torch.nn.functional.normalize(u)
            xx = torch.nn.functional.normalize(self.x)
            x = self.A(uu, xx, xx)
            y = self.B(torch.nn.functional.normalize(x)) + u
            self.x = x + self.x
            return y
        else:
            uu = torch.nn.functional.normalize(u)
            x = self.A(uu, uu, uu)
            y = self.B(torch.nn.functional.normalize(x)) + u
            self.x = x
            return y

class SanokaModel(torch.nn.Module):
    def __init__(self, dim, mul=1, Top=True):
        super().__init__()
        self.Top = Top
        if (Top):
            self.I = torch.nn.Linear(128, dim)
        self.A = SanokaLayer(dim, mul)
        self.B = SanokaLayer(dim, mul)
        self.C = SanokaLayer(dim, mul)
        self.D = SanokaLayer(dim, mul)
        self.E = SanokaLayer(dim, mul)
        self.F = SanokaLayer(dim, mul)
    def reset(self):
        self.A.reset()
        self.B.reset()
        self.C.reset()
        self.D.reset()
        self.E.reset()
        self.F.reset()
        
    def forward(self, x):
        if (self.Top):
            x = self.I(x)
        x = self.A(x)
        x = self.B(x)
        x = self.C(x)
        x = self.D(x)
        x = self.E(x)
        x = self.F(x)
        
        return x

class OutputLayer (torch.nn.Module):
    def __init__(self, hiddendim, worddim=59000, heads=4):
        super().__init__()
        self.H = torch.nn.Linear(hiddendim, worddim)
    def forward(self, inpute):
        x = inpute
        x = self.H(x)
        return x       

def GOILOAD():
    fuf = open("table.txt", "r", encoding="UTF-8")
    goi = fuf.read().split("\n")
    fuf.close()
    chardim = len(goi[1:])
    charid = {goi[i+1].split()[0]:i for i in range(chardim-1)}
    return charid, [goi[ia+1].split()[0] for ia in range(chardim-1)]

datas = []
trues = []
lens = []
dones = 0
def Convert(buns, table, maxlen=256):
    buns = buns.split("\n")
    sp = spm.SentencePieceProcessor()
    sp.Load("tokenizer.model")
    w2v = Word2Vec.load("word2vec.model")
    data = []
    true = []
    lena = []
    for datac in range(len(buns)):
        #print(datac)
        #print(buns[datac])
        error = False
        try:
            buna = sp.EncodeAsPieces(buns[datac])[:maxlen]
            a = torch.from_numpy(w2v.wv[buna])
            b = torch.tensor([table[buna[ii]] for ii in range(len(buna))])
            ll = len(buna)
            c = ll
        except:
            print("ERROR")
        else:
            data.append(a)
            true.append(b)
            lena.append(c)
            print(datac)
    f = open("Train_Data.bin", "wb")
    pickle.dump((data, true, lena), f)
    f.close()
    return

def SPMake():

    spm.SentencePieceTrainer.Train(f"--input=train_data.txt --model_prefix=tokenizer --vocab_size=20000 --train_extremely_large_corpus=True")
def W2VMake(filepath="train_data.txt", mincount=50, worker=60):
    sp = spm.SentencePieceProcessor()
    sp.Load("tokenizer.model")
    f = open(filepath, mode="r", encoding="UTF-8")
    texts = f.read().split("\n")
    f.close()
    dat = []
    print(len(texts))
    for a in range(len(texts)):
        dat.append(sp.EncodeAsPieces(texts[a]))
        print(a)
	
    model = Word2Vec(sentences=dat, vector_size=128, window=100, min_count=mincount, workers=worker)
    model.save("word2vec.model")
    model.wv.save_word2vec_format('table.txt')

def DataMake(filepath="train_data.txt", maxlen=129):
    table, i2w = GOILOAD()
    print(len(table))
    time.sleep(1)
    f = open(filepath, mode="r", encoding="UTF-8")
    txt = f.read()
    f.close()
    Convert(txt, table)
    return None

def PreTrain(Load=False, dim=512, outputdim=40000, lr=1e-04, epoch=10, epochload=1000,usedata=480000, onestep=100, uselen=64):
    global datas
    global trues
    global lens
    torch.manual_seed(1293431)
    #torch.manual_seed(576765)
    device1 = torch.device("cuda:0")
    device2 = torch.device("cuda:1")
    device3 = torch.device("cuda:2")
    device4 = torch.device("cuda:3")
    device5 = torch.device("cuda:4")
    device6 = torch.device("cuda:5")
    device7 = torch.device("cuda:6")
    lossf = torch.nn.CrossEntropyLoss()
    model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)
    model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)
    model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)
    model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)
    model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)
    model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)
    output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)
    
    if (Load):
        model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))
        model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))
        model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))
        model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))
        model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))
        model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))
        output.load_state_dict(torch.load("output.pth", map_location=device7))
    model1Optim = torch.optim.Adam(model1.parameters(), lr=lr)
    model2Optim = torch.optim.Adam(model2.parameters(), lr=lr)
    model3Optim = torch.optim.Adam(model3.parameters(), lr=lr)
    model4Optim = torch.optim.Adam(model4.parameters(), lr=lr)
    model5Optim = torch.optim.Adam(model5.parameters(), lr=lr)
    model6Optim = torch.optim.Adam(model6.parameters(), lr=lr)
    outputO = torch.optim.Adam(output.parameters(), lr=lr)
    f = open("Train_Data.bin", "rb")
    datas, trues, lens = pickle.load(f)
    f.close()
    train_x = torch.zeros((epochload, uselen, 128)).to(torch.bfloat16).to(device1)
    train_y = torch.full((epochload, uselen), outputdim - 1, dtype=torch.long).to(device7)
    table, i2w = GOILOAD()
    base = 0
    epoch = int(np.floor((len(datas) / epochload) * epoch))
    print("データ量", len(datas))
    for epochs in range(epoch):
        train_x = train_x.detach()
        train_y = train_y.detach()
        if (base < len(datas) - epochload*2):
            base += epochload
        else:
            base = 0
        if (base > usedata):
            base = 0
        for b in range(epochload):
            a = b + base
            leng = lens[a]
            if (leng > uselen):
                leng = uselen
                
            train_x[b, :datas[a].shape[0]] = datas[a].to(torch.bfloat16).to(device1)[:uselen]
            train_y[b, :trues[a].shape[0]] = trues[a].to(device7).to(torch.long)[:uselen]
        epls = 0.00
        timem = time.time()
        for steps in range(epochload//onestep):
            model1.reset()
            model2.reset()
            model3.reset()
            model4.reset()
            model5.reset()
            model6.reset()
            oa = ""
            model1Optim.zero_grad()
            model2Optim.zero_grad()
            model3Optim.zero_grad()
            model4Optim.zero_grad()
            model5Optim.zero_grad()
            model6Optim.zero_grad()
            outputO.zero_grad()
            loss = 0.00
            for b in range(uselen-1):
                out = model1(train_x[steps*onestep:steps*onestep+onestep, b])
                out = model2(out.to(device2))
                out = model3(out.to(device3))
                out = model4(out.to(device4))
                out = model5(out.to(device5))
                out = model6(out.to(device6))
                out = output(out.to(device7))
                loss += lossf(out, train_y[steps*onestep:steps*onestep+onestep, b+1])
                epls += loss
                
                sfo = torch.nn.functional.softmax(out[0], dim=-1)
                wid = torch.argmax(sfo, dim=-1).item()
                try:
                    wd = i2w[wid]
                except:
                    oa = oa + "ERROR"
                else:
                    oa = oa + wd
                
            loss.backward()
            #print(b)
            model1Optim.step()
            model2Optim.step()
            model3Optim.step()
            model4Optim.step()
            model5Optim.step()
            model6Optim.step()
            outputO.step()
        print("出力サンプル> ", oa[:32].replace("?", ""))
        print("epoch", epochs,"Train_epoch_sum_loss", epls.item(), "time", time.time() - timem)
        if (epochs % 10 == 9):
            torch.save(model1.state_dict(), "LLM1.pth")
            torch.save(model2.state_dict(), "LLM2.pth")
            torch.save(model3.state_dict(), "LLM3.pth")
            torch.save(model4.state_dict(), "LLM4.pth")
            torch.save(model5.state_dict(), "LLM5.pth")
            torch.save(model6.state_dict(), "LLM6.pth")
            torch.save(output.state_dict(), "output.pth")  
def Fineturning(Load=False, dim=512, outputdim=40000, lr=1e-04, epoch=10000, epochload=1000, onestep=200, uselen=32):
    global datas
    global trues
    global lens
    torch.manual_seed(1293431)
    #torch.manual_seed(576765)
    device1 = torch.device("cuda:0")
    device2 = torch.device("cuda:1")
    device3 = torch.device("cuda:2")
    device4 = torch.device("cuda:3")
    device5 = torch.device("cuda:4")
    device6 = torch.device("cuda:5")
    device7 = torch.device("cuda:6")
    lossf = torch.nn.CrossEntropyLoss()
    model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)
    model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)
    model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)
    model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)
    model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)
    model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)
    output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)
    
    model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))
    model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))
    model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))
    model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))
    model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))
    model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))
    output.load_state_dict(torch.load("output.pth", map_location=device7))
    model1Optim = torch.optim.Adam(model1.parameters(), lr=lr)
    model2Optim = torch.optim.Adam(model2.parameters(), lr=lr)
    model3Optim = torch.optim.Adam(model3.parameters(), lr=lr)
    model4Optim = torch.optim.Adam(model4.parameters(), lr=lr)
    model5Optim = torch.optim.Adam(model5.parameters(), lr=lr)
    model6Optim = torch.optim.Adam(model6.parameters(), lr=lr/500)
    outputO = torch.optim.Adam(output.parameters(), lr=lr)
    f = open("Train_Data.bin", "rb")
    datas, trues, lens = pickle.load(f)
    f.close()
    train_x = torch.zeros((epochload, uselen, 128)).to(torch.bfloat16).to(device1)
    train_y = torch.full((epochload, uselen), outputdim - 1, dtype=torch.long).to(device7)
    table, i2w = GOILOAD()
    base = 0
    epoch = int(np.floor((len(datas) / epochload) * epoch))
    #print(epoch)
    for epochs in range(epoch):
        train_x = train_x.detach()
        train_y = train_y.detach()
        if (base < len(datas) - epochload*2):
            base += epochload
        else:
            base = 0
        for b in range(epochload):
            a = b + base
            #print(a)
            leng = lens[a]
            if (leng > uselen):
                leng = uselen
                
            train_x[b, :datas[a].shape[0]] = datas[a].to(torch.bfloat16).to(device1)[:uselen]
            train_y[b, :trues[a].shape[0]] = trues[a].to(device7).to(torch.long)[:uselen]
        epls = 0.00
        timem = time.time()
        for steps in range(epochload//onestep):
            model1.reset()
            model2.reset()
            model3.reset()
            model4.reset()
            model5.reset()
            model6.reset()
            oa = ""
            loss = 0.00
            model1Optim.zero_grad()
            model2Optim.zero_grad()
            model3Optim.zero_grad()
            model4Optim.zero_grad()
            model5Optim.zero_grad()
            model6Optim.zero_grad()
            outputO.zero_grad()
            for b in range(uselen-1):
                with torch.no_grad():
                    out = model1(train_x[steps*onestep:steps*onestep+onestep, b])
                    out = model2(out.to(device2))
                    out = model3(out.to(device3))
                    out = model4(out.to(device4))
                    out = model5(out.to(device5))
                    out = model6(out.to(device6))
                out = output(out.to(device7))
                loss += lossf(out, train_y[steps*onestep:steps*onestep+onestep, b+1])
                epls += loss.item()
                
                sfo = torch.nn.functional.softmax(out[0], dim=-1)
                wid = torch.argmax(sfo, dim=-1).item()
                try:
                    wd = i2w[wid]
                except:
                    oa = oa + "ERROR"
                else:
                    oa = oa + wd
            loss.backward()
            #model6Optim.step()
            outputO.step()
        print("出力サンプル> ", oa[:32].replace("?", ""))
        print("epoch", epochs,"Train_epoch_sum_loss", epls, "time", time.time() - timem)
        if (epochs % 10 == 9):
            #torch.save(model6.state_dict(), "LLM6F.pth")
            torch.save(output.state_dict(), "fineturning.pth")
def Predict(dim=512, outputdim=40000, maxlen=32):

    torch.manual_seed(1293431)

    table, i2w = GOILOAD()
    sp = spm.SentencePieceProcessor()
    sp.Load("tokenizer.model")

    w2v = Word2Vec.load("word2vec.model")

    device1 = torch.device("cuda:0")

    device2 = torch.device("cuda:1")

    device3 = torch.device("cuda:2")

    device4 = torch.device("cuda:3")

    device5 = torch.device("cuda:4")

    device6 = torch.device("cuda:5")

    device7 = torch.device("cuda:6")

    lossf = torch.nn.CrossEntropyLoss()

    model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)

    model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)

    model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)

    model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)

    model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)

    model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)

    output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)

    

    model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))

    model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))

    model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))

    model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))

    model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))

    model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))

    output.load_state_dict(torch.load("fineturning.pth", map_location=device7))

    while(1):

        dd = input("Q> ")# + ","

        

        data = []

        buna = sp.EncodeAsPieces(dd)

        print(buna)

        for a in range(len(buna)):

            try:

                data.append(torch.from_numpy(w2v.wv[buna[a]]).view(1, 1, 128).to(device1))

            except KeyError:

                print("Not Found")

        dat = torch.cat(data, dim=1).to(device1)

        oa = ""

        with torch.no_grad():

            model1.reset()

            model2.reset()

            model3.reset()

            model4.reset()

            model5.reset()

            model6.reset()

            oa = ""

            for a in range(dat.shape[1] - 1):

                out = model1(dat[:, a].to(torch.bfloat16))

                out = model2(out.to(device2))

                out = model3(out.to(device3))

                out = model4(out.to(device4))

                out = model5(out.to(device5))

                out = model6(out.to(device6))

                out = output(out.to(device7))

            for b in range(maxlen - dat.shape[1]):

                out = model1(dat[:, -1].to(torch.bfloat16))

                out = model2(out.to(device2))

                out = model3(out.to(device3))

                out = model4(out.to(device4))

                out = model5(out.to(device5))

                out = model6(out.to(device6))

                out = output(out.to(device7))

                sfo = torch.nn.functional.softmax(out, dim=-1)

                wid = torch.argmax(sfo, dim=-1).item()

                if (wid != outputdim - 1):

                    try:

                        wd = i2w[wid]

                    except:

                        oa = oa + "ERROR"

                    else:

                        oa = oa + wd

                        dat = torch.cat([dat, torch.from_numpy(w2v.wv[wd]).to(device1).view(1, 1, 128)], dim=1)

        print("A> ", oa.replace("?", ""))

def ValidationLoss(dim=512, outputdim=40000, maxlen=32):

    torch.manual_seed(1293431)

    table, i2w = GOILOAD()

    tagger = MeCab.Tagger("-Owakati")

    w2v = Word2Vec.load("word2vec.model")

    device1 = torch.device("cuda:0")

    device2 = torch.device("cuda:1")

    device3 = torch.device("cuda:2")

    device4 = torch.device("cuda:3")

    device5 = torch.device("cuda:4")

    device6 = torch.device("cuda:5")

    device7 = torch.device("cuda:6")

    lossf = torch.nn.CrossEntropyLoss()

    model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)

    model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)

    model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)

    model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)

    model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)

    model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)

    output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)

    

    model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))

    model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))

    model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))

    model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))

    model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))

    model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))

    output.load_state_dict(torch.load("output.pth", map_location=device7))

    dd = input("TestData> ")

    lossf = torch.nn.CrossEntropyLoss()

    data = []

    buna = tagger.parse(dd).split()

    trued = torch.tensor([table[dfg] for dfg in buna]).to(torch.long).unsqueeze(dim=0)

    print(buna)

    print(trued)

    for a in range(len(buna)):

        try:

            data.append(torch.from_numpy(w2v.wv[buna[a]]).view(1, 1, 128).to(device1))

        except KeyError:

            print("Not Found")

    dat = torch.cat(data, dim=1).to(device1)

    oa = ""

    loss = 0.00

    with torch.no_grad():

        model1.reset()

        model2.reset()

        model3.reset()

        model4.reset()

        model5.reset()

        model6.reset()

        oa = ""

        for a in range(dat.shape[1] - 1):

            out = model1(dat[:, a])

            out = model2(out.to(device2))

            out = model3(out.to(device3))

            out = model4(out.to(device4))

            out = model5(out.to(device5))

            out = model6(out.to(device6))

            out = output(out.to(device7))

            sfo = torch.nn.functional.softmax(out, dim=-1)

            wid = torch.argmax(sfo, dim=-1).item()

            try:

                wd = i2w[wid]

            except:

                oa = oa + "ERROR"

            else:
                oa = oa + wd

            loss += lossf(out, trued[:, a+1].to(device2))

    print("validationloss", loss.item() / dat.shape[1], "preview", oa)
if __name__ == "__main__": 
    #DataMake()
    #Fineturning(Load=False,dim=2048, outputdim=21000,lr=1e-03, onestep=300, uselen=128)
    #Predict(dim=2048, outputdim=21000, maxlen=128)