# -*- coding: utf-8 -*- """ Created on Thu Mar 21 10:34:46 2024 @author: takan """ import MeCab import torch import copy import time import matplotlib.pyplot as plt import re import math import numpy as np from gensim.models import Word2Vec import pickle import threading import sentencepiece as spm class DenseBlock(torch.nn.Module): def __init__(self, dim, mul=1): super().__init__() self.I = torch.nn.Linear(dim, dim*mul) self.O = torch.nn.Linear(dim*mul, dim) def forward(self, x): x = self.I(x) x = torch.nn.functional.elu(x) x = self.O(x) return x class AttentionBlock(torch.nn.Module): def __init__(self, dim, mul=1): super().__init__() self.Q = torch.nn.Linear(dim, dim*mul) self.K = torch.nn.Linear(dim, dim*mul) self.V = torch.nn.Linear(dim, dim*mul) self.O = torch.nn.Linear(dim*mul, dim) def forward(self, q,k,v): q = self.Q(q) k = self.K(k) v = self.V(v) x = torch.nn.functional.softmax(q * k, dim=-1) * v x = self.O(x) return x """ class AttentionBlock(torch.nn.Module): def __init__(self, dim, mul=1): super().__init__() self.attn = torch.nn.MultiheadAttention(dim, 16, batch_first=True) def forward(self, q,k,v): x = self.attn(q, k, v)[0] return x """ class SanokaLayer(torch.nn.Module): def __init__(self, dim, mul=1): super().__init__() self.x = None self.A = AttentionBlock(dim, mul) self.B = DenseBlock(dim, mul) def reset(self, x=None): self.x = x def forward(self, u): if (self.x != None): uu = torch.nn.functional.normalize(u) xx = torch.nn.functional.normalize(self.x) x = self.A(uu, xx, xx) y = self.B(torch.nn.functional.normalize(x)) + u self.x = x + self.x return y else: uu = torch.nn.functional.normalize(u) x = self.A(uu, uu, uu) y = self.B(torch.nn.functional.normalize(x)) + u self.x = x return y class SanokaModel(torch.nn.Module): def __init__(self, dim, mul=1, Top=True): super().__init__() self.Top = Top if (Top): self.I = torch.nn.Linear(128, dim) self.A = SanokaLayer(dim, mul) self.B = SanokaLayer(dim, mul) self.C = SanokaLayer(dim, mul) self.D = SanokaLayer(dim, mul) self.E = SanokaLayer(dim, mul) self.F = SanokaLayer(dim, mul) def reset(self): self.A.reset() self.B.reset() self.C.reset() self.D.reset() self.E.reset() self.F.reset() def forward(self, x): if (self.Top): x = self.I(x) x = self.A(x) x = self.B(x) x = self.C(x) x = self.D(x) x = self.E(x) x = self.F(x) return x class OutputLayer (torch.nn.Module): def __init__(self, hiddendim, worddim=59000, heads=4): super().__init__() self.H = torch.nn.Linear(hiddendim, worddim) def forward(self, inpute): x = inpute x = self.H(x) return x def GOILOAD(): fuf = open("table.txt", "r", encoding="UTF-8") goi = fuf.read().split("\n") fuf.close() chardim = len(goi[1:]) charid = {goi[i+1].split()[0]:i for i in range(chardim-1)} return charid, [goi[ia+1].split()[0] for ia in range(chardim-1)] datas = [] trues = [] lens = [] dones = 0 def Convert(buns, table, maxlen=256): buns = buns.split("\n") sp = spm.SentencePieceProcessor() sp.Load("tokenizer.model") w2v = Word2Vec.load("word2vec.model") data = [] true = [] lena = [] for datac in range(len(buns)): #print(datac) #print(buns[datac]) error = False try: buna = sp.EncodeAsPieces(buns[datac])[:maxlen] a = torch.from_numpy(w2v.wv[buna]) b = torch.tensor([table[buna[ii]] for ii in range(len(buna))]) ll = len(buna) c = ll except: print("ERROR") else: data.append(a) true.append(b) lena.append(c) print(datac) f = open("Train_Data.bin", "wb") pickle.dump((data, true, lena), f) f.close() return def SPMake(): spm.SentencePieceTrainer.Train(f"--input=train_data.txt --model_prefix=tokenizer --vocab_size=20000 --train_extremely_large_corpus=True") def W2VMake(filepath="train_data.txt", mincount=50, worker=60): sp = spm.SentencePieceProcessor() sp.Load("tokenizer.model") f = open(filepath, mode="r", encoding="UTF-8") texts = f.read().split("\n") f.close() dat = [] print(len(texts)) for a in range(len(texts)): dat.append(sp.EncodeAsPieces(texts[a])) print(a) model = Word2Vec(sentences=dat, vector_size=128, window=100, min_count=mincount, workers=worker) model.save("word2vec.model") model.wv.save_word2vec_format('table.txt') def DataMake(filepath="train_data.txt", maxlen=129): table, i2w = GOILOAD() print(len(table)) time.sleep(1) f = open(filepath, mode="r", encoding="UTF-8") txt = f.read() f.close() Convert(txt, table) return None def PreTrain(Load=False, dim=512, outputdim=40000, lr=1e-04, epoch=10, epochload=1000,usedata=480000, onestep=100, uselen=64): global datas global trues global lens torch.manual_seed(1293431) #torch.manual_seed(576765) device1 = torch.device("cuda:0") device2 = torch.device("cuda:1") device3 = torch.device("cuda:2") device4 = torch.device("cuda:3") device5 = torch.device("cuda:4") device6 = torch.device("cuda:5") device7 = torch.device("cuda:6") lossf = torch.nn.CrossEntropyLoss() model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1) model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2) model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3) model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4) model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5) model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6) output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7) if (Load): model1.load_state_dict(torch.load("LLM1.pth", map_location=device1)) model2.load_state_dict(torch.load("LLM2.pth", map_location=device2)) model3.load_state_dict(torch.load("LLM3.pth", map_location=device3)) model4.load_state_dict(torch.load("LLM4.pth", map_location=device4)) model5.load_state_dict(torch.load("LLM5.pth", map_location=device5)) model6.load_state_dict(torch.load("LLM6.pth", map_location=device6)) output.load_state_dict(torch.load("output.pth", map_location=device7)) model1Optim = torch.optim.Adam(model1.parameters(), lr=lr) model2Optim = torch.optim.Adam(model2.parameters(), lr=lr) model3Optim = torch.optim.Adam(model3.parameters(), lr=lr) model4Optim = torch.optim.Adam(model4.parameters(), lr=lr) model5Optim = torch.optim.Adam(model5.parameters(), lr=lr) model6Optim = torch.optim.Adam(model6.parameters(), lr=lr) outputO = torch.optim.Adam(output.parameters(), lr=lr) f = open("Train_Data.bin", "rb") datas, trues, lens = pickle.load(f) f.close() train_x = torch.zeros((epochload, uselen, 128)).to(torch.bfloat16).to(device1) train_y = torch.full((epochload, uselen), outputdim - 1, dtype=torch.long).to(device7) table, i2w = GOILOAD() base = 0 epoch = int(np.floor((len(datas) / epochload) * epoch)) print("データ量", len(datas)) for epochs in range(epoch): train_x = train_x.detach() train_y = train_y.detach() if (base < len(datas) - epochload*2): base += epochload else: base = 0 if (base > usedata): base = 0 for b in range(epochload): a = b + base leng = lens[a] if (leng > uselen): leng = uselen train_x[b, :datas[a].shape[0]] = datas[a].to(torch.bfloat16).to(device1)[:uselen] train_y[b, :trues[a].shape[0]] = trues[a].to(device7).to(torch.long)[:uselen] epls = 0.00 timem = time.time() for steps in range(epochload//onestep): model1.reset() model2.reset() model3.reset() model4.reset() model5.reset() model6.reset() oa = "" model1Optim.zero_grad() model2Optim.zero_grad() model3Optim.zero_grad() model4Optim.zero_grad() model5Optim.zero_grad() model6Optim.zero_grad() outputO.zero_grad() loss = 0.00 for b in range(uselen-1): out = model1(train_x[steps*onestep:steps*onestep+onestep, b]) out = model2(out.to(device2)) out = model3(out.to(device3)) out = model4(out.to(device4)) out = model5(out.to(device5)) out = model6(out.to(device6)) out = output(out.to(device7)) loss += lossf(out, train_y[steps*onestep:steps*onestep+onestep, b+1]) epls += loss sfo = torch.nn.functional.softmax(out[0], dim=-1) wid = torch.argmax(sfo, dim=-1).item() try: wd = i2w[wid] except: oa = oa + "ERROR" else: oa = oa + wd loss.backward() #print(b) model1Optim.step() model2Optim.step() model3Optim.step() model4Optim.step() model5Optim.step() model6Optim.step() outputO.step() print("出力サンプル> ", oa[:32].replace("?", "")) print("epoch", epochs,"Train_epoch_sum_loss", epls.item(), "time", time.time() - timem) if (epochs % 10 == 9): torch.save(model1.state_dict(), "LLM1.pth") torch.save(model2.state_dict(), "LLM2.pth") torch.save(model3.state_dict(), "LLM3.pth") torch.save(model4.state_dict(), "LLM4.pth") torch.save(model5.state_dict(), "LLM5.pth") torch.save(model6.state_dict(), "LLM6.pth") torch.save(output.state_dict(), "output.pth") def Fineturning(Load=False, dim=512, outputdim=40000, lr=1e-04, epoch=10000, epochload=1000, onestep=200, uselen=32): global datas global trues global lens torch.manual_seed(1293431) #torch.manual_seed(576765) device1 = torch.device("cuda:0") device2 = torch.device("cuda:1") device3 = torch.device("cuda:2") device4 = torch.device("cuda:3") device5 = torch.device("cuda:4") device6 = torch.device("cuda:5") device7 = torch.device("cuda:6") lossf = torch.nn.CrossEntropyLoss() model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1) model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2) model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3) model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4) model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5) model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6) output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7) model1.load_state_dict(torch.load("LLM1.pth", map_location=device1)) model2.load_state_dict(torch.load("LLM2.pth", map_location=device2)) model3.load_state_dict(torch.load("LLM3.pth", map_location=device3)) model4.load_state_dict(torch.load("LLM4.pth", map_location=device4)) model5.load_state_dict(torch.load("LLM5.pth", map_location=device5)) model6.load_state_dict(torch.load("LLM6.pth", map_location=device6)) output.load_state_dict(torch.load("output.pth", map_location=device7)) model1Optim = torch.optim.Adam(model1.parameters(), lr=lr) model2Optim = torch.optim.Adam(model2.parameters(), lr=lr) model3Optim = torch.optim.Adam(model3.parameters(), lr=lr) model4Optim = torch.optim.Adam(model4.parameters(), lr=lr) model5Optim = torch.optim.Adam(model5.parameters(), lr=lr) model6Optim = torch.optim.Adam(model6.parameters(), lr=lr/500) outputO = torch.optim.Adam(output.parameters(), lr=lr) f = open("Train_Data.bin", "rb") datas, trues, lens = pickle.load(f) f.close() train_x = torch.zeros((epochload, uselen, 128)).to(torch.bfloat16).to(device1) train_y = torch.full((epochload, uselen), outputdim - 1, dtype=torch.long).to(device7) table, i2w = GOILOAD() base = 0 epoch = int(np.floor((len(datas) / epochload) * epoch)) #print(epoch) for epochs in range(epoch): train_x = train_x.detach() train_y = train_y.detach() if (base < len(datas) - epochload*2): base += epochload else: base = 0 for b in range(epochload): a = b + base #print(a) leng = lens[a] if (leng > uselen): leng = uselen train_x[b, :datas[a].shape[0]] = datas[a].to(torch.bfloat16).to(device1)[:uselen] train_y[b, :trues[a].shape[0]] = trues[a].to(device7).to(torch.long)[:uselen] epls = 0.00 timem = time.time() for steps in range(epochload//onestep): model1.reset() model2.reset() model3.reset() model4.reset() model5.reset() model6.reset() oa = "" loss = 0.00 model1Optim.zero_grad() model2Optim.zero_grad() model3Optim.zero_grad() model4Optim.zero_grad() model5Optim.zero_grad() model6Optim.zero_grad() outputO.zero_grad() for b in range(uselen-1): with torch.no_grad(): out = model1(train_x[steps*onestep:steps*onestep+onestep, b]) out = model2(out.to(device2)) out = model3(out.to(device3)) out = model4(out.to(device4)) out = model5(out.to(device5)) out = model6(out.to(device6)) out = output(out.to(device7)) loss += lossf(out, train_y[steps*onestep:steps*onestep+onestep, b+1]) epls += loss.item() sfo = torch.nn.functional.softmax(out[0], dim=-1) wid = torch.argmax(sfo, dim=-1).item() try: wd = i2w[wid] except: oa = oa + "ERROR" else: oa = oa + wd loss.backward() #model6Optim.step() outputO.step() print("出力サンプル> ", oa[:32].replace("?", "")) print("epoch", epochs,"Train_epoch_sum_loss", epls, "time", time.time() - timem) if (epochs % 10 == 9): #torch.save(model6.state_dict(), "LLM6F.pth") torch.save(output.state_dict(), "fineturning.pth") def Predict(dim=512, outputdim=40000, maxlen=32): torch.manual_seed(1293431) table, i2w = GOILOAD() sp = spm.SentencePieceProcessor() sp.Load("tokenizer.model") w2v = Word2Vec.load("word2vec.model") device1 = torch.device("cuda:0") device2 = torch.device("cuda:1") device3 = torch.device("cuda:2") device4 = torch.device("cuda:3") device5 = torch.device("cuda:4") device6 = torch.device("cuda:5") device7 = torch.device("cuda:6") lossf = torch.nn.CrossEntropyLoss() model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1) model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2) model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3) model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4) model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5) model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6) output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7) model1.load_state_dict(torch.load("LLM1.pth", map_location=device1)) model2.load_state_dict(torch.load("LLM2.pth", map_location=device2)) model3.load_state_dict(torch.load("LLM3.pth", map_location=device3)) model4.load_state_dict(torch.load("LLM4.pth", map_location=device4)) model5.load_state_dict(torch.load("LLM5.pth", map_location=device5)) model6.load_state_dict(torch.load("LLM6.pth", map_location=device6)) output.load_state_dict(torch.load("fineturning.pth", map_location=device7)) while(1): dd = input("Q> ")# + "," data = [] buna = sp.EncodeAsPieces(dd) print(buna) for a in range(len(buna)): try: data.append(torch.from_numpy(w2v.wv[buna[a]]).view(1, 1, 128).to(device1)) except KeyError: print("Not Found") dat = torch.cat(data, dim=1).to(device1) oa = "" with torch.no_grad(): model1.reset() model2.reset() model3.reset() model4.reset() model5.reset() model6.reset() oa = "" for a in range(dat.shape[1] - 1): out = model1(dat[:, a].to(torch.bfloat16)) out = model2(out.to(device2)) out = model3(out.to(device3)) out = model4(out.to(device4)) out = model5(out.to(device5)) out = model6(out.to(device6)) out = output(out.to(device7)) for b in range(maxlen - dat.shape[1]): out = model1(dat[:, -1].to(torch.bfloat16)) out = model2(out.to(device2)) out = model3(out.to(device3)) out = model4(out.to(device4)) out = model5(out.to(device5)) out = model6(out.to(device6)) out = output(out.to(device7)) sfo = torch.nn.functional.softmax(out, dim=-1) wid = torch.argmax(sfo, dim=-1).item() if (wid != outputdim - 1): try: wd = i2w[wid] except: oa = oa + "ERROR" else: oa = oa + wd dat = torch.cat([dat, torch.from_numpy(w2v.wv[wd]).to(device1).view(1, 1, 128)], dim=1) print("A> ", oa.replace("?", "")) def ValidationLoss(dim=512, outputdim=40000, maxlen=32): torch.manual_seed(1293431) table, i2w = GOILOAD() tagger = MeCab.Tagger("-Owakati") w2v = Word2Vec.load("word2vec.model") device1 = torch.device("cuda:0") device2 = torch.device("cuda:1") device3 = torch.device("cuda:2") device4 = torch.device("cuda:3") device5 = torch.device("cuda:4") device6 = torch.device("cuda:5") device7 = torch.device("cuda:6") lossf = torch.nn.CrossEntropyLoss() model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1) model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2) model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3) model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4) model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5) model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6) output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7) model1.load_state_dict(torch.load("LLM1.pth", map_location=device1)) model2.load_state_dict(torch.load("LLM2.pth", map_location=device2)) model3.load_state_dict(torch.load("LLM3.pth", map_location=device3)) model4.load_state_dict(torch.load("LLM4.pth", map_location=device4)) model5.load_state_dict(torch.load("LLM5.pth", map_location=device5)) model6.load_state_dict(torch.load("LLM6.pth", map_location=device6)) output.load_state_dict(torch.load("output.pth", map_location=device7)) dd = input("TestData> ") lossf = torch.nn.CrossEntropyLoss() data = [] buna = tagger.parse(dd).split() trued = torch.tensor([table[dfg] for dfg in buna]).to(torch.long).unsqueeze(dim=0) print(buna) print(trued) for a in range(len(buna)): try: data.append(torch.from_numpy(w2v.wv[buna[a]]).view(1, 1, 128).to(device1)) except KeyError: print("Not Found") dat = torch.cat(data, dim=1).to(device1) oa = "" loss = 0.00 with torch.no_grad(): model1.reset() model2.reset() model3.reset() model4.reset() model5.reset() model6.reset() oa = "" for a in range(dat.shape[1] - 1): out = model1(dat[:, a]) out = model2(out.to(device2)) out = model3(out.to(device3)) out = model4(out.to(device4)) out = model5(out.to(device5)) out = model6(out.to(device6)) out = output(out.to(device7)) sfo = torch.nn.functional.softmax(out, dim=-1) wid = torch.argmax(sfo, dim=-1).item() try: wd = i2w[wid] except: oa = oa + "ERROR" else: oa = oa + wd loss += lossf(out, trued[:, a+1].to(device2)) print("validationloss", loss.item() / dat.shape[1], "preview", oa) if __name__ == "__main__": #DataMake() #Fineturning(Load=False,dim=2048, outputdim=21000,lr=1e-03, onestep=300, uselen=128) #Predict(dim=2048, outputdim=21000, maxlen=128)