SanokaLayer / 3BSanokaKai2 /AI-Large.py
ApfelSchorle's picture
upload All
9e92d30 verified
raw
history blame
22.1 kB
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 21 10:34:46 2024
@author: takan
"""
import MeCab
import torch
import copy
import time
import matplotlib.pyplot as plt
import re
import math
import numpy as np
from gensim.models import Word2Vec
import pickle
import threading
import sentencepiece as spm
class DenseBlock(torch.nn.Module):
def __init__(self, dim, mul=1):
super().__init__()
self.I = torch.nn.Linear(dim, dim*mul)
self.O = torch.nn.Linear(dim*mul, dim)
def forward(self, x):
x = self.I(x)
x = torch.nn.functional.elu(x)
x = self.O(x)
return x
class AttentionBlock(torch.nn.Module):
def __init__(self, dim, mul=1):
super().__init__()
self.Q = torch.nn.Linear(dim, dim*mul)
self.K = torch.nn.Linear(dim, dim*mul)
self.V = torch.nn.Linear(dim, dim*mul)
self.O = torch.nn.Linear(dim*mul, dim)
def forward(self, q,k,v):
q = self.Q(q)
k = self.K(k)
v = self.V(v)
x = torch.nn.functional.softmax(q * k, dim=-1) * v
x = self.O(x)
return x
"""
class AttentionBlock(torch.nn.Module):
def __init__(self, dim, mul=1):
super().__init__()
self.attn = torch.nn.MultiheadAttention(dim, 16, batch_first=True)
def forward(self, q,k,v):
x = self.attn(q, k, v)[0]
return x
"""
class SanokaLayer(torch.nn.Module):
def __init__(self, dim, mul=1):
super().__init__()
self.x = None
self.A = AttentionBlock(dim, mul)
self.B = DenseBlock(dim, mul)
def reset(self, x=None):
self.x = x
def forward(self, u):
if (self.x != None):
uu = torch.nn.functional.normalize(u)
xx = torch.nn.functional.normalize(self.x)
x = self.A(uu, xx, xx)
y = self.B(torch.nn.functional.normalize(x)) + u
self.x = x + self.x
return y
else:
uu = torch.nn.functional.normalize(u)
x = self.A(uu, uu, uu)
y = self.B(torch.nn.functional.normalize(x)) + u
self.x = x
return y
class SanokaModel(torch.nn.Module):
def __init__(self, dim, mul=1, Top=True):
super().__init__()
self.Top = Top
if (Top):
self.I = torch.nn.Linear(128, dim)
self.A = SanokaLayer(dim, mul)
self.B = SanokaLayer(dim, mul)
self.C = SanokaLayer(dim, mul)
self.D = SanokaLayer(dim, mul)
self.E = SanokaLayer(dim, mul)
self.F = SanokaLayer(dim, mul)
def reset(self):
self.A.reset()
self.B.reset()
self.C.reset()
self.D.reset()
self.E.reset()
self.F.reset()
def forward(self, x):
if (self.Top):
x = self.I(x)
x = self.A(x)
x = self.B(x)
x = self.C(x)
x = self.D(x)
x = self.E(x)
x = self.F(x)
return x
class OutputLayer (torch.nn.Module):
def __init__(self, hiddendim, worddim=59000, heads=4):
super().__init__()
self.H = torch.nn.Linear(hiddendim, worddim)
def forward(self, inpute):
x = inpute
x = self.H(x)
return x
def GOILOAD():
fuf = open("table.txt", "r", encoding="UTF-8")
goi = fuf.read().split("\n")
fuf.close()
chardim = len(goi[1:])
charid = {goi[i+1].split()[0]:i for i in range(chardim-1)}
return charid, [goi[ia+1].split()[0] for ia in range(chardim-1)]
datas = []
trues = []
lens = []
dones = 0
def Convert(buns, table, maxlen=256):
buns = buns.split("\n")
sp = spm.SentencePieceProcessor()
sp.Load("tokenizer.model")
w2v = Word2Vec.load("word2vec.model")
data = []
true = []
lena = []
for datac in range(len(buns)):
#print(datac)
#print(buns[datac])
error = False
try:
buna = sp.EncodeAsPieces(buns[datac])[:maxlen]
a = torch.from_numpy(w2v.wv[buna])
b = torch.tensor([table[buna[ii]] for ii in range(len(buna))])
ll = len(buna)
c = ll
except:
print("ERROR")
else:
data.append(a)
true.append(b)
lena.append(c)
print(datac)
f = open("Train_Data.bin", "wb")
pickle.dump((data, true, lena), f)
f.close()
return
def SPMake():
spm.SentencePieceTrainer.Train(f"--input=train_data.txt --model_prefix=tokenizer --vocab_size=20000 --train_extremely_large_corpus=True")
def W2VMake(filepath="train_data.txt", mincount=50, worker=60):
sp = spm.SentencePieceProcessor()
sp.Load("tokenizer.model")
f = open(filepath, mode="r", encoding="UTF-8")
texts = f.read().split("\n")
f.close()
dat = []
print(len(texts))
for a in range(len(texts)):
dat.append(sp.EncodeAsPieces(texts[a]))
print(a)
model = Word2Vec(sentences=dat, vector_size=128, window=100, min_count=mincount, workers=worker)
model.save("word2vec.model")
model.wv.save_word2vec_format('table.txt')
def DataMake(filepath="train_data.txt", maxlen=129):
table, i2w = GOILOAD()
print(len(table))
time.sleep(1)
f = open(filepath, mode="r", encoding="UTF-8")
txt = f.read()
f.close()
Convert(txt, table)
return None
def PreTrain(Load=False, dim=512, outputdim=40000, lr=1e-04, epoch=10, epochload=1000,usedata=480000, onestep=100, uselen=64):
global datas
global trues
global lens
torch.manual_seed(1293431)
#torch.manual_seed(576765)
device1 = torch.device("cuda:0")
device2 = torch.device("cuda:1")
device3 = torch.device("cuda:2")
device4 = torch.device("cuda:3")
device5 = torch.device("cuda:4")
device6 = torch.device("cuda:5")
device7 = torch.device("cuda:6")
lossf = torch.nn.CrossEntropyLoss()
model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)
model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)
model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)
model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)
model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)
model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)
output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)
if (Load):
model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))
model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))
model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))
model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))
model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))
model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))
output.load_state_dict(torch.load("output.pth", map_location=device7))
model1Optim = torch.optim.Adam(model1.parameters(), lr=lr)
model2Optim = torch.optim.Adam(model2.parameters(), lr=lr)
model3Optim = torch.optim.Adam(model3.parameters(), lr=lr)
model4Optim = torch.optim.Adam(model4.parameters(), lr=lr)
model5Optim = torch.optim.Adam(model5.parameters(), lr=lr)
model6Optim = torch.optim.Adam(model6.parameters(), lr=lr)
outputO = torch.optim.Adam(output.parameters(), lr=lr)
f = open("Train_Data.bin", "rb")
datas, trues, lens = pickle.load(f)
f.close()
train_x = torch.zeros((epochload, uselen, 128)).to(torch.bfloat16).to(device1)
train_y = torch.full((epochload, uselen), outputdim - 1, dtype=torch.long).to(device7)
table, i2w = GOILOAD()
base = 0
epoch = int(np.floor((len(datas) / epochload) * epoch))
print("データ量", len(datas))
for epochs in range(epoch):
train_x = train_x.detach()
train_y = train_y.detach()
if (base < len(datas) - epochload*2):
base += epochload
else:
base = 0
if (base > usedata):
base = 0
for b in range(epochload):
a = b + base
leng = lens[a]
if (leng > uselen):
leng = uselen
train_x[b, :datas[a].shape[0]] = datas[a].to(torch.bfloat16).to(device1)[:uselen]
train_y[b, :trues[a].shape[0]] = trues[a].to(device7).to(torch.long)[:uselen]
epls = 0.00
timem = time.time()
for steps in range(epochload//onestep):
model1.reset()
model2.reset()
model3.reset()
model4.reset()
model5.reset()
model6.reset()
oa = ""
model1Optim.zero_grad()
model2Optim.zero_grad()
model3Optim.zero_grad()
model4Optim.zero_grad()
model5Optim.zero_grad()
model6Optim.zero_grad()
outputO.zero_grad()
loss = 0.00
for b in range(uselen-1):
out = model1(train_x[steps*onestep:steps*onestep+onestep, b])
out = model2(out.to(device2))
out = model3(out.to(device3))
out = model4(out.to(device4))
out = model5(out.to(device5))
out = model6(out.to(device6))
out = output(out.to(device7))
loss += lossf(out, train_y[steps*onestep:steps*onestep+onestep, b+1])
epls += loss
sfo = torch.nn.functional.softmax(out[0], dim=-1)
wid = torch.argmax(sfo, dim=-1).item()
try:
wd = i2w[wid]
except:
oa = oa + "ERROR"
else:
oa = oa + wd
loss.backward()
#print(b)
model1Optim.step()
model2Optim.step()
model3Optim.step()
model4Optim.step()
model5Optim.step()
model6Optim.step()
outputO.step()
print("出力サンプル> ", oa[:32].replace("?", ""))
print("epoch", epochs,"Train_epoch_sum_loss", epls.item(), "time", time.time() - timem)
if (epochs % 10 == 9):
torch.save(model1.state_dict(), "LLM1.pth")
torch.save(model2.state_dict(), "LLM2.pth")
torch.save(model3.state_dict(), "LLM3.pth")
torch.save(model4.state_dict(), "LLM4.pth")
torch.save(model5.state_dict(), "LLM5.pth")
torch.save(model6.state_dict(), "LLM6.pth")
torch.save(output.state_dict(), "output.pth")
def Fineturning(Load=False, dim=512, outputdim=40000, lr=1e-04, epoch=10000, epochload=1000, onestep=200, uselen=32):
global datas
global trues
global lens
torch.manual_seed(1293431)
#torch.manual_seed(576765)
device1 = torch.device("cuda:0")
device2 = torch.device("cuda:1")
device3 = torch.device("cuda:2")
device4 = torch.device("cuda:3")
device5 = torch.device("cuda:4")
device6 = torch.device("cuda:5")
device7 = torch.device("cuda:6")
lossf = torch.nn.CrossEntropyLoss()
model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)
model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)
model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)
model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)
model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)
model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)
output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)
model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))
model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))
model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))
model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))
model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))
model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))
output.load_state_dict(torch.load("output.pth", map_location=device7))
model1Optim = torch.optim.Adam(model1.parameters(), lr=lr)
model2Optim = torch.optim.Adam(model2.parameters(), lr=lr)
model3Optim = torch.optim.Adam(model3.parameters(), lr=lr)
model4Optim = torch.optim.Adam(model4.parameters(), lr=lr)
model5Optim = torch.optim.Adam(model5.parameters(), lr=lr)
model6Optim = torch.optim.Adam(model6.parameters(), lr=lr/500)
outputO = torch.optim.Adam(output.parameters(), lr=lr)
f = open("Train_Data.bin", "rb")
datas, trues, lens = pickle.load(f)
f.close()
train_x = torch.zeros((epochload, uselen, 128)).to(torch.bfloat16).to(device1)
train_y = torch.full((epochload, uselen), outputdim - 1, dtype=torch.long).to(device7)
table, i2w = GOILOAD()
base = 0
epoch = int(np.floor((len(datas) / epochload) * epoch))
#print(epoch)
for epochs in range(epoch):
train_x = train_x.detach()
train_y = train_y.detach()
if (base < len(datas) - epochload*2):
base += epochload
else:
base = 0
for b in range(epochload):
a = b + base
#print(a)
leng = lens[a]
if (leng > uselen):
leng = uselen
train_x[b, :datas[a].shape[0]] = datas[a].to(torch.bfloat16).to(device1)[:uselen]
train_y[b, :trues[a].shape[0]] = trues[a].to(device7).to(torch.long)[:uselen]
epls = 0.00
timem = time.time()
for steps in range(epochload//onestep):
model1.reset()
model2.reset()
model3.reset()
model4.reset()
model5.reset()
model6.reset()
oa = ""
loss = 0.00
model1Optim.zero_grad()
model2Optim.zero_grad()
model3Optim.zero_grad()
model4Optim.zero_grad()
model5Optim.zero_grad()
model6Optim.zero_grad()
outputO.zero_grad()
for b in range(uselen-1):
with torch.no_grad():
out = model1(train_x[steps*onestep:steps*onestep+onestep, b])
out = model2(out.to(device2))
out = model3(out.to(device3))
out = model4(out.to(device4))
out = model5(out.to(device5))
out = model6(out.to(device6))
out = output(out.to(device7))
loss += lossf(out, train_y[steps*onestep:steps*onestep+onestep, b+1])
epls += loss.item()
sfo = torch.nn.functional.softmax(out[0], dim=-1)
wid = torch.argmax(sfo, dim=-1).item()
try:
wd = i2w[wid]
except:
oa = oa + "ERROR"
else:
oa = oa + wd
loss.backward()
#model6Optim.step()
outputO.step()
print("出力サンプル> ", oa[:32].replace("?", ""))
print("epoch", epochs,"Train_epoch_sum_loss", epls, "time", time.time() - timem)
if (epochs % 10 == 9):
#torch.save(model6.state_dict(), "LLM6F.pth")
torch.save(output.state_dict(), "fineturning.pth")
def Predict(dim=512, outputdim=40000, maxlen=32):
torch.manual_seed(1293431)
table, i2w = GOILOAD()
sp = spm.SentencePieceProcessor()
sp.Load("tokenizer.model")
w2v = Word2Vec.load("word2vec.model")
device1 = torch.device("cuda:0")
device2 = torch.device("cuda:1")
device3 = torch.device("cuda:2")
device4 = torch.device("cuda:3")
device5 = torch.device("cuda:4")
device6 = torch.device("cuda:5")
device7 = torch.device("cuda:6")
lossf = torch.nn.CrossEntropyLoss()
model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)
model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)
model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)
model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)
model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)
model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)
output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)
model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))
model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))
model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))
model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))
model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))
model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))
output.load_state_dict(torch.load("fineturning.pth", map_location=device7))
while(1):
dd = input("Q> ")# + ","
data = []
buna = sp.EncodeAsPieces(dd)
print(buna)
for a in range(len(buna)):
try:
data.append(torch.from_numpy(w2v.wv[buna[a]]).view(1, 1, 128).to(device1))
except KeyError:
print("Not Found")
dat = torch.cat(data, dim=1).to(device1)
oa = ""
with torch.no_grad():
model1.reset()
model2.reset()
model3.reset()
model4.reset()
model5.reset()
model6.reset()
oa = ""
for a in range(dat.shape[1] - 1):
out = model1(dat[:, a].to(torch.bfloat16))
out = model2(out.to(device2))
out = model3(out.to(device3))
out = model4(out.to(device4))
out = model5(out.to(device5))
out = model6(out.to(device6))
out = output(out.to(device7))
for b in range(maxlen - dat.shape[1]):
out = model1(dat[:, -1].to(torch.bfloat16))
out = model2(out.to(device2))
out = model3(out.to(device3))
out = model4(out.to(device4))
out = model5(out.to(device5))
out = model6(out.to(device6))
out = output(out.to(device7))
sfo = torch.nn.functional.softmax(out, dim=-1)
wid = torch.argmax(sfo, dim=-1).item()
if (wid != outputdim - 1):
try:
wd = i2w[wid]
except:
oa = oa + "ERROR"
else:
oa = oa + wd
dat = torch.cat([dat, torch.from_numpy(w2v.wv[wd]).to(device1).view(1, 1, 128)], dim=1)
print("A> ", oa.replace("?", ""))
def ValidationLoss(dim=512, outputdim=40000, maxlen=32):
torch.manual_seed(1293431)
table, i2w = GOILOAD()
tagger = MeCab.Tagger("-Owakati")
w2v = Word2Vec.load("word2vec.model")
device1 = torch.device("cuda:0")
device2 = torch.device("cuda:1")
device3 = torch.device("cuda:2")
device4 = torch.device("cuda:3")
device5 = torch.device("cuda:4")
device6 = torch.device("cuda:5")
device7 = torch.device("cuda:6")
lossf = torch.nn.CrossEntropyLoss()
model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)
model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)
model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)
model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)
model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)
model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)
output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)
model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))
model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))
model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))
model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))
model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))
model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))
output.load_state_dict(torch.load("output.pth", map_location=device7))
dd = input("TestData> ")
lossf = torch.nn.CrossEntropyLoss()
data = []
buna = tagger.parse(dd).split()
trued = torch.tensor([table[dfg] for dfg in buna]).to(torch.long).unsqueeze(dim=0)
print(buna)
print(trued)
for a in range(len(buna)):
try:
data.append(torch.from_numpy(w2v.wv[buna[a]]).view(1, 1, 128).to(device1))
except KeyError:
print("Not Found")
dat = torch.cat(data, dim=1).to(device1)
oa = ""
loss = 0.00
with torch.no_grad():
model1.reset()
model2.reset()
model3.reset()
model4.reset()
model5.reset()
model6.reset()
oa = ""
for a in range(dat.shape[1] - 1):
out = model1(dat[:, a])
out = model2(out.to(device2))
out = model3(out.to(device3))
out = model4(out.to(device4))
out = model5(out.to(device5))
out = model6(out.to(device6))
out = output(out.to(device7))
sfo = torch.nn.functional.softmax(out, dim=-1)
wid = torch.argmax(sfo, dim=-1).item()
try:
wd = i2w[wid]
except:
oa = oa + "ERROR"
else:
oa = oa + wd
loss += lossf(out, trued[:, a+1].to(device2))
print("validationloss", loss.item() / dat.shape[1], "preview", oa)
if __name__ == "__main__":
#DataMake()
#Fineturning(Load=False,dim=2048, outputdim=21000,lr=1e-03, onestep=300, uselen=128)
#Predict(dim=2048, outputdim=21000, maxlen=128)