ApfelSchorle
/

SanokaLayer

Model card Files Files and versions Community

SanokaLayer / 3BSanokaKai2 /AI-Large.py

ApfelSchorle

upload All

9e92d30 verified 8 months ago

raw

history blame

22.1 kB

	# -- coding: utf-8 --
	"""
	Created on Thu Mar 21 10:34:46 2024

	@author: takan
	"""

	import MeCab
	import torch
	import copy
	import time
	import matplotlib.pyplot as plt
	import re
	import math
	import numpy as np
	from gensim.models import Word2Vec
	import pickle
	import threading
	import sentencepiece as spm

	class DenseBlock(torch.nn.Module):
	def __init__(self, dim, mul=1):
	super().__init__()
	self.I = torch.nn.Linear(dim, dim*mul)
	self.O = torch.nn.Linear(dim*mul, dim)
	def forward(self, x):
	x = self.I(x)
	x = torch.nn.functional.elu(x)
	x = self.O(x)
	return x

	class AttentionBlock(torch.nn.Module):
	def __init__(self, dim, mul=1):
	super().__init__()
	self.Q = torch.nn.Linear(dim, dim*mul)
	self.K = torch.nn.Linear(dim, dim*mul)
	self.V = torch.nn.Linear(dim, dim*mul)
	self.O = torch.nn.Linear(dim*mul, dim)
	def forward(self, q,k,v):
	q = self.Q(q)
	k = self.K(k)
	v = self.V(v)
	x = torch.nn.functional.softmax(q * k, dim=-1) * v
	x = self.O(x)
	return x
	"""
	class AttentionBlock(torch.nn.Module):
	def __init__(self, dim, mul=1):
	super().__init__()
	self.attn = torch.nn.MultiheadAttention(dim, 16, batch_first=True)
	def forward(self, q,k,v):
	x = self.attn(q, k, v)[0]
	return x
	"""
	class SanokaLayer(torch.nn.Module):
	def __init__(self, dim, mul=1):
	super().__init__()
	self.x = None
	self.A = AttentionBlock(dim, mul)
	self.B = DenseBlock(dim, mul)
	def reset(self, x=None):
	self.x = x
	def forward(self, u):
	if (self.x != None):
	uu = torch.nn.functional.normalize(u)
	xx = torch.nn.functional.normalize(self.x)
	x = self.A(uu, xx, xx)
	y = self.B(torch.nn.functional.normalize(x)) + u
	self.x = x + self.x
	return y
	else:
	uu = torch.nn.functional.normalize(u)
	x = self.A(uu, uu, uu)
	y = self.B(torch.nn.functional.normalize(x)) + u
	self.x = x
	return y

	class SanokaModel(torch.nn.Module):
	def __init__(self, dim, mul=1, Top=True):
	super().__init__()
	self.Top = Top
	if (Top):
	self.I = torch.nn.Linear(128, dim)
	self.A = SanokaLayer(dim, mul)
	self.B = SanokaLayer(dim, mul)
	self.C = SanokaLayer(dim, mul)
	self.D = SanokaLayer(dim, mul)
	self.E = SanokaLayer(dim, mul)
	self.F = SanokaLayer(dim, mul)
	def reset(self):
	self.A.reset()
	self.B.reset()
	self.C.reset()
	self.D.reset()
	self.E.reset()
	self.F.reset()

	def forward(self, x):
	if (self.Top):
	x = self.I(x)
	x = self.A(x)
	x = self.B(x)
	x = self.C(x)
	x = self.D(x)
	x = self.E(x)
	x = self.F(x)

	return x

	class OutputLayer (torch.nn.Module):
	def __init__(self, hiddendim, worddim=59000, heads=4):
	super().__init__()
	self.H = torch.nn.Linear(hiddendim, worddim)
	def forward(self, inpute):
	x = inpute
	x = self.H(x)
	return x

	def GOILOAD():
	fuf = open("table.txt", "r", encoding="UTF-8")
	goi = fuf.read().split("\n")
	fuf.close()
	chardim = len(goi[1:])
	charid = {goi[i+1].split()[0]:i for i in range(chardim-1)}
	return charid, [goi[ia+1].split()[0] for ia in range(chardim-1)]

	datas = []
	trues = []
	lens = []
	dones = 0
	def Convert(buns, table, maxlen=256):
	buns = buns.split("\n")
	sp = spm.SentencePieceProcessor()
	sp.Load("tokenizer.model")
	w2v = Word2Vec.load("word2vec.model")
	data = []
	true = []
	lena = []
	for datac in range(len(buns)):
	#print(datac)
	#print(buns[datac])
	error = False
	try:
	buna = sp.EncodeAsPieces(buns[datac])[:maxlen]
	a = torch.from_numpy(w2v.wv[buna])
	b = torch.tensor([table[buna[ii]] for ii in range(len(buna))])
	ll = len(buna)
	c = ll
	except:
	print("ERROR")
	else:
	data.append(a)
	true.append(b)
	lena.append(c)
	print(datac)
	f = open("Train_Data.bin", "wb")
	pickle.dump((data, true, lena), f)
	f.close()
	return

	def SPMake():

	spm.SentencePieceTrainer.Train(f"--input=train_data.txt --model_prefix=tokenizer --vocab_size=20000 --train_extremely_large_corpus=True")
	def W2VMake(filepath="train_data.txt", mincount=50, worker=60):
	sp = spm.SentencePieceProcessor()
	sp.Load("tokenizer.model")
	f = open(filepath, mode="r", encoding="UTF-8")
	texts = f.read().split("\n")
	f.close()
	dat = []
	print(len(texts))
	for a in range(len(texts)):
	dat.append(sp.EncodeAsPieces(texts[a]))
	print(a)

	model = Word2Vec(sentences=dat, vector_size=128, window=100, min_count=mincount, workers=worker)
	model.save("word2vec.model")
	model.wv.save_word2vec_format('table.txt')

	def DataMake(filepath="train_data.txt", maxlen=129):
	table, i2w = GOILOAD()
	print(len(table))
	time.sleep(1)
	f = open(filepath, mode="r", encoding="UTF-8")
	txt = f.read()
	f.close()
	Convert(txt, table)
	return None

	def PreTrain(Load=False, dim=512, outputdim=40000, lr=1e-04, epoch=10, epochload=1000,usedata=480000, onestep=100, uselen=64):
	global datas
	global trues
	global lens
	torch.manual_seed(1293431)
	#torch.manual_seed(576765)
	device1 = torch.device("cuda:0")
	device2 = torch.device("cuda:1")
	device3 = torch.device("cuda:2")
	device4 = torch.device("cuda:3")
	device5 = torch.device("cuda:4")
	device6 = torch.device("cuda:5")
	device7 = torch.device("cuda:6")
	lossf = torch.nn.CrossEntropyLoss()
	model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)
	model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)
	model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)
	model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)
	model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)
	model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)
	output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)

	if (Load):
	model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))
	model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))
	model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))
	model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))
	model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))
	model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))
	output.load_state_dict(torch.load("output.pth", map_location=device7))
	model1Optim = torch.optim.Adam(model1.parameters(), lr=lr)
	model2Optim = torch.optim.Adam(model2.parameters(), lr=lr)
	model3Optim = torch.optim.Adam(model3.parameters(), lr=lr)
	model4Optim = torch.optim.Adam(model4.parameters(), lr=lr)
	model5Optim = torch.optim.Adam(model5.parameters(), lr=lr)
	model6Optim = torch.optim.Adam(model6.parameters(), lr=lr)
	outputO = torch.optim.Adam(output.parameters(), lr=lr)
	f = open("Train_Data.bin", "rb")
	datas, trues, lens = pickle.load(f)
	f.close()
	train_x = torch.zeros((epochload, uselen, 128)).to(torch.bfloat16).to(device1)
	train_y = torch.full((epochload, uselen), outputdim - 1, dtype=torch.long).to(device7)
	table, i2w = GOILOAD()
	base = 0
	epoch = int(np.floor((len(datas) / epochload) * epoch))
	print("データ量", len(datas))
	for epochs in range(epoch):
	train_x = train_x.detach()
	train_y = train_y.detach()
	if (base < len(datas) - epochload*2):
	base += epochload
	else:
	base = 0
	if (base > usedata):
	base = 0
	for b in range(epochload):
	a = b + base
	leng = lens[a]
	if (leng > uselen):
	leng = uselen

	train_x[b, :datas[a].shape[0]] = datas[a].to(torch.bfloat16).to(device1)[:uselen]
	train_y[b, :trues[a].shape[0]] = trues[a].to(device7).to(torch.long)[:uselen]
	epls = 0.00
	timem = time.time()
	for steps in range(epochload//onestep):
	model1.reset()
	model2.reset()
	model3.reset()
	model4.reset()
	model5.reset()
	model6.reset()
	oa = ""
	model1Optim.zero_grad()
	model2Optim.zero_grad()
	model3Optim.zero_grad()
	model4Optim.zero_grad()
	model5Optim.zero_grad()
	model6Optim.zero_grad()
	outputO.zero_grad()
	loss = 0.00
	for b in range(uselen-1):
	out = model1(train_x[stepsonestep:stepsonestep+onestep, b])
	out = model2(out.to(device2))
	out = model3(out.to(device3))
	out = model4(out.to(device4))
	out = model5(out.to(device5))
	out = model6(out.to(device6))
	out = output(out.to(device7))
	loss += lossf(out, train_y[stepsonestep:stepsonestep+onestep, b+1])
	epls += loss

	sfo = torch.nn.functional.softmax(out[0], dim=-1)
	wid = torch.argmax(sfo, dim=-1).item()
	try:
	wd = i2w[wid]
	except:
	oa = oa + "ERROR"
	else:
	oa = oa + wd

	loss.backward()
	#print(b)
	model1Optim.step()
	model2Optim.step()
	model3Optim.step()
	model4Optim.step()
	model5Optim.step()
	model6Optim.step()
	outputO.step()
	print("出力サンプル> ", oa[:32].replace("?", ""))
	print("epoch", epochs,"Train_epoch_sum_loss", epls.item(), "time", time.time() - timem)
	if (epochs % 10 == 9):
	torch.save(model1.state_dict(), "LLM1.pth")
	torch.save(model2.state_dict(), "LLM2.pth")
	torch.save(model3.state_dict(), "LLM3.pth")
	torch.save(model4.state_dict(), "LLM4.pth")
	torch.save(model5.state_dict(), "LLM5.pth")
	torch.save(model6.state_dict(), "LLM6.pth")
	torch.save(output.state_dict(), "output.pth")
	def Fineturning(Load=False, dim=512, outputdim=40000, lr=1e-04, epoch=10000, epochload=1000, onestep=200, uselen=32):
	global datas
	global trues
	global lens
	torch.manual_seed(1293431)
	#torch.manual_seed(576765)
	device1 = torch.device("cuda:0")
	device2 = torch.device("cuda:1")
	device3 = torch.device("cuda:2")
	device4 = torch.device("cuda:3")
	device5 = torch.device("cuda:4")
	device6 = torch.device("cuda:5")
	device7 = torch.device("cuda:6")
	lossf = torch.nn.CrossEntropyLoss()
	model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)
	model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)
	model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)
	model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)
	model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)
	model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)
	output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)

	model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))
	model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))
	model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))
	model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))
	model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))
	model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))
	output.load_state_dict(torch.load("output.pth", map_location=device7))
	model1Optim = torch.optim.Adam(model1.parameters(), lr=lr)
	model2Optim = torch.optim.Adam(model2.parameters(), lr=lr)
	model3Optim = torch.optim.Adam(model3.parameters(), lr=lr)
	model4Optim = torch.optim.Adam(model4.parameters(), lr=lr)
	model5Optim = torch.optim.Adam(model5.parameters(), lr=lr)
	model6Optim = torch.optim.Adam(model6.parameters(), lr=lr/500)
	outputO = torch.optim.Adam(output.parameters(), lr=lr)
	f = open("Train_Data.bin", "rb")
	datas, trues, lens = pickle.load(f)
	f.close()
	train_x = torch.zeros((epochload, uselen, 128)).to(torch.bfloat16).to(device1)
	train_y = torch.full((epochload, uselen), outputdim - 1, dtype=torch.long).to(device7)
	table, i2w = GOILOAD()
	base = 0
	epoch = int(np.floor((len(datas) / epochload) * epoch))
	#print(epoch)
	for epochs in range(epoch):
	train_x = train_x.detach()
	train_y = train_y.detach()
	if (base < len(datas) - epochload*2):
	base += epochload
	else:
	base = 0
	for b in range(epochload):
	a = b + base
	#print(a)
	leng = lens[a]
	if (leng > uselen):
	leng = uselen

	train_x[b, :datas[a].shape[0]] = datas[a].to(torch.bfloat16).to(device1)[:uselen]
	train_y[b, :trues[a].shape[0]] = trues[a].to(device7).to(torch.long)[:uselen]
	epls = 0.00
	timem = time.time()
	for steps in range(epochload//onestep):
	model1.reset()
	model2.reset()
	model3.reset()
	model4.reset()
	model5.reset()
	model6.reset()
	oa = ""
	loss = 0.00
	model1Optim.zero_grad()
	model2Optim.zero_grad()
	model3Optim.zero_grad()
	model4Optim.zero_grad()
	model5Optim.zero_grad()
	model6Optim.zero_grad()
	outputO.zero_grad()
	for b in range(uselen-1):
	with torch.no_grad():
	out = model1(train_x[stepsonestep:stepsonestep+onestep, b])
	out = model2(out.to(device2))
	out = model3(out.to(device3))
	out = model4(out.to(device4))
	out = model5(out.to(device5))
	out = model6(out.to(device6))
	out = output(out.to(device7))
	loss += lossf(out, train_y[stepsonestep:stepsonestep+onestep, b+1])
	epls += loss.item()

	sfo = torch.nn.functional.softmax(out[0], dim=-1)
	wid = torch.argmax(sfo, dim=-1).item()
	try:
	wd = i2w[wid]
	except:
	oa = oa + "ERROR"
	else:
	oa = oa + wd
	loss.backward()
	#model6Optim.step()
	outputO.step()
	print("出力サンプル> ", oa[:32].replace("?", ""))
	print("epoch", epochs,"Train_epoch_sum_loss", epls, "time", time.time() - timem)
	if (epochs % 10 == 9):
	#torch.save(model6.state_dict(), "LLM6F.pth")
	torch.save(output.state_dict(), "fineturning.pth")
	def Predict(dim=512, outputdim=40000, maxlen=32):

	torch.manual_seed(1293431)

	table, i2w = GOILOAD()
	sp = spm.SentencePieceProcessor()
	sp.Load("tokenizer.model")

	w2v = Word2Vec.load("word2vec.model")

	device1 = torch.device("cuda:0")

	device2 = torch.device("cuda:1")

	device3 = torch.device("cuda:2")

	device4 = torch.device("cuda:3")

	device5 = torch.device("cuda:4")

	device6 = torch.device("cuda:5")

	device7 = torch.device("cuda:6")

	lossf = torch.nn.CrossEntropyLoss()

	model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)

	model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)

	model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)

	model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)

	model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)

	model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)

	output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)



	model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))

	model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))

	model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))

	model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))

	model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))

	model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))

	output.load_state_dict(torch.load("fineturning.pth", map_location=device7))

	while(1):

	dd = input("Q> ")# + ","



	data = []

	buna = sp.EncodeAsPieces(dd)

	print(buna)

	for a in range(len(buna)):

	try:

	data.append(torch.from_numpy(w2v.wv[buna[a]]).view(1, 1, 128).to(device1))

	except KeyError:

	print("Not Found")

	dat = torch.cat(data, dim=1).to(device1)

	oa = ""

	with torch.no_grad():

	model1.reset()

	model2.reset()

	model3.reset()

	model4.reset()

	model5.reset()

	model6.reset()

	oa = ""

	for a in range(dat.shape[1] - 1):

	out = model1(dat[:, a].to(torch.bfloat16))

	out = model2(out.to(device2))

	out = model3(out.to(device3))

	out = model4(out.to(device4))

	out = model5(out.to(device5))

	out = model6(out.to(device6))

	out = output(out.to(device7))

	for b in range(maxlen - dat.shape[1]):

	out = model1(dat[:, -1].to(torch.bfloat16))

	out = model2(out.to(device2))

	out = model3(out.to(device3))

	out = model4(out.to(device4))

	out = model5(out.to(device5))

	out = model6(out.to(device6))

	out = output(out.to(device7))

	sfo = torch.nn.functional.softmax(out, dim=-1)

	wid = torch.argmax(sfo, dim=-1).item()

	if (wid != outputdim - 1):

	try:

	wd = i2w[wid]

	except:

	oa = oa + "ERROR"

	else:

	oa = oa + wd

	dat = torch.cat([dat, torch.from_numpy(w2v.wv[wd]).to(device1).view(1, 1, 128)], dim=1)

	print("A> ", oa.replace("?", ""))

	def ValidationLoss(dim=512, outputdim=40000, maxlen=32):

	torch.manual_seed(1293431)

	table, i2w = GOILOAD()

	tagger = MeCab.Tagger("-Owakati")

	w2v = Word2Vec.load("word2vec.model")

	device1 = torch.device("cuda:0")

	device2 = torch.device("cuda:1")

	device3 = torch.device("cuda:2")

	device4 = torch.device("cuda:3")

	device5 = torch.device("cuda:4")

	device6 = torch.device("cuda:5")

	device7 = torch.device("cuda:6")

	lossf = torch.nn.CrossEntropyLoss()

	model1 = SanokaModel(dim, 2, True).to(torch.bfloat16).to(device1)

	model2 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device2)

	model3 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device3)

	model4 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device4)

	model5 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device5)

	model6 = SanokaModel(dim, 2, False).to(torch.bfloat16).to(device6)

	output = OutputLayer(dim, outputdim).to(torch.bfloat16).to(device7)



	model1.load_state_dict(torch.load("LLM1.pth", map_location=device1))

	model2.load_state_dict(torch.load("LLM2.pth", map_location=device2))

	model3.load_state_dict(torch.load("LLM3.pth", map_location=device3))

	model4.load_state_dict(torch.load("LLM4.pth", map_location=device4))

	model5.load_state_dict(torch.load("LLM5.pth", map_location=device5))

	model6.load_state_dict(torch.load("LLM6.pth", map_location=device6))

	output.load_state_dict(torch.load("output.pth", map_location=device7))

	dd = input("TestData> ")

	lossf = torch.nn.CrossEntropyLoss()

	data = []

	buna = tagger.parse(dd).split()

	trued = torch.tensor([table[dfg] for dfg in buna]).to(torch.long).unsqueeze(dim=0)

	print(buna)

	print(trued)

	for a in range(len(buna)):

	try:

	data.append(torch.from_numpy(w2v.wv[buna[a]]).view(1, 1, 128).to(device1))

	except KeyError:

	print("Not Found")

	dat = torch.cat(data, dim=1).to(device1)

	oa = ""

	loss = 0.00

	with torch.no_grad():

	model1.reset()

	model2.reset()

	model3.reset()

	model4.reset()

	model5.reset()

	model6.reset()

	oa = ""

	for a in range(dat.shape[1] - 1):

	out = model1(dat[:, a])

	out = model2(out.to(device2))

	out = model3(out.to(device3))

	out = model4(out.to(device4))

	out = model5(out.to(device5))

	out = model6(out.to(device6))

	out = output(out.to(device7))

	sfo = torch.nn.functional.softmax(out, dim=-1)

	wid = torch.argmax(sfo, dim=-1).item()

	try:

	wd = i2w[wid]

	except:

	oa = oa + "ERROR"

	else:
	oa = oa + wd

	loss += lossf(out, trued[:, a+1].to(device2))

	print("validationloss", loss.item() / dat.shape[1], "preview", oa)
	if __name__ == "__main__":
	#DataMake()
	#Fineturning(Load=False,dim=2048, outputdim=21000,lr=1e-03, onestep=300, uselen=128)
	#Predict(dim=2048, outputdim=21000, maxlen=128)