Spaces:
Runtime error
Runtime error
from calendar import c | |
from transformers import AlbertForMaskedLM, AlbertTokenizer, pipeline | |
import numpy as np | |
import gradio as gr | |
## Load the model | |
tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False ) | |
model = AlbertForMaskedLM.from_pretrained("Rostlab/prot_albert") | |
#pipeline | |
fill_mask = pipeline("fill-mask", model = model, tokenizer=tokenizer, top_k = 21) | |
## Initialization | |
header=['SeqNo','PDB','No','V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','NOCC','NDEL','NINS','ENTROPY','RELENT','WEIGHT','CHAIN','AUTHCHAIN'] | |
rem=[] | |
codes = ['V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','X'] | |
Hash1={ | |
'V':0, | |
'L':1, | |
'I':2, | |
'M':3, | |
'F':4, | |
'W':5, | |
'Y':6, | |
'G':7, | |
'A':8, | |
'P':9, | |
'S':10, | |
'T':11, | |
'C':12, | |
'H':13, | |
'R':14, | |
'K':15, | |
'Q':16, | |
'E':17, | |
'N':18, | |
'D':19, | |
'X':20 | |
} | |
def ReadfastaFile(filename): | |
seq=[] | |
name=[] | |
human="" | |
fn=open(filename,"r") | |
S="" | |
for h in fn: | |
h=h.rstrip() | |
if not ">" in h: | |
S=S+h | |
fn.close() | |
S=S.upper() | |
return(S) | |
def Predict_profile(sequence, header = header,rem=rem,Hash1 = Hash1): | |
f=list() | |
f.append("PDBNO"+"\t") | |
for i in range(3,23): | |
f.append(header[i]+"\t") | |
f.append("X\n") | |
a = (len(Hash1),len(sequence)) | |
pred_Profile=np.zeros(a) | |
for i in range(len(sequence)): | |
if i not in rem: | |
T=np.copy(list(sequence)) | |
T=" ".join(T) | |
T=T.split(" ") | |
T[i]='[MASK]' | |
T=" ".join(T) | |
l=fill_mask(T) | |
number=len(l) | |
for k in range(number): | |
token=l[k]['token_str'] | |
token=token.replace("β","") | |
score=l[k]['score'] | |
if token not in Hash1: | |
print(i,token) | |
else: | |
pred_Profile[Hash1[token]][i]=int(score*100) | |
f.append(str(i+1)) | |
for k in range(len(Hash1)): #without X | |
f.append("\t"+str(pred_Profile[k][i])) | |
f.append("\n") | |
print(i) | |
if len(rem)!=0: | |
pred_Profile=np.delete(pred_Profile,rem,1) | |
return(pred_Profile) | |
def Predict_profile1(sequence, header = header,rem=rem,Hash1 = Hash1): | |
f=list() | |
f.append("PDBNO"+"\t") | |
for i in range(3,23): | |
f.append(header[i]+"\t") | |
f.append("X\n") | |
a = (len(Hash1),len(sequence)) | |
pred_Profile=np.zeros(a) | |
for i in range(len(sequence)): | |
if i not in rem: | |
T=np.copy(list(sequence)) | |
T=" ".join(T) | |
T=T.split(" ") | |
T[i]='[MASK]' | |
T=" ".join(T) | |
l=fill_mask(T) | |
number=len(l) | |
for k in range(number): | |
token=l[k]['token_str'] | |
token=token.replace("β","") | |
score=l[k]['score'] | |
if token not in Hash1: | |
pred_Profile['X'][i]=pred_Profile['X'][i]+score | |
else: | |
pred_Profile[Hash1[token]][i]=score | |
f.append(str(i+1)) | |
for k in range(len(Hash1)): #without X | |
f.append("\t"+str(pred_Profile[k][i])) | |
f.append("\n") | |
print(i) | |
if len(rem)!=0: | |
pred_Profile=np.delete(pred_Profile,rem,1) | |
return(pred_Profile) | |
def print_func(sequence): | |
s = Predict_profile1(sequence) | |
ss = list(s) | |
final = [] | |
for i in range(len(s)): | |
# q= np.concatenate((codes[i],s[i])) | |
q = [str(codes[i])] + str(ss[i]).replace('[','').replace(']','').split(" ") | |
final.append(q) | |
res = "\n".join(" ".join(str(el) for el in row) for row in final) | |
return res | |
title="Protein sequence profile prediction using ProtAlbert transformer" | |
description="""Please enter the sequence. | |
* Prediction process can take longer for long sequences. | |
""" | |
iface = gr.Interface(fn=print_func, | |
inputs=["text"], | |
outputs="text", | |
description=description, | |
title=title) | |
iface.launch() | |