Spaces:
Runtime error
Runtime error
File size: 4,216 Bytes
fc98b4d da19097 fc98b4d da19097 9478003 da19097 a8a36ec fc98b4d 5f3e2ac fc98b4d da19097 9478003 74f1e11 da19097 fc98b4d da19097 f8299ac a7723b1 da19097 fc98b4d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 |
from calendar import c
from transformers import AlbertForMaskedLM, AlbertTokenizer, pipeline
import numpy as np
import gradio as gr
## Load the model
tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False )
model = AlbertForMaskedLM.from_pretrained("Rostlab/prot_albert")
#pipeline
fill_mask = pipeline("fill-mask", model = model, tokenizer=tokenizer, top_k = 21)
## Initialization
header=['SeqNo','PDB','No','V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','NOCC','NDEL','NINS','ENTROPY','RELENT','WEIGHT','CHAIN','AUTHCHAIN']
rem=[]
codes = ['V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','X']
Hash1={
'V':0,
'L':1,
'I':2,
'M':3,
'F':4,
'W':5,
'Y':6,
'G':7,
'A':8,
'P':9,
'S':10,
'T':11,
'C':12,
'H':13,
'R':14,
'K':15,
'Q':16,
'E':17,
'N':18,
'D':19,
'X':20
}
def ReadfastaFile(filename):
seq=[]
name=[]
human=""
fn=open(filename,"r")
S=""
for h in fn:
h=h.rstrip()
if not ">" in h:
S=S+h
fn.close()
S=S.upper()
return(S)
def Predict_profile(sequence, header = header,rem=rem,Hash1 = Hash1):
f=list()
f.append("PDBNO"+"\t")
for i in range(3,23):
f.append(header[i]+"\t")
f.append("X\n")
a = (len(Hash1),len(sequence))
pred_Profile=np.zeros(a)
for i in range(len(sequence)):
if i not in rem:
T=np.copy(list(sequence))
T=" ".join(T)
T=T.split(" ")
T[i]='[MASK]'
T=" ".join(T)
l=fill_mask(T)
number=len(l)
for k in range(number):
token=l[k]['token_str']
token=token.replace("▁","")
score=l[k]['score']
if token not in Hash1:
print(i,token)
else:
pred_Profile[Hash1[token]][i]=int(score*100)
f.append(str(i+1))
for k in range(len(Hash1)): #without X
f.append("\t"+str(pred_Profile[k][i]))
f.append("\n")
print(i)
if len(rem)!=0:
pred_Profile=np.delete(pred_Profile,rem,1)
return(pred_Profile)
def Predict_profile1(sequence, header = header,rem=rem,Hash1 = Hash1):
f=list()
f.append("PDBNO"+"\t")
for i in range(3,23):
f.append(header[i]+"\t")
f.append("X\n")
a = (len(Hash1),len(sequence))
pred_Profile=np.zeros(a)
for i in range(len(sequence)):
if i not in rem:
T=np.copy(list(sequence))
T=" ".join(T)
T=T.split(" ")
T[i]='[MASK]'
T=" ".join(T)
l=fill_mask(T)
number=len(l)
for k in range(number):
token=l[k]['token_str']
token=token.replace("▁","")
score=l[k]['score']
if token not in Hash1:
pred_Profile['X'][i]=pred_Profile['X'][i]+score
else:
pred_Profile[Hash1[token]][i]=score
f.append(str(i+1))
for k in range(len(Hash1)): #without X
f.append("\t"+str(pred_Profile[k][i]))
f.append("\n")
print(i)
if len(rem)!=0:
pred_Profile=np.delete(pred_Profile,rem,1)
return(pred_Profile)
def print_func(sequence):
s = Predict_profile1(sequence)
ss = list(s)
final = []
for i in range(len(s)):
# q= np.concatenate((codes[i],s[i]))
q = [str(codes[i])] + str(ss[i]).replace('[','').replace(']','').split(" ")
final.append(q)
res = "\n".join(" ".join(str(el) for el in row) for row in final)
return res
title="Protein sequence profile prediction using ProtAlbert transformer"
description="""Please enter the sequence.
* Prediction process can take longer for long sequences.
"""
iface = gr.Interface(fn=print_func,
inputs=["text"],
outputs="text",
description=description,
title=title)
iface.launch()
|