armheb commited on
Commit
da19097
1 Parent(s): 113b1a2

Add profile app.py

Browse files
Files changed (2) hide show
  1. app.py +104 -0
  2. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AlbertForMaskedLM, AlbertTokenizer, pipeline
2
+ import numpy as np
3
+ import gradio as gr
4
+
5
+
6
+ ## Load the model
7
+ tokenizer = AlbertTokenizer.from_pretrained("Rostlab/prot_albert", do_lower_case=False )
8
+ model = AlbertForMaskedLM.from_pretrained("Rostlab/prot_albert")
9
+
10
+ #pipeline
11
+ fill_mask = pipeline("fill-mask", model = model, tokenizer=tokenizer, top_k = 21)
12
+
13
+ ## Initialization
14
+ header=['SeqNo','PDB','No','V','L','I','M','F','W','Y','G','A','P','S','T','C','H','R','K','Q','E','N','D','NOCC','NDEL','NINS','ENTROPY','RELENT','WEIGHT','CHAIN','AUTHCHAIN']
15
+ rem=[]
16
+
17
+ Hash1={
18
+
19
+ 'V':0,
20
+ 'L':1,
21
+ 'I':2,
22
+ 'M':3,
23
+ 'F':4,
24
+ 'W':5,
25
+ 'Y':6,
26
+ 'G':7,
27
+ 'A':8,
28
+ 'P':9,
29
+ 'S':10,
30
+ 'T':11,
31
+ 'C':12,
32
+ 'H':13,
33
+ 'R':14,
34
+ 'K':15,
35
+ 'Q':16,
36
+ 'E':17,
37
+ 'N':18,
38
+ 'D':19,
39
+ 'X':20
40
+ }
41
+
42
+
43
+
44
+ def ReadfastaFile(filename):
45
+ seq=[]
46
+ name=[]
47
+ human=""
48
+ fn=open(filename,"r")
49
+ S=""
50
+ for h in fn:
51
+ h=h.rstrip()
52
+ if not ">" in h:
53
+ S=S+h
54
+ fn.close()
55
+ S=S.upper()
56
+ return(S)
57
+
58
+
59
+ def Predict_profile(sequence, header = header,rem=rem,Hash1 = Hash1):
60
+ f=list()
61
+ f.append("PDBNO"+"\t")
62
+ for i in range(3,23):
63
+ f.append(header[i]+"\t")
64
+ f.append("X\n")
65
+
66
+ a = (len(Hash1),len(sequence))
67
+ pred_Profile=np.zeros(a)
68
+ for i in range(len(S)):
69
+ if i not in rem:
70
+ T=np.copy(list(sequence))
71
+ T=" ".join(T)
72
+ T=T.split(" ")
73
+ T[i]='[MASK]'
74
+ T=" ".join(T)
75
+ l=fill_mask(T)
76
+ number=len(l)
77
+ for k in range(number):
78
+ token=l[k]['token_str']
79
+ token=token.replace("▁","")
80
+ score=l[k]['score']
81
+ if token not in Hash1:
82
+ print(i,token)
83
+
84
+ else:
85
+ pred_Profile[Hash1[token]][i]=int(score*100)
86
+ f.append(str(i+1))
87
+ for k in range(len(Hash1)): #without X
88
+ f.append("\t"+str(pred_Profile[k][i]))
89
+ f.append("\n")
90
+ print(i)
91
+ if len(rem)!=0:
92
+ pred_Profile=np.delete(pred_Profile,rem,1)
93
+ return(pred_Profile)
94
+
95
+
96
+
97
+
98
+
99
+ iface = gr.Interface(fn=Predict_profile,
100
+ inputs=["text"],
101
+ outputs="text",
102
+ description="Please enter the sequence",
103
+ title="Protein profile prediction using ProtAbert")
104
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ numpy
2
+ transformers