saicharan2804 commited on
Commit
50320ea
1 Parent(s): f9285ac

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +67 -0
  2. molgen_metric.py +154 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import evaluate
2
+ from evaluate.utils import launch_gradio_widget
3
+ import gradio as gr
4
+ # from pathlib import Path
5
+ # import sys
6
+ # import os
7
+
8
+ # from .logging import get_logger
9
+ # logger = get_logger(__name__)
10
+
11
+ # ###
12
+ # def launch_gradio_widget(metric):
13
+ # """Launches `metric` widget with Gradio."""
14
+
15
+ # try:
16
+ # import gradio as gr
17
+ # except ImportError as error:
18
+ # logger.error("To create a metric widget with Gradio make sure gradio is installed.")
19
+ # raise error
20
+
21
+ # local_path = Path(sys.path[0])
22
+ # # if there are several input types, use first as default.
23
+ # if isinstance(metric.features, list):
24
+ # (feature_names, feature_types) = zip(*metric.features[0].items())
25
+ # else:
26
+ # (feature_names, feature_types) = zip(*metric.features.items())
27
+ # gradio_input_types = infer_gradio_input_types(feature_types)
28
+
29
+ # def compute(data):
30
+ # return metric.compute(**parse_gradio_data(data, gradio_input_types))
31
+
32
+ # iface = gr.Interface(
33
+ # fn=compute,
34
+ # inputs=gr.Dataframe(
35
+ # headers=feature_names,
36
+ # col_count=len(feature_names),
37
+ # row_count=1,
38
+ # datatype=json_to_string_type(gradio_input_types),
39
+ # ),
40
+ # outputs=gr.Textbox(label=metric.name),
41
+ # description=(
42
+ # metric.info.description + "\nIf this is a text-based metric, make sure to wrap you input in double quotes."
43
+ # " Alternatively you can use a JSON-formatted list as input."
44
+ # ),
45
+ # title=f"Metric: {metric.name}",
46
+ # article=parse_readme(local_path / "README.md"),
47
+ # # TODO: load test cases and use them to populate examples
48
+ # # examples=[parse_test_cases(test_cases, feature_names, gradio_input_types)]
49
+ # )
50
+
51
+ # iface.launch()
52
+ # ###
53
+
54
+
55
+ module = evaluate.load("saicharan2804/molgenevalmetric")
56
+ # launch_gradio_widget(module)
57
+
58
+ iface = gr.Interface(
59
+ fn = module,
60
+ inputs=[
61
+ gr.File(label="Generated SMILES"),
62
+ gr.File(label="Training Data", value=None),
63
+ ],
64
+ outputs="text"
65
+ )
66
+
67
+ iface.launch()
molgen_metric.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ import evaluate
15
+ import datasets
16
+ # import moses
17
+ from moses import metrics
18
+ import pandas as pd
19
+ from tdc import Evaluator
20
+ from tdc import Oracle
21
+
22
+
23
+ _DESCRIPTION = """
24
+
25
+ Comprehensive suite of metrics designed to assess the performance of molecular generation models, for understanding how well a model can produce novel, chemically valid molecules that are relevant to specific research objectives.
26
+
27
+ """
28
+
29
+
30
+ _KWARGS_DESCRIPTION = """
31
+ Args:
32
+ generated_smiles (`list` of `string`): A collection of SMILES (Simplified Molecular Input Line Entry System) strings generated by the model, ideally encompassing more than 30,000 samples.
33
+ train_smiles (`list` of `string`): The dataset of SMILES strings used to train the model, serving as a reference to evaluate the novelty and diversity of the generated molecules.
34
+
35
+ Returns:
36
+ Dectionary item containing various metrics to evaluate model performance
37
+ """
38
+
39
+
40
+ _CITATION = """
41
+ @article{DBLP:journals/corr/abs-1811-12823,
42
+ author = {Daniil Polykovskiy and
43
+ Alexander Zhebrak and
44
+ Benjam{\'{\i}}n S{\'{a}}nchez{-}Lengeling and
45
+ Sergey Golovanov and
46
+ Oktai Tatanov and
47
+ Stanislav Belyaev and
48
+ Rauf Kurbanov and
49
+ Aleksey Artamonov and
50
+ Vladimir Aladinskiy and
51
+ Mark Veselov and
52
+ Artur Kadurin and
53
+ Sergey I. Nikolenko and
54
+ Al{\'{a}}n Aspuru{-}Guzik and
55
+ Alex Zhavoronkov},
56
+ title = {Molecular Sets {(MOSES):} {A} Benchmarking Platform for Molecular
57
+ Generation Models},
58
+ journal = {CoRR},
59
+ volume = {abs/1811.12823},
60
+ year = {2018},
61
+ url = {http://arxiv.org/abs/1811.12823},
62
+ eprinttype = {arXiv},
63
+ eprint = {1811.12823},
64
+ timestamp = {Fri, 26 Nov 2021 15:34:30 +0100},
65
+ biburl = {https://dblp.org/rec/journals/corr/abs-1811-12823.bib},
66
+ bibsource = {dblp computer science bibliography, https://dblp.org}
67
+ }
68
+ """
69
+
70
+
71
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
72
+ class molgen_metric(evaluate.Measurement):
73
+ """TODO: Short description of my evaluation module."""
74
+
75
+ def _info(self):
76
+ return evaluate.MetricInfo(
77
+ description=_DESCRIPTION,
78
+ citation=_CITATION,
79
+ inputs_description=_KWARGS_DESCRIPTION,
80
+ features=datasets.Features(
81
+ {
82
+ "generated_smiles": datasets.Sequence(datasets.Value("string")),
83
+ "train_smiles": datasets.Sequence(datasets.Value("string")),
84
+ }
85
+ if self.config_name == "multilabel"
86
+ else {
87
+ "generated_smiles": datasets.Value("string"),
88
+ "train_smiles": datasets.Value("string"),
89
+ }
90
+ ),
91
+
92
+ reference_urls=["https://github.com/molecularsets/moses", "https://tdcommons.ai/functions/oracles/"],
93
+ )
94
+
95
+ def _compute(self, generated_smiles, train_smiles = None):
96
+
97
+ Results = metrics.get_all_metrics(gen = generated_smiles, train= train_smiles)
98
+
99
+ # evaluator = Evaluator(name = 'Diversity')
100
+ # Diversity = evaluator(generated_smiles)
101
+
102
+ # Results = {}
103
+
104
+ evaluator = Evaluator(name = 'KL_Divergence')
105
+ KL_Divergence = evaluator(generated_smiles, train_smiles)
106
+
107
+ # evaluator = Evaluator(name = 'FCD_Distance')
108
+ # FCD_Distance = evaluator(generated_smiles, train_smiles)
109
+
110
+ # evaluator = Evaluator(name = 'Novelty')
111
+ # Novelty = evaluator(generated_smiles, train_smiles)
112
+
113
+ # evaluator = Evaluator(name = 'Validity')
114
+ # Validity = evaluator(generated_smiles)
115
+
116
+
117
+ Results.update({
118
+ # "PyTDC_Diversity": Diversity,
119
+ "KL_Divergence": KL_Divergence,
120
+ # "PyTDC_Validity": Validity,FCD_Distance": FCD_Distance,
121
+ # "PyTDC_Novelty": Novelty,
122
+ # "PyTDC_
123
+ })
124
+
125
+
126
+ oracle_list = [
127
+ 'QED', 'SA', 'MPO', 'GSK3B', 'JNK3',
128
+ 'DRD2', 'LogP', 'Rediscovery', 'Similarity',
129
+ 'Median', 'Isomers', 'Valsartan_SMARTS', 'Hop'
130
+ ]
131
+
132
+ # Iterate through each oracle and compute its score
133
+ for oracle_name in oracle_list:
134
+ oracle = Oracle(name=oracle_name)
135
+ if oracle_name in ['Rediscovery', 'MPO', 'Similarity', 'Median', 'Isomers', 'Hop']:
136
+ # Assuming these oracles return a dictionary where values are lists of scores
137
+ score = oracle(generated_smiles)
138
+ if isinstance(score, dict):
139
+ # Convert lists of scores to average score for these specific metrics
140
+ score = {key: sum(values)/len(values) for key, values in score.items()}
141
+ else:
142
+ # Assuming other oracles return a list of scores
143
+ score = oracle(generated_smiles)
144
+ if isinstance(score, list):
145
+ # Convert list of scores to average score
146
+ score = sum(score) / len(score)
147
+
148
+ Results.update({f"{oracle_name}": score})
149
+
150
+ keys_to_remove = ["FCD/TestSF", "SNN/TestSF", "Frag/TestSF", "Scaf/TestSF"]
151
+ for key in keys_to_remove:
152
+ result['results'].pop(key, None)
153
+
154
+ return {"results": Results}
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ git+https://github.com/huggingface/evaluate@main
2
+ git+https://github.com/molecularsets/moses.git
3
+ rdkit
4
+ pandas==1.5.3
5
+ PyTDC