{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import torch\n",
    "from torch_geometric.data import Data\n",
    "from ogb.utils import smiles2graph\n",
    "import os\n",
    "import json\n",
    "from rdkit import RDLogger\n",
    "from rdkit import Chem\n",
    "RDLogger.DisableLog('rdApp.*')\n",
    "from tqdm import tqdm\n",
    "import multiprocessing\n",
    "\n",
    "def write_json(data, filename):\n",
    "    with open(filename, 'w') as f:\n",
    "        json.dump(data, f, indent=4, ensure_ascii=False)\n",
    "\n",
    "def read_json(filename):\n",
    "    with open(filename, 'r') as f:\n",
    "        data = json.load(f)\n",
    "    return data\n",
    "\n",
    "def smiles2data(smiles):\n",
    "    graph = smiles2graph(smiles)\n",
    "    x = torch.from_numpy(graph['node_feat'])\n",
    "    edge_index = torch.from_numpy(graph['edge_index'], )\n",
    "    edge_attr = torch.from_numpy(graph['edge_feat'])\n",
    "    data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr)\n",
    "    return data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make pretrain graphs\n",
    "root = 'data/pretrain_data/'\n",
    "mol_property_list = read_json(f'{root}/Abstract_property.json')\n",
    "target_file = f'{root}/mol_graph_map.pt'\n",
    "\n",
    "if not os.path.exists(target_file):\n",
    "    mol_graph_map = {}\n",
    "    for mol_dict in tqdm(mol_property_list):\n",
    "        smiles = mol_dict['canon_smiles']\n",
    "        graph = smiles2data(smiles)\n",
    "        mol_graph_map[smiles] = graph\n",
    "    torch.save(mol_graph_map, target_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make downstrem (action prediction) graphs\n",
    "root = 'data/action_data'\n",
    "target_file = f'{root}/mol_graph_map.pt'\n",
    "\n",
    "if not os.path.exists(target_file):\n",
    "    all_mols = set()\n",
    "    reaction_list = read_json(f'{root}/processed.json')\n",
    "    rxn_keys = ['REACTANT', 'PRODUCT', 'CATALYST', 'SOLVENT']\n",
    "\n",
    "    for rxn in reaction_list:\n",
    "        for key in rxn_keys:\n",
    "            for mol in rxn[key]:\n",
    "                if mol in all_mols:\n",
    "                    continue\n",
    "                all_mols.add(mol)\n",
    "    mol_graph_map={}\n",
    "\n",
    "    for smiles in all_mols:\n",
    "        graph = smiles2data(smiles)\n",
    "        mol_graph_map[smiles] = graph\n",
    "    torch.save(mol_graph_map, target_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make downstream (retrosynthesis) graphs\n",
    "root = 'data/synthesis_data'\n",
    "\n",
    "for folder in [\n",
    "    'USPTO_50K_PtoR',\n",
    "    'USPTO_50K_PtoR_aug20',\n",
    "    'USPTO-MIT_PtoR_aug5',\n",
    "    'USPTO-MIT_RtoP_aug5_mixed',\n",
    "    'USPTO-MIT_RtoP_aug5_separated',\n",
    "    'USPTO_full_pretrain_aug5_masked_token',\n",
    "    ]:\n",
    "    mol_graphid_file = f'{root}/{folder}/mol_graphid_map.json'\n",
    "    target_file = f'{root}/{folder}/idx_graph_map.pt'\n",
    "    if not os.path.exists(mol_graphid_file):\n",
    "        canon_idx_map = {}\n",
    "        mol_idx_map = {}\n",
    "        mol_set = set()\n",
    "        for mode in ['train', 'val', 'test']:\n",
    "            for file in ['src', 'tgt']:\n",
    "                if 'pretrain' in folder:\n",
    "                    if file=='src':\n",
    "                        continue\n",
    "                else:\n",
    "                    if file=='tgt':\n",
    "                        continue\n",
    "                file_path = f'{root}/{folder}/{mode}/{file}-{mode}.txt'\n",
    "                with open(file_path) as f:\n",
    "                    lines = f.readlines()\n",
    "                for line in lines:\n",
    "                    line = line.strip().replace(' ', '')\n",
    "                    line = line.replace('<separated>', '.')\n",
    "                    for smi in line.split('.'):\n",
    "                        mol_set.add(smi)\n",
    "        smi_list = list(mol_set)\n",
    "        pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())\n",
    "        canon_list = pool.map(func=Chem.CanonSmiles,iterable=smi_list)\n",
    "        for smi, canon in zip(smi_list, canon_list):\n",
    "            if canon not in canon_idx_map:\n",
    "                canon_idx_map[canon] = len(canon_idx_map)\n",
    "            mol_idx_map[smi] = canon_idx_map[canon]\n",
    "        write_json(mol_idx_map, mol_graphid_file)\n",
    "    else:\n",
    "        mol_idx_map = read_json(mol_graphid_file)\n",
    "\n",
    "    cid_graph_map = {}\n",
    "    for smiles, graph_id in mol_idx_map.items():\n",
    "        if graph_id in cid_graph_map:\n",
    "            continue\n",
    "        graph = smiles2data(smiles)\n",
    "        cid_graph_map[graph_id] = graph\n",
    "    torch.save(cid_graph_map, target_file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make downstream (retrosynthesis) graphs\n",
    "root = 'data/ChEBI-20_data'\n",
    "target_file = f'{root}/cid_graph_map.pt'\n",
    "\n",
    "cid_graph_map = {}\n",
    "if not os.path.exists(target_file):\n",
    "    for mode in ['train', 'validation', 'test']:\n",
    "        with open(f'{root}/{mode}.txt') as f:\n",
    "            lines = f.readlines()\n",
    "        for line in lines[1:]:\n",
    "            cid, smiles, _ = line.strip().split('\\t', maxsplit=2)\n",
    "            graph = smiles2data(smiles)\n",
    "            cid_graph_map[cid] = graph\n",
    "    torch.save(cid_graph_map, target_file)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pth20v3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.17"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}