{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Neural Fingerprints\n",
    "\n",
    "We create atom, bond, and edge tensors from molecule SMILES using `chemml.chem.tensorize_molecules` in order to build neural fingerprints using `chemml.models.NeuralGraphHidden` and `chemml.models.NeuralGraphOutput` modules. These neural fingerprints are then used as features to train a simple feed forward neural network to predict densities of small organic compounds using tensorflow. \n",
    "\n",
    "Here we import a sample dataset from ChemML library which has the SMILES codes for 500 small organic molecules with their densities in $kg/m^3$. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from chemml.datasets import load_organic_density\n",
    "molecules, target, dragon_subset = load_organic_density()\n",
    "target = np.asarray(target['density_Kg/m3'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Building `chemml.chem.Molecule` objects from molecule SMILES. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from chemml.chem import Molecule\n",
    "mol_objs_list = []\n",
    "for smi in molecules['smiles']:\n",
    "    mol = Molecule(smi, 'smiles')\n",
    "    mol.hydrogens('add')\n",
    "    mol.to_xyz('MMFF', maxIters=10000, mmffVariant='MMFF94s')\n",
    "    mol_objs_list.append(mol)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Molecule tensors can be used to create neural graph fingerprints using `chemml.models`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Tensorising molecules in batches of 100 ...\n",
      "\u001b[1m500/500\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 14ms/step \n",
      "Merging batch tensors ...    [DONE]\n"
     ]
    }
   ],
   "source": [
    "from chemml.chem import tensorise_molecules\n",
    "xatoms, xbonds, xedges = tensorise_molecules(molecules=mol_objs_list, max_degree=5, \n",
    "                                        max_atoms=None, n_jobs=-1, batch_size=100, verbose=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Splitting and preprocessing the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import ShuffleSplit\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "y_scale = StandardScaler()\n",
    "rs = ShuffleSplit(n_splits=1, test_size=.20, random_state=42)\n",
    "\n",
    "for train, test in rs.split(mol_objs_list):\n",
    "    xatoms_train = xatoms[train]\n",
    "    xatoms_test = xatoms[test]\n",
    "    xbonds_train = xbonds[train]\n",
    "    xbonds_test = xbonds[test]\n",
    "    xedges_train = xedges[train]\n",
    "    xedges_test = xedges[test]\n",
    "    target_train = target[train]\n",
    "    target_test = target[test]\n",
    "    target_train = y_scale.fit_transform(target_train.reshape(-1,1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training data:\n",
      "\n",
      "Atoms:  (400, 57, 62)\n",
      "Bonds:  (400, 57, 5, 6)\n",
      "Edges:  (400, 57, 5)\n",
      "Target:  (400, 1)\n",
      "\n",
      "Testing data:\n",
      "\n",
      "Atoms:  (100, 57, 62)\n",
      "Bonds:  (100, 57, 5, 6)\n",
      "Edges:  (100, 57, 5)\n",
      "Target:  (100,)\n"
     ]
    }
   ],
   "source": [
    "print('Training data:\\n')\n",
    "print('Atoms: ',xatoms_train.shape)\n",
    "print('Bonds: ',xbonds_train.shape)\n",
    "print('Edges: ',xedges_train.shape)\n",
    "print('Target: ',target_train.shape)\n",
    "\n",
    "print('\\nTesting data:\\n')\n",
    "print('Atoms: ',xatoms_test.shape)\n",
    "print('Bonds: ',xbonds_test.shape)\n",
    "print('Edges: ',xedges_test.shape)\n",
    "print('Target: ',target_test.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building the Neural Fingerprints\n",
    "\n",
    "The atom, bond, and edge tensors are used here to build 200 neural fingerprints of width 8 (i.e., the size atomic neighborhood which will be considered in the convolution process). "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "ename": "AttributeError",
     "evalue": "property 'trainable_weights' of 'NeuralGraphHidden' object has no setter",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[1;32mIn[6], line 21\u001b[0m\n\u001b[0;32m     18\u001b[0m edges \u001b[38;5;241m=\u001b[39m Input(name\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124medge_inputs\u001b[39m\u001b[38;5;124m'\u001b[39m, shape\u001b[38;5;241m=\u001b[39m(max_atoms, max_degree), dtype\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mint32\u001b[39m\u001b[38;5;124m'\u001b[39m,batch_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m)\n\u001b[0;32m     20\u001b[0m \u001b[38;5;66;03m# Defining the convolved atom feature layers                                    \u001b[39;00m\n\u001b[1;32m---> 21\u001b[0m atoms1 \u001b[38;5;241m=\u001b[39m \u001b[43mNeuralGraphHidden\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconv_width\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mactivation\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrelu\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43muse_bias\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43matoms0\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbonds\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43medges\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m     22\u001b[0m atoms2 \u001b[38;5;241m=\u001b[39m NeuralGraphHidden(conv_width, activation\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrelu\u001b[39m\u001b[38;5;124m'\u001b[39m, use_bias\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)([atoms1, bonds, edges])\n\u001b[0;32m     24\u001b[0m \u001b[38;5;66;03m# Defining the outputs of each (convolved) atom feature layer to fingerprint\u001b[39;00m\n",
      "File \u001b[1;32mc:\\Users\\nitin\\anaconda3\\envs\\chemml_dev_env\\Lib\\site-packages\\keras\\src\\utils\\traceback_utils.py:122\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m    119\u001b[0m     filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[0;32m    120\u001b[0m     \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[0;32m    121\u001b[0m     \u001b[38;5;66;03m# `keras.config.disable_traceback_filtering()`\u001b[39;00m\n\u001b[1;32m--> 122\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m    123\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m    124\u001b[0m     \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
      "File \u001b[1;32mc:\\users\\nitin\\documents\\ub\\hachmann_group\\chemml_dev_nitin\\chemml\\chemml\\models\\graphconvlayers.py:214\u001b[0m, in \u001b[0;36mNeuralGraphHidden.build\u001b[1;34m(self, inputs_shape)\u001b[0m\n\u001b[0;32m    212\u001b[0m \u001b[38;5;66;03m# Store inner_3D_layer and it's weights\u001b[39;00m\n\u001b[0;32m    213\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39minner_3D_layers\u001b[38;5;241m.\u001b[39mappend(inner_3D_layer)\n\u001b[1;32m--> 214\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrainable_weights\u001b[49m \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m inner_3D_layer\u001b[38;5;241m.\u001b[39mtrainable_weights\n",
      "\u001b[1;31mAttributeError\u001b[0m: property 'trainable_weights' of 'NeuralGraphHidden' object has no setter"
     ]
    }
   ],
   "source": [
    "from chemml.models import NeuralGraphHidden, NeuralGraphOutput\n",
    "from tensorflow.keras.layers import Input, add\n",
    "import tensorflow as tf\n",
    "tf.random.set_seed(42)\n",
    "\n",
    "conv_width = 8\n",
    "fp_length = 200\n",
    "\n",
    "num_molecules = xatoms_train.shape[0]\n",
    "max_atoms = xatoms_train.shape[1]\n",
    "max_degree = xbonds_train.shape[2]\n",
    "num_atom_features = xatoms_train.shape[-1]\n",
    "num_bond_features = xbonds_train.shape[-1]\n",
    "\n",
    "# Creating input layers for atoms ,bonds and edge information\n",
    "atoms0 = Input(name='atom_inputs', shape=(max_atoms, num_atom_features),batch_size=None)\n",
    "bonds = Input(name='bond_inputs', shape=(max_atoms, max_degree, num_bond_features),batch_size=None)\n",
    "edges = Input(name='edge_inputs', shape=(max_atoms, max_degree), dtype='int32',batch_size=None)\n",
    "\n",
    "# Defining the convolved atom feature layers                                    \n",
    "atoms1 = NeuralGraphHidden(conv_width, activation='relu', use_bias=False)([atoms0, bonds, edges])\n",
    "atoms2 = NeuralGraphHidden(conv_width, activation='relu', use_bias=False)([atoms1, bonds, edges])\n",
    "\n",
    "# Defining the outputs of each (convolved) atom feature layer to fingerprint\n",
    "fp_out0 = NeuralGraphOutput(fp_length, activation='softmax')([atoms0,bonds,edges])\n",
    "fp_out1 = NeuralGraphOutput(fp_length, activation='softmax')([atoms1,bonds,edges])\n",
    "fp_out2 = NeuralGraphOutput(fp_length, activation='softmax')([atoms2,bonds,edges])\n",
    "\n",
    "# Sum outputs to obtain fingerprint                                            \n",
    "final_fp = add([fp_out0, fp_out1, fp_out2])\n",
    "print('Neural Fingerprint Shape: ',final_fp.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Building and training the neural network\n",
    "\n",
    "Here, we build and train a simple feed forward neural network using `tensorflow.keras` and provide our neural fingerprints as features. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model: \"model\"\n",
      "__________________________________________________________________________________________________\n",
      "Layer (type)                    Output Shape         Param #     Connected to                     \n",
      "==================================================================================================\n",
      "atom_inputs (InputLayer)        [(None, 57, 62)]     0                                            \n",
      "__________________________________________________________________________________________________\n",
      "bond_inputs (InputLayer)        [(None, 57, 5, 6)]   0                                            \n",
      "__________________________________________________________________________________________________\n",
      "edge_inputs (InputLayer)        [(None, 57, 5)]      0                                            \n",
      "__________________________________________________________________________________________________\n",
      "neural_graph_hidden (NeuralGrap (None, 57, 8)        2720        atom_inputs[0][0]                \n",
      "                                                                 bond_inputs[0][0]                \n",
      "                                                                 edge_inputs[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "neural_graph_hidden_1 (NeuralGr (None, 57, 8)        560         neural_graph_hidden[0][0]        \n",
      "                                                                 bond_inputs[0][0]                \n",
      "                                                                 edge_inputs[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "neural_graph_output (NeuralGrap (None, 200)          13800       atom_inputs[0][0]                \n",
      "                                                                 bond_inputs[0][0]                \n",
      "                                                                 edge_inputs[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "neural_graph_output_1 (NeuralGr (None, 200)          3000        neural_graph_hidden[0][0]        \n",
      "                                                                 bond_inputs[0][0]                \n",
      "                                                                 edge_inputs[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "neural_graph_output_2 (NeuralGr (None, 200)          3000        neural_graph_hidden_1[0][0]      \n",
      "                                                                 bond_inputs[0][0]                \n",
      "                                                                 edge_inputs[0][0]                \n",
      "__________________________________________________________________________________________________\n",
      "add (Add)                       (None, 200)          0           neural_graph_output[0][0]        \n",
      "                                                                 neural_graph_output_1[0][0]      \n",
      "                                                                 neural_graph_output_2[0][0]      \n",
      "__________________________________________________________________________________________________\n",
      "dense_layer0 (Dense)            (None, 128)          25728       add[0][0]                        \n",
      "__________________________________________________________________________________________________\n",
      "dense_layer1 (Dense)            (None, 64)           8256        dense_layer0[0][0]               \n",
      "__________________________________________________________________________________________________\n",
      "main_prediction (Dense)         (None, 1)            65          dense_layer1[0][0]               \n",
      "==================================================================================================\n",
      "Total params: 57,129\n",
      "Trainable params: 57,129\n",
      "Non-trainable params: 0\n",
      "__________________________________________________________________________________________________\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2021-11-09 18:23:04.812435: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<keras.callbacks.History at 0x7feda4198be0>"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from tensorflow.keras import Model\n",
    "from tensorflow.keras.layers import Dense\n",
    "\n",
    "# Build and compile model for regression.\n",
    "dense_layer0 = Dense(128,activation='relu',name='dense_layer0',\n",
    "                     kernel_regularizer=tf.keras.regularizers.l2(0.01))(final_fp)\n",
    "dense_layer1 = Dense(64,activation='relu',name='dense_layer1',\n",
    "                     kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense_layer0)\n",
    "dense_layer2 = Dense(32,activation='relu',name='dense_layer2',\n",
    "                     kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense_layer1)\n",
    "\n",
    "main_prediction = Dense(1, activation='linear', name='main_prediction')(dense_layer1)\n",
    "model = Model(inputs=[atoms0, bonds, edges], outputs=[main_prediction])\n",
    "model.compile(optimizer='adam', loss='mae')\n",
    "\n",
    "# Show summary\n",
    "model.summary()\n",
    "\n",
    "model.fit([xatoms_train, xbonds_train, xedges_train], target_train, epochs=50,\n",
    "          steps_per_epoch=None, batch_size=None,verbose=False,validation_split=0.1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Predicting the density of the molecules in our test data and evaluating our model based on it. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean Absolute Error = 16.518 kg/m^3\n",
      "R squared = 0.935\n"
     ]
    }
   ],
   "source": [
    "from chemml.utils import regression_metrics\n",
    "\n",
    "y_pred = model.predict([xatoms_test,xbonds_test,xedges_test])\n",
    "y_pred = y_scale.inverse_transform(y_pred)\n",
    "metrics_df = regression_metrics(target_test, list(y_pred.reshape(-1,)))\n",
    "mae = metrics_df['MAE'].values[0]\n",
    "r_2 = metrics_df['r_squared'].values[0]\n",
    "\n",
    "print(\"Mean Absolute Error = {} kg/m^3\".format(mae.round(3)))\n",
    "print(\"R squared = {}\".format(r_2.round(3)))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "chemml_dev_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.5"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}