{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Neural Fingerprints\n", "\n", "We create atom, bond, and edge tensors from molecule SMILES using `chemml.chem.tensorize_molecules` in order to build neural fingerprints using `chemml.models.NeuralGraphHidden` and `chemml.models.NeuralGraphOutput` modules. These neural fingerprints are then used as features to train a simple feed forward neural network to predict densities of small organic compounds using tensorflow. \n", "\n", "Here we import a sample dataset from ChemML library which has the SMILES codes for 500 small organic molecules with their densities in $kg/m^3$. " ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2021-11-09 18:22:23.552207: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n", "2021-11-09 18:22:23.552291: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n", "/mnt/c/Aatish/UB/Mr. Hachmann/master_chemml_wrapper_v2/chemml/chemml/datasets/base.py:87: FutureWarning: In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only\n", " features = df.drop(['smiles', 'density_Kg/m3'],1)\n" ] } ], "source": [ "import numpy as np\n", "from chemml.datasets import load_organic_density\n", "molecules, target, dragon_subset = load_organic_density()\n", "target = np.asarray(target['density_Kg/m3'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Building `chemml.chem.Molecule` objects from molecule SMILES. " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from chemml.chem import Molecule\n", "mol_objs_list = []\n", "for smi in molecules['smiles']:\n", " mol = Molecule(smi, 'smiles')\n", " mol.hydrogens('add')\n", " mol.to_xyz('MMFF', maxIters=10000, mmffVariant='MMFF94s')\n", " mol_objs_list.append(mol)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Molecule tensors can be used to create neural graph fingerprints using `chemml.models`" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Tensorising molecules in batches of 100 ...\n", "500/500 [==================================================] - 1s 1ms/step\n", "Merging batch tensors ... [DONE]\n" ] } ], "source": [ "from chemml.chem import tensorise_molecules\n", "xatoms, xbonds, xedges = tensorise_molecules(molecules=mol_objs_list, max_degree=5, \n", " max_atoms=None, n_jobs=-1, batch_size=100, verbose=True)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Splitting and preprocessing the data" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import ShuffleSplit\n", "from sklearn.preprocessing import StandardScaler\n", "y_scale = StandardScaler()\n", "rs = ShuffleSplit(n_splits=1, test_size=.20, random_state=42)\n", "\n", "for train, test in rs.split(mol_objs_list):\n", " xatoms_train = xatoms[train]\n", " xatoms_test = xatoms[test]\n", " xbonds_train = xbonds[train]\n", " xbonds_test = xbonds[test]\n", " xedges_train = xedges[train]\n", " xedges_test = xedges[test]\n", " target_train = target[train]\n", " target_test = target[test]\n", " target_train = y_scale.fit_transform(target_train.reshape(-1,1))" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training data:\n", "\n", "Atoms: (400, 57, 62)\n", "Bonds: (400, 57, 5, 6)\n", "Edges: (400, 57, 5)\n", "Target: (400, 1)\n", "\n", "Testing data:\n", "\n", "Atoms: (100, 57, 62)\n", "Bonds: (100, 57, 5, 6)\n", "Edges: (100, 57, 5)\n", "Target: (100,)\n" ] } ], "source": [ "print('Training data:\\n')\n", "print('Atoms: ',xatoms_train.shape)\n", "print('Bonds: ',xbonds_train.shape)\n", "print('Edges: ',xedges_train.shape)\n", "print('Target: ',target_train.shape)\n", "\n", "print('\\nTesting data:\\n')\n", "print('Atoms: ',xatoms_test.shape)\n", "print('Bonds: ',xbonds_test.shape)\n", "print('Edges: ',xedges_test.shape)\n", "print('Target: ',target_test.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building the Neural Fingerprints\n", "\n", "The atom, bond, and edge tensors are used here to build 200 neural fingerprints of width 8 (i.e., the size atomic neighborhood which will be considered in the convolution process). " ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2021-11-09 18:23:03.337511: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory\n", "2021-11-09 18:23:03.337613: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)\n", "2021-11-09 18:23:03.337645: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (Aatish-HP): /proc/driver/nvidia/version does not exist\n", "2021-11-09 18:23:03.337984: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX2 FMA\n", "To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Neural Fingerprint Shape: (None, 200)\n" ] } ], "source": [ "from chemml.models import NeuralGraphHidden, NeuralGraphOutput\n", "from tensorflow.keras.layers import Input, add\n", "import tensorflow as tf\n", "tf.random.set_seed(42)\n", "\n", "conv_width = 8\n", "fp_length = 200\n", "\n", "num_molecules = xatoms_train.shape[0]\n", "max_atoms = xatoms_train.shape[1]\n", "max_degree = xbonds_train.shape[2]\n", "num_atom_features = xatoms_train.shape[-1]\n", "num_bond_features = xbonds_train.shape[-1]\n", "\n", "# Creating input layers for atoms ,bonds and edge information\n", "atoms0 = Input(name='atom_inputs', shape=(max_atoms, num_atom_features),batch_size=None)\n", "bonds = Input(name='bond_inputs', shape=(max_atoms, max_degree, num_bond_features),batch_size=None)\n", "edges = Input(name='edge_inputs', shape=(max_atoms, max_degree), dtype='int32',batch_size=None)\n", "\n", "# Defining the convolved atom feature layers \n", "atoms1 = NeuralGraphHidden(conv_width, activation='relu', use_bias=False)([atoms0, bonds, edges])\n", "atoms2 = NeuralGraphHidden(conv_width, activation='relu', use_bias=False)([atoms1, bonds, edges])\n", "\n", "# Defining the outputs of each (convolved) atom feature layer to fingerprint\n", "fp_out0 = NeuralGraphOutput(fp_length, activation='softmax')([atoms0,bonds,edges])\n", "fp_out1 = NeuralGraphOutput(fp_length, activation='softmax')([atoms1,bonds,edges])\n", "fp_out2 = NeuralGraphOutput(fp_length, activation='softmax')([atoms2,bonds,edges])\n", "\n", "# Sum outputs to obtain fingerprint \n", "final_fp = add([fp_out0, fp_out1, fp_out2])\n", "print('Neural Fingerprint Shape: ',final_fp.shape)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Building and training the neural network\n", "\n", "Here, we build and train a simple feed forward neural network using `tensorflow.keras` and provide our neural fingerprints as features. " ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Model: \"model\"\n", "__________________________________________________________________________________________________\n", "Layer (type) Output Shape Param # Connected to \n", "==================================================================================================\n", "atom_inputs (InputLayer) [(None, 57, 62)] 0 \n", "__________________________________________________________________________________________________\n", "bond_inputs (InputLayer) [(None, 57, 5, 6)] 0 \n", "__________________________________________________________________________________________________\n", "edge_inputs (InputLayer) [(None, 57, 5)] 0 \n", "__________________________________________________________________________________________________\n", "neural_graph_hidden (NeuralGrap (None, 57, 8) 2720 atom_inputs[0][0] \n", " bond_inputs[0][0] \n", " edge_inputs[0][0] \n", "__________________________________________________________________________________________________\n", "neural_graph_hidden_1 (NeuralGr (None, 57, 8) 560 neural_graph_hidden[0][0] \n", " bond_inputs[0][0] \n", " edge_inputs[0][0] \n", "__________________________________________________________________________________________________\n", "neural_graph_output (NeuralGrap (None, 200) 13800 atom_inputs[0][0] \n", " bond_inputs[0][0] \n", " edge_inputs[0][0] \n", "__________________________________________________________________________________________________\n", "neural_graph_output_1 (NeuralGr (None, 200) 3000 neural_graph_hidden[0][0] \n", " bond_inputs[0][0] \n", " edge_inputs[0][0] \n", "__________________________________________________________________________________________________\n", "neural_graph_output_2 (NeuralGr (None, 200) 3000 neural_graph_hidden_1[0][0] \n", " bond_inputs[0][0] \n", " edge_inputs[0][0] \n", "__________________________________________________________________________________________________\n", "add (Add) (None, 200) 0 neural_graph_output[0][0] \n", " neural_graph_output_1[0][0] \n", " neural_graph_output_2[0][0] \n", "__________________________________________________________________________________________________\n", "dense_layer0 (Dense) (None, 128) 25728 add[0][0] \n", "__________________________________________________________________________________________________\n", "dense_layer1 (Dense) (None, 64) 8256 dense_layer0[0][0] \n", "__________________________________________________________________________________________________\n", "main_prediction (Dense) (None, 1) 65 dense_layer1[0][0] \n", "==================================================================================================\n", "Total params: 57,129\n", "Trainable params: 57,129\n", "Non-trainable params: 0\n", "__________________________________________________________________________________________________\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2021-11-09 18:23:04.812435: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)\n" ] }, { "data": { "text/plain": [ "" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from tensorflow.keras import Model\n", "from tensorflow.keras.layers import Dense\n", "\n", "# Build and compile model for regression.\n", "dense_layer0 = Dense(128,activation='relu',name='dense_layer0',\n", " kernel_regularizer=tf.keras.regularizers.l2(0.01))(final_fp)\n", "dense_layer1 = Dense(64,activation='relu',name='dense_layer1',\n", " kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense_layer0)\n", "dense_layer2 = Dense(32,activation='relu',name='dense_layer2',\n", " kernel_regularizer=tf.keras.regularizers.l2(0.01))(dense_layer1)\n", "\n", "main_prediction = Dense(1, activation='linear', name='main_prediction')(dense_layer1)\n", "model = Model(inputs=[atoms0, bonds, edges], outputs=[main_prediction])\n", "model.compile(optimizer='adam', loss='mae')\n", "\n", "# Show summary\n", "model.summary()\n", "\n", "model.fit([xatoms_train, xbonds_train, xedges_train], target_train, epochs=50,\n", " steps_per_epoch=None, batch_size=None,verbose=False,validation_split=0.1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Predicting the density of the molecules in our test data and evaluating our model based on it. " ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Mean Absolute Error = 16.518 kg/m^3\n", "R squared = 0.935\n" ] } ], "source": [ "from chemml.utils import regression_metrics\n", "\n", "y_pred = model.predict([xatoms_test,xbonds_test,xedges_test])\n", "y_pred = y_scale.inverse_transform(y_pred)\n", "metrics_df = regression_metrics(target_test, list(y_pred.reshape(-1,)))\n", "mae = metrics_df['MAE'].values[0]\n", "r_2 = metrics_df['r_squared'].values[0]\n", "\n", "print(\"Mean Absolute Error = {} kg/m^3\".format(mae.round(3)))\n", "print(\"R squared = {}\".format(r_2.round(3)))" ] } ], "metadata": { "kernelspec": { "display_name": "Python [conda env:chemml_env] *", "language": "python", "name": "conda-env-chemml_env-py" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 2 }