Prepare: Handling Missing Values
ChemML implements 4 strategies to handle missing values and interpolate, replace or remove them.
[1]:
import pandas as pd
import numpy as np
from chemml.preprocessing import MissingValues
[2]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df
[2]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 1 | 1 |
1 | 2 | nan | 2 |
2 | 3 | inf | 3 |
3 | nan | 2 | 4 |
4 | missing | 3 | 5 |
5 | 4 | 4 | 6 |
6 | 5 | 5 | 7 |
7 | NaN | 6 | 8 |
Strategy 1: Ignoring Rows
[3]:
df2 = MissingValues(df, strategy='ignore_row',string_as_null=True,inf_as_null=True,missing_values=None)
df2
[3]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 1.0 | 1.0 | 1 |
5 | 4.0 | 4.0 | 6 |
6 | 5.0 | 5.0 | 7 |
[4]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df
[4]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 1 | 1 |
1 | 2 | nan | 2 |
2 | 3 | inf | 3 |
3 | nan | 2 | 4 |
4 | missing | 3 | 5 |
5 | 4 | 4 | 6 |
6 | 5 | 5 | 7 |
7 | NaN | 6 | 8 |
Strategy 2: Replacing With Zeros
[5]:
df2 = MissingValues(df, strategy='zero',string_as_null=True,inf_as_null=True,missing_values=None)
df2
[5]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 1.0 | 1.0 | 1 |
1 | 2.0 | 0.0 | 2 |
2 | 3.0 | 0.0 | 3 |
3 | 0.0 | 2.0 | 4 |
4 | 0.0 | 3.0 | 5 |
5 | 4.0 | 4.0 | 6 |
6 | 5.0 | 5.0 | 7 |
7 | 0.0 | 6.0 | 8 |
[6]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df
[6]:
col1 | col2 | |
---|---|---|
0 | 1 | 1 |
1 | 2 | nan |
2 | 3 | inf |
3 | nan | 2 |
4 | missing | 3 |
5 | 4 | 4 |
6 | 5 | 5 |
7 | NaN | 6 |
Strategy 3: Interpolate
[7]:
df2 = MissingValues(df,strategy='interpolate',string_as_null=True,inf_as_null=True,missing_values=None)
df2
[7]:
col1 | col2 | |
---|---|---|
0 | 1.000000 | 1.000000 |
1 | 2.000000 | 1.333333 |
2 | 3.000000 | 1.666667 |
3 | 3.333333 | 2.000000 |
4 | 3.666667 | 3.000000 |
5 | 4.000000 | 4.000000 |
6 | 5.000000 | 5.000000 |
7 | 5.000000 | 6.000000 |
[8]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df
[8]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 1 | 1 |
1 | 2 | nan | 2 |
2 | 3 | inf | 3 |
3 | nan | 2 | 4 |
4 | missing | 3 | 5 |
5 | 4 | 4 | 6 |
6 | 5 | 5 | 7 |
7 | NaN | 6 | 8 |
Strategy 3: Ignore Columns
[9]:
df2 = MissingValues(df, strategy='ignore_column',string_as_null=True,inf_as_null=True,missing_values=None)
df2
[9]:
col3 | |
---|---|
0 | 1 |
1 | 2 |
2 | 3 |
3 | 4 |
4 | 5 |
5 | 6 |
6 | 7 |
7 | 8 |
[ ]: