Prepare: Handling Missing Values

ChemML implements 4 strategies to handle missing values and interpolate, replace or remove them.

[1]:
import pandas as pd
import numpy as np
from chemml.preprocessing import MissingValues
[2]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df
[2]:
col1 col2 col3
0 1 1 1
1 2 nan 2
2 3 inf 3
3 nan 2 4
4 missing 3 5
5 4 4 6
6 5 5 7
7 NaN 6 8

Strategy 1: Ignoring Rows

[3]:
df2 = MissingValues(df, strategy='ignore_row',string_as_null=True,inf_as_null=True,missing_values=None)
df2
[3]:
col1 col2 col3
0 1.0 1.0 1
5 4.0 4.0 6
6 5.0 5.0 7
[4]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df
[4]:
col1 col2 col3
0 1 1 1
1 2 nan 2
2 3 inf 3
3 nan 2 4
4 missing 3 5
5 4 4 6
6 5 5 7
7 NaN 6 8

Strategy 2: Replacing With Zeros

[5]:
df2 = MissingValues(df, strategy='zero',string_as_null=True,inf_as_null=True,missing_values=None)
df2
[5]:
col1 col2 col3
0 1.0 1.0 1
1 2.0 0.0 2
2 3.0 0.0 3
3 0.0 2.0 4
4 0.0 3.0 5
5 4.0 4.0 6
6 5.0 5.0 7
7 0.0 6.0 8
[6]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df
[6]:
col1 col2
0 1 1
1 2 nan
2 3 inf
3 nan 2
4 missing 3
5 4 4
6 5 5
7 NaN 6

Strategy 3: Interpolate

[7]:
df2 = MissingValues(df,strategy='interpolate',string_as_null=True,inf_as_null=True,missing_values=None)
df2
[7]:
col1 col2
0 1.000000 1.000000
1 2.000000 1.333333
2 3.000000 1.666667
3 3.333333 2.000000
4 3.666667 3.000000
5 4.000000 4.000000
6 5.000000 5.000000
7 5.000000 6.000000
[8]:
df = pd.DataFrame()
df['col1'] = [1,2,3,'nan','missing',4,5,np.nan]
df['col2'] = [1,'nan',np.inf,2,3,4,5,6]
df['col3'] = [1,2,3,4,5,6,7,8]
df
[8]:
col1 col2 col3
0 1 1 1
1 2 nan 2
2 3 inf 3
3 nan 2 4
4 missing 3 5
5 4 4 6
6 5 5 7
7 NaN 6 8

Strategy 3: Ignore Columns

[9]:
df2 = MissingValues(df, strategy='ignore_column',string_as_null=True,inf_as_null=True,missing_values=None)
df2
[9]:
col3
0 1
1 2
2 3
3 4
4 5
5 6
6 7
7 8
[ ]: