Prepare: Remove Constant Columns and Outliers

[1]:
import pandas as pd
import numpy as np
[2]:
df = pd.DataFrame()
df['col1'] = [1,2,3,4,5]
df['col2'] = np.ones(5)
df['col3'] = [1,2,101,3,4]
df
[2]:
col1 col2 col3
0 1 1.0 1
1 2 1.0 2
2 3 1.0 101
3 4 1.0 3
4 5 1.0 4

Removing Constant Columns

[3]:
from chemml.preprocessing import ConstantColumns, Outliers
df1 = ConstantColumns(df)
df1
[3]:
col1 col3
0 1 1
1 2 2
2 3 101
3 4 3
4 5 4

Removing oultiers based on mean

[4]:
df_clean = Outliers(df, m=2.0,strategy='mean')
df_clean
[4]:
col1 col2 col3
0 1 1.0 1
1 2 1.0 2
2 3 1.0 101
3 4 1.0 3
4 5 1.0 4

Removing outliers based on median

[5]:
df_clean = Outliers(df, m=2.0,strategy='median')
df_clean
[5]:
col1 col2 col3
0 1 1.0 1
1 2 1.0 2
3 4 1.0 3
4 5 1.0 4