Prepare: Remove Constant Columns and Outliers
[1]:
import pandas as pd
import numpy as np
[2]:
df = pd.DataFrame()
df['col1'] = [1,2,3,4,5]
df['col2'] = np.ones(5)
df['col3'] = [1,2,101,3,4]
df
[2]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 1.0 | 1 |
1 | 2 | 1.0 | 2 |
2 | 3 | 1.0 | 101 |
3 | 4 | 1.0 | 3 |
4 | 5 | 1.0 | 4 |
Removing Constant Columns
[3]:
from chemml.preprocessing import ConstantColumns, Outliers
df1 = ConstantColumns(df)
df1
[3]:
col1 | col3 | |
---|---|---|
0 | 1 | 1 |
1 | 2 | 2 |
2 | 3 | 101 |
3 | 4 | 3 |
4 | 5 | 4 |
Removing oultiers based on mean
[4]:
df_clean = Outliers(df, m=2.0,strategy='mean')
df_clean
[4]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 1.0 | 1 |
1 | 2 | 1.0 | 2 |
2 | 3 | 1.0 | 101 |
3 | 4 | 1.0 | 3 |
4 | 5 | 1.0 | 4 |
Removing outliers based on median
[5]:
df_clean = Outliers(df, m=2.0,strategy='median')
df_clean
[5]:
col1 | col2 | col3 | |
---|---|---|---|
0 | 1 | 1.0 | 1 |
1 | 2 | 1.0 | 2 |
3 | 4 | 1.0 | 3 |
4 | 5 | 1.0 | 4 |