Prepare: Remove Constant Columns and Outliers
[1]:
import pandas as pd
import numpy as np
[2]:
df = pd.DataFrame()
df['col1'] = [1,2,3,4,5]
df['col2'] = np.ones(5)
df['col3'] = [1,2,101,3,4]
df
[2]:
| col1 | col2 | col3 | |
|---|---|---|---|
| 0 | 1 | 1.0 | 1 |
| 1 | 2 | 1.0 | 2 |
| 2 | 3 | 1.0 | 101 |
| 3 | 4 | 1.0 | 3 |
| 4 | 5 | 1.0 | 4 |
Removing Constant Columns
[3]:
from chemml.preprocessing import ConstantColumns, Outliers
df1 = ConstantColumns(df)
df1
[3]:
| col1 | col3 | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 2 | 2 |
| 2 | 3 | 101 |
| 3 | 4 | 3 |
| 4 | 5 | 4 |
Removing oultiers based on mean
[4]:
df_clean = Outliers(df, m=2.0,strategy='mean')
df_clean
[4]:
| col1 | col2 | col3 | |
|---|---|---|---|
| 0 | 1 | 1.0 | 1 |
| 1 | 2 | 1.0 | 2 |
| 2 | 3 | 1.0 | 101 |
| 3 | 4 | 1.0 | 3 |
| 4 | 5 | 1.0 | 4 |
Removing outliers based on median
[5]:
df_clean = Outliers(df, m=2.0,strategy='median')
df_clean
[5]:
| col1 | col2 | col3 | |
|---|---|---|---|
| 0 | 1 | 1.0 | 1 |
| 1 | 2 | 1.0 | 2 |
| 3 | 4 | 1.0 | 3 |
| 4 | 5 | 1.0 | 4 |