Added data preproccessing section to readme
This commit is contained in:
Binary file not shown.
|
After Width: | Height: | Size: 3.0 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 18 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 3.3 KiB |
@@ -21,8 +21,6 @@ def main():
|
||||
# exploratory data analysis
|
||||
# draw_plots(df_clean)
|
||||
|
||||
le = get_label_encoder(df_clean)
|
||||
|
||||
# separate features and target
|
||||
X, y = separate_features_and_target(df_clean)
|
||||
|
||||
@@ -41,6 +39,7 @@ def main():
|
||||
y_pred = predict_target(model, X_test_normalized)
|
||||
|
||||
# evaluation
|
||||
le = get_label_encoder(df_clean)
|
||||
draw_feature_importance(model, X)
|
||||
draw_confusion_matrix(y_test, y_pred, le)
|
||||
draw_classification_report(y_test, y_pred, le)
|
||||
@@ -148,19 +147,29 @@ def clean_data(df):
|
||||
return df_clean
|
||||
|
||||
def remove_outliers(df):
|
||||
numeric_cols = df.select_dtypes(include=['number']).columns
|
||||
|
||||
df_clean = df.copy()
|
||||
|
||||
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
|
||||
if len(numeric_cols) == 0:
|
||||
print("No numeric columns detected.")
|
||||
return df_clean
|
||||
|
||||
mask = np.ones(len(df_clean), dtype=bool)
|
||||
|
||||
for col in numeric_cols:
|
||||
Q1 = df[col].quantile(0.25)
|
||||
Q3 = df[col].quantile(0.75)
|
||||
col_data = pd.to_numeric(df_clean[col], errors='coerce')
|
||||
|
||||
Q1 = col_data.quantile(0.25)
|
||||
Q3 = col_data.quantile(0.75)
|
||||
IQR = Q3 - Q1
|
||||
|
||||
lower_bound = Q1 - 1.5 * IQR
|
||||
upper_bound = Q3 + 1.5 * IQR
|
||||
|
||||
df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
|
||||
|
||||
mask &= col_data.between(lower_bound, upper_bound)
|
||||
|
||||
df_clean = df_clean[mask]
|
||||
|
||||
print(f"Removed {len(df) - len(df_clean)} outliers across {len(numeric_cols)} numeric columns.")
|
||||
|
||||
return df_clean
|
||||
|
||||
|
||||
@@ -24,7 +24,7 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
|
||||
- Students who study more are more likely to have a higher GPA and more stress.
|
||||
- Physical activity has a negative correlation with other activities, one being study and therefore stress.
|
||||
- Students who sleep more were less likely to be very stressed.
|
||||
- Some outliers were observed and will be need to be removed before training for more accurrate results.
|
||||
- Some outliers were observed and will be need to be removed before training for more accurate results.
|
||||
|
||||
**Figures:**
|
||||

|
||||
@@ -35,4 +35,14 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
|
||||

|
||||

|
||||

|
||||

|
||||

|
||||
|
||||
---
|
||||
|
||||
## Data Preprocessing
|
||||
|
||||
No missing values or duplicate rows were found in the dataset. Outliers in numeric features were identified using the interquartile range (IQR) method and removed before training. This helps reduce the impact of extreme values and can improve model performance.
|
||||
|
||||

|
||||

|
||||

|
||||
Reference in New Issue
Block a user