diff --git a/images/duplicate_entries.png b/images/duplicate_entries.png new file mode 100644 index 0000000..bc39f54 Binary files /dev/null and b/images/duplicate_entries.png differ diff --git a/images/missing_values.png b/images/missing_values.png new file mode 100644 index 0000000..505228f Binary files /dev/null and b/images/missing_values.png differ diff --git a/images/removed_outliers.png b/images/removed_outliers.png new file mode 100644 index 0000000..4db6385 Binary files /dev/null and b/images/removed_outliers.png differ diff --git a/main.py b/main.py index 3373f1b..77087ed 100644 --- a/main.py +++ b/main.py @@ -21,8 +21,6 @@ def main(): # exploratory data analysis # draw_plots(df_clean) - le = get_label_encoder(df_clean) - # separate features and target X, y = separate_features_and_target(df_clean) @@ -41,6 +39,7 @@ def main(): y_pred = predict_target(model, X_test_normalized) # evaluation + le = get_label_encoder(df_clean) draw_feature_importance(model, X) draw_confusion_matrix(y_test, y_pred, le) draw_classification_report(y_test, y_pred, le) @@ -148,19 +147,29 @@ def clean_data(df): return df_clean def remove_outliers(df): - numeric_cols = df.select_dtypes(include=['number']).columns - df_clean = df.copy() + numeric_cols = df_clean.select_dtypes(include=[np.number]).columns + if len(numeric_cols) == 0: + print("No numeric columns detected.") + return df_clean + + mask = np.ones(len(df_clean), dtype=bool) + for col in numeric_cols: - Q1 = df[col].quantile(0.25) - Q3 = df[col].quantile(0.75) + col_data = pd.to_numeric(df_clean[col], errors='coerce') + + Q1 = col_data.quantile(0.25) + Q3 = col_data.quantile(0.75) IQR = Q3 - Q1 - lower_bound = Q1 - 1.5 * IQR upper_bound = Q3 + 1.5 * IQR - - df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)] + + mask &= col_data.between(lower_bound, upper_bound) + + df_clean = df_clean[mask] + + print(f"Removed {len(df) - len(df_clean)} outliers across {len(numeric_cols)} numeric columns.") return df_clean diff --git a/readme.md b/readme.md index f5abb2f..0eabd9a 100644 --- a/readme.md +++ b/readme.md @@ -24,7 +24,7 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or * - Students who study more are more likely to have a higher GPA and more stress. - Physical activity has a negative correlation with other activities, one being study and therefore stress. - Students who sleep more were less likely to be very stressed. -- Some outliers were observed and will be need to be removed before training for more accurrate results. +- Some outliers were observed and will be need to be removed before training for more accurate results. **Figures:** ![Feature Distributions Historgram](images/feature_distributions_histogram.png) @@ -35,4 +35,14 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or * ![Sleep Boxplot](images/boxplots_extracurricular_hours_per_day.png) ![Sleep Boxplot](images/boxplots_physical_hours_per_day.png) ![Sleep Boxplot](images/boxplots_social_hours_per_day.png) -![Sleep Boxplot](images/boxplots_gpa.png) \ No newline at end of file +![Sleep Boxplot](images/boxplots_gpa.png) + +--- + +## Data Preprocessing + +No missing values or duplicate rows were found in the dataset. Outliers in numeric features were identified using the interquartile range (IQR) method and removed before training. This helps reduce the impact of extreme values and can improve model performance. + +![Missing Values](images/missing_values.png) +![Duplicate Entries](images/duplicate_entries.png) +![Duplicate Entries](images/removed_outliers.png) \ No newline at end of file