Added data preproccessing section to readme

2025-10-20 17:43:41 -04:00
parent cf82ddd11d
commit 5e375d1e6d
5 changed files with 30 additions and 11 deletions
@@ -21,8 +21,6 @@ def main():
    # exploratory data analysis
    # draw_plots(df_clean)
    
-    le = get_label_encoder(df_clean)
-    
    # separate features and target
    X, y = separate_features_and_target(df_clean)
    
@@ -41,6 +39,7 @@ def main():
    y_pred = predict_target(model, X_test_normalized)

    # evaluation
+    le = get_label_encoder(df_clean)
    draw_feature_importance(model, X)
    draw_confusion_matrix(y_test, y_pred, le)
    draw_classification_report(y_test, y_pred, le)
@@ -148,19 +147,29 @@ def clean_data(df):
    return df_clean

 def remove_outliers(df):
-    numeric_cols = df.select_dtypes(include=['number']).columns
-    
    df_clean = df.copy()
    
+    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) == 0:
+        print("No numeric columns detected.")
+        return df_clean
+    
+    mask = np.ones(len(df_clean), dtype=bool)
+    
    for col in numeric_cols:
-        Q1 = df[col].quantile(0.25)
-        Q3 = df[col].quantile(0.75)
+        col_data = pd.to_numeric(df_clean[col], errors='coerce')
+        
+        Q1 = col_data.quantile(0.25)
+        Q3 = col_data.quantile(0.75)
        IQR = Q3 - Q1
-
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
-
-        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
+        
+        mask &= col_data.between(lower_bound, upper_bound)
+    
+    df_clean = df_clean[mask]
+    
+    print(f"Removed {len(df) - len(df_clean)} outliers across {len(numeric_cols)} numeric columns.")
    
    return df_clean

@@ -24,7 +24,7 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
 - Students who study more are more likely to have a higher GPA and more stress.
 - Physical activity has a negative correlation with other activities, one being study and therefore stress.
 - Students who sleep more were less likely to be very stressed.
- Some outliers were observed and will be need to be removed before training for more accurrate results.
+- Some outliers were observed and will be need to be removed before training for more accurate results.

 **Figures:**
 ![Feature Distributions Historgram](images/feature_distributions_histogram.png)
@@ -35,4 +35,14 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
 ![Sleep Boxplot](images/boxplots_extracurricular_hours_per_day.png)
 ![Sleep Boxplot](images/boxplots_physical_hours_per_day.png)
 ![Sleep Boxplot](images/boxplots_social_hours_per_day.png)
-![Sleep Boxplot](images/boxplots_gpa.png)
+![Sleep Boxplot](images/boxplots_gpa.png)
+
+---
+
+## Data Preprocessing
+
+No missing values or duplicate rows were found in the dataset. Outliers in numeric features were identified using the interquartile range (IQR) method and removed before training. This helps reduce the impact of extreme values and can improve model performance.
+
+![Missing Values](images/missing_values.png)
+![Duplicate Entries](images/duplicate_entries.png)
+![Duplicate Entries](images/removed_outliers.png)