Added data preproccessing section to readme

This commit is contained in:
Drew Giffin
2025-10-20 17:43:41 -04:00
parent cf82ddd11d
commit 5e375d1e6d
5 changed files with 30 additions and 11 deletions
Binary file not shown.

After

Width:  |  Height:  |  Size: 3.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.3 KiB

+18 -9
View File
@@ -21,8 +21,6 @@ def main():
# exploratory data analysis
# draw_plots(df_clean)
le = get_label_encoder(df_clean)
# separate features and target
X, y = separate_features_and_target(df_clean)
@@ -41,6 +39,7 @@ def main():
y_pred = predict_target(model, X_test_normalized)
# evaluation
le = get_label_encoder(df_clean)
draw_feature_importance(model, X)
draw_confusion_matrix(y_test, y_pred, le)
draw_classification_report(y_test, y_pred, le)
@@ -148,19 +147,29 @@ def clean_data(df):
return df_clean
def remove_outliers(df):
numeric_cols = df.select_dtypes(include=['number']).columns
df_clean = df.copy()
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
if len(numeric_cols) == 0:
print("No numeric columns detected.")
return df_clean
mask = np.ones(len(df_clean), dtype=bool)
for col in numeric_cols:
Q1 = df[col].quantile(0.25)
Q3 = df[col].quantile(0.75)
col_data = pd.to_numeric(df_clean[col], errors='coerce')
Q1 = col_data.quantile(0.25)
Q3 = col_data.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
mask &= col_data.between(lower_bound, upper_bound)
df_clean = df_clean[mask]
print(f"Removed {len(df) - len(df_clean)} outliers across {len(numeric_cols)} numeric columns.")
return df_clean
+12 -2
View File
@@ -24,7 +24,7 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
- Students who study more are more likely to have a higher GPA and more stress.
- Physical activity has a negative correlation with other activities, one being study and therefore stress.
- Students who sleep more were less likely to be very stressed.
- Some outliers were observed and will be need to be removed before training for more accurrate results.
- Some outliers were observed and will be need to be removed before training for more accurate results.
**Figures:**
![Feature Distributions Historgram](images/feature_distributions_histogram.png)
@@ -35,4 +35,14 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
![Sleep Boxplot](images/boxplots_extracurricular_hours_per_day.png)
![Sleep Boxplot](images/boxplots_physical_hours_per_day.png)
![Sleep Boxplot](images/boxplots_social_hours_per_day.png)
![Sleep Boxplot](images/boxplots_gpa.png)
![Sleep Boxplot](images/boxplots_gpa.png)
---
## Data Preprocessing
No missing values or duplicate rows were found in the dataset. Outliers in numeric features were identified using the interquartile range (IQR) method and removed before training. This helps reduce the impact of extreme values and can improve model performance.
![Missing Values](images/missing_values.png)
![Duplicate Entries](images/duplicate_entries.png)
![Duplicate Entries](images/removed_outliers.png)