Feature importance is now drawn

2025-10-20 17:01:41 -04:00
parent 93c9da88d2
commit cf82ddd11d
3 changed files with 26 additions and 24 deletions
@@ -41,10 +41,8 @@ def main():
    y_pred = predict_target(model, X_test_normalized)

    # evaluation
-    evaluate_model(model, X, y_pred, y_test, le)
-    
+    draw_feature_importance(model, X)
    draw_confusion_matrix(y_test, y_pred, le)
-    
    draw_classification_report(y_test, y_pred, le)

 def get_label_encoder(df):
@@ -91,22 +89,26 @@ def separate_features_and_target(df):
    y = df['Stress_Level'].cat.codes
    return X, y

-def evaluate_model(model, X, y_pred, y_test, le):
-    feature_names = X.columns
-    
-    # Evaluate
-    print("Accuracy:", accuracy_score(y_test, y_pred))
-    print("\nClassification Report:")
-    print(classification_report(y_test, y_pred, target_names=le.classes_))
-    
-    print("\nConfusion Matrix:")
-    print(confusion_matrix(y_test, y_pred))
-    
+def draw_feature_importance(model, X):
    feature_importance = pd.DataFrame({
-        'Feature': feature_names,
-        'Coefficient': model.coef_[0]
+        'Feature': X.columns,
+        'Coefficient': -model.coef_[0]
    })
-    print(feature_importance.sort_values(by='Coefficient', ascending=False))
+
+    feature_importance['abs_coef'] = feature_importance['Coefficient'].abs()
+    feature_importance = feature_importance.sort_values(by='abs_coef', ascending=False) 
+    feature_importance = feature_importance.iloc[::-1]
+
+    colors = ['green' if c > 0 else 'red' for c in feature_importance['Coefficient']]
+
+    plt.figure(figsize=(8,6))
+    plt.barh(feature_importance['Feature'], feature_importance['Coefficient'], color=colors)
+    plt.xlabel("Coefficient (Impact on Stress Level)")
+    plt.ylabel("Feature")
+    plt.title("Feature Importance")
+    plt.axvline(0, color='black', linewidth=0.8)
+    plt.tight_layout()
+    plt.show()

 def train_logistic_regression(X_train, y_train):
    model = LogisticRegression(
@@ -134,13 +136,13 @@ def inspect_data(df):
    print("\n")

 def clean_data(df):
-    print("Missing values:")
-    print(df.isnull().sum())
-    print("\n")
+    # print("Missing values:")
+    # print(df.isnull().sum())
+    # print("\n")
    
-    print("Duplicate rows in dataset:")
-    print(df.duplicated().sum())
-    print("\n")
+    # print("Duplicate rows in dataset:")
+    # print(df.duplicated().sum())
+    # print("\n")
    
    df_clean = df.dropna(inplace=False)
    return df_clean
@@ -24,7 +24,7 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
 - Students who study more are more likely to have a higher GPA and more stress.
 - Physical activity has a negative correlation with other activities, one being study and therefore stress.
 - Students who sleep more were less likely to be very stressed.
- No extreme outliers were observed.
+- Some outliers were observed and will be need to be removed before training for more accurrate results.

 **Figures:**
 ![Feature Distributions Historgram](images/feature_distributions_histogram.png)