diff --git a/images/feature_importance.png b/images/feature_importance.png new file mode 100644 index 0000000..649ad34 Binary files /dev/null and b/images/feature_importance.png differ diff --git a/main.py b/main.py index 5feda4f..3373f1b 100644 --- a/main.py +++ b/main.py @@ -41,10 +41,8 @@ def main(): y_pred = predict_target(model, X_test_normalized) # evaluation - evaluate_model(model, X, y_pred, y_test, le) - + draw_feature_importance(model, X) draw_confusion_matrix(y_test, y_pred, le) - draw_classification_report(y_test, y_pred, le) def get_label_encoder(df): @@ -91,22 +89,26 @@ def separate_features_and_target(df): y = df['Stress_Level'].cat.codes return X, y -def evaluate_model(model, X, y_pred, y_test, le): - feature_names = X.columns - - # Evaluate - print("Accuracy:", accuracy_score(y_test, y_pred)) - print("\nClassification Report:") - print(classification_report(y_test, y_pred, target_names=le.classes_)) - - print("\nConfusion Matrix:") - print(confusion_matrix(y_test, y_pred)) - +def draw_feature_importance(model, X): feature_importance = pd.DataFrame({ - 'Feature': feature_names, - 'Coefficient': model.coef_[0] + 'Feature': X.columns, + 'Coefficient': -model.coef_[0] }) - print(feature_importance.sort_values(by='Coefficient', ascending=False)) + + feature_importance['abs_coef'] = feature_importance['Coefficient'].abs() + feature_importance = feature_importance.sort_values(by='abs_coef', ascending=False) + feature_importance = feature_importance.iloc[::-1] + + colors = ['green' if c > 0 else 'red' for c in feature_importance['Coefficient']] + + plt.figure(figsize=(8,6)) + plt.barh(feature_importance['Feature'], feature_importance['Coefficient'], color=colors) + plt.xlabel("Coefficient (Impact on Stress Level)") + plt.ylabel("Feature") + plt.title("Feature Importance") + plt.axvline(0, color='black', linewidth=0.8) + plt.tight_layout() + plt.show() def train_logistic_regression(X_train, y_train): model = LogisticRegression( @@ -134,13 +136,13 @@ def inspect_data(df): print("\n") def clean_data(df): - print("Missing values:") - print(df.isnull().sum()) - print("\n") + # print("Missing values:") + # print(df.isnull().sum()) + # print("\n") - print("Duplicate rows in dataset:") - print(df.duplicated().sum()) - print("\n") + # print("Duplicate rows in dataset:") + # print(df.duplicated().sum()) + # print("\n") df_clean = df.dropna(inplace=False) return df_clean diff --git a/readme.md b/readme.md index ab11b93..f5abb2f 100644 --- a/readme.md +++ b/readme.md @@ -24,7 +24,7 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or * - Students who study more are more likely to have a higher GPA and more stress. - Physical activity has a negative correlation with other activities, one being study and therefore stress. - Students who sleep more were less likely to be very stressed. -- No extreme outliers were observed. +- Some outliers were observed and will be need to be removed before training for more accurrate results. **Figures:** ![Feature Distributions Historgram](images/feature_distributions_histogram.png)