Merge branch 'feature/report' into development

2025-10-22 11:07:40 -04:00
parent cb2ff18b89 3cf2d55a80
commit ebbd3b2da6
9 changed files with 252 additions and 46 deletions
@@ -19,56 +19,133 @@ def main():
    df_clean = preprocess_data(df)
    
    # exploratory data analysis
-    # draw_graphs(df_clean)
+    # draw_plots(df_clean)
    
    # separate features and target
-    X = df_clean.drop('Stress_Level', axis=1)
-    y_raw = df_clean['Stress_Level']
-        
-    # encode target
-    le = LabelEncoder()
-    y = le.fit_transform(y_raw)
+    X, y = separate_features_and_target(df_clean)
    
    # split into train and test data
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.2, stratify=y, random_state=0
-    )   
+    accuracy_scores = []
+    # run training many times using different splits to get an average accuracy score 
+    for i in range(1000):
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, stratify=y, random_state=i
+        )
    
-    # feature engineering
-    X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
-    
-    feature_names = X.columns
-    model = train_logistic_regression(X_train_normalized, X_test_normalized, y_train, y_test, le, feature_names)
-    
-    
-    y_pred = model.predict(X_test)
-    
-    # Evaluate
-    print("Accuracy:", accuracy_score(y_test, y_pred))
-    print("\nClassification Report:")
-    print(classification_report(y_test, y_pred, target_names=le.classes_))
-    
-    print("\nConfusion Matrix:")
-    print(confusion_matrix(y_test, y_pred))
-    
-    feature_importance = pd.DataFrame({
-        'Feature': feature_names,
-        'Coefficient': model.coef_[0]
-    })
-    print(feature_importance.sort_values(by='Coefficient', ascending=False))
+        # pre training processing
+        X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
+        
+        # training
+        model = train_logistic_regression(X_train_normalized, y_train)
+        
+        # prediction
+        y_pred = predict_target(model, X_test_normalized)

-def train_logistic_regression(X_train, X_test, y_train, y_test, le, feature_names):
-    model = LogisticRegression(
-        solver='lbfgs',
-        max_iter=10000
+        # evaluation
+        le = get_label_encoder(df_clean)
+        # draw_feature_importance(model, X)
+        # draw_confusion_matrix(y_test, y_pred, le)
+        # draw_classification_report(y_test, y_pred, le)
+        accuracy = get_accuracy(y_test, y_pred)
+        accuracy_scores.append(accuracy)
+    print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
+    print(f"Samples: {len(accuracy_scores)}")
+
+def get_accuracy(y_test, y_pred):
+    accuracy = accuracy_score(y_test, y_pred)
+    return accuracy
+
+def get_label_encoder(df):
+    le = LabelEncoder()
+    le.classes_ = np.array(df['Stress_Level'].cat.categories)
+    return le
+
+def draw_classification_report(y_test, y_pred, le):
+    report = classification_report(
+        y_test, y_pred, output_dict=True, target_names=le.classes_
    )
+    df_report = pd.DataFrame(report).transpose()
+    
+    metrics_df = df_report.loc[le.classes_, ["precision", "recall", "f1-score"]]
+    
+    ax = metrics_df.plot(
+        kind="bar",
+        figsize=(8, 5),
+        rot=0,
+        color=["#4C72B0", "#55A868", "#C44E52"]
+    )
+    
+    plt.title("Classification Report Metrics")
+    plt.ylabel("Score")
+    plt.ylim(0, 1)
+    plt.legend(loc="lower right")
+    
+    for p in ax.patches:
+        height = p.get_height()
+        ax.annotate(
+            f"{height:.2f}",
+            (p.get_x() + p.get_width() / 2, height),
+            ha='center',
+            va='bottom',
+            fontsize=9
+        )
+    
+    plt.tight_layout()
+    plt.show()
+    
+def draw_confusion_matrix(y_test, y_pred, le):
+    y_test_decoded = le.inverse_transform(y_test)
+    y_pred_decoded = le.inverse_transform(y_pred)
+
+    cm = confusion_matrix(y_test_decoded, y_pred_decoded, labels=le.classes_)
+
+    # Plot
+    plt.figure(figsize=(6,5))
+    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_,
+                yticklabels=le.classes_)
+    plt.xlabel("Predicted")
+    plt.ylabel("Actual")
+    plt.title("Confusion Matrix")
+    plt.tight_layout()
+    plt.show()
+
+def predict_target(model, X_test):
+    y_pred = model.predict(X_test)
+    return y_pred
+
+def separate_features_and_target(df): 
+    X = df.drop('Stress_Level', axis=1)
+    y = df['Stress_Level'].cat.codes
+    return X, y
+
+def draw_feature_importance(model, X):
+    feature_importance = pd.DataFrame({
+        'Feature': X.columns,
+        'Coefficient': -model.coef_[0]
+    })
+
+    feature_importance['abs_coef'] = feature_importance['Coefficient'].abs()
+    feature_importance = feature_importance.sort_values(by='abs_coef', ascending=False) 
+    feature_importance = feature_importance.iloc[::-1]
+
+    colors = ['green' if c > 0 else 'red' for c in feature_importance['Coefficient']]
+
+    plt.figure(figsize=(8,6))
+    plt.barh(feature_importance['Feature'], feature_importance['Coefficient'], color=colors)
+    plt.xlabel("Coefficient (Impact on Stress Level)")
+    plt.ylabel("Feature")
+    plt.title("Feature Importance")
+    plt.axvline(0, color='black', linewidth=0.8)
+    plt.tight_layout()
+    plt.show()
+
+def train_logistic_regression(X_train, y_train):
+    model = LogisticRegression()
    model.fit(X_train, y_train)
-    return model  
+    return model
 
 def load_data():
    df = pd.read_csv(data_path, encoding="ascii", delimiter=",")
-    #removing uneeded feature
-    df.drop("Student_ID", axis=1, inplace=True)
    return df

 def inspect_data(df):
@@ -89,8 +166,39 @@ def clean_data(df):
    # print(df.isnull().sum())
    # print("\n")
    
-    df.dropna(inplace=False)
-    return df
+    # print("Duplicate rows in dataset:")
+    # print(df.duplicated().sum())
+    # print("\n")
+    
+    df_clean = df.dropna(inplace=False)
+    return df_clean
+
+def remove_outliers(df):
+    df_clean = df.copy()
+    
+    numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
+    if len(numeric_cols) == 0:
+        # print("No numeric columns detected.")
+        return df_clean
+    
+    mask = np.ones(len(df_clean), dtype=bool)
+    
+    for col in numeric_cols:
+        col_data = pd.to_numeric(df_clean[col], errors='coerce')
+        
+        Q1 = col_data.quantile(0.25)
+        Q3 = col_data.quantile(0.75)
+        IQR = Q3 - Q1
+        lower_bound = Q1 - 1.5 * IQR
+        upper_bound = Q3 + 1.5 * IQR
+        
+        mask &= col_data.between(lower_bound, upper_bound)
+    
+    df_clean = df_clean[mask]
+    
+    # print(f"Removed {len(df) - len(df_clean)} outliers across {len(numeric_cols)} numeric columns.")
+    
+    return df_clean

 def order_data_stress_level(df):
    df["Stress_Level"] = pd.Categorical(
@@ -121,20 +229,26 @@ def display_feature_boxplots(df):
        plt.title(f"{col} by Stress Level")
        plt.show()

-def draw_graphs(df):
+def draw_plots(df):
    display_feature_distributions_histogram(df)
    display_scatter_plot_matrix(df)
    display_correlation_heatmap(df)
    display_feature_boxplots(df)

 def preprocess_data(df):
+    #removing uneeded feature
+    df.drop("Student_ID", axis=1, inplace=True)
+    df.drop("GPA", axis=1, inplace=True)
+    df.drop("Extracurricular_Hours_Per_Day", axis=1, inplace=True)
+    df.drop("Social_Hours_Per_Day", axis=1, inplace=True)
    df_clean = clean_data(df)
    order_data_stress_level(df_clean)
+    df_clean = remove_outliers(df_clean)
    return df_clean

 def normalize_features(X_train, X_test):
    scaler = MinMaxScaler()
-    X_train_scaled = scaler.fit_transform(X_train)
+    X_train_scaled = scaler.fit_transform(X_train)  # fit only on training data
    X_test_scaled = scaler.transform(X_test)    
    return X_train_scaled, X_test_scaled

@@ -1,4 +1,4 @@
-# Student Habits and Stress Prediction Analysis
+# Student Stress Level Classifier

 A machine learning project that examines the habits of students with the goal of gaining insight into how their daily routines may affect their stress levels. Habits such as studying, extracurricular involvement, sleep, socialization, and physical activity, as well as performance indicators like GPA, are analyzed to understand their correlation with stress.

@@ -24,7 +24,7 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
 - Students who study more are more likely to have a higher GPA and more stress.
 - Physical activity has a negative correlation with other activities, one being study and therefore stress.
 - Students who sleep more were less likely to be very stressed.
- No extreme outliers were observed.
+- Some outliers were observed and will be need to be removed before training for more accurate results.

 **Figures:**
 ![Feature Distributions Historgram](images/feature_distributions_histogram.png)
@@ -35,4 +35,96 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or *
 ![Sleep Boxplot](images/boxplots_extracurricular_hours_per_day.png)
 ![Sleep Boxplot](images/boxplots_physical_hours_per_day.png)
 ![Sleep Boxplot](images/boxplots_social_hours_per_day.png)
-![Sleep Boxplot](images/boxplots_gpa.png)
+![Sleep Boxplot](images/boxplots_gpa.png)
+![Feature Importance](images/feature_importance.png)
+
+---
+
+## Data Preprocessing
+
+No missing values or duplicate rows were found in the dataset. Outliers in numeric features were identified using the interquartile range (IQR) method and removed before training. This helps reduce the impact of extreme values and can improve model performance.
+
+![Missing Values](images/missing_values.png)
+![Duplicate Entries](images/duplicate_entries.png)
+![Removed Outliers](images/removed_outliers.png)
+
+---
+
+## Feature Engineering
+
+To improve model performance and reduce redundancy, I performed feature engineering before training:
+- **GPA** was removed because it was highly correlated with **study time**, reducing redundant information and potential multicollinearity.
+- Features such as **extracurricular activity time** and **social time** were removed due to low predictive importance, minimizing noise and helping the model focus on the most relevant factors.
+
+---
+
+## Modeling
+
+This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose.
+
+---
+
+## Results
+
+To ensure better results, the model was trained and evaluated using 1,000 different random seeds for splitting the training and test data. Across all samples, the model scored an accuracy of **82.6%**, indicating strong performance. 
+
+As shown in the confusion matrix, each classification yielded slightly different results. The **high stress** students were the most identifiable, with **precision, recall, and F1-scores all above 86%**. The **moderate stress** group was the most challenging to predict, though still produced reasonable scores. This is somewhat expected, as students in the moderate category often displayed activity patterns that blended characteristics of both high and low stress groups, making them more prone to being categorized as on the extremes.
+
+Also, because stress is a subjective measurement, some inconsistency in labeling is expected. Students who appear to belong to one stress category based on their activities might self-report differently due to personal coping mechanisms or varying perceptions of what "stress" means to them. This subjectivity likely contributes to occasional misclassifications, even when the model's performance is otherwise strong.
+
+![Classification Report](images/classification_report.png)
+![Confusion Matrix](images/confusion_matrix.png)
+
+---
+
+## Conclusion
+
+Overall, the current state of the model is fairly reliable in predicting high-stress students. The main goal was to identify these students and provide insight into lifestyle adjustments that could help reduce stress without compromising academic performance. For students with GPAs near 4.0, this balance appears more difficult to achieve. Their elevated stress levels are often linked to the amount of time spent studying — the same factor driving their strong performance. For these students, ensuring adequate sleep (at least six hours per night) may be the most effective way to manage stress.
+
+Physical activity showed a small connection to lower stress levels, but this relationship was not consistent across all students. In fact, increased time spent exercising may slightly reduce GPA, suggesting that excessive physical activity could detract from study time.
+
+The process that produced these results was intentionally straightforward: a classification problem addressed using logistic regression to predict stress levels. Future improvements could include experimenting with more complex models such as random forests or gradient boosting. Additionally, only a single training and test split was used in this study. Incorporating cross-validation would likely provide more stable and trustworthy performance estimates by reducing the variance introduced by a single random split.
+
+---
+
+## How to Run
+
+1. Clone the repository
+
+    `git clone https://github.com/drewgiffin/student-stress-level-classifier`  
+    `cd student-stress-level-classifier`
+
+2. Create a virtual enviornment (recommended), and activate it
+
+    `python -m venv venv`
+
+- Windows
+
+    `venv\Scripts\activate`
+
+- macOS / Linux
+
+    `source venv/bin/activate`
+
+3. Install dependencies
+
+    `pip install -r requirements.txt`
+
+4. Run the program
+
+    `python main.py`
+
+### Notes:
+Right now, the program only outputs the average accuracy after running the 1,000 random train/test splits.
+If you want to visualize the results yourself:
+1. Remove the model training loop that stores the accuracy scores.
+
+2. Uncomment the `draw_` methods in the main method.
+
+This allows you to view the analysis graphs instead of just the aggregated accuracy results.
+
+---
+
+## References
+
+- [Study Habits and Activities of Students Dataset - Kaggle](https://www.kaggle.com/datasets/afnansaifafnan/study-habits-and-activities-of-students)