From b5f6069cea6baf22c0caa6070d8d73b713db45c8 Mon Sep 17 00:00:00 2001 From: Drew Giffin Date: Tue, 21 Oct 2025 16:49:11 -0400 Subject: [PATCH] Added modeling section --- main.py | 11 ++++------- readme.md | 9 +++++++-- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/main.py b/main.py index 48017e2..344faea 100644 --- a/main.py +++ b/main.py @@ -41,8 +41,8 @@ def main(): # evaluation le = get_label_encoder(df_clean) # draw_feature_importance(model, X) - draw_confusion_matrix(y_test, y_pred, le) - draw_classification_report(y_test, y_pred, le) + # draw_confusion_matrix(y_test, y_pred, le) + # draw_classification_report(y_test, y_pred, le) evaluate_accuracy(y_test, y_pred) def evaluate_accuracy(y_test, y_pred): @@ -134,12 +134,9 @@ def draw_feature_importance(model, X): plt.show() def train_logistic_regression(X_train, y_train): - model = LogisticRegression( - solver='lbfgs', - max_iter=10000 - ) + model = LogisticRegression() model.fit(X_train, y_train) - return model + return model def load_data(): df = pd.read_csv(data_path, encoding="ascii", delimiter=",") diff --git a/readme.md b/readme.md index 4a50697..e28c2a6 100644 --- a/readme.md +++ b/readme.md @@ -29,12 +29,14 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or * **Figures:** ![Feature Distributions Historgram](images/feature_distributions_histogram.png) ![Scatter Plot Matrix](images/scatter_plot_matrix.png) +![Correlation Heatmap](images/correlation_heatmap.png) ![Study Boxplot](images/boxplots_study_hours_per_day.png) ![Sleep Boxplot](images/boxplots_sleep_hours_per_day.png) ![Sleep Boxplot](images/boxplots_extracurricular_hours_per_day.png) ![Sleep Boxplot](images/boxplots_physical_hours_per_day.png) ![Sleep Boxplot](images/boxplots_social_hours_per_day.png) ![Sleep Boxplot](images/boxplots_gpa.png) +![Feature Importance](images/feature_importance.png) --- @@ -54,5 +56,8 @@ To improve model performance and reduce redundancy, I performed feature engineer - **GPA** was removed because it was highly correlated with **study time**, reducing redundant information and potential multicollinearity. - Features such as **extracurricular activity time** and **social time** were removed due to low predictive importance, minimizing noise and helping the model focus on the most relevant factors. -![Correlation Heatmap](images/correlation_heatmap.png) -![Feature Importance](images/feature_importance.png) \ No newline at end of file +--- + +## Modeling + +This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose. \ No newline at end of file