The average accuracy is calculated after 1000 trainings

This commit is contained in:
Drew Giffin
2025-10-22 08:57:57 -04:00
parent b5f6069cea
commit 5c15f6204b
3 changed files with 36 additions and 22 deletions
Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

+26 -20
View File
@@ -25,29 +25,35 @@ def main():
X, y = separate_features_and_target(df_clean) X, y = separate_features_and_target(df_clean)
# split into train and test data # split into train and test data
X_train, X_test, y_train, y_test = train_test_split( accuracy_scores = []
X, y, test_size=0.2, stratify=y, random_state=0 # run training many times using different splits to get an average accuracy score
) for i in range(1000):
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=y, random_state=i
)
# pre training processing # pre training processing
X_train_normalized, X_test_normalized = normalize_features(X_train, X_test) X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
# training # training
model = train_logistic_regression(X_train_normalized, y_train) model = train_logistic_regression(X_train_normalized, y_train)
# prediction # prediction
y_pred = predict_target(model, X_test_normalized) y_pred = predict_target(model, X_test_normalized)
# evaluation # evaluation
le = get_label_encoder(df_clean) le = get_label_encoder(df_clean)
# draw_feature_importance(model, X) # draw_feature_importance(model, X)
# draw_confusion_matrix(y_test, y_pred, le) # draw_confusion_matrix(y_test, y_pred, le)
# draw_classification_report(y_test, y_pred, le) # draw_classification_report(y_test, y_pred, le)
evaluate_accuracy(y_test, y_pred) accuracy = get_accuracy(y_test, y_pred)
accuracy_scores.append(accuracy)
print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
print(f"Samples: {len(accuracy_scores)}")
def evaluate_accuracy(y_test, y_pred): def get_accuracy(y_test, y_pred):
acc = accuracy_score(y_test, y_pred) accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {acc:.4f}") return accuracy
def get_label_encoder(df): def get_label_encoder(df):
le = LabelEncoder() le = LabelEncoder()
+10 -2
View File
@@ -46,7 +46,7 @@ No missing values or duplicate rows were found in the dataset. Outliers in numer
![Missing Values](images/missing_values.png) ![Missing Values](images/missing_values.png)
![Duplicate Entries](images/duplicate_entries.png) ![Duplicate Entries](images/duplicate_entries.png)
![Duplicate Entries](images/removed_outliers.png) ![Removed Outliers](images/removed_outliers.png)
--- ---
@@ -60,4 +60,12 @@ To improve model performance and reduce redundancy, I performed feature engineer
## Modeling ## Modeling
This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose. This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose.
---
## Results
![Accuracy](images/accuracy.png)
![Classification Report](images/classification_report.png)
![Confusion Matrix](images/confusion_matrix.png)