The average accuracy is calculated after 1000 trainings

2025-10-22 08:57:57 -04:00
parent b5f6069cea
commit 5c15f6204b
3 changed files with 36 additions and 22 deletions
@@ -25,29 +25,35 @@ def main():
    X, y = separate_features_and_target(df_clean)
    # split into train and test data
-    X_train, X_test, y_train, y_test = train_test_split(
+    accuracy_scores = []
-        X, y, test_size=0.2, stratify=y, random_state=0
+    # run training many times using different splits to get an average accuracy score 
-    )
+    for i in range(1000):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, stratify=y, random_state=i
        )
-    # pre training processing
+        # pre training processing
-    X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
+        X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
-    
+        
-    # training
+        # training
-    model = train_logistic_regression(X_train_normalized, y_train)
+        model = train_logistic_regression(X_train_normalized, y_train)
-    
+        
-    # prediction
+        # prediction
-    y_pred = predict_target(model, X_test_normalized)
+        y_pred = predict_target(model, X_test_normalized)
-    # evaluation
+        # evaluation
-    le = get_label_encoder(df_clean)
+        le = get_label_encoder(df_clean)
-    # draw_feature_importance(model, X)
+        # draw_feature_importance(model, X)
-    # draw_confusion_matrix(y_test, y_pred, le)
+        # draw_confusion_matrix(y_test, y_pred, le)
-    # draw_classification_report(y_test, y_pred, le)
+        # draw_classification_report(y_test, y_pred, le)
-    evaluate_accuracy(y_test, y_pred)
+        accuracy = get_accuracy(y_test, y_pred)
        accuracy_scores.append(accuracy)
    print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
    print(f"Samples: {len(accuracy_scores)}")
-def evaluate_accuracy(y_test, y_pred):
+def get_accuracy(y_test, y_pred):
-    acc = accuracy_score(y_test, y_pred)
+    accuracy = accuracy_score(y_test, y_pred)
-    print(f"Model Accuracy: {acc:.4f}")
+    return accuracy
 def get_label_encoder(df):
    le = LabelEncoder()
@@ -46,7 +46,7 @@ No missing values or duplicate rows were found in the dataset. Outliers in numer
 ![Missing Values](images/missing_values.png)
 ![Duplicate Entries](images/duplicate_entries.png)
-![Duplicate Entries](images/removed_outliers.png)
+![Removed Outliers](images/removed_outliers.png)
 ---
@@ -60,4 +60,12 @@ To improve model performance and reduce redundancy, I performed feature engineer
 ## Modeling
-This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose.
+This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose.
 ---
 ## Results
 ![Accuracy](images/accuracy.png)
 ![Classification Report](images/classification_report.png)
 ![Confusion Matrix](images/confusion_matrix.png)