From 5c15f6204b7571547de7a5b39290bc77470ef868 Mon Sep 17 00:00:00 2001
From: Drew Giffin <drewgiffin97@gmail.com>
Date: Wed, 22 Oct 2025 08:57:57 -0400
Subject: [PATCH] The average accuracy is calculated after 1000 trainings

---
 images/accuracy.png | Bin 0 -> 1390 bytes
 main.py             |  46 +++++++++++++++++++++++++-------------------
 readme.md           |  12 ++++++++++--
 3 files changed, 36 insertions(+), 22 deletions(-)
 create mode 100644 images/accuracy.png
diff --git a/images/accuracy.png b/images/accuracy.png
new file mode 100644
index 0000000000000000000000000000000000000000..ef0b6b52c3e4f4ecd6bc03e1f872df156c5eb212
GIT binary patch
literal 1390
zcmV-!1(EuRP)<h;3K|Lk000e1NJLTq006K6001Kh1^@s6G(~{300001b5ch_0Itp)
z=>Px#1ZP1_K>z@;j|==^1poj532;bRa{vGi!vFvd!vV){sAK>D1p`S$K~#8N?V8(_
z;~)%%w~T2l<+u>H;3`tU^yEcC0wE-9lbt>BL=WFNlL&PEA-DPS_4Ty}!CeUf2e&2!
z9Ne0*fFJAr{Qlm*pX>gx?04t4*EhvuU!SQb&=+7^j5QX^zVRMp+ZVQRpNVae`+#<D
zYbTpCqoIHw>vMmuODgEOuMd08o%!wc&9H1qEJd_gJkRyftm(<3m%^)lz$W^;Y)Pcz
zF^RhubpqFJ`<mu7A>f)+9yAp}PRr@)p7T#I6a{|3Uw_)vz9wOPx&dy2A8(9PAA>bj
zzm46rRe!t=>!f_<Yfo754(9++-_G^~#nc{V%=hpW3gQ_1%(yuxt{wU!d)BO@xT73=
z0b?=L3}{So&?W%rLDj_bV0idw6AhJ#J=S#6N&}32>{FfCQM@5;ee_rjq~O|)1w2!`
zAFrQ7+l#uUeQto+2gq9;6oXtJtsC8{P2TW#sx8&ehtrBRC1Bb75{R1E+MwZVgLNta
z(G)CFCL%?os4-;0fE@5RUU|;IuWxh+j%`B$U#@)=Z&br1+&dCr7Sjd4MezRmrVkva
z)L`@}fpt1Fbp+fG100t|%I3r0Iw1|&8y^~LuNYz}64z0K4&rq&3Zwe;fsu@C#{#}y
z`zT%l_t6|cJVFRl6ak8ZC1VU|ghzgKz1WW<-^O#-1#C4GC{F6Iz_}@G1AmBlP|~J{
zFY`DX&t*`bCB(`#+c^P`Y9GY|svZ3M>rR4wQMV`G%5t5?+b3p&HFY;L)UKX0W{P_+
z3D`<nB?(URCwLhwH`Kccj)qtUzybq5MnRQvJePy|a<lE2zAfic>Z4oetbonh)8kPt
z2-cEjVGcO^KHneeXYr+K0ysYzkDikU2rnI4lWWl873)NP`Z0j}&H5Yx+s)R+c+H_E
zfQ{fi{t&rH-a}uVkZ&wX_yejRKc2^#`r3Tj^{$PF3#5iGss=BwQSH55`^j^#eP0zY
z(Dt+QYY7<O1IrXU<)A5cHNTQ;KUYM>bw9+t4cIVs1L1S_iT*r5AfMZd_ZKjp>{h_}
zMjE`Veh~QTbm@luUA(`56a8C=5O3*;ekuPjZXw_x@CX41w<ZJ}+?o(@aBJR1z|D7^
zAN;90xW+)h{TI+L|5SMb6!KdLnCkC7<$GVezhVn!aRi*3?fr7}le~q1>APp^AF12#
zU9-r!PqVgQ9$UbNzjUeFgog}+-&V4v{Y(AXG4BDN@@DV9&$R`7;b!Z6QoPQmM_Nzy
zU@lU?MqhZ%tgXG>gR2a0VDBF=+<(BP-^1apM*gB5!~_C*mUYr+dn>W;Ucl`-5zkrQ
za#g^UPGnG@eoTjE))v1ezQ*uF91aj0J$|_>A`*iF(d9j@fSdQxda#LJkg#73TI|a?
z{)`Y@0ppFz0;KvTZPo=5oM$ND0ymz8RR$rL6|fRtCtxcUAcU1~`^@N|UBwhI#t}JV
zess%D*nZ;ReG=ycoZw;dOR^M75#5Y{r9cw4fX@!P&+QdD0b?%e)}$I{ZL;>@Iwu88
z?_W_^StLb>I2^KF60j|{PQXp<y3QuAZGGlA=k~XB0_I%Gb28qnjeQ3**a8M2!q6o`
z065rU6aRpHLcr2z9bj)mN=$FBv9A;ARRKG>DDm<0tq5mr!7QiWLcC5MVDlCt2wo%v
w9Nd}^aByq>qkx0p-xC53ZcPX{xG`V*UlUGXjAf!^MgRZ+07*qoM6N<$f@|rKP5=M^

literal 0
HcmV?d00001

diff --git a/main.py b/main.py
index 344faea..5cef138 100644
--- a/main.py
+++ b/main.py
@@ -25,29 +25,35 @@ def main():
     X, y = separate_features_and_target(df_clean)
     
     # split into train and test data
-    X_train, X_test, y_train, y_test = train_test_split(
-        X, y, test_size=0.2, stratify=y, random_state=0
-    )
+    accuracy_scores = []
+    # run training many times using different splits to get an average accuracy score 
+    for i in range(1000):
+        X_train, X_test, y_train, y_test = train_test_split(
+            X, y, test_size=0.2, stratify=y, random_state=i
+        )
     
-    # pre training processing
-    X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
-    
-    # training
-    model = train_logistic_regression(X_train_normalized, y_train)
-    
-    # prediction
-    y_pred = predict_target(model, X_test_normalized)
+        # pre training processing
+        X_train_normalized, X_test_normalized = normalize_features(X_train, X_test)
+        
+        # training
+        model = train_logistic_regression(X_train_normalized, y_train)
+        
+        # prediction
+        y_pred = predict_target(model, X_test_normalized)
 
-    # evaluation
-    le = get_label_encoder(df_clean)
-    # draw_feature_importance(model, X)
-    # draw_confusion_matrix(y_test, y_pred, le)
-    # draw_classification_report(y_test, y_pred, le)
-    evaluate_accuracy(y_test, y_pred)
+        # evaluation
+        le = get_label_encoder(df_clean)
+        # draw_feature_importance(model, X)
+        # draw_confusion_matrix(y_test, y_pred, le)
+        # draw_classification_report(y_test, y_pred, le)
+        accuracy = get_accuracy(y_test, y_pred)
+        accuracy_scores.append(accuracy)
+    print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}")
+    print(f"Samples: {len(accuracy_scores)}")
 
-def evaluate_accuracy(y_test, y_pred):
-    acc = accuracy_score(y_test, y_pred)
-    print(f"Model Accuracy: {acc:.4f}")
+def get_accuracy(y_test, y_pred):
+    accuracy = accuracy_score(y_test, y_pred)
+    return accuracy
 
 def get_label_encoder(df):
     le = LabelEncoder()
diff --git a/readme.md b/readme.md
index e28c2a6..180c48a 100644
--- a/readme.md
+++ b/readme.md
@@ -46,7 +46,7 @@ No missing values or duplicate rows were found in the dataset. Outliers in numer
 
 ![Missing Values](images/missing_values.png)
 ![Duplicate Entries](images/duplicate_entries.png)
-![Duplicate Entries](images/removed_outliers.png)
+![Removed Outliers](images/removed_outliers.png)
 
 ---
 
@@ -60,4 +60,12 @@ To improve model performance and reduce redundancy, I performed feature engineer
 
 ## Modeling
 
-This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose.
\ No newline at end of file
+This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose.
+
+---
+
+## Results
+
+![Accuracy](images/accuracy.png)
+![Classification Report](images/classification_report.png)
+![Confusion Matrix](images/confusion_matrix.png)
\ No newline at end of file