From 5c15f6204b7571547de7a5b39290bc77470ef868 Mon Sep 17 00:00:00 2001 From: Drew Giffin Date: Wed, 22 Oct 2025 08:57:57 -0400 Subject: [PATCH] The average accuracy is calculated after 1000 trainings --- images/accuracy.png | Bin 0 -> 1390 bytes main.py | 46 +++++++++++++++++++++++++------------------- readme.md | 12 ++++++++++-- 3 files changed, 36 insertions(+), 22 deletions(-) create mode 100644 images/accuracy.png diff --git a/images/accuracy.png b/images/accuracy.png new file mode 100644 index 0000000000000000000000000000000000000000..ef0b6b52c3e4f4ecd6bc03e1f872df156c5eb212 GIT binary patch literal 1390 zcmV-!1(EuRP)Px#1ZP1_K>z@;j|==^1poj532;bRa{vGi!vFvd!vV){sAK>D1p`S$K~#8N?V8(_ z;~)%%w~T2l<+u>H;3`tU^yEcC0wE-9lbt>BL=WFNlL&PEA-DPS_4Ty}!CeUf2e&2! z9Ne0*fFJAr{Qlm*pX>gx?04t4*EhvuU!SQb&=+7^j5QX^zVRMp+ZVQRpNVae`+#vMmuODgEOuMd08o%!wc&9H1qEJd_gJkRyftm(<3m%^)lz$W^;Y)Pcz zF^RhubpqFJ`f)+9yAp}PRr@)p7T#I6a{|3Uw_)vz9wOPx&dy2A8(9PAA>bj zzm46rRe!t=>!f_{FfCQM@5;ee_rjq~O|)1w2!` zAFrQ7+l#uUeQto+2gq9;6oXtJtsC8{P2TW#sx8&ehtrBRC1Bb75{R1E+MwZVgLNta z(G)CFCL%?os4-;0fE@5RUU|;IuWxh+j%`B$U#@)=Z&br1+&dCr7Sjd4MezRmrVkva z)L`@}fpt1Fbp+fG100t|%I3r0Iw1|&8y^~LuNYz}64z0K4&rq&3Zwe;fsu@C#{#}y z`zT%l_t6|cJVFRl6ak8ZC1VU|ghzgKz1WW<-^O#-1#C4GC{F6Iz_}@G1AmBlP|~J{ zFY`DX&t*`bCB(`#+c^P`Y9GY|svZ3M>rR4wQMV`G%5t5?+b3p&HFY;L)UKX0W{P_+ z3D`Z4oetbonh)8kPt z2-cEjVGcO^KHneeXYr+K0ysYzkDikU2rnI4lWWl873)NP`Z0j}&H5Yx+s)R+c+H_E zfQ{fi{t&rH-a}uVkZ&wX_yejRKc2^#`r3Tj^{$PF3#5iGss=BwQSH55`^j^#eP0zY z(Dt+QYY7bw9+t4cIVs1L1S_iT*r5AfMZd_ZKjp>{h_} zMjE`Veh~QTbm@luUA(`56a8C=5O3*;ekuPjZXw_x@CX41wAPp^AF12# zU9-r!PqVgQ9$UbNzjUeFgog}+-&V4v{Y(AXG4BDN@@DV9&$R`7;b!Z6QoPQmM_Nzy zU@lU?MqhZ%tgXG>gR2a0VDBF=+<(BP-^1apM*gB5!~_C*mUYr+dn>W;Ucl`-5zkrQ za#g^UPGnG@eoTjE))v1ezQ*uF91aj0J$|_>A`*iF(d9j@fSdQxda#LJkg#73TI|a? z{)`Y@0ppFz0;KvTZPo=5oM$ND0ymz8RR$rL6|fRtCtxcUAcU1~`^@N|UBwhI#t}JV zess%D*nZ;ReG=ycoZw;dOR^M75#5Y{r9cw4fX@!P&+QdD0b?%e)}$I{ZL;>@Iwu88 z?_W_^StLb>I2^KF60j|{PQXpDDm<0tq5mr!7QiWLcC5MVDlCt2wo%v w9Nd}^aByq>qkx0p-xC53ZcPX{xG`V*UlUGXjAf!^MgRZ+07*qoM6N<$f@|rKP5=M^ literal 0 HcmV?d00001 diff --git a/main.py b/main.py index 344faea..5cef138 100644 --- a/main.py +++ b/main.py @@ -25,29 +25,35 @@ def main(): X, y = separate_features_and_target(df_clean) # split into train and test data - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.2, stratify=y, random_state=0 - ) + accuracy_scores = [] + # run training many times using different splits to get an average accuracy score + for i in range(1000): + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.2, stratify=y, random_state=i + ) - # pre training processing - X_train_normalized, X_test_normalized = normalize_features(X_train, X_test) - - # training - model = train_logistic_regression(X_train_normalized, y_train) - - # prediction - y_pred = predict_target(model, X_test_normalized) + # pre training processing + X_train_normalized, X_test_normalized = normalize_features(X_train, X_test) + + # training + model = train_logistic_regression(X_train_normalized, y_train) + + # prediction + y_pred = predict_target(model, X_test_normalized) - # evaluation - le = get_label_encoder(df_clean) - # draw_feature_importance(model, X) - # draw_confusion_matrix(y_test, y_pred, le) - # draw_classification_report(y_test, y_pred, le) - evaluate_accuracy(y_test, y_pred) + # evaluation + le = get_label_encoder(df_clean) + # draw_feature_importance(model, X) + # draw_confusion_matrix(y_test, y_pred, le) + # draw_classification_report(y_test, y_pred, le) + accuracy = get_accuracy(y_test, y_pred) + accuracy_scores.append(accuracy) + print(f"Average Accuracy: {np.mean(accuracy_scores):.4f}") + print(f"Samples: {len(accuracy_scores)}") -def evaluate_accuracy(y_test, y_pred): - acc = accuracy_score(y_test, y_pred) - print(f"Model Accuracy: {acc:.4f}") +def get_accuracy(y_test, y_pred): + accuracy = accuracy_score(y_test, y_pred) + return accuracy def get_label_encoder(df): le = LabelEncoder() diff --git a/readme.md b/readme.md index e28c2a6..180c48a 100644 --- a/readme.md +++ b/readme.md @@ -46,7 +46,7 @@ No missing values or duplicate rows were found in the dataset. Outliers in numer ![Missing Values](images/missing_values.png) ![Duplicate Entries](images/duplicate_entries.png) -![Duplicate Entries](images/removed_outliers.png) +![Removed Outliers](images/removed_outliers.png) --- @@ -60,4 +60,12 @@ To improve model performance and reduce redundancy, I performed feature engineer ## Modeling -This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose. \ No newline at end of file +This model was made using **logistic regression**, it works well in this situation because it models the probability of each class based on the input features, making it effective for categorical outcomes. After experimenting with different hyperparameter settings, including various solvers and iteration limits, I found that removing them entirely did not noticeably change the model's performance, indicating that the default configuration worked well enough for this purpose. + +--- + +## Results + +![Accuracy](images/accuracy.png) +![Classification Report](images/classification_report.png) +![Confusion Matrix](images/confusion_matrix.png) \ No newline at end of file