From 5e375d1e6d01d283c245877ab74d5076c65732ad Mon Sep 17 00:00:00 2001 From: Drew Giffin Date: Mon, 20 Oct 2025 17:43:41 -0400 Subject: [PATCH] Added data preproccessing section to readme --- images/duplicate_entries.png | Bin 0 -> 3061 bytes images/missing_values.png | Bin 0 -> 18359 bytes images/removed_outliers.png | Bin 0 -> 3336 bytes main.py | 27 ++++++++++++++++++--------- readme.md | 14 ++++++++++++-- 5 files changed, 30 insertions(+), 11 deletions(-) create mode 100644 images/duplicate_entries.png create mode 100644 images/missing_values.png create mode 100644 images/removed_outliers.png diff --git a/images/duplicate_entries.png b/images/duplicate_entries.png new file mode 100644 index 0000000000000000000000000000000000000000..bc39f54bdf43fcd0ee9df4e31255503465aa1034 GIT binary patch literal 3061 zcmb`Jc{JPG7RP_udr>vC_1dZ!ZmUWuFg(|RRj^?<^KQvcyF!u#~Jos>+J8|XRou)=hZ_a9WD-j4gdhS^mOl= z0sxCYb8pJd%A7x{|5?jiSo}?OG=Yi%fi z#{#HD{jbiMsv|c;@^<@?K)bxV>JE%@tM8ewURzrm&z2i~*S^{I`go~0*U)WvgT`3U zzyYasZg78fagbktwA#$AYi5y=KJTQmZ8=WO#6mg)+U8@E5r^_tse*S7EU z9xJ6xsIIsu9BQ4*+;2p5u5-b+Y(b1Ny8Mkk$tw$QyuaF|31G<5Pj!5!T1pBp?-KY! zx(|wa)|#rp)oKTfsFes;;+kw2iariK-8#7&Pa||W)fSrNnRS5J&A!t2DW~UP{Cs!; z*EpHT#Z%?J*oZr;T=EGxVKnRtUZ9-=esTt%hl}PdempJOR?)HK)YFBQiapk*8i;1OqzaMq!T1O{(pRC4bdHtkcwfiF3}8jN{obzBB=&7um^)jk zSocI_*`&8fS7YuLS{K8^7;M-A$pPEZAbCfzMz>R0(OE7oYn)}VW{cV7vlj$B#n2G602N98ne*KzCiMN zf&P04qA}FN|Nc8NK#VTr6!rN2yY>3(?9!af~d|Y`qG_#U0WMla`U=%lpgo z1|2;*ia$0p{Ai&`4p5Dc%U07l#B=-Ky6=v=)4CV#OmNk{^w7|H*ZYU$l1Q09O(^$Z z^cW4_MmUx8ZqSggR#C3Pd-E9XKYAq-M~{NAn5w{uAF`ILpM5n=v$BFW^xOg#=Yqb5 zxdaN99L2nIfE}YJrVuI02WSyd=(PZ%8wNzuuQkcq)OA`PEe0D*35%M?Oig>~KnVT( zB+r`n9M2x0ls>6_^pR3grhiCw4o*KOII3dr#GS!fK_Sec%WG~7Z5vJsfCCciFrBzb z-J7if9>hlOklGOzt6TBo{#^BphhL_-HplJ-#!VE>PBaw_%c+;98C9iru;t4>;ttTN z#Ogo!mr%Kupc+=ND{Z)J!M=KTsn_Cm?2pgMEe5YMo^SqKprhSlqCUrsnw8k+uC%X< zT_jt{+uir+?g+ZGNuj?{7z`x17AmT^sxmti4&!?(m{&nKnLt*ia#Iq7A%Qw?K*|!* zx3>g4l1g90$pV*JQ~i_t4zPXY=w)ttVNY!GmSW~A&nv!+swUJSu3x6!{>x3mhXw^< z&nJorm@GHpb{%3}q_m-UHBZ!^&XH^vU8wWW=Cm*NGkY&#agZ>BluTz;%p_Y1qjgBM+AvVu%Ez{!WZhyK6 zhgg9kO5Ic|0#z{Jz6Oo+VaBL3&Adw_buwQp#<*r5EZ?GOr?OmK6pbV#v3blPBRZ(iPe@POm^1K6NPv!uVMDdwoYG)f=~-yqy(y1o#4hTmzf|qf@^mu3 zes+>H9AtMC-JHeP(?+#bC+FDJFPWB%9N`-nwBM@Z8aI4GaX(w(1=^b-N<=w=<{H&0pr$~ejN|;x+^R|P$L2es45SQzLk}%9^eCCV0j|3(il&pot z6KMJ2{dJ-ctJeIx{d---o8hh~@DQ}|7x9_{Q!bi#68?sUpUFE=>wcZVHM#PA*8#WZ^Wfb2~A2~q}=>AyK^*BVfsI?TN*`g}fBp}Q5)DL@q zy;B_=IQ1sI&D~a49=i@R^?QvlO`W6vHXo)UcJ!^zZQDs8tBx`!lO-G-A+~^Ao@udp zcD{DVgXj@v`A^}fjYH1r*hZBwFD_U8c=<{WUv^4V1a?9U{3+e2Ris?@vYeM-V8gsT z?D#Lm?-GSYE3}L}o*G3*MyoJ*^`}kaC|53Qno7#8o58n43{Uc#gM9r))P7uCvHWZV z%W7SYJ%-)z@REYjXhUpvG-T*VMfQP!`RBa}eX(k8fp^xhMCY78qVmp(w*E zDvsbfay0FFLZJ9ZrNi3cV_0>0mS>Sj{n9^O#qU7ej$*QI?Lyy6zq|UhW$M>VsM|-C za{2L$)4@x=o#_gFxh_3wWyX)3cbC~I!5$Ntf<2&ZeMF3`P}!k7a#$s~RV|4u4L zUtF*m1a z84db2#(ek--&bq)agBS)jY;hOygWqJj-kah%#Y@ub4{_$eP#)VHtyv9>#kyo3dE+^ zI@&-`?18-MFedRjhQXTy!*M5x#XP zl{nXS_Zn=$044m~G)iXGvYA3an!6ZzIETTYi~yeAu|T{RW1QU`v)^iKNez-7B(C{5 zO%M3N2LvmQ61X4{#W=abl2)lP=xn`wp^@0$?Dm?H(Y3g3J#W;1;1K_ z&CH!VX8{1~XfqUfo@TJ-e4IC|GTe59a=bbN01JX~@ifvcB~2_z^1* z^^8w|v|fV8b{(~F(-Vfj@2UYnOaFpQD%O=>v<-5?Dt2nG!8p&@KBO}94R&=zk%g;T z(aD{(3@P7}!=jQDjlJ^#Fv7`65fPn79f%jdO9jh|h^qdEx@%UlE zO(}0;{3^462-bL-Y?=~hM@_ZbDfP1HBjXP9bOFR5^M;!_xha)enl9>cU>j?4vqmvm zUm6c7)LK<$rd+uQKdCJ&%kwOiYw}7a(g^owWcKuFE1lMC0Fb$Gfz&scy+ykEBoOCb zW?;NeWlCXY@OieI;t%{K`X7SjngT1lSb(UNhInSUaDo5SO_u+w^Ed>y2gYh3fyY=7 Rlhgq~Puu8zg{EWlzX7Zt1-k$M literal 0 HcmV?d00001 diff --git a/images/missing_values.png b/images/missing_values.png new file mode 100644 index 0000000000000000000000000000000000000000..505228f76995ebe6d6ffeb583da6a8d4f3213993 GIT binary patch literal 18359 zcmbuncT`i+x;+|21Vozjj-d1=ASEjuKoekrcNSUk@uTdT9BS zj_O9p!!5Vx=2sU92nC3o3;Jb#wj8jqi4j@Ke(PmFoCpqBAd^LI9$GnWS!W{Nx1-R=YsN_=o4dwi}N$9XNehdYHadS zF4AiBL3VudfUggpNqCZ_R5MXG!q;hLmyi%xG$B(W0 z@f-8FpWH?Vkp5{-qeo`yK9K0}HdZ!VodR1oYj{$hItbtDGW0)CkF$JOuRnBA z99%v9wn=aMxj>A^wEdd;2PX17LeSAoTcSgoyB&P_9MW(wS1sK*ZWakDp#9(p8~DHm zrS%CB&Vhybczje%7d2R$wo=BsybUhSlO>5cif{chTh0QQn^5T-=FE_1Z*Vr6Mrk_9 z8=2wNa>>dmjXAI4bVrt35yKPEb4SXj`WO02-;5~l?DkK&Wr7zftIe>Hhs0s&vbxmL zUpvxJ+Q|~gULJPvug>YuTdGrcFI>xq7;>?gwBVQxmX$p+0dr> z^?Ej;VA1~W{hib0R!rf@T@6~sD-MO#?+$*#CsZP*f9Y*H+CQ5&n`B%okJIK`J*5=$ zK))3q{FqfOd=2GQnxM1NxCQ;@=2VjzD!1i5S5a|j&}&SgJ4;PjL5Wr41#`#-*?I_c z7#(7v%e8p>H#H8gg95XWrU>J9Ugb%>TlYBIGsB0wHC+Yo2@fI+I9GMU1pgiX2W%zhpXGSIGb*)>FSk@AJQ-^hnBF4;v zh43GDV}2@ebW3wsF7a+FuSs)+j5XkXXpCK)X|E2pKBc=mh2*_?kL&^uDS$`N*Obon z?FcD^n*suWFUtYa`PQN*6^bH+>xP8g5;M=65s2cwivj+;8CAt9wX|>pA_>Zuu1#_h zH#PZk9v#YwzEf~`)$s7ON}C%uB;+CM zF#)l*qCx=?JX@fz_Mkwm4l@K-Op_#8sB;dKHI@K(kT%6-aK`PFB4%oK2fb(b&~E5m zD=iyd&0Z5pG$YXyO0ScYQRjlaE|o%s;`(AcVt9jcT$j0*-IJ+>`^sAQxo_5-2t}rk zZ{LMuxr~7Ymm|Of`Et6DLol~yhVRo7TZj?n_83&@bbl!P=EKcl&Zy-T1MX7V8KYoN z5fk)sBge*w=ZMX;_He%ybbq}pH$U~nA!GPsjo#r#Ag9*UoXupkxAf4==h>E3V_#l_ z+=3GBsOj-uFmfQ&uUok_vqASTr5o(!W={Y%48#ZRd}vZYbh5LVJ_5%IhO(IE*w#O} z$Hi0FoAG|-59l5rqJ|HIT<7m+3Cv>;t{r%O#XA&6&*iywWnMKUp{69q!mD(HUclHt_}v4CJJJBj=nCQC^i!m1jAGIT4)F>f44n1= z4|%41lu?xCQdZbA)@$xnwCt?mm<>D%ASS}Ybfaw{s@+b5-wO8Bw0o&Y4w`GgBPdMz zZR-ui z0x#xLX+S%eQqotCcr-80%J=w~^~2q15JZRJ^dsUNHm^RFS=M~n^;~og@4k&Dhxmj- z$)rOz{2tgF+gSoc`P})L`{K)Cl z>+0=~5e5nq)}@i(Br;mbLK%_HeF!MZ$))VDAZO89^1@Qafgqx=aOmPBMcZQY^eRa1 zmVH!zU2k09@=SSZ;SimNG|?e;f+4vG*&J6`N-P>AlGCitchmC;?V118+sI5Y0lv6c z%aKXJFRmJO7qrM`uV!Qsb05Tp4e9} zGk|dHzH^-9DN>dT4LG54bi26+>zHF2c*?9KREQYXZ;i765@vx_lAmX`iU&1zJP2j( zmN7b(^xH?WRay(t*JI~JJycXAeQdmW?7sf8Q_)R0J9lYltf+c99{Ocv=G;sLA?>E2 zUV1#cVpI-}R}xEgd{vC6#sUwaP@oYQcuc{!Y(+y4)>8|I?41(58TG}T{?JIQ{GWwIki*A zSX;l;GcJr-=3-A#FLH1-P2$?5!rItZ84OM0H>21m1`(EJ55$)_AM*}3sK*S#76WQJ=00XQY}(oWv+XfE*e=DBllYY zFYa2~)w}383J;FZpINTVW}>iGfn99z_$@UPfd$*%d?H<<@$ z>jQzR(k{>HDC-+)!yb*vwBomdD!_zVLu&i2LFZf$tyMOq@#f zIfk8l(q_5`^%mBm;XW*P$B*;4#{b%>^29^5tUNro>;uwy^5dzYq5^#EtBwR%*7-Jt z215GTvt^>NZjA5yTP+zna&XxP_`laYPNGxRDfE%ftFJdtiNmx<1hnMdZ%h2S{7J{h zNxJUq_^EZ<2j7XsGYf9OA6)PJg?>lE%8QP+6priJa0`Z+-9`JOJ(K-a5mJ2HstO$j z9gcf^t8t+-F+-jq~W2>u;U5sT&7!_OpXU*-NJ`!eFE6?drn@0UtaQ}@%| z>(7zs$aU*{w5qs1*k*>P9Ud`!^ux@UqarHJuh}(A;Q1r1K)wT{z%<*1V($xz1TAt% z?cVzX-%#PW(#%R4w@LZ$z-uY+JB8`|d-`31Skcx+=q+qnKu(Aq&CLG6!?)0s*l5p= z!c@j_ix=0tKhx(W!Sq|T4T5dwWo}zjZ_sfYgu)zVqS*`WJKD+ywgkEt;FCqx>VG6x@ZC2NRj89wY_+fw8a2o`|(6*t(7-Y9{jMc2-5}?HG?Zfgm4$6}PzM1-?D~3!O(GI_S+-WR^y}9oX-2ZuDlp)@no&_qp z%R#^I*Lg9y+UGKJlpAhDq&e{8CpD=gk;Fney>l}py+J1cCv!l1%e6?tXISTWsmHEx zC8joHbkj!KG&KF~xTYWP1lu0hdhjschIE+mkk-l!=8Ay-cIQOP%F3^edfvWemOl&2 z&|r|zn%)dcsvxmcw+l*uZnuY4=iHNI7bTJDmH526_d2w8s%jYDyrC)~YnYO9P)=1^ z&2LZjXP^pyRK5G7v;{qLmS@Hc&+_W6Qa|t7uErzWT&W?0-U0Muv~wdOL|wEkRb{{= z_ZI8Q&sGdq1Y{UOL^oY2v1$9WmHVYH3)&leDxe!RUN^cnqJp;S2m|tMF%bz9&z7P92|k8GU`)8pb>miIPz< zXi`(OPpc~wFi!cwvh1||8%}Rt7F$7@qMTy8VP-Wo)ZWz0$ET*JlcnsAG2USq3Btd` zJfrYr+ZsMPh6?y15pu>j%7~*C8*RT8blXQ_E~RF2uD*B8p^@OA&&bm!5ZahS1cx|w zRQ;B8tcb8t-{!_l^yV;nD^r|w)y#(&F@y24&)t?zsF`d9tf+fL{U8H=7k($EtZ z_Rz+>LevM7ZM-Mb;DwVI)>^mh-j~vbE^ACn)sJ)(Bd}gO!zLre&JjeJvg2=m1wWG= zot6>e3}xgeCp?_9HbFSny%=(3NwNF~&Qb07U`^!|zVelTefnd4%x~9{bZW|d`dloW z2<{MyKXViRIWv3h@P&t(`AMPcxaKGL!uy_dVeb>RPbR;*Ce=)>4Hgc<|A9YBj(l}* z0RZH6vvqTU+F1Hf;V6Eyl!WDLorcvl{1++(T#0s?X>ti23!NU{Z5k>k<#bVXR_**4 z+kK`E-SrQ+mIHl`mHjD-h`=+`{3~8(8)aT>X!D3&Z!0h9aBr#CEjM~0Tnpv)tiz=G zz8gQ5(@xht1~nk4-?B)J6&5hN&^G;>GqXIBfvXG0b{>Lr50MHOx=pa%7R z`Zw||y_!`Ky*734u}hp)JX~bYD1^#B-Eqm!Fs&?EdG5Nl$+RC&>aHl=?{NhgLRZuD zy0gX;ok7t&@g>*CUmBO4>l;xd(H3#AF@==3$gLgq%&FMByot?Oxw_+eeo;y6j<6bkG7f6A;gJF8niKZ}}G{J_fz{arg`V}g)~cqcMkHLPUVzzMK+ zjrILl+qj-@Tk#` z{%mTjYgcqJgJnVD1W9B-_L*~2-B)b{?)z*1UgA*&I)tVY9~|9WpDQRzQl#XyjQW}g z4lrkS2XNi8-S6N!s|RSS8#%rYxv^*1owUPR2`Pt#?$LLrTYN1FG)tDN%*?LQUz;hU zH3Dt^39@2em(G4C#m+ohw#tFV)jA&g{#MYA1B+k6w%z&-nu+ z`a{{Kh+L@jqVtM5cl0giDl%nAL-uP-_wuxQzgQm4(uX<8{ITPc^^Z}eKB?#G**G29 zpH*M6DK5{wB67=sDaEozfyR=-hU>}&1p;^2dq)d$fc>uw=?b-t(a}|b-?)^*2lCAj z`CR6g$HMM1PE)lkdCB<22?7O@UBN|x`0KJs&!m(J%q%QV)FWo&)=s0LhgXfBgzyzd zWq$JfHVtQm%MMAqV-MwqU-1n2wu^oqoHHOW&95amvHKnE?bBZpIvqF7($ar&GL>#| z$<-Y3MriospT$=)9&BkO+g$aY@mH39ipMn5(^%s;D6Fbc!*}hoEb)D|C5S7ImY6-z zc~Gn`ju3u>=9}@wMU=^sE{-#o*T{yG7JQ0gyiJIQs?Qc?m(kK?Y*tOswh(y{MU6$F z6jF%1+`RL%UsRp|UwD`QQZGoDp_i?xqWOcNGpYMqy{CiCO$2>OM< zzg-%;)sVkULyJ2RZRNRsW73PI)||b^-OOTPIxJo2vr%ggwJ7JDc@0It9Eql`qn*|V zwc5e-mz#R(Etu2i{bc)CE*}F9oW&!tcDL2c(cCYz9O&(m5JAelN5$m%D9CK0@th&q8G!PuHMVqA>+ed)YQ{zxlS+ANw@fDj?6w!K*KBqmzezQQoYp^3_Q- z0*jAKdCC#!dZ1!xz07^2naGv}EKmUbImaNh{R|*l#)D478!tJcwCO7GG_F>*lB@FH5N z;LEo+6YZ?ma(AU%ELCqt-5z@xUZQ+ahHe7&Ev9ImC?}7i4Ne;Q)4L#uIR!|-B>Udj z`Ah(hOq0G3vg1_%7@k5$@ z43w1Nxa5f!$K&xQyCkIRO9Ztw`>!H7rf*_HYQ9+CjeA4#da>*%la&HnzRTRrwEAlb z4DCzr6ZI?jTK5E~T# z8r&6DfG7ckksCg>6>VrQ{T;VU^<`VVQ5Lq40=@%%c}&r+;tIc0aKfk@G?(=f9c+HS?TH3NlGl(Y{nQv7)GvuXpn$HqG4~M(fBHJ z*INzPUr)y^35v!HYj5m`ce8ALI5||jf!vDZL_e3_z@BA}?pKQBoyjCZ9c_lqp?^Re zD3Hv~DX*`@Q70aTHEzlrCB2126$T^&770NL2= z!rw55cuP~gombeMK6y|f&jN?ucOl?jaSN{^wPARNYMo6ZK%RNfRY>&JEtn9{$T$)3 zwdh&9Y54DhS4c~pyNrfC%1~Quc?*X<|AcZEBT>6DAPcYvNSZi$CHsC4)GaCd)iph22n^$-RTd`oGc zSo+_RF2h!cVD!iQ6%Rwxr3{aakCW=2Gnk=$$I)POoi=qK3}757_ilujKqZ%zg>9(mltjz5mg zDCXacF86x$CQR7l+N&%maql9(%IA_tcYGqNDn^{b4#UQ4?e8=kyO`M(U|OIRrwD4R ziJ;*YYpA{SB?&`?Nk0CbO~L5Q^+~Hy!@&BVT-Cq8*o$7r+f`RUzTzBdU9#_=Z$?^& zr}P;iq|Vty%lb6a-?7>SL)G9972suHk8_D&ou-Wq@hdW^@YF~qrEkL{ zICq?(o7xu%vP02ED`<$OB;Q;j>gGSvH_sL2mhJ7^8F>%XN5`5NitViejKJ? zz`>Rxwr;2#^q6UT;M4Pup(po#!a|>-Ma$bAZEwI>JS$x85ydjq{(A~*6Fn7V%=}~T z1}1r^#_r}KZjGQ_X8i{cX|$2d*^Lm_nJB)wEmmqs%)srBip1*;7ng(Uck!WUAj8^U zpZ{bLv}d`InQSnM?tI3=xaps%zVk%*Uo=)H{@A~vu|!=R35L9iSS;_N!(+SuNLDj@ ziyX%j!DH1AG>m(AVyoB~K!JCA&D4rBa&(2-YW(04W|eC)dI)?G?);VTO(649k{?1J zTXCR`=2<#$yqJT9x-Y*TAqR}SRBq(dFg6eT<*`1<>0u^Fm&d<8%O`~InsDSVf3$$eTa^u&?%+s9PbM<<4fi*|N2xmQ!f;=+s( z50=ArritNc5ls>!tb>h_d(nt6fj-7PK-i4!sXww4LQlQ4FTL$lQ5KI<(x7Yz@f27|&vY~`sJI3B`kity?2q9l~9k{Pu!^7a7%s9)vceLJi7mzGD=PBJTMq(H#? z%>{nFJmfnuO?NM+pL>}8WV}C zp^VM+hm6PX{aBa#?Rxe*jf?OQ^O~*V&`W3Fx2o%j@xoGOWC&EcHU!us>>rfk*HD#? zejYC&hfL77Y>l~MX9!2chhTfnV7Q}DqLMT`?xPqeYe?Vw@r53J*i_?s=GZd>mO{Jj1EMg|?20SZ!u(2KQ`ER~``#GH3?|qXb8@IS&-;%$W1{CPEYNl}xIhfUJ z5f8nWi4W`+$r8LDe~-OVipfdQalc2Ti|UE9f*MpDhHpLUbcs~_dAWldCTIv3TBstZMkurWf`sGAvMIp#$i*EN6B z2OZkjRlU_@fbjSORJv%<_50Hg;)l1IY_@X!@?m=deO=!<`mcbr!iBUm7oV+P9h@8I zvj66|jG&Dmzm8*)0JUDB7P?oRN(Qo^cz|$ZL(ivtp^aa{t)M_4dE(H<>qruRp^av7 zlzcTi6X+72I68%IWel zG4bAv?Z@20pEFea|*5uJnI^OUue22Xt8HyFKKfMxus+c zn)YXs-MbEH34o^MduoVd&hvHwO8U)h_NdjLA4Z$wm}hPzRqSqw&FlzjH@m>_Dk7F) zSv+XBx~pf?(G^Zz2Dj7zxsJBxw7`q+seW>Co{Y^l!(93JJ5taNUa1UtCXsXOFLTvl zglIi8#EOs6k=L@gx%FFaFVgM+5};pXNk+NQz>woLs=u&jdUKO;aJs{iUYGk)yjxWc z((Y)vCjgp{DJ9LkVs&Bq5AJG{6U}H#H65M!Evzm)zOQRK9!}44(fj}*PkbBQPP*vL z`gCf7elGoYVdlv*<>{~9YAf6+%+1|U(1Qj+9?X*a4HnHh#uQ_KOqW+t5 zcaaYC6Xl92pNu>!^4bi+oJQ;xGBdGL^u=rM6oC6dzT_k2savrC*wOwku=6%hi1sgl zX$g?iO1XBp!erE7D|hA!Mt!VmUsHC~cQ<{v?-J^MDqf)N`w?n9UQ6B(&pgrr(QTe7 zQ#6g5nASF8jym4nZOmexjjDC?-J0{Ws%u={YGzbyXbdiP4k=uw=$wm=B$_haJ?F>N zRcv@6O_k;PN(wh1%@=(YHRB^YQXgLm6XY#z^B9}U-GXHQJU9dom59oBcBB!o*m_ZN zedWpddYY)qY`BMDjwUR&X_SkkQ1~Ah2S#B%;4{80s%PT7kN@yf z%_yI{fi}Js34032t!F4)sPR)yJ1JgMX7fR!(GRF}D6(qP z!+)#N_0*ZNDm5npR;c~pm{d+)tKU^qjQz zCRvznCZBeGBvFR{mK!}ScHnlNy1SD*MKYu`7%~k>W8iW4@4DQwXY>Dob8_)P{#?kY z0;gD5q}Y(1l9zk_m+v)yG`jKL72ZpYZj(XSvI*O~)9#okjhoGw_RC@`d|2cE48h9b zk18v-`@dlu&bI3I1OOxb=#*{DwF6YlC2FMXMEMc`alT(doSlNd5NDS+E)QCGL?lPrQwKSI$ zQ9D0C5nXiw9gT8N^A){{)NHmGK9FVhuF1ze!_cT6*17X&Qc}u8lmgf|fgpGuETW8M zShjDq>H{1#P36FAlaU0=V|&s#JRNKKR zKy;6qLSj&pr__;z7U`lM2e&ZiQz{~DpYNqPK>DeixyZxx@pW?ZfBkge=Op|PooLgV zX|lRw?lN|GrF?Ga0wuD>coPB9iZ>1RRhZY{n4l;=okj9r8hYaWi!*Q-`p2nD!& z7)ydB+ZSh17}?K7%eS=_U=R1Sw20*bJJspGqOO^ilsE-3YsU zY54b=Tx>0TQG8D-Rf+kB$OP%c^cR|Ix|NzK`W2$n`|IcyA3*Kf;@~C;JaPKmjWbWu zE^LO)KGrZ^!oZWeShz4W%AvUd$#<^Rg;$JD8CQn79DOn$%^Mw|l33s}C#b(~r?!3A z(B;(4TP`~%qH55-$2IbDPS!0AcWDMX)$eYTI*r8xfDW|CagwogxbK-Z4LrTJp9Yd- z9$U=i5e?7$pLC7voSD%a58B%wyv;}!yzA4|R%E*I55fin3d4R{+&0>7Uf7VK8#S80 zgFX@dRQfVHqDLJs?uQ0XLv&p7#0jyI@3E2q^PKHHBCOCnQX;%C#d&OGyzeP#o!#b* zcF7qw{ubeHW!V*flr;aw%rDz?lJL2zOY9@)(M&cnc&kX}sx)9E$|1EShI7G#w`A0m zGvPZgZ?0B$YakpK_ZqxUXq<{|L$KQErYw>P1SM!eyZ;sCwlC=! zK$K$y?eIO_y~PoA3|RCo{XiDB6$gZ=(p&_f*DQd^wJmVpEPVrZo`VP4xxt2Z2?C%^ z$2G(M0Bwe@VEl>MpZ5McWFs2%|DBWh7h%IudV%XV)(VcBHyt7XjD9=zP1=agpOiSy zK5qh_`k0qW-_}-Q>RKKp42m9}ncdptHuqyGba%F68xIK<3czWHJ%P|s+(*B7Kr$$oR&NO54d)uuLe`}-^! zQ*eD=AyqS>iBT(ts7LB8YYj$-lm*kFZY)er>)LH}`DOn~L~wl?)2DQuQQ(z;_v4>N z*S1j-b3XIJZakO|NK>6L(U+G_CC8gbEG^v%X+yNwq2>K9Ps-&19ABiTad+g4Io~qr zRt+&WbSnYv$w7T|UC4g&pthj&9VPeGECS6-O^@|b)2qh?KO5iQsv_8C0oEj~>_>OX z5J}_$Id)~o+lmjp=!HnS2xw_!LY_IW{hiyAwzQ-odoo9^g{TY}|2yY)wC5=z#-R9* zm8n%3KrPopJ`x{BHjTzq=WKltE@uDyAluPsSAaa-%?qL3?V~F3)O#%}!R_X{gu{NBdPU);#=>fdN!-n!Mb5t!3QRV^;(O|y zF54lYd*bpYR8gA_vpQpYGj-jaEwz<;IEnihWWq*-d9sF*+SGb#R>V{(zyc&T3yecF7fj zN=Do`TyN_eXIc@nc^Y8cT_Hn`1@uQVrL+6{cLSovjSFI#W}SUZHW_n8;oks&mS9`I zj;0>AfQ%NA@_V5#)xr_|<`UBY5rv5{jaIx4BQLJVSNkU0vT-U+iUWg zLFJ^3G1qW4Adx5`Y~+5~Sv!`@j+M)&P%yU1Dn92AvJOXK*8cKHw5_A6N(`PEjbFTF zhrX&-*z);Qo?XA#UmEsc6Cug@v@ZBmI-lNhvfVny50c2X!e*A$NH(Ud&L0oi!@BJ+ zOt8_LNgjezbz^z#^`_}PCCBQ2)@@~uR(;DIsD*4z(MUzU&;5LRz$?kF`O!jp)(ZS3 zM?;CrAMB^ZRgEZ$ox7Yq>48SW1L6jIvqx{K?(3^OZ*CY15*S4K5qv50)@nRA3FT6* zW!$E+pDl^pTUEES6b;Zs&^O#B@1|>lw`?>Wv1EGI75)oJgg z0ZWRfbJQ(t_Zt~9S;0r)sD3J}%%<7gi(47T+XqyeTj`rS?$=qsKeJ|rzW|F2P+lxw z^tNB3vVkuhp*^AuyzVr|SC<2Bt$(mjk7$%PKA^Uv*c|tggLr%wE?^oR?Pa@fFRaMbVKOKv?o}fc z=h&FlTM}xlnF*}88+vYw=Lg);D4A&0k6_`Ph+;=_mwZ;or)1oCXa zT+YeQK1h!RS!+u1j~t+Xmbi`71U0g@-Kjj*J^73G`rA9eX}sk;!UsMbzs^Am+!} zN3o!RZA8+F(bg1TKZk{o#Dv@V+`lI5a4`STv~Rc-{5l>=oZmdH3$1ivCk+i?x4D!Z z3E|Jz)O_SboyYm6_fFjsAOCaW zX$+(2pjb#VxYq0Ep;`p7?oGL{TQ;YEEau%RHx;H@+A?6ohaKB|VOPGlj@Fdpk*syu zb_uTfDSNUug5LoXo{bzFqt4m%@BS=JRHhUsvIuy! zA{)KIt(`aDLgV)?*n4()+KtW{s3Ubn$Era7-au&cFdKKBP}7BKf$;L)3ON289= zBd2p8^s@vvq&7%8Rqt^((75~(}6UNgEZVIKuGSvU7 z;G8HONcx8|`Cr&~x>&Hn0TmS}M4$18-Ph<_W;SdRI)`C#$TrtN?AN*jzL0P%s=XM; z_+CTRYh^0cchuu@JgqA5_p+zY$IaU_D_8?k>zPWwKm*Km?l#yw`P8brg1Q);>?kSU#eva3+#-g%C#)Fk9aZ@$+W1r){b&YCEI>y@!KK5Mr|TqxWtXDdjh})X zFj##vv~Rd#iwWn4F*|nU>l!G0F3)AMQ2fzH3M{m+qPTW>iCAva^_y~DpSwQh7MlV>=$)A;=tNqIqZ`Xf~%>8bMfx+@v$-0q{tRfM3xMyAW<=4k4A zQqrWWSRc(@@yCL#J0Q=2K8vp+UltV44$(7GZJf=hcTYH!zj(p%i*qY=gBck1HSH6` zf*VS}Y>6UsiqK7p>|7rZX5-Qvh3jwjBL{tM|7BN*pj{k2T27&J89+50@HIf)Sj)=m zDk@VG}*Q3J@`53w(k8j`o|bG?12ezxATh7E;+e zO~ZUBtp{XT_(=+sFYssTmI04DC=k9gP^0O8mD64O%rsW>o>n(d5Z7d-`q<2~BP{@{ zx6Gpyq-9*MSC9#f^7_?HTw1vLsxj%(RuD*AIjj5bGM1QGyKg9^Hi{Tu)`x<2TuFdl}DhQ znwY)PF|KqD1eaG|NhoM9{aSz$?8E1UCe|r-kUj?rRa+9eXn~{9m>QwaWDx7|dfWdRlcHbfe!tAaBHLC(;gx0r|{aEiL!o$z! zhY7>|rbsmJ+&`@kMAfu*vT8B~wx`@pWDO@rJ^;c8x<8 zhLoE6{3nAmZh{^Hm*j&#fyjOykdg8&Hb^DuJj7=gZu*?DxYw00uj4DV$? zaRc5p?aK-kDY^>A7j}gw*`!HMHC)QBm#V!GpxXB|?JHUNB|bF1%jJ<#MQwI1xl1f~ z?o@*5Z9|?bQUOu6F;53?we;M5jcTelrytdp#_FI!nhF$_W9U3K48yMUMps6G9qco zgZ}cT(9>K9Uo!W>iB8)-tl{!OX}$7Km*N}Etw4Q;skj^Pu4VG4I)(i-@!$Ng+-OQ- zo(ds1=Xf&aeQn>1!!v}3A2Bah0WoYZFW4h!Us^>4Spfspm@{6RR@09*eQ1`)YA%rW+nqr|q>?b($gqyTyN>?XTk0#vK**U8`N4$loUW1HJs%IIdGHz+lh z=-0-B>qXxRdlGKp%%~R84UkK-kw3wU_2lV38Ib@zj;lWwf0g3_H6C}x@*fnSZop5U zIrB5k1$#e684fI@v*Vv2^1R-Bh7J{nWEuN7p**)$O*p>7b!iS|tZI zF*(1ZR1_g#ctj@YN*c;KbKzxT{Y3dIsUgSH)Ra;Sz0LDvFqaIN?(o0n|qF!9)>6%C+I6-p*hnOr&Dv{$+V3zM>QC*HyB~xk%5T+-qqcMB0+>)_WneHFxF-Jczf5+`vjF^3iKukY%d+; zZRuY6v>44y__!HJnI0}XE@O6l0pOV6Ip-c!x9t5&JZ~70m;8_nfUIaUJ1o9-{BvT-3T=^1@O#gm7?K{9dqGa~eUGn4QP}i1;4tsO0 zQpa$gYM-K@hc8b8Zli(HfNf2M%{R3_jV;By_n@MuTghru`3ae!o?PX><@okAC3)DlyXbK{fO62}im!X7z&`k>A9me7e)|8#ci;&^}UGm74jxu#8sbHe_KeKMZ{ zA3HcY^6;^zh(mPNb(klrNmu13jO7L_NhfbW7Pj&6V|pqF%va8wMa{rG<@@9F1F^g) zlL@{{Ph|c)KiYDZ;gCH)0mba>j4YLsKrCZGn*_MVxvrI{_H-~ziJ>pe_WXau-L$(snaPf%6>d)C(prkd0>MccqOqcQ}Y8vu`zbe5e=reWC)fC^mTCg5{nlCS@9 z=h0_ys*LMBv@;b`>C}It4>$s>BNU#_?q|Jo?U3b*^cy{fz2HD|c(7$@*|j_uK0EHI zKjbY~ZDP)l?pCc7wrF>Oph9%_ zeWW5+h$XB5Z4hf2X`3a8u1&49veKUBWJBLrE3z0;x?Z|ZSiNl4RK|_+7Md^nEY_56sO(B1 z1-3;Q^vZvdM@|nGhm3vfybT17Lf2amsSy#n^YPmbO39W~klNXU=It*hJ@LlAcjO^p z($kA}$L*+zUWqqy()vS<`urNww2pmt*9+5S@1 z^Y2R=cGEZwm;K~rvHoX2(q(qE=a7oo^hLDh#VjN@l@%jqi{zzV4aeV#4z*Ssw%Xz1 zTNYT4QW+uBKpPd5NeQd0JV226kXxsDA)WVq!O}iOLtpbyHhVgSe{N3j7pPmT0kyL! zlPdg6N=Fvi6eWlTn{?>ng4u%ivjnvRjC(mv ze_7z7Q377Y*S6#uREMY_;o~^VeVRK`N!}}ADyv$Cy!w}wVu3vUx3Dmd+`hilQ*DLU zY|nuk(-v$L3*A<-mLf}In7oq*n1O)Gu*1Ks!SPB&kmlv&K&>f5)DRXWGXllh-~r&* zK_Ul4l>Z1}N6o(F&?peEvfgrHX!qOyI{p7YDe$j+@n@F*`%FXEvU&k^QQ)$m%jX5U P06kXHR4jjJ5&ZuE29NPR literal 0 HcmV?d00001 diff --git a/images/removed_outliers.png b/images/removed_outliers.png new file mode 100644 index 0000000000000000000000000000000000000000..4db63854e53ecea0bf67c3d59627b9168d481edb GIT binary patch literal 3336 zcmb7HX*d*Y7oJIytRYz%DoVT|WRhYejD4($FZ(iS42D6L33+YNNcO>uY%>YNWJy^Q zVr*rZYOF)ZATkWuK7HT&^ZWDtc%Ji|>pIVM?)$mV^<3v9T9`xmPDq>p004YOhI&>2 zz!8hX^ZDb)4)n;s`9^xTY#ur-=f8N%&=&4&p=wCAbGqCC#oltYxWs!R<$Xcf6;c^a!PNFt z>4~5fdFRg;%_OWonwo^LTgY2l?}$j8m#SL6?9gfpnXC*vOalNgZUO$}1LzzDiu|#F zlSls`1pxE}b^a3}_`d=gUG@?3nMwAmX8=<_IqUf19s?F(474x=0%6PtpCE9_g1tvR z4$TCgaQ^waVIn9e{-c0@4Pe)9%MKY%f*p0t1`ZpI9my|vIL4Eje=S;dN>PT(;`MiP z*-&`}a%5$^=j!j4=)Lb3dlpZ6u2QWPeCu;d1=&!U7!7!}DD7RwzcO@*ZSWlO=K1b) zytYGA63tFC$;Y~e7-Zv9OXBTSH*6JD`$Z~Hj` zBdzZ{T3%a}gH$dYWRspGeF^R{o|8mXu5+Us-r|kZx9xrA`h(O${V--Ztbyztu+}ek z_C0j?nQyATl`*(V+JUr~&IQe=Wk>oXJS=ndqWOz;8#hl!cQldcE39|txDp%!o3&{pCGoj*oy{>Zy=d zX+fYem~eIYMq$=lf8!Z+v?9mzW^avvoj2unnx~YUHUzwv|9De;)%cdv$E47(pXL${%f`mI~uld2}dkJ+OO7Xu^jn`_pK~ zBGD%{Z8ic1ieAJ%dpxih)jD@KgQiDwO=$>vX|4c@eGtICG`aDXJ*ruLyOJ}qC2qY= z6mB`}sRe~TNSO;)rqm@DKLXRSxLd$<^(Pa62dN2^N~TgJvo-XHjeQzlK*b8#Q?{C2 z+vU)cba#i1NJNP`yJ6&Su&-`3p((stSNa=`*}>eWC5oDm<~Fnga$7oj-y)RDrq0#L{ErTLJ7lu0W& zTA6Rke&jNbW`j=eX}NZ1m?z{iJ=0W=vU?9H)DMb-6!ktm#_?Q~UUl0HV;>@kU2oF`pn`@dK@ji`$PYAa|A3V2lTrE&Zt^aV4s)A1#-X(!r)W~(}P&LBp-CtBLf*?q%$ZD1U~ z8iD$1un-ESfOoCKKI_~TqT6P3=)z~|D=ZvmtyHNZIx)4)Cam?-r^F=|XAB2P&=!U9 z?d{Q{sXns=I%^(g>V{c{#M6IUg9~|~ z0j8WjgH?PftW6Nos+x0eumvIBs z@sjeLpatIe4O&W+pNg`Yy1RSUjV)#KCoTBTkKTUR=D<1y9u|CUtdfdqwxPgX&EGp1 z#uT7T;oUKErlszQ!QeM?fTRYZxOz7BSzD*Bv%4naTt;X5x+Vj;pCXIyW0b=qcYm-( zqj*r@fpahFyHMGfZaU1kqstueN}EC#dm}UU0w@9-wK)_rFw9lMb295t$U+Q|CHq^l zOK{jSXg4!quS`yED(z?xT3J;~;!y^4;dVxw3;j-)%=be!^i7Cys#Jbkj#n-Y@QN9} zAO=cjb2s_KfT6Y4`2P~w;=nr%y`cR=#6aNTe~6Gw9?q4F8{qjX-rITZsYZ5Qe6P;F z3Ytde{WxxykdIBUOu$>Bb4@4u0##)=An!}+<=&GB=?}zUMmZED`@C@XN@3vJGZ*CA zGt@`UIx-e{+vRtf52eujo`7i@XtYn#bY&^<=dOXe!Xkx?a3OT}O(EvZ`XAN~I_7C@ z0D7kX?54MUi(d<-T6BKpIX@q^&yt$G|6+^zW4TMH2zIln^mYY2Tk-BC)va==RdkK3 zF=0W?S19xba;s5qL?Pm6pVoEIN#fld(}DhnPev}L)vM`wIJBYo z%ybXV?^1P8+;lp|Ij)E+KOnjJ&W&Dzb*)S|dtbrN1*Ecffc=Rn&9ZeG4E0^)DTuxp z<62iS^>I0k;tO|Pp-rSn_v>3vzRHPX`}<%WI#~i8&IeLE-U*-N{=Dm`+L)-sKM;q1 zc@w^Ly#c1rl41lSMHiQUjR@yqtHLcZvi4_9)M~eDv{w{qeb~^~v)?bI{I#%L4a5-y zy|=*3MjxVJm(xI*)_l0-RL{2seq>qQxqj5&|Db*{tsE8i)$K3yoNn*%@za(y&UroB zRG8ZiRkB0tx^^s6fB`qYuPSz*E_J5isS-klIze9b$(^j7SI^559x)G9?w^E1RQ98Y zG&E@=*^@^^JmSt_AxLFro~l%z$gJF zs(sQ^EUZB6uPD_z8on-+6j24T#S^XsUQwrgJ0_$xD97D7e54 z+C#d(SQYsyFJ=O!pP%2&q$q2r6xR~l9@)#F2qsTkH{W8M#5EBp`Q_JdC>An_yBZVe z7onreR-R_-$#Vz#rIE8kYU!d~sNrm|f^mq98SL0|mmH`PHl}Bl6}5k)3Tx_0=#a9` zrGbtYZJq7kSFMwp|7jfAE&gz&!XB%V4Ihr&!6vT@6*DEFzQWxO(^y6c5{Kk<=u%>e z?+Yt8W(oJ1Z|{dm>`Z^Hdt64lN7@)fM$5%`vs~^zBUcUhr8L!8L!#VxZM21SFqrF~ z2nYI3?`)ffLMbxC!N;ckd}jL^R%uK{HsVAuo*i1 z^1GxxN^vmGi;fv;zE2#4+Eu5ch6e&CLJSn#+fp~?A_6_mZWQ-+*kjaBF1n}We5)FH zMV^aAZOlkqCML&`xLBuKDNQYtD-NUGN?U2C&kP$x*tI}LO0iuTOH7)k=44(jAqv$q z3h7S8;|)f}?2qD_QM2Iua>VceYPN)=iER<7#}iauwDS~*tOaZKI!8`CI`*aR??T$` zg3;TrIh%R4$y{eZAYWpOH@Co_Mt^FO^Z#l6|L1JLFxizpD&oU= lower_bound) & (df_clean[col] <= upper_bound)] + + mask &= col_data.between(lower_bound, upper_bound) + + df_clean = df_clean[mask] + + print(f"Removed {len(df) - len(df_clean)} outliers across {len(numeric_cols)} numeric columns.") return df_clean diff --git a/readme.md b/readme.md index f5abb2f..0eabd9a 100644 --- a/readme.md +++ b/readme.md @@ -24,7 +24,7 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or * - Students who study more are more likely to have a higher GPA and more stress. - Physical activity has a negative correlation with other activities, one being study and therefore stress. - Students who sleep more were less likely to be very stressed. -- Some outliers were observed and will be need to be removed before training for more accurrate results. +- Some outliers were observed and will be need to be removed before training for more accurate results. **Figures:** ![Feature Distributions Historgram](images/feature_distributions_histogram.png) @@ -35,4 +35,14 @@ The target variable is the **stress level**, indicated as *low*, *moderate* or * ![Sleep Boxplot](images/boxplots_extracurricular_hours_per_day.png) ![Sleep Boxplot](images/boxplots_physical_hours_per_day.png) ![Sleep Boxplot](images/boxplots_social_hours_per_day.png) -![Sleep Boxplot](images/boxplots_gpa.png) \ No newline at end of file +![Sleep Boxplot](images/boxplots_gpa.png) + +--- + +## Data Preprocessing + +No missing values or duplicate rows were found in the dataset. Outliers in numeric features were identified using the interquartile range (IQR) method and removed before training. This helps reduce the impact of extreme values and can improve model performance. + +![Missing Values](images/missing_values.png) +![Duplicate Entries](images/duplicate_entries.png) +![Duplicate Entries](images/removed_outliers.png) \ No newline at end of file