presentation.tex (13435B)
1 \documentclass[10pt]{beamer} 2 \usepackage{booktabs} 3 4 \usepackage{animate} 5 \usepackage{float} 6 \usecolortheme[snowy]{owl} 7 \usepackage{listings} 8 \usepackage{graphicx} % Load graphicx package for image handling 9 \usepackage{natbib} 10 11 \usepackage{algorithm} 12 \usepackage{algpseudocode} 13 14 \title{CART-ELC: Oblique Decision Tree \\ Induction via Exhaustive Search} 15 \author{Andrew D. Laack} 16 \institute{University of Wisconsin-Superior} 17 \date{May 13, 2025} 18 19 \usepackage{xcolor} % Ensure xcolor is loaded 20 21 \usepackage{float} 22 \usepackage{pgfplots} 23 \pgfplotsset{compat=1.17} % or another compatible version for arXiv 24 25 \usepackage{ifthen} 26 \usepackage{multirow} 27 \usepackage{booktabs} 28 29 \newboolean{isFinal} 30 \setboolean{isFinal}{false} 31 32 \setbeamercolor{section in toc}{fg=white} % Section titles in white 33 \setbeamercolor{subsection in toc}{fg=lightgray} 34 35 \renewcommand{\thealgorithm}{\arabic{algorithm}:} 36 37 \renewcommand{\alglinenumber}[1]{\tiny\color{white}\@#1.} 38 39 \usepackage{url} 40 \usepackage{hyperref} % Hides ugly link borders 41 42 43 \begin{document} 44 \section{} 45 46 47 \begin{frame} 48 \titlepage 49 50 \footnotesize 51 Source Available: 52 53 \texttt{https://github.com/andrewlaack/cart-elc} 54 55 \end{frame} 56 57 \begin{frame} 58 \frametitle{Problem} 59 How can we determine if someone has diabetes based on their BMI? 60 \end{frame} 61 62 \begin{frame} 63 \frametitle{Visualization} 64 \begin{figure}[h] 65 \centering 66 \includegraphics[scale=.4]{images/dia1.pdf} 67 \caption{Diabetes Dataset \citep{diabetes} Graph} 68 \end{figure} 69 \end{frame} 70 71 \begin{frame} 72 \frametitle{Solution} 73 \begin{figure}[h] 74 \centering 75 \includegraphics[scale=.4]{images/dia2.pdf} 76 \caption{Diabetes Dataset Graph w/ Boundary} 77 \end{figure} 78 \end{frame} 79 80 \begin{frame} 81 \frametitle{Tree} 82 \begin{figure}[h] 83 \centering 84 \includegraphics[scale=.3]{images/tree1.pdf} 85 \caption{Tree Representation} 86 \end{figure} 87 \end{frame} 88 89 90 \begin{frame} 91 \frametitle{But...} 92 62.2\% Accuracy 93 \begin{figure}[h] 94 \centering 95 \includegraphics[scale=.4]{images/dia3.pdf} 96 \caption{Full Diabetes Dataset Graph w/ Boundary} 97 \end{figure} 98 \end{frame} 99 100 101 102 103 % not density, mass 104 105 \begin{frame} 106 \frametitle{Extra Feature} 107 \begin{figure}[h] 108 \centering 109 \includegraphics[scale=.4]{images/dia4.pdf} 110 \caption{Diabetes Dataset w/ Glucose (a bit complicated)} 111 \end{figure} 112 \end{frame} 113 114 \begin{frame} 115 \frametitle{Pseudocode} 116 117 \begin{algorithm}[H] 118 \tiny 119 \caption{CART Algorithm \citep{breiman}} 120 \begin{algorithmic}[1] 121 \Function{fit}{samples, labels, featureCount} 122 \If{homogeneous(labels)} 123 \State \Return Node(majorityClass(labels)) 124 \EndIf 125 \State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore() 126 \For{sample in samples} 127 \For{feature in range(0,featureCount)} 128 \State currentSplit $\gets$ (feature, sample[feature]) 129 \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples) 130 \If{ isBetterThan(currentSplittingScore, bestSplittingScore)} 131 \State bestSplittingScore, bestSplit $\gets$ currentSplittingScore, currentSplit 132 \EndIf 133 \EndFor 134 \EndFor 135 \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit) 136 \If{left is empty or right is empty} 137 \State \Return Node(majorityClass(labels)) 138 \EndIf 139 \State leftSubtree $\gets$ fit(left.samples, left.labels, featureCount) 140 \State rightSubtree $\gets$ fit(right.samples, right.labels, featureCount) 141 \State tree $\gets$ Node(bestSplit) 142 \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree 143 \State \Return tree 144 \EndFunction 145 \end{algorithmic} 146 \end{algorithm} 147 148 \end{frame} 149 150 \begin{frame} 151 \frametitle{Quantification of Goodness (Splitting Criterion)} 152 153 \begin{equation*} 154 G = p_L \left(1 - \sum_{j\in C} p_{Lj}^2 \right) + p_R \left(1 - \sum_{j \in C} p_{Rj}^2 \right) 155 \label{gini_equ} 156 \end{equation*} 157 158 \end{frame} 159 160 161 % DEPTH: 1 - 0.7356770833333334 162 % DEPTH: 2 - 0.7721354166666666 163 % DEPTH: 3 - 0.7734375 164 % DEPTH: 4 - 0.78515625 165 166 \begin{frame} 167 \frametitle{Boundary (Depth = 1)} 168 \begin{figure}[h] 169 \centering 170 \includegraphics[scale=.35]{images/images/diabetes_tree_bmi_glucose_depth_1.png} 171 \caption{73.6\% Accuracy} 172 \end{figure} 173 \end{frame} 174 175 \begin{frame} 176 \frametitle{Boundary (Depth = 2)} 177 \begin{figure}[h] 178 \centering 179 \includegraphics[scale=.35]{images/images/diabetes_tree_bmi_glucose_depth_2.png} 180 \caption{77.2\% Accuracy} 181 \end{figure} 182 \end{frame} 183 184 185 \begin{frame} 186 \frametitle{Boundary (Depth = 3)} 187 \begin{figure}[h] 188 \centering 189 \includegraphics[scale=.35]{images/images/diabetes_tree_bmi_glucose_depth_3.png} 190 \caption{77.3\% Accuracy} 191 \end{figure} 192 \end{frame} 193 194 \begin{frame} 195 \frametitle{Boundary (Depth = 4)} 196 \begin{figure}[h] 197 \centering 198 \includegraphics[scale=.35]{images/images/diabetes_tree_bmi_glucose_depth_4.png} 199 \caption{78.5\% Accuracy} 200 \end{figure} 201 \end{frame} 202 203 \begin{frame} 204 \frametitle{Tree (Depth = 4)} 205 \begin{figure}[h] 206 \centering 207 \includegraphics[scale=.225]{images/tree2.pdf} 208 \caption{Tree Representation} 209 \end{figure} 210 \end{frame} 211 212 \begin{frame} 213 \frametitle{New Problem} 214 \begin{figure}[h] 215 \centering 216 \includegraphics[scale=.25]{correlation/original_graph.png} 217 \caption{Cancer Diagnosis (Synthetic)} 218 \end{figure} 219 \end{frame} 220 221 \begin{frame} 222 \frametitle{Boundary (Depth = 1)} 223 \begin{figure}[h] 224 \centering 225 \includegraphics[scale=.25]{correlation/images/decision_tree_depth_1.png} 226 \caption{Cancer Diagnosis (Synthetic)} 227 \end{figure} 228 \end{frame} 229 230 \begin{frame} 231 \frametitle{Boundary (Depth = 5)} 232 \begin{figure}[h] 233 \centering 234 \includegraphics[scale=.25]{correlation/images/decision_tree_depth_5.png} 235 \caption{Cancer Diagnosis (Synthetic)} 236 \end{figure} 237 \end{frame} 238 239 240 \begin{frame} 241 \frametitle{Boundary (Depth = 10)} 242 \begin{figure}[h] 243 \centering 244 \includegraphics[scale=.25]{correlation/images/decision_tree_depth_10.png} 245 \caption{Cancer Diagnosis (Synthetic)} 246 \end{figure} 247 \end{frame} 248 249 \begin{frame} 250 \frametitle{Boundary (Depth = 20)} 251 \begin{figure}[h] 252 \centering 253 \includegraphics[scale=.25]{correlation/images/decision_tree_depth_20.png} 254 \caption{Cancer Diagnosis (Synthetic)} 255 \end{figure} 256 \end{frame} 257 258 \begin{frame} 259 \frametitle{Tree (Depth = 20)} 260 \begin{figure}[h] 261 \centering 262 \includegraphics[scale=.04]{correlation/tree3.pdf} 263 \caption{Tree Visualization} 264 \end{figure} 265 \end{frame} 266 267 \begin{frame} 268 \frametitle{Solution} 269 \begin{figure}[h] 270 \centering 271 \includegraphics[scale=.25]{correlation/oblique.png} 272 \caption{Oblique Split} 273 \end{figure} 274 \end{frame} 275 276 \begin{frame} 277 \frametitle{How?} 278 \begin{enumerate} 279 \item CART-LC \citep{breiman} 280 \item OC1 \citep{oc1} 281 \item HHCART \citep{hhcart} 282 \item CART-ELC \citep{laack} 283 \end{enumerate} 284 \end{frame} 285 286 \begin{frame} 287 \frametitle{Pseudocode} 288 289 \begin{algorithm}[H] 290 \tiny 291 \caption{CART-ELC} 292 \begin{algorithmic}[1] 293 \Function{fit}{samples, labels, $m$, $r$} 294 \If{homogeneous(labels)} 295 \State \Return Node(majorityClass(labels)) 296 \EndIf 297 \State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore() 298 \For{selectedSamples in combinations(samples, $r$)} 299 \For{selectedFeatures in combinations($m$, $r$)} 300 \State vectorsToPassThrough $\gets$ featureSubset(selectedSamples, selectedFeatures) 301 \State currentSplit $\gets$ findHyperplanePassingThrough(vectorsToPassThrough) 302 \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples) 303 \If{ isBetterThan(currentSplittingScore, bestSplittingScore)} 304 \State bestSplittingScore, bestSplit $\gets$ currentSplittingScore, currentSplit 305 \EndIf 306 \EndFor 307 \EndFor 308 \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit) 309 \If{left is empty or right is empty} 310 \State \Return Node(majorityClass(labels)) 311 \EndIf 312 \State leftSubtree $\gets$ fit(left.samples, left.labels, $m$, $r$) 313 \State rightSubtree $\gets$ fit(right.samples, right.labels, $m$, $r$) 314 \State tree $\gets$ Node(bestSplit) 315 \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree 316 \State \Return tree 317 \EndFunction 318 \end{algorithmic} 319 \end{algorithm} 320 \end{frame} 321 322 \begin{frame} 323 \frametitle{Asymptotic Time Complexity per Split (CART-ELC)} 324 \[ 325 \Theta\left( \binom{n}{r} \cdot \binom{m}{r} \cdot r(r^2 + n) \right) 326 \] 327 328 \end{frame} 329 330 \begin{frame} 331 \frametitle{Operations for a Single Split (CART-ELC)} 332 333 \begin{table}[h] 334 \centering 335 \footnotesize 336 \caption{Operations for single split assuming multiplicative factor of one, no additive constants, and $r = m$.} 337 \begin{tabular}{lcccccc} 338 \addlinespace 339 \toprule 340 \multirow{2}{*}{r} & \multicolumn{6}{c}{n} \\ 341 \cmidrule(lr){2-7} 342 & 100 & 500 & 1000 & 5000 & 10000 & 20000 \\ 343 \midrule 344 1 & 1.01e+04 & 2.50e+05 & 1.00e+06 & 2.50e+07 & 1.00e+08 & 4.00e+08 \\ 345 2 & 1.03e+06 & 1.26e+08 & 1.00e+09 & 1.25e+11 & 1.00e+12 & 8.00e+12 \\ 346 3 & 5.29e+07 & 3.16e+10 & 5.03e+11 & 3.13e+14 & 5.00e+15 & 8.00e+16 \\ 347 4 & 1.82e+09 & 5.31e+12 & 1.68e+14 & 5.22e+17 & 1.67e+19 & 5.34e+20 \\ 348 5 & 4.71e+10 & 6.70e+14 & 4.23e+16 & 6.53e+20 & 4.17e+22 & 2.67e+24 \\ 349 6 & 9.73e+11 & 6.77e+16 & 8.50e+18 & 6.54e+23 & 8.35e+25 & 1.07e+28 \\ 350 7 & 1.67e+13 & 5.71e+18 & 1.43e+21 & 5.46e+26 & 1.39e+29 & 3.56e+31 \\ 351 8 & 2.44e+14 & 4.13e+20 & 2.05e+23 & 3.90e+29 & 1.99e+32 & 1.02e+35 \\ 352 9 & 3.10e+15 & 2.62e+22 & 2.59e+25 & 2.44e+32 & 2.49e+35 & 2.55e+38 \\ 353 10 & 3.46e+16 & 1.47e+24 & 2.90e+27 & 1.36e+35 & 2.77e+38 & 5.66e+41 \\ 354 \bottomrule 355 \end{tabular} 356 \label{fig:operations} 357 \end{table} 358 \end{frame} 359 360 \begin{frame} 361 \frametitle{Empirical Comparison} 362 \tiny 363 364 \begin{table}[h] 365 \centering 366 \caption{Accuracy and tree size comparison across decision tree induction algorithms.} 367 \begin{tabular}{lcccccc} 368 \addlinespace 369 \toprule 370 \multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Accuracy} \\ 371 \cmidrule(lr){2-7} 372 & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ 373 \midrule 374 CART-ELC & \textbf{98.9 ± 0.2} & \textbf{95.2 ± 0.5} & 96.3 ± 0.4 & 95.1 ± 0.8 & 83.5 ± 0.7 & \textbf{74.5 ± 1.3} \\ 375 376 HHCART(A) & 98.3 ± 0.5 & 93.7 ± 0.8 & \textbf{96.9 ± 0.3} & \textbf{95.5 ± 1.4} & \textbf{83.9 ± 0.8} & 73.2 ± 1.2 \\ 377 HHCART(D) & 98.1 ± 0.4 & 93.7 ± 0.9 & \textbf{96.9 ± 0.3} & 94.3 ± 1.5 & 82.2 ± 1.4 & 73.2 ± 1.2 \\ 378 OC1 & \textbf{98.9 ± 0.2} & 95.0 ± 0.3 & 96.2 ± 0.3 & 94.7 ± 3.1 & 82.4 ± 0.8 & 74.4 ± 1.0 \\ 379 OC1-AP & 98.1 ± 0.2 & 94.0 ± 0.2 & 94.5 ± 0.5 & 92.7 ± 2.4 & 81.8 ± 1.0 & 73.8 ± 1.0 \\ 380 CART-LC & 98.8 ± 0.2 & 92.8 ± 0.5 & 95.3 ± 0.6 & 93.5 ± 2.9 & 81.4 ± 1.2 & 73.7 ± 1.2 \\ 381 CART-AP & 98.5 ± 0.5 & 94.2 ± 0.7 & 95.0 ± 1.6 & 93.8 ± 3.7 & 82.1 ± 3.5 & 73.9 ± 3.4 \\ 382 C4.5 & 98.5 ± 0.5 & 93.3 ± 0.8 & 95.3 ± 2.0 & 95.1 ± 3.2 & 83.2 ± 3.1 & 71.4 ± 3.3 \\ 383 \midrule 384 \multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Tree Size} \\ 385 \cmidrule(lr){2-7} 386 & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ 387 \midrule 388 CART-ELC & \textbf{3.7 ± 0.2} & \textbf{9.8 ± 4.2} & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & \textbf{4.0 ± 0.0} & \textbf{4.0 ± 0.0} \\ 389 HHCART(A) & 6.1 ± 0.3 & 14.6 ± 4.8 & \textbf{2.0 ± 0.0} & \textbf{3.1 ± 0.1} & 7.8 ± 0.2 & \textbf{4.0 ± 0.0} \\ 390 HHCART(D) & 6.3 ± 0.4 & 14.9 ± 5.0 & \textbf{2.0 ± 0.0} & 4.7 ± 0.1 & 23.3 ± 0.8 & \textbf{4.0 ± 0.0} \\ 391 OC1 & 4.3 ± 1.0 & 13.0 ± 8.7 & 2.8 ± 0.9 & \textbf{3.1 ± 0.2} & 6.9 ± 3.2 & 5.4 ± 3.8 \\ 392 OC1-AP & 6.9 ± 2.4 & 29.3 ± 8.8 & 6.4 ± 1.7 & 3.2 ± 0.3 & 8.6 ± 4.5 & 11.4 ± 7.5 \\ 393 CART-LC & 3.9 ± 1.3 & 24.2 ± 8.7 & 3.5 ± 0.9 & 3.2 ± 0.3 & 5.8 ± 3.2 & 8.0 ± 5.2 \\ 394 CART-AP & 13.9 ± 5.7 & 30.4 ± 10.0 & 11.5 ± 7.2 & 4.3 ± 1.6 & 15.1 ± 10 & 11.5 ± 9.1 \\ 395 C4.5 & 14.3 ± 2.2 & 77.9 ± 7.4 & 9.8 ± 2.2 & 4.6 ± 0.8 & 28.2 ± 3.3 & 56.3 ± 7.9 \\ 396 \bottomrule 397 \end{tabular} 398 \label{fig:results} 399 \end{table} 400 \end{frame} 401 402 \begin{frame} 403 \frametitle{Cohen's d} 404 \tiny 405 406 \begin{table}[h] 407 \centering 408 \caption{Cohen's d effect size for accuracies between various models and CART-ELC. Comparisons with $p < 0.05$ are bolded.} 409 \begin{tabular}{lccccccc} 410 \addlinespace 411 \toprule 412 \multirow{1}{*}{Algorithm} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\ 413 \midrule 414 415 HHCART(A) & \textbf{1.576} & \textbf{2.249} & \textbf{-1.697} & -0.351 & -0.532 & \textbf{1.039} \\ 416 HHCART(D) & \textbf{2.530} & \textbf{2.060} & \textbf{-1.697} & 0.666 & \textbf{1.175} & \textbf{1.039} \\ 417 OC1 & 0.000 & 0.485 & 0.283 & 0.177 & \textbf{1.463} & 0.086 \\ 418 CART-LC & 0.500 & \textbf{4.800} & \textbf{1.961} & 0.752 & \textbf{2.138} & 0.639 \\ 419 420 OC1-AP & \textbf{4.000} & \textbf{3.151} & \textbf{3.976} & \textbf{1.342} & \textbf{1.970} & 0.604 \\ 421 CART-AP & \textbf{1.050} & \textbf{1.644} & \textbf{1.115} & 0.486 & 0.555 & 0.233 \\ 422 C4.5 & \textbf{1.050} & \textbf{2.848} & 0.693 & 0.000 & 0.133 & \textbf{1.236} \\ 423 \bottomrule 424 \end{tabular} 425 \label{fig:cohens_d} 426 \end{table} 427 \end{frame} 428 429 \begin{frame} 430 \frametitle{Future Directions} 431 432 \begin{enumerate} 433 \item Smaller Subsets of Candidates 434 \begin{enumerate} 435 \item Active Sampling 436 \item Feature Selection 437 \end{enumerate} 438 439 \item Random Forests \citep{rndfrst} 440 \item Stochastic Gradient Boosting \citep{sgboost} 441 \end{enumerate} 442 443 \end{frame} 444 445 \begin{frame} 446 \frametitle{References} 447 \bibliographystyle{plain} 448 \bibliography{references} 449 \end{frame} 450 451 \end{document}