main.tex (52312B)
1 \documentclass[10pt]{article} % For LaTeX2e 2 \usepackage{changepage} % Put this in your preamble 3 \usepackage{float} 4 \usepackage[preprint]{tmlr} 5 \pdfoutput=1 6 \usepackage{caption} 7 8 \usepackage{pgfplots} 9 \pgfplotsset{compat=1.17} % or another compatible version for arXiv 10 11 \usepackage{graphicx} 12 \usepackage{ifthen} 13 14 \usepackage{multirow} 15 \usepackage{booktabs} 16 17 \newboolean{isFinal} 18 \setboolean{isFinal}{false} 19 20 \usepackage{listings} 21 22 \usepackage{enumitem} 23 \usepackage{algorithm} 24 \usepackage{algpseudocode} 25 26 % If accepted, instead use the following line for the camera-ready submission: 27 %\usepackage[accepted]{tmlr} 28 % To de-anonymize and remove mentions to TMLR (for example for posting to preprint servers), instead use the following: 29 %\usepackage[preprint]{tmlr} 30 31 % Optional math commands from https://github.com/goodfeli/dlbook_notation. 32 \input{math_commands.tex} 33 34 \usepackage{url} 35 36 \usepackage{hyperref} 37 38 \lstdefinestyle{mystyle}{ 39 basicstyle=\ttfamily\footnotesize, % Set the font and size 40 numbers=left, % Add line numbers 41 numberstyle=\tiny, % Style of line numbers 42 stepnumber=1, % Step between line numbers 43 numbersep=5pt, % Distance of numbers from code 44 backgroundcolor=\color{gray!10}, % Light gray background 45 keywordstyle=\color{blue}\bfseries, % Style for keywords 46 commentstyle=\color{green!50!black}, % Style for comments 47 stringstyle=\color{red}, % Style for strings 48 breaklines=true, % Auto line breaking 49 tabsize=4, % Tab width 50 frame=single % Draw a frame around the code 51 } 52 53 \usepackage{natbib} 54 55 \title{CART-ELC: Oblique Decision Tree Induction via Exhaustive Search} 56 57 % Authors must not appear in the submitted version. They should be hidden 58 % as long as the tmlr package is used without the [accepted] or [preprint] options. 59 % Non-anonymous submissions will be rejected without review. 60 61 \author{\name Andrew D. Laack \email andrew@laack.co\\ 62 University of Wisconsin-Superior 63 } 64 65 % The \author macro works with any number of authors. Use \AND 66 % to separate the names and addresses of multiple authors. 67 68 \newcommand{\fix}{\marginpar{FIX}} 69 \newcommand{\new}{\marginpar{NEW}} 70 71 \def\month{MM} % Insert correct month for camera-ready version 72 \def\year{YYYY} % Insert correct year for camera-ready version 73 \def\openreview{\url{https://openreview.net/forum?id=XXXX}} % Insert correct link to OpenReview for camera-ready version 74 75 76 \renewcommand{\thealgorithm}{\arabic{algorithm}:} 77 78 \begin{document} 79 80 \ifthenelse{\boolean{isFinal}}{ 81 82 {\large \textbf{CART-ELC: Oblique Decision Tree Induction via Exhaustive Search}} \\ 83 { {Andrew D. Laack}} 84 85 \section{Prerequisites} 86 87 Terms a broad computer science audience may not be familiar with are defined in this section. 88 89 \begin{enumerate} 90 \item \textbf{Machine Learning:} 91 A field that focuses on algorithms that learn from patterns in data. 92 \item \textbf{Sample:} 93 A single data point, often the result of an experiment. Mathematically, samples are vectors. 94 \item \textbf{Feature:} 95 A coordinate of a sample, or an attribute of all samples. 96 \item \textbf{Label:} 97 Label and classification are used interchangeably to describe the value we are trying to predict about a sample. 98 \item \textbf{Model:} 99 A model is a trained implementation of a machine learning algorithm that has learned patterns from data. 100 \item \textbf{Decision Tree:} 101 A machine learning algorithm that uses a binary tree for classification (discrete labels) or regression (continuous labels) where tree traversals are done based on feature values for the sample being evaluated. Additionally, decision trees generally use axis-aligned splitting boundaries. Pseudocode for a decision tree algorithm is found in \autoref{decision_tree}. 102 \item \textbf{Splitting Criterion:} 103 A function to quantify how good a candidate split is for a decision tree (see \autoref{splitting_criteria}). 104 \item \textbf{Hyperplane:} 105 A mathematical object that has $n-1$ dimensions in $n$ dimensional space. 106 \item \textbf{Oblique Decision Tree:} 107 A decision tree that uses hyperplanes as splitting boundaries that are not constrained to axis-alignment. 108 \item \textbf{Accuracy:} 109 The relative frequency of correct predictions made by a model on a given dataset. 110 \item \textbf{P-value:} 111 The probability of obtaining a result at least as strong as the one observed, assuming the null hypothesis is true. 112 \item \textbf{Cohen's d:} 113 A measure of effect size quantifying the difference in standard deviations between two means. 114 \end{enumerate} 115 116 \newpage 117 118 119 \subsection{Decision Tree Pseudocode}\label{decision_tree} 120 121 \begin{algorithm} 122 \caption{CART Algorithm} 123 \begin{algorithmic}[1] 124 \Function{fit}{samples, labels, featureCount} 125 \If{homogeneous(labels)} 126 \State \Return Node(majorityClass(labels)) 127 \EndIf 128 \State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore() 129 \For{sample in samples} 130 \For{feature in range(0,featureCount)} 131 \State currentSplit $\gets$ (feature, sample[feature]) 132 \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples) 133 \If{ isBetterThan(currentSplittingScore, bestSplittingScore)} 134 \State bestSplittingScore, bestSplit $\gets$ currentSplittingScore, currentSplit 135 \EndIf 136 \EndFor 137 \EndFor 138 \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit) 139 \If{left is empty or right is empty} 140 \State \Return Node(majorityClass(labels)) 141 \EndIf 142 \State leftSubtree $\gets$ fit(left.samples, left.labels, featureCount) 143 \State rightSubtree $\gets$ fit(right.samples, right.labels, featureCount) 144 \State tree $\gets$ Node(bestSplit) 145 \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree 146 \State \Return tree 147 \EndFunction 148 \end{algorithmic} 149 \end{algorithm} 150 151 \vfill 152 153 \begin{center} 154 \textbf{\large This concludes the Prerequisites section.} 155 \end{center} 156 157 \newpage 158 }{} 159 160 161 \maketitle 162 163 %\vspace{-.75cm} 164 %\vspace{.5cm} 165 166 167 \begin{abstract} 168 169 Oblique decision trees have attracted attention due to their potential for improved classification performance over traditional axis-aligned decision trees. However, methods that rely on exhaustive search to find oblique splits face computational challenges. As a result, they have not been widely explored. We introduce a novel algorithm, Classification and Regression Tree - Exhaustive Linear Combinations (CART-ELC), for inducing oblique decision trees that performs an exhaustive search on a restricted set of hyperplanes. We then investigate the algorithm's computational complexity and its predictive capabilities. Our results demonstrate that CART-ELC consistently achieves competitive performance on small datasets, often yielding statistically significant improvements in classification accuracy relative to existing decision tree induction algorithms, while frequently producing shallower, simpler, and thus more interpretable trees. 170 171 172 173 \end{abstract} 174 175 % 176 %\makeatletter 177 %\@ifpackagewith{tmlr}{preprint}{ 178 % 179 %\noindent\textbf{Keywords:} Decision trees, Oblique decision trees, Supervised learning, Classification algorithms 180 %}{}% 181 %\makeatother 182 183 184 185 186 \section{Introduction}\label{introduction} 187 188 Decision tree induction algorithms are among the most widely used algorithms for training interpretable machine learning models. This interpretability is one of the reasons they are often preferred over other models with better predictive capabilities \citep{survey, boosting}. While there has been much research on axis-aligned decision trees \citep{recent_overview, breiman1984cart, survey}, research on oblique decision trees has been less extensive \citep{HHCart, MurthyKS94, breiman1984cart}. 189 190 Where traditional decision trees use axis-aligned splits to partition the feature space, oblique decision trees use linear combinations of features that are not restricted to axis-alignment. This allows oblique decision trees to be stronger learners with shallower trees while still being fairly interpretable \citep{MurthyKS94}. However, this increased flexibility results in a larger search space as the number of hyperplanes that uniquely partition a dataset when considering the relative positions of samples with respect to the hyperplane is at most $2^m \cdot \binom{n}{m}$, where $m$ is the dimensionality of the feature space and $n$ is the number of samples \citep{MurthyKS94}. 191 192 \subsection{Related Works}\label{related_works} 193 194 \subsubsection{CART} 195 196 The Classification and Regression Tree (CART) algorithm \citep{breiman1984cart} induces axis-aligned decision trees by recursively partitioning the dataset. At each step, it evaluates a set of candidate axis-aligned splits, selects the best one, and partitions the dataset using this split. These steps are then performed recursively for each partition until a stopping criterion is met. 197 198 \subsubsection{CART-LC} 199 200 The Classification and Regression Tree - Linear Combinations (CART-LC) algorithm \citep{breiman1984cart} is a deterministic hill climbing algorithm for inducing oblique decision trees. The algorithm starts by median-centering the features and dividing them by their interquartile range. It then identifies the best axis-aligned split and iteratively adjusts the coefficients of the corresponding hyperplane to improve the split. Once improvements fall below a threshold, the algorithm converts the refined split from the transformed feature space to the original feature space, and uses this split to partition the dataset. This process is then applied recursively for each partition until a stopping criterion is met. 201 202 \subsubsection{OC1} 203 204 The Oblique Classifier 1 (OC1) algorithm \citep{MurthyKS94} is a stochastic hill climbing algorithm for inducing oblique decision trees. It begins by initializing a random hyperplane and deterministically perturbs the hyperplane's coefficients to improve its splitting score. If no improvements can be made, a new hyperplane is created by adding random values to the coefficients in an attempt to escape the local optimum. The new hyperplane is then evaluated against the previous one. If the new score is better, the process repeats, starting with this new hyperplane as the initial hyperplane. If it is not better, another random hyperplane is initialized, and the entire procedure starts again. This process of applying perturbations and then attempting to escape local optima is applied $z$ times, where $z$ is a hyperparameter of the algorithm. After $z$ iterations, the split with the best splitting score is then used to partition the dataset. The algorithm then recursively applies the above procedure to each partition until a stopping criterion is met. 205 206 \subsubsection{HHCART} 207 208 The Householder Classification and Regression Tree (HHCART) algorithm \citep{HHCart} induces oblique decision trees by leveraging class-specific orientation. HHCART(D), a variant of the algorithm, computes the dominant eigenvector of the covariance matrix for each class. Each of these dominant eigenvectors is then individually used to perform a Householder transformation to align the corresponding class orientation with a coordinate axis. In the resulting feature space, the algorithm searches for the best axis-aligned split, which is then mapped back to the original feature space as an oblique split. The best oblique split found after evaluating each dominant eigenvector is used to partition the dataset. This process is then applied to each of the resulting partitions until a stopping criterion is met. \citet{HHCart} defined two variants of HHCART: HHCART(D), which only uses the dominant eigenvector for each class, and HHCART(A), which uses all eigenvectors for each class. 209 210 \subsection{Limitations}\label{Limitations} 211 212 A limitation of the CART algorithm is that it only considers axis-aligned splits. These splits can be suboptimal when the data is linearly separable, as described by \citet{breiman1984cart}. Algorithms like CART-LC and OC1 are not constrained to axis-aligned splits, but they often converge to local optima due to their hill climbing nature. While OC1 attempts to escape local optima by adding random values to the coefficients of the splitting hyperplane after its improvements have stagnated, this strategy does not guarantee escaping local optima. HHCART is not limited in either of these regards, but it does only test splits that are axis-aligned in specific transformed feature spaces. This subset of oblique splits can be overly limited in cases where it is computationally feasible to evaluate more oblique splits. 213 214 Additionally, while CART-LC, OC1, and HHCART can generate shallower trees relative to CART \citep{MurthyKS94}, the resulting trees often use splits that are linear combinations of many features. This results in oblique trees that can be difficult to interpret, and more costly to perform predictions with. 215 216 \section{CART-ELC}\label{algorithm} 217 218 \subsection{Description}\label{Description} 219 220 We introduce CART-ELC to address the limitations described in \autoref{Limitations}. CART-ELC has one hyperparameter, $r$, defining both the minimum number of samples each candidate hyperplane must pass through and the maximum number of non-zero coefficients the hyperplane's normal vector can have. We define $m$ as the dimensionality of the feature space. With these definitions, we now discuss the two cases for the algorithm. 221 222 \textbf{Case 1: $\mathbf{r = m}$} 223 224 \begin{adjustwidth}{1em}{1em} 225 226 In the general case for $r=m$, CART-ELC performs an exhaustive search of all unique hyperplanes that pass through at least $r$ samples. This is done by evaluating each hyperplane according to a splitting criterion, tracking the best splitting hyperplane, and after all hyperplanes have been evaluated, the algorithm partitions the dataset using the best split. This process is then recursively applied to both partitions, stopping when a stopping criterion is met. The general case for $r=m$ is when each selection of $r$ samples uniquely identifies a hyperplane. In such cases, this process guarantees the greedily optimal hyperplane passing through at least $r$ samples will be used to partition the dataset at each splitting node. This guarantee does not apply in degenerate cases where samples don't uniquely identify a hyperplane, but when floating-point vectors are sampled from a continuous distribution, it is unlikely a selection of $r$ samples is affinely dependent. In such cases, the normal vector for a hyperplane passing through the $r$ samples is chosen according to the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation. % This may not be the best splitting hyperplane passing through the $r$ samples. %, but enumerating all uniquely partitioning hyperplanes passing through the $r$ samples is computationally expensive. 227 228 % In the general case for $r=m$, CART-ELC performs an exhaustive search of all unique hyperplanes that pass through at least $r$ samples. The algorithm evaluates each candidate according to a splitting criterion, as shown on line 10 of \autoref{Pseudocode}. As the hyperplanes are evaluated, we track the best candidate, and after all hyperplanes have been evaluated, the algorithm partitions the dataset using the best split. This process is then recursively applied to both partitions, stopping when a stopping criterion is met. The general case for $r=m$ is when each selection of $r$ samples uniquely identifies a hyperplane. The calculation for this hyperplane is done on line 8 of \autoref{Pseudocode}. In such cases, this process guarantees the greedily optimal hyperplane passing through at least $r$ samples will be used to partition the dataset at each splitting node. This guarantee does not apply in degenerate cases where samples don't uniquely identify a hyperplane, but when floating-point vectors are sampled from a continuous distribution, it is unlikely a selection of $r$ samples is affinely dependent. In such cases, the normal vector for a hyperplane passing through the $r$ samples is chosen according to the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation. This may not be the best splitting hyperplane passing through the $r$ samples, but enumerating all uniquely partitioning hyperplanes passing through the $r$ samples is computationally expensive. 229 230 231 \end{adjustwidth} 232 233 \textbf{Case 2: $\mathbf{r < m}$} 234 235 \begin{adjustwidth}{1em}{1em} 236 237 CART-ELC limits the search space to hyperplanes with at most $r$ non-zero coefficients and one bias term. In the general case, the algorithm considers each unique hyperplane defined by selecting a combination of $r$ features passing through at least $r$ samples where the remaining $m-r$ features have coefficients of zero. Like the $r=m$ case, each candidate is then evaluated using a splitting criterion with the best split chosen to partition the dataset. This process is then recursively applied to both partitions until a stopping criterion is met. Additionally, like the $r=m$ case, the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation is used to select a normal vector for a hyperplane that passes through the $r$ samples when the selection of samples and features is affinely dependent. 238 239 \end{adjustwidth} 240 241 \textbf{Hyperplane Orientation} 242 243 \begin{adjustwidth}{1em}{1em} 244 245 Every hyperplane has two unit normal vectors in $\mathbb{R}^n$ for $n \in \mathbb{N}$. Since we assign samples to the left partition when samples are on or above the hyperplane, the selection of a normal vector can change how samples that lie on the hyperplane contribute to the splitting score. To eliminate this nondeterminism, one may enforce a canonical sign rule. %, such as requiring the first nonzero entry of the normal vector to be positive 246 Given this, optimality guarantees for hyperplane splitting only hold under the deterministic orientation of the hyperplanes. 247 248 \end{adjustwidth} 249 250 \begin{figure}[h] 251 \centering 252 \centering 253 \begin{tikzpicture}[scale=1] 254 % Draw axes 255 \draw[->] (-1,0) -- (4,0) node[right] {$x$}; 256 \draw[->] (0,-1) -- (0,4) node[above] {$y$}; 257 258 % Plot points 259 \fill[black] (1,1) circle (2pt) node[above] {$x_1$}; 260 \fill[black] (4,4) circle (2pt) node[below] {$x_2$}; 261 262 % Draw hyperplanes 263 \draw[thick, dashed] (0,1.25) -- (4,3.25) node[right] {$H_1$}; 264 \draw[thick, dashed] (0,2.5) -- (4,4.5) node[right] {$H_2$}; 265 \end{tikzpicture} 266 \caption{Hyperplanes resulting in unique partitionings of two samples.} 267 \label{fig:hyper} 268 \end{figure} 269 270 In the general case for cases 1 and 2, the algorithm guarantees finding the best splitting hyperplane with at most $r$ non-zero coefficients and a bias term that passes through at least $r$ samples at each splitting node, but in neither case can it guarantee this is the greedily optimal oblique split. An illustration of this is shown in \autoref{fig:hyper}. Based on the samples in \autoref{fig:hyper}, if we define $r = m$, only one hyperplane will be evaluated: the hyperplane that passes through both samples and places them in the same partition. This hyperplane will result in the same partitioning as the hyperplane $H_2$. However, a hyperplane that results in the same partitioning as hyperplane $H_1$ is better if $x_1$ and $x_2$ have different labels. Since CART-ELC does not evaluate a hyperplane that partitions the samples in this way, the algorithm can not guarantee greedily optimal oblique splits. 271 272 Additionally, there may be multiple selections of features and samples that describe the same hyperplane. While computing the splitting score for each hyperplane is not necessary, the algorithm will evaluate each. When floating-point vectors are sampled from a continuous distribution, it is infrequent different selections of samples and features will result in the same hyperplane. 273 274 The algorithm also defines two stopping criteria. The first stopping criterion is class homogeneity: if a partition contains samples that all belong to the same class, the algorithm will stop because subsequent splits would not change the model's predictions. 275 276 The second criterion is when a partition is empty. If no special handling is implemented, the algorithm will attempt to partition samples indefinitely in cases where one partition is empty and the other is not homogeneous. As such, the algorithm must verify both partitions are not empty after each partitioning. 277 278 Finally, the choice to not specify a splitting criterion, additional stopping criteria, specific hyperplane tie-breaking behavior, or pruning was made to ensure future research can use this algorithm without limiting its usefulness to specific domains. 279 280 \subsection{Pseudocode}\label{Pseudocode} 281 282 \begin{algorithm}[H] 283 \caption{CART-ELC} 284 \begin{algorithmic}[1] 285 \Function{fit}{samples, labels, $m$, $r$} 286 \If{homogeneous(labels)} 287 \State \Return Node(majorityClass(labels)) 288 \EndIf 289 \State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore() 290 \For{selectedSamples in combinations(samples, $r$)} 291 \For{selectedFeatures in combinations($m$, $r$)} 292 \State vectorsToPassThrough $\gets$ featureSubset(selectedSamples, selectedFeatures) 293 \State currentSplit $\gets$ findHyperplanePassingThrough(vectorsToPassThrough) 294 \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples) 295 \If{ isBetterThan(currentSplittingScore, bestSplittingScore)} 296 \State bestSplittingScore, bestSplit $\gets$ currentSplittingScore, currentSplit 297 \EndIf 298 \EndFor 299 \EndFor 300 \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit) 301 \If{left is empty or right is empty} 302 \State \Return Node(majorityClass(labels)) 303 \EndIf 304 \State leftSubtree $\gets$ fit(left.samples, left.labels, $m$, $r$) 305 \State rightSubtree $\gets$ fit(right.samples, right.labels, $m$, $r$) 306 \State tree $\gets$ Node(bestSplit) 307 \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree 308 \State \Return tree 309 \EndFunction 310 \end{algorithmic} 311 \end{algorithm} 312 313 \subsection{Splitting Criteria}\label{splitting_criteria} 314 315 To evaluate the time complexity of our algorithm and perform an empirical analysis, we must define some splitting criteria. The three we implemented and will define are the twoing criterion \citep{breiman1984cart}, the Gini criterion \citep{breiman1984cart}, and information gain \citep{quinlan1986}. The default splitting criterion for our implementation of CART-ELC is the Gini criterion. This decision is based on our empirical analysis in \autoref{empirical_criteria_comparison}, which showed the twoing and Gini criteria often result in trees that yield the same classification accuracy, but the Gini criterion is computationally cheaper. Despite this, our empirical evaluation in \autoref{empirical_comparison} uses the twoing criterion to maintain consistency with prior research. 316 317 \subsubsection{Twoing Criterion}\label{twoing} 318 319 The twoing criterion is a splitting criterion that evaluates the quality of a split by considering the number of samples on each side of the splitting hyperplane and the difference in the distribution of classifications between the two partitions. Trees using the twoing criterion tend to be well-balanced because the splitting score is maximized when the number of samples in each partition is equal. 320 321 The formula for the twoing criterion defines the following variables: 322 323 \begin{itemize} 324 \setlength{\itemsep}{0.1em} 325 \item \( p_L \): The probability a sample is in the left partition. 326 \item \( p_R \): The probability a sample is in the right partition. 327 \item \( C \): The set of all classes in the union of the left and right partitions. 328 \item \( p_{Lj} \): The probability a random sample from the left partition has a classification of \( j \). 329 \item \( p_{Rj} \): The probability a random sample from the right partition has a classification of \( j \). 330 \end{itemize} 331 332 \begin{equation*} 333 T = \frac{p_L p_R}{4} \left( \sum_{j \in C} \left| p_{Lj} - p_{Rj} \right| \right)^2 334 \label{twoing_equ} 335 \end{equation*} 336 337 338 \subsubsection{Gini Criterion}\label{gini} 339 340 The Gini criterion is the most popular splitting criterion for decision trees. Trees induced using the Gini criterion minimize class impurity, resulting in trees that may be less balanced than the twoing criterion. The splitting score is calculated by computing the Gini impurity for both partitions, weighting them based on the number of samples in each partition, and summing the results. 341 342 The formula for the Gini criterion defines the following variables: 343 344 \begin{itemize} 345 \setlength{\itemsep}{0.1em} 346 \item \( p_L \): The probability a sample is in the left partition. 347 \item \( p_R \): The probability a sample is in the right partition. 348 \item \( C \): The set of all classes in the union of the left and right partitions. 349 \item \( p_{Lj} \): The probability a random sample from the left partition has a classification of \( j \). 350 \item \( p_{Rj} \): The probability a random sample from the right partition has a classification of \( j \). 351 \end{itemize} 352 353 \begin{equation*} 354 G = p_L \left(1 - \sum_{j\in C} p_{Lj}^2 \right) + p_R \left(1 - \sum_{j \in C} p_{Rj}^2 \right) 355 \label{gini_equ} 356 \end{equation*} 357 358 \subsubsection{Information Gain}\label{IG} 359 360 Information gain is a criterion based on reducing class impurity. Unlike the Gini criterion, information gain calculates impurity using entropy. 361 362 The formula for information gain defines the following variables where $H(S)$ is the entropy of the set $S$ \citep{it}: 363 364 \begin{itemize} 365 \setlength{\itemsep}{0.1em} 366 \item \(S \): The set of all samples before the split. 367 \item \(S_L \): The set of all samples in the left partition. 368 \item \(S_R \): The set of all samples in the right partition. 369 \end{itemize} 370 371 \begin{equation*} 372 IG = H(S) - \left( \frac{|S_{L}|}{|S|} H(S_{L}) + \frac{|S_{R}|}{|S|} H(S_{R}) \right) 373 \end{equation*} 374 375 \subsection{Time Complexity}\label{complexity} 376 377 \subsubsection{Asymptotic Time Complexity Analysis}\label{analysis} 378 379 To evaluate the algorithm's asymptotic time complexity, we concentrate on the cost of finding and executing a split at a single decision node. Although a complete decision tree often contains multiple splitting nodes, the computational cost is most significant for the first split. Once the dataset is partitioned, subsequent splits operate on smaller subsets and thus require less computation. This is shown in \autoref{fig:operations} by considering the computational differences between the $n=1000$ and $n=500$ columns. 380 381 The homogeneity calculation, which requires at most \( n \) operations per label, is dominated by the complexity of the loop over each hyperplane. As such, it does not need to be considered for the final asymptotic time complexity. Next, we consider the evaluation of hyperplanes. Assume the feature matrix is \( n \times m \), and 382 383 \begin{equation*} 384 0 < r \leq n, m. 385 \end{equation*} 386 387 Hyperplane construction requires selecting \( r \) samples from \( n \), which can be done in \( \binom{n}{r} \) ways, and for each selection of samples, there are \( \binom{m}{r} \) ways to choose \( r \) features. Therefore, the total number of hyperplanes to evaluate is 388 389 \[ 390 \binom{n}{r} \cdot\binom{m}{r}. 391 \] 392 393 To derive the equation of the hyperplane passing through the \(r\) samples, an \( r \times r \) matrix is constructed based on the $r$ selected features and samples. Subsequently, its column-wise means are computed; these means are then subtracted from each row of the matrix, and the covariance matrix is calculated. Eigen decomposition is then performed on the resulting matrix to find the hyperplane's normal vector, and the hyperplane's bias term is determined by taking the dot product between the normal vector and the column-wise mean vector. These calculations are sufficient to derive the hyperplane's equation in normal form. The calculation of the covariance matrix and the eigen decomposition dominate the complexity of finding the equation for the hyperplane. Each is in the complexity class of \( \Theta(r^3) \), assuming the use of a naïve matrix multiplication approach \citep{matrix}. 394 395 We then evaluate the split using our splitting criterion. Assume our splitting criterion is the Gini criterion. To compute our splitting score, we evaluate 396 397 \begin{equation*} 398 G = p_L \left(1 - \sum_{j\in C} p_{Lj}^2 \right) + p_R \left(1 - \sum_{j \in C} p_{Rj}^2 \right). 399 \label{gini_usage} 400 \end{equation*} 401 402 The complexity of calculating the splitting score is dominated by determining which side of the hyperplane each sample lies on. This requires calculating the dot product between the sample and the hyperplane's normal vector, adding the bias term, and checking if the result is positive, negative, or zero. As such, the evaluation of one hyperplane is in the complexity class of \( \Theta(rn) \), and since \( |C| \leq n \), the number of classes does not impact the asymptotic time complexity of evaluating each hyperplane. 403 404 To split the dataset on the best split, we recompute the side of the hyperplane each sample lies on and add each sample to the corresponding list for samples that are either above or on the hyperplane or strictly below it. While this recomputation can be avoided by caching the results, it does not impact the overall asymptotic time complexity of the algorithm. The evaluation of all samples will take \( \Theta(rn) \) time, and since we need to move each sample to the appropriate list, and each sample has \( m \) features, this will take \( \Theta(nm) \) time. Thus, the asymptotic time complexity of splitting the dataset is in the complexity class of 405 406 \begin{equation*} 407 \Theta(rn + nm). 408 \end{equation*} 409 410 Since splitting on the best split is only done once, its complexity is dominated by the complexity of evaluating all hyperplanes. As such, this final split does not impact the overall asymptotic time complexity. 411 412 Since there are \( \binom{n}{r} \cdot \binom{m}{r} \) splits to evaluate, each split takes \( \Theta(r^3) \) operations to compute the hyperplane, and \( \Theta(rn) \) time to evaluate, we find the asymptotic time complexity of finding and executing a split at a single decision node is in the complexity class of 413 414 \[ 415 \Theta\left( \binom{n}{r} \cdot \binom{m}{r} \cdot r(r^2 + n) \right). 416 \] 417 418 \subsubsection{Operations for a Single Split}\label{ops} 419 420 \begin{table}[h] 421 \centering 422 \caption{Operations required for CART-ELC to identify and execute a split at a single decision node, assuming a multiplicative factor of one for the asymptotic time complexity, no additive constants, and $r = m$.} 423 \small 424 \begin{tabular}{lcccccc} 425 \addlinespace 426 \toprule 427 \multirow{2}{*}{r} & \multicolumn{6}{c}{n} \\ 428 \cmidrule(lr){2-7} 429 & 100 & 500 & 1000 & 5000 & 10000 & 20000 \\ 430 \midrule 431 1 & 1.01e+04 & 2.50e+05 & 1.00e+06 & 2.50e+07 & 1.00e+08 & 4.00e+08 \\ 432 2 & 1.03e+06 & 1.26e+08 & 1.00e+09 & 1.25e+11 & 1.00e+12 & 8.00e+12 \\ 433 3 & 5.29e+07 & 3.16e+10 & 5.03e+11 & 3.13e+14 & 5.00e+15 & 8.00e+16 \\ 434 4 & 1.82e+09 & 5.31e+12 & 1.68e+14 & 5.22e+17 & 1.67e+19 & 5.34e+20 \\ 435 5 & 4.71e+10 & 6.70e+14 & 4.23e+16 & 6.53e+20 & 4.17e+22 & 2.67e+24 \\ 436 6 & 9.73e+11 & 6.77e+16 & 8.50e+18 & 6.54e+23 & 8.35e+25 & 1.07e+28 \\ 437 7 & 1.67e+13 & 5.71e+18 & 1.43e+21 & 5.46e+26 & 1.39e+29 & 3.56e+31 \\ 438 8 & 2.44e+14 & 4.13e+20 & 2.05e+23 & 3.90e+29 & 1.99e+32 & 1.02e+35 \\ 439 9 & 3.10e+15 & 2.62e+22 & 2.59e+25 & 2.44e+32 & 2.49e+35 & 2.55e+38 \\ 440 10 & 3.46e+16 & 1.47e+24 & 2.90e+27 & 1.36e+35 & 2.77e+38 & 5.66e+41 \\ 441 \bottomrule 442 \end{tabular} 443 \label{fig:operations} 444 \end{table} 445 446 While the assumptions underlying \autoref{fig:operations} are idealized, it offers insights into the computational cost for CART-ELC to identify and split on a single decision node. These values indicate using CART-ELC on small datasets ($n \leq 1000$) when $r$ values are small ($r \leq 3$) remains manageable, but as $r$ and $n$ values increase, the computational costs quickly grow reaching $3.13 \cdot 10^{14}$ operations at $r = 3$, $n = 5000$ for a single split. 447 448 While performing our evaluations shown in \autoref{fig:results}, we found $r$ values of three or higher to be too costly in most cases. This was partially because the results shown in \autoref{fig:operations} assume $r=m$, but in \autoref{fig:results} this was often not true, and thus there was an additional multiplicative factor of $\binom{m}{r}$. 449 450 \section{Experiments and Results} 451 452 In this section, we present our data collection methodology and provide a detailed description of the datasets used. We then compare CART-ELC with other decision tree induction algorithms, evaluate different splitting criteria, and discuss our findings. 453 454 \subsection{Methodology}\label{methodology} 455 456 The results shown in \autoref{fig:results} for OC1 and OC1-AP \citep{MurthyKS94}, CART-LC and CART \citep{breiman1984cart}, and C4.5 \citep{c4.5}, where C4.5 and CART were collected using the IND 2.1 package \citep{buntine}, were reported by \citet{MurthyKS94}. Alongside these, we present results from our implementations of HHCART \citep{HHCart} and CART-ELC.% 457 \makeatletter 458 \@ifpackagewith{tmlr}{accepted}{\footnote{Our implementations of HHCART and CART-ELC are available at \url{https://github.com/andrewlaack/cart-elc}.}}{}% 459 \@ifpackagewith{tmlr}{preprint}{\footnote{Our implementations of HHCART and CART-ELC are available at \url{https://github.com/andrewlaack/cart-elc}.}}{}% 460 \makeatother 461 \space To ensure a fair comparison, we followed experimental procedures consistent with those of \citet{MurthyKS94}, differing only in our hyperparameter tuning strategy for CART-ELC and HHCART. The change to hyperparameter tuning was necessary due to our inclusion of a max depth hyperparameter for HHCART and CART-ELC, as well as the $r$ hyperparameter for CART-ELC. 462 463 Similar to the experiments performed by \citet{MurthyKS94}, HHCART and CART-ELC used the twoing criterion as the splitting criterion, defined in \autoref{splitting_criteria}. Additionally, HHCART and CART-ELC did not apply any post-pruning or pre-pruning strategies. 464 465 For each experiment with CART-ELC we defined two variables, $d$ and $j$, the largest max depth and $r$ value to test respectively. We set $d = 5$ because increasing the max depth beyond five often led to identical tree structures or overfitting (see \autoref{hyper_tuning}). We also set $j = 2$ for all datasets except for the iris dataset where we set $j = 4$. These choices for $j$ values were made due to computational constraints imposed by our algorithm's complexity and dataset sizes, outlined in \autoref{complexity}. 466 467 Shown below are the steps performed for each experiment. 468 469 \begin{enumerate} 470 \item Complete ten iterations of the following: 471 \begin{enumerate} 472 \item Randomly partition the dataset into five partitions of roughly equal size. 473 \item Perform a grid search over all integers $1 \leq i \leq j$ and $1 \leq k \leq d$ where $k$ is defined as the current max depth hyperparameter, and $i$ is defined as the current $r$ hyperparameter. For each iteration: 474 \begin{enumerate} 475 \item Build a tree using all partitions except for one as the training data. 476 \item Evaluate the accuracy of the tree on the unseen partition. 477 \item Repeat steps (i) and (ii) for each of the five partitions. 478 \item Calculate the mean accuracy and tree size across the five iterations where tree size is defined as the number of leaf nodes in the tree. 479 \end{enumerate} 480 \end{enumerate} 481 \item Calculate the mean and standard deviation of the tree sizes and accuracies based on the final results of each cross-validation (CV) \citep{cv} iteration for each hyperparameter pair. 482 \item Select the hyperparameter pair that offers the best balance between mean accuracy and tree size. The mean and standard deviation of the accuracy and tree size with the selected hyperparameter pair are the results of the experiment. 483 \end{enumerate} 484 485 For our evaluation of both HHCART algorithms, we performed the same steps outlined above, but HHCART does not have an $r$ hyperparameter, and thus our hyperparameter search was only performed for max depth values. 486 487 \subsection{Datasets}\label{datasets} 488 489 \subsubsection{Star/Galaxy Discrimination (Bright)}\label{sg_bright} 490 491 This dataset was derived from bright images collected by \citet{odewahn} of stars and galaxies. The dataset is comprised of 2,462 samples, 14 features, and two classifications. The two classifications are star and galaxy. The 14 features were measurements defined by the original researchers that they deemed to be relevant for star/galaxy discrimination. 492 493 \subsubsection{Star/Galaxy Discrimination (Dim)}\label{sg_dim} 494 495 This dataset was derived from dim images collected by \citet{odewahn} of stars and galaxies. The dataset is comprised of 4,192 samples, 14 features, and two classifications. The two classifications are star and galaxy. The 14 features were measurements defined by the original researchers that they deemed to be relevant for star/galaxy discrimination. 496 497 \subsubsection{Breast Cancer Diagnosis}\label{Breast Cancer Diagnosis} 498 499 This dataset \citep{breast_cancer} is comprised of 699 samples with nine features and two classifications. The two classifications are malignant and benign. Similar to the evaluations performed by \citet{MurthyKS94}, we removed the 16 samples from this dataset that were missing the "Bare Nuclei" feature. Of the remaining 683 samples, 444 were labeled as benign and 239 were labeled as malignant. 500 501 \subsubsection{Iris Classification}\label{iris} 502 503 This dataset \citep{iris} is comprised of 150 samples with four features and three classifications. Each of the three classifications specifies a different species of iris and contains 50 samples. 504 505 \subsubsection{Boston Housing Cost}\label{housing} 506 507 This dataset \citep{boston} is comprised of 506 samples with 12 continuous features, one binary feature, and two classifications. While the original dataset contained a continuous label, we discretized the dataset by assigning the class of a given sample to be one if the continuous label's value was strictly less than 21,000 and two if it was greater than or equal to 21,000, as was done by \citet{MurthyKS94}. After this preprocessing, the dataset contained 260 samples with a classification of two and 246 samples with a classification of one. The dataset was also missing some features for select samples which we left as NaN for our evaluation of CART-ELC and performed mean imputation for HHCART as HHCART is unable to handle missing features. 508 509 \subsubsection{Diabetes Diagnosis}\label{diabetes} 510 511 This dataset \citep{diabetes} is comprised of 768 samples with eight features and two classifications. These two classifications represent testing positive and negative for diabetes. Of the 768 samples, 500 tested negative for diabetes, and 268 tested positive. 512 513 514 \newpage 515 \subsection{Empirical Algorithmic Comparison}\label{empirical_comparison} 516 517 \begin{table}[h] 518 \centering 519 \small 520 \caption{Accuracy and tree size comparison across decision tree induction algorithms.} 521 \begin{tabular}{lcccccc} 522 \addlinespace 523 \toprule 524 \multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Accuracy} \\ 525 \cmidrule(lr){2-7} 526 & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ 527 \midrule 528 CART-ELC & \textbf{98.9 ± 0.2} & \textbf{95.2 ± 0.5} & 96.3 ± 0.4 & 95.1 ± 0.8 & 83.5 ± 0.7 & \textbf{74.5 ± 1.3} \\ 529 530 HHCART(A) & 98.3 ± 0.5 & 93.7 ± 0.8 & \textbf{96.9 ± 0.3} & \textbf{95.5 ± 1.4} & \textbf{83.9 ± 0.8} & 73.2 ± 1.2 \\ 531 HHCART(D) & 98.1 ± 0.4 & 93.7 ± 0.9 & \textbf{96.9 ± 0.3} & 94.3 ± 1.5 & 82.2 ± 1.4 & 73.2 ± 1.2 \\ 532 OC1 & \textbf{98.9 ± 0.2} & 95.0 ± 0.3 & 96.2 ± 0.3 & 94.7 ± 3.1 & 82.4 ± 0.8 & 74.4 ± 1.0 \\ 533 OC1-AP & 98.1 ± 0.2 & 94.0 ± 0.2 & 94.5 ± 0.5 & 92.7 ± 2.4 & 81.8 ± 1.0 & 73.8 ± 1.0 \\ 534 CART-LC & 98.8 ± 0.2 & 92.8 ± 0.5 & 95.3 ± 0.6 & 93.5 ± 2.9 & 81.4 ± 1.2 & 73.7 ± 1.2 \\ 535 CART & 98.5 ± 0.5 & 94.2 ± 0.7 & 95.0 ± 1.6 & 93.8 ± 3.7 & 82.1 ± 3.5 & 73.9 ± 3.4 \\ 536 C4.5 & 98.5 ± 0.5 & 93.3 ± 0.8 & 95.3 ± 2.0 & 95.1 ± 3.2 & 83.2 ± 3.1 & 71.4 ± 3.3 \\ 537 \midrule 538 \multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Tree Size} \\ 539 \cmidrule(lr){2-7} 540 & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ 541 \midrule 542 CART-ELC & \textbf{3.7 ± 0.2} & \textbf{9.8 ± 4.2} & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & \textbf{4.0 ± 0.0} & \textbf{4.0 ± 0.0} \\ 543 HHCART(A) & 6.1 ± 0.3 & 14.6 ± 4.8 & \textbf{2.0 ± 0.0} & \textbf{3.1 ± 0.1} & 7.8 ± 0.2 & \textbf{4.0 ± 0.0} \\ 544 HHCART(D) & 6.3 ± 0.4 & 14.9 ± 5.0 & \textbf{2.0 ± 0.0} & 4.7 ± 0.1 & 23.3 ± 0.8 & \textbf{4.0 ± 0.0} \\ 545 OC1 & 4.3 ± 1.0 & 13.0 ± 8.7 & 2.8 ± 0.9 & \textbf{3.1 ± 0.2} & 6.9 ± 3.2 & 5.4 ± 3.8 \\ 546 OC1-AP & 6.9 ± 2.4 & 29.3 ± 8.8 & 6.4 ± 1.7 & 3.2 ± 0.3 & 8.6 ± 4.5 & 11.4 ± 7.5 \\ 547 CART-LC & 3.9 ± 1.3 & 24.2 ± 8.7 & 3.5 ± 0.9 & 3.2 ± 0.3 & 5.8 ± 3.2 & 8.0 ± 5.2 \\ 548 CART & 13.9 ± 5.7 & 30.4 ± 10.0 & 11.5 ± 7.2 & 4.3 ± 1.6 & 15.1 ± 10 & 11.5 ± 9.1 \\ 549 C4.5 & 14.3 ± 2.2 & 77.9 ± 7.4 & 9.8 ± 2.2 & 4.6 ± 0.8 & 28.2 ± 3.3 & 56.3 ± 7.9 \\ 550 \bottomrule 551 \end{tabular} 552 \label{fig:results} 553 \end{table} 554 555 556 The results presented in \autoref{fig:results} demonstrate that CART-ELC, HHCART(A), HHCART(D), and OC1 obtained consistently strong accuracies across the datasets evaluated when compared with other decision tree induction algorithms. Additionally, of these top performers, CART-ELC had the smallest trees across most datasets, falling behind on the iris dataset where a higher max depth value was selected due to minor variances in accuracies across max depth and $r$ values. Finally, we note the standard deviations in accuracy for CART-ELC, on average, were lower than those of OC1 and the variants of HHCART. 557 558 559 \begin{table}[h] 560 \centering 561 \caption{P-values for accuracy comparisons between various models and CART-ELC. P-values strictly less than 0.05 are bolded. P-values reported as 0.000 are not zero, but are smaller than the reporting precision.} 562 \small 563 \begin{tabular}{lccccccc} 564 \addlinespace 565 \toprule 566 \multirow{1}{*}{Algorithm} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\ 567 \midrule 568 HHCART(A) & \textbf{0.004} & \textbf{0.000} & \textbf{0.001} & 0.446 & 0.250 & \textbf{0.032} \\ 569 HHCART(D) & \textbf{0.000} & \textbf{0.000} & \textbf{0.001} & 0.159 & \textbf{0.021} & \textbf{0.032} \\ 570 OC1 & 1.000 & 0.296 & 0.536 & 0.701 & \textbf{0.004} & 0.849 \\ 571 OC1-AP & \textbf{0.000} & \textbf{0.000} & \textbf{0.000} & \textbf{0.012} & \textbf{0.000} & 0.195 \\ 572 CART-LC & 0.278 & \textbf{0.000} & \textbf{0.000} & 0.122 & \textbf{0.000} & 0.170 \\ 573 CART & \textbf{0.037} & \textbf{0.002} & \textbf{0.032} & 0.303 & 0.244 & 0.612 \\ 574 C4.5 & \textbf{0.037} & \textbf{0.000} & 0.153 & 1.000 & 0.771 & \textbf{0.017} \\ 575 \bottomrule 576 \end{tabular} 577 \label{fig:p_values} 578 \end{table} 579 580 The results in \autoref{fig:p_values}, obtained by performing Welch's t-tests, compare the accuracies of each algorithm against CART-ELC. These results support the finding that CART-ELC, HHCART, and OC1 are the best-performing decision tree induction algorithms on the datasets evaluated. Additionally, CART-ELC had statistically significant performance improvements on both star/galaxy discrimination datasets and the diabetes dataset relative to both HHCART variants and only had statistically significant performance improvements when compared to OC1 on the housing dataset. The final result of interest is that both variants of HHCART performed better than CART-ELC on the cancer dataset by a statistically significant amount. 581 582 \begin{table}[h] 583 \centering 584 \caption{Cohen's d effect size for accuracies between various models and CART-ELC. Comparisons with $p < 0.05$ are bolded.} 585 \small 586 \begin{tabular}{lccccccc} 587 \addlinespace 588 \toprule 589 \multirow{1}{*}{Algorithm} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\ 590 \midrule 591 592 HHCART(A) & \textbf{1.576} & \textbf{2.249} & \textbf{-1.697} & -0.351 & -0.532 & \textbf{1.039} \\ 593 HHCART(D) & \textbf{2.530} & \textbf{2.060} & \textbf{-1.697} & 0.666 & \textbf{1.175} & \textbf{1.039} \\ 594 OC1 & 0.000 & 0.485 & 0.283 & 0.177 & \textbf{1.463} & 0.086 \\ 595 OC1-AP & \textbf{4.000} & \textbf{3.151} & \textbf{3.976} & \textbf{1.342} & \textbf{1.970} & 0.604 \\ 596 CART-LC & 0.500 & \textbf{4.800} & \textbf{1.961} & 0.752 & \textbf{2.138} & 0.639 \\ 597 CART & \textbf{1.050} & \textbf{1.644} & \textbf{1.115} & 0.486 & 0.555 & 0.233 \\ 598 C4.5 & \textbf{1.050} & \textbf{2.848} & 0.693 & 0.000 & 0.133 & \textbf{1.236} \\ 599 \bottomrule 600 \end{tabular} 601 \label{fig:cohens_d} 602 \end{table} 603 604 605 The results in \autoref{fig:cohens_d} show the effect sizes of the difference between accuracies with respect to CART-ELC. Negative values indicate the algorithm outperformed CART-ELC on the dataset, and positive values indicate CART-ELC outperformed the algorithm on the dataset. These results show the magnitude of CART-ELC's outperformance of HHCART on the star/galaxy datasets is greater than the magnitude of HHCART's outperformance of CART-ELC on the cancer dataset. 606 607 HHCART's strong performance relative to CART-ELC on the cancer dataset is likely due to the dataset being both linearly separable and having a sizable gap between the two classes. HHCART typically uses splits that pass through one sample in the transformed feature space, and thus it can find splits with a larger separation on both sides with respect to the classes on each side. This contrasts CART-ELC where each hyperplane generally passes through multiple samples and will thus generalize worse on some datasets because the separation on both sides of the hyperplane will be smaller. 608 609 Despite this, a limitation of HHCART that makes CART-ELC perform better on certain datasets is its inability to handle outliers well. Outliers nominally impact CART-ELC as the splitting criterion is minimally impacted by them, but HHCART uses a change of basis derived from the eigen decomposition of the covariance matrix for each class, and these resulting eigenvectors can be greatly impacted by outliers. This results in HHCART searching for axis-aligned splits in suboptimal feature spaces. Similarly, when there are clusters of samples with the same classification throughout the feature space, this can also pose issues for HHCART as the covariance matrices will be impacted by the disparate groups more than CART-ELC will be. 610 611 \subsection{Empirical Criteria Comparison}\label{empirical_criteria_comparison} 612 613 \begin{table}[h] 614 \centering 615 \caption{Accuracy and tree size comparison across splitting criteria.} 616 \small 617 \begin{tabular}{lccccccc} 618 \addlinespace 619 \toprule 620 \multirow{2}{*}{Splitting Criterion} & \multicolumn{6}{c}{Accuracy} \\ 621 \cmidrule(lr){2-7} 622 \multirow{1}{*}{} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\ 623 \midrule 624 \hyperref[twoing]{Twoing Criterion} & \textbf{98.9 ± 0.2} & \textbf{95.2 ± 0.5} & \textbf{96.3 ± 0.4} & 95.1 ± 0.8 & \textbf{83.5 ± 0.7} & \textbf{74.5 ± 1.3} \\ 625 626 \hyperref[gini]{Gini Criterion} & \textbf{98.9 ± 0.3} & 95.1 ± 0.5 & \textbf{96.3 ± 0.4} & 95.1 ± 0.9 & \textbf{83.5 ± 0.7} & \textbf{74.5 ± 1.3} \\ 627 628 \hyperref[IG]{Information Gain}& \textbf{98.9 ± 0.2} & 94.9 ± 0.5 & 96.0 ± 0.4 & \textbf{95.3 ± 0.8} & 81.8 ± 1.0 & 74.0 ± 1.1 \\ 629 630 \midrule 631 \multirow{2}{*}{Splitting Criterion} & \multicolumn{6}{c}{Tree Size} \\ 632 \cmidrule(lr){2-7} 633 & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ 634 \midrule 635 636 637 \hyperref[twoing] {Twoing Criterion} & 3.7 ± 0.2 & \textbf{9.8 ± 4.2} & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & 4.0 ± 0.0 & \textbf{4.0 ± 0.0} \\ 638 \hyperref[gini]{Gini Criterion} & \textbf{3.6 ± 0.1} & 12.4 ± 8.3 & \textbf{2.0 ± 0.0} & \textbf{3.0 ± 0.1} & 4.0 ± 0.0 & \textbf{4.0 ± 0.0} \\ 639 \hyperref[IG]{Information Gain} & 3.8 ± 0.2 & 9.9 ± 4.5 & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & \textbf{2.0 ± 0.0} & \textbf{4.0 ± 0.0} \\ 640 641 \bottomrule 642 \end{tabular} 643 \label{fig:crits} 644 \end{table} 645 646 Our experimental results across splitting criteria, shown in \autoref{fig:crits}, indicate the twoing criterion and Gini criterion are both strong splitting criteria, and information gain only falls behind nominally. Additionally, on the dataset we evaluated that contained more than two classifications, information gain outperformed the other two criteria. 647 648 \section{Conclusions and Future Work}\label{conclusions_and_future_work} 649 650 CART-ELC achieves competitive accuracies across a variety of small datasets when compared with other decision tree algorithms despite only searching a subset of all unique hyperplanes. Additionally, while other oblique decision trees often use splits that are linear combinations of many features, CART-ELC's $r$ hyperparameter limits the number of features in each linear combination, which can result in simpler decision boundaries. This, along with CART-ELC's ability to generate shallower trees, often results in trees that are both more interpretable and efficient when making predictions. 651 652 Despite this, the computational complexity of CART-ELC remains prohibitively high for large datasets. As such, future research should focus on strategies to mitigate these computational costs, such as developing methods to select a smaller subset of candidate hyperplanes or using CART-ELC with ensemble methods. Adapting CART-ELC for use in random forests \citep{rndfrst} and stochastic gradient boosting \citep{sgboost} may be a promising direction, as these approaches could reduce computational costs by training on subsets of samples or by introducing stochastic elements into the training process. Finally, while our evaluation included six datasets, further experimentation on a wider variety of datasets and more extensive CV may be beneficial for assessing the algorithm's overall efficacy. 653 654 \bibliography{references} 655 \bibliographystyle{tmlr} 656 657 \appendix 658 659 \newpage 660 661 \section{Hyperparameter Tuning Results for CART-ELC}\label{hyper_tuning} 662 663 Figures~\ref{graph_twoing}--\ref{graph_ig} show the accuracies and tree sizes for CART-ELC across each splitting criterion, dataset, $r$ value, and max depth. These results show higher $r$ values often result in higher accuracies at lower max depths but begin overfitting at lower max depths. We also see the accuracies and tree sizes across splitting criteria are quite consistent, often varying in accuracy but rarely in trends. Finally, on the S/G Dim dataset, accuracies appear to be plateauing around a max depth of five for $r=2$ while on the other datasets, max depths of five either result in models overfitting or accuracies stagnating for $r \geq 2$. 664 665 \begin{figure}[h] 666 \includegraphics[width=\linewidth]{twoing.pdf} 667 \caption{Max depth and $r$ value results for CART-ELC when using the twoing criterion as the splitting criterion.} 668 \label{graph_twoing} 669 \end{figure} 670 671 \begin{figure}[h] 672 \includegraphics[width=\linewidth]{gini.pdf} 673 \caption{Max depth and $r$ value results for CART-ELC when using the Gini criterion as the splitting criterion.} 674 \label{graph_gini} 675 \end{figure} 676 677 \begin{figure}[h] 678 \includegraphics[width=\linewidth]{ig.pdf} 679 \caption{Max depth and $r$ value results for CART-ELC when using information gain as the splitting criterion.} 680 \label{graph_ig} 681 \end{figure} 682 683 \end{document}