cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

main.tex (52312B)


      1 \documentclass[10pt]{article} % For LaTeX2e
      2 \usepackage{changepage} % Put this in your preamble
      3 \usepackage{float}
      4 \usepackage[preprint]{tmlr}
      5 \pdfoutput=1
      6 \usepackage{caption}
      7 
      8 \usepackage{pgfplots}
      9 \pgfplotsset{compat=1.17} % or another compatible version for arXiv
     10 
     11 \usepackage{graphicx}
     12 \usepackage{ifthen}
     13 
     14 \usepackage{multirow}
     15 \usepackage{booktabs}
     16 
     17 \newboolean{isFinal}
     18 \setboolean{isFinal}{false}
     19 
     20 \usepackage{listings}
     21 
     22 \usepackage{enumitem}
     23 \usepackage{algorithm}
     24 \usepackage{algpseudocode}
     25 
     26 % If accepted, instead use the following line for the camera-ready submission:
     27 %\usepackage[accepted]{tmlr}
     28 % To de-anonymize and remove mentions to TMLR (for example for posting to preprint servers), instead use the following:
     29 %\usepackage[preprint]{tmlr}
     30 
     31 % Optional math commands from https://github.com/goodfeli/dlbook_notation.
     32 \input{math_commands.tex}
     33 
     34 \usepackage{url}
     35 
     36 \usepackage{hyperref}
     37 
     38 \lstdefinestyle{mystyle}{
     39     basicstyle=\ttfamily\footnotesize,   % Set the font and size
     40     numbers=left,                        % Add line numbers
     41     numberstyle=\tiny,                   % Style of line numbers
     42     stepnumber=1,                         % Step between line numbers
     43     numbersep=5pt,                        % Distance of numbers from code
     44     backgroundcolor=\color{gray!10},      % Light gray background
     45     keywordstyle=\color{blue}\bfseries,   % Style for keywords
     46     commentstyle=\color{green!50!black},  % Style for comments
     47     stringstyle=\color{red},              % Style for strings
     48     breaklines=true,                      % Auto line breaking
     49     tabsize=4,                            % Tab width
     50     frame=single                          % Draw a frame around the code
     51 }
     52 
     53 \usepackage{natbib}
     54 
     55 \title{CART-ELC: Oblique Decision Tree Induction via Exhaustive Search}
     56 
     57 % Authors must not appear in the submitted version. They should be hidden
     58 % as long as the tmlr package is used without the [accepted] or [preprint] options.
     59 % Non-anonymous submissions will be rejected without review.
     60 
     61 \author{\name Andrew D. Laack \email andrew@laack.co\\
     62       University of Wisconsin-Superior
     63 }
     64 
     65 % The \author macro works with any number of authors. Use \AND 
     66 % to separate the names and addresses of multiple authors.
     67 
     68 \newcommand{\fix}{\marginpar{FIX}}
     69 \newcommand{\new}{\marginpar{NEW}}
     70 
     71 \def\month{MM}  % Insert correct month for camera-ready version
     72 \def\year{YYYY} % Insert correct year for camera-ready version
     73 \def\openreview{\url{https://openreview.net/forum?id=XXXX}} % Insert correct link to OpenReview for camera-ready version
     74 
     75 
     76 \renewcommand{\thealgorithm}{\arabic{algorithm}:}
     77 
     78 \begin{document}
     79 
     80 \ifthenelse{\boolean{isFinal}}{
     81 
     82 	{\large \textbf{CART-ELC: Oblique Decision Tree Induction via Exhaustive Search}} \\
     83 	{ {Andrew D. Laack}}
     84 
     85     \section{Prerequisites}
     86 
     87 	Terms a broad computer science audience may not be familiar with are defined in this section.
     88 
     89     \begin{enumerate}
     90         \item \textbf{Machine Learning:} 
     91 			A field that focuses on algorithms that learn from patterns in data.
     92         \item \textbf{Sample:} 
     93 			A single data point, often the result of an experiment. Mathematically, samples are vectors. 
     94         \item \textbf{Feature:}  
     95 			A coordinate of a sample, or an attribute of all samples.
     96 		\item \textbf{Label:}  
     97 			Label and classification are used interchangeably to describe the value we are trying to predict about a sample.
     98         \item \textbf{Model:}  
     99 			A model is a trained implementation of a machine learning algorithm that has learned patterns from data.
    100         \item \textbf{Decision Tree:}  
    101 			A machine learning algorithm that uses a binary tree for classification (discrete labels) or regression (continuous labels) where tree traversals are done based on feature values for the sample being evaluated. Additionally, decision trees generally use axis-aligned splitting boundaries. Pseudocode for a decision tree algorithm is found in \autoref{decision_tree}.
    102 		\item \textbf{Splitting Criterion:}  
    103 			A function to quantify how good a candidate split is for a decision tree (see \autoref{splitting_criteria}).
    104         \item \textbf{Hyperplane:}  
    105 			A mathematical object that has $n-1$ dimensions in $n$ dimensional space.
    106         \item \textbf{Oblique Decision Tree:}  
    107 			A decision tree that uses hyperplanes as splitting boundaries that are not constrained to axis-alignment.
    108         \item \textbf{Accuracy:}  
    109 			The relative frequency of correct predictions made by a model on a given dataset.
    110         \item \textbf{P-value:}  
    111 			The probability of obtaining a result at least as strong as the one observed, assuming the null hypothesis is true.
    112         \item \textbf{Cohen's d:}  
    113 			A measure of effect size quantifying the difference in standard deviations between two means.
    114     \end{enumerate}
    115 
    116 	\newpage
    117 
    118 
    119 \subsection{Decision Tree Pseudocode}\label{decision_tree}
    120 
    121 \begin{algorithm}
    122 	\caption{CART Algorithm}
    123 \begin{algorithmic}[1]
    124 \Function{fit}{samples, labels, featureCount}
    125     \If{homogeneous(labels)}
    126         \State \Return Node(majorityClass(labels)) 
    127     \EndIf
    128 	\State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore()
    129     \For{sample in samples}
    130 		\For{feature in range(0,featureCount)}
    131 			\State currentSplit $\gets$ (feature, sample[feature])
    132             \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples)
    133 			\If{ isBetterThan(currentSplittingScore, bestSplittingScore)}
    134                 \State bestSplittingScore, bestSplit  $\gets$ currentSplittingScore, currentSplit
    135             \EndIf
    136         \EndFor
    137     \EndFor
    138     \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit)
    139 	\If{left is empty or right is empty}
    140 		\State \Return Node(majorityClass(labels)) 
    141 	\EndIf
    142     \State leftSubtree $\gets$ fit(left.samples, left.labels, featureCount)
    143     \State rightSubtree $\gets$ fit(right.samples, right.labels, featureCount)
    144     \State tree $\gets$ Node(bestSplit)
    145     \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree
    146     \State \Return tree
    147 \EndFunction
    148 \end{algorithmic}
    149 \end{algorithm}
    150 
    151 \vfill
    152 
    153     \begin{center}
    154 		\textbf{\large This concludes the Prerequisites section.}
    155     \end{center}
    156 
    157 \newpage
    158 }{} 
    159 
    160 
    161 \maketitle
    162 
    163 %\vspace{-.75cm}
    164 %\vspace{.5cm}
    165 
    166 
    167 \begin{abstract}
    168 
    169 Oblique decision trees have attracted attention due to their potential for improved classification performance over traditional axis-aligned decision trees. However, methods that rely on exhaustive search to find oblique splits face computational challenges. As a result, they have not been widely explored. We introduce a novel algorithm, Classification and Regression Tree - Exhaustive Linear Combinations (CART-ELC), for inducing oblique decision trees that performs an exhaustive search on a restricted set of hyperplanes. We then investigate the algorithm's  computational complexity and its predictive capabilities. Our results demonstrate that CART-ELC consistently achieves competitive performance on small datasets, often yielding statistically significant improvements in classification accuracy relative to existing decision tree induction algorithms, while frequently producing shallower, simpler, and thus more interpretable trees.
    170 
    171 
    172 
    173 \end{abstract}
    174 
    175 %
    176 %\makeatletter
    177 %\@ifpackagewith{tmlr}{preprint}{
    178 %
    179 %\noindent\textbf{Keywords:} Decision trees, Oblique decision trees, Supervised learning, Classification algorithms
    180 %}{}%
    181 %\makeatother
    182 
    183 
    184 
    185 
    186 \section{Introduction}\label{introduction}
    187 
    188 Decision tree induction algorithms are among the most widely used algorithms for training interpretable machine learning models. This interpretability is one of the reasons they are often preferred over other models with better predictive capabilities \citep{survey, boosting}. While there has been much research on axis-aligned decision trees \citep{recent_overview, breiman1984cart, survey}, research on oblique decision trees has been less extensive \citep{HHCart, MurthyKS94, breiman1984cart}.
    189 
    190 Where traditional decision trees use axis-aligned splits to partition the feature space, oblique decision trees use linear combinations of features that are not restricted to axis-alignment. This allows oblique decision trees to be stronger learners with shallower trees while still being fairly interpretable \citep{MurthyKS94}. However, this increased flexibility results in a larger search space as the number of hyperplanes that uniquely partition a dataset when considering the relative positions of samples with respect to the hyperplane is at most $2^m \cdot \binom{n}{m}$,  where $m$ is the dimensionality of the feature space and $n$ is the number of samples \citep{MurthyKS94}.
    191 
    192 \subsection{Related Works}\label{related_works}
    193 
    194   \subsubsection{CART}
    195 
    196 The Classification and Regression Tree (CART) algorithm \citep{breiman1984cart} induces axis-aligned decision trees by recursively partitioning the dataset. At each step, it evaluates a set of candidate axis-aligned splits, selects the best one, and partitions the dataset using this split. These steps are then performed recursively for each partition until a stopping criterion is met.
    197 
    198   \subsubsection{CART-LC}
    199 
    200 The Classification and Regression Tree - Linear Combinations (CART-LC) algorithm \citep{breiman1984cart} is a deterministic hill climbing algorithm for inducing oblique decision trees. The algorithm starts by median-centering the features and dividing them by their interquartile range. It then identifies the best axis-aligned split and iteratively adjusts the coefficients of the corresponding hyperplane to improve the split. Once improvements fall below a threshold, the algorithm converts the refined split from the transformed feature space to the original feature space, and uses this split to partition the dataset. This process is then applied recursively for each partition until a stopping criterion is met.
    201 
    202   \subsubsection{OC1}
    203 
    204 The Oblique Classifier 1 (OC1) algorithm \citep{MurthyKS94} is a stochastic hill climbing algorithm for inducing oblique decision trees. It begins by initializing a random hyperplane and deterministically perturbs the hyperplane's coefficients to improve its splitting score. If no improvements can be made, a new hyperplane is created by adding random values to the coefficients in an attempt to escape the local optimum. The new hyperplane is then evaluated against the previous one. If the new score is better, the process repeats, starting with this new hyperplane as the initial hyperplane. If it is not better, another random hyperplane is initialized, and the entire procedure starts again. This process of applying perturbations and then attempting to escape local optima is applied $z$ times, where $z$ is a hyperparameter of the algorithm. After $z$ iterations, the split with the best splitting score is then used to partition the dataset. The algorithm then recursively applies the above procedure to each partition until a stopping criterion is met.
    205 
    206 \subsubsection{HHCART}
    207 
    208 The Householder Classification and Regression Tree (HHCART) algorithm \citep{HHCart} induces oblique decision trees by leveraging class-specific orientation. HHCART(D), a variant of the algorithm, computes the dominant eigenvector of the covariance matrix for each class. Each of these dominant eigenvectors is then individually used to perform a Householder transformation to align the corresponding class orientation with a coordinate axis. In the resulting feature space, the algorithm searches for the best axis-aligned split, which is then mapped back to the original feature space as an oblique split. The best oblique split found after evaluating each dominant eigenvector is used to partition the dataset. This process is then applied to each of the resulting partitions until a stopping criterion is met. \citet{HHCart} defined two variants of HHCART: HHCART(D), which only uses the dominant eigenvector for each class, and HHCART(A), which uses all eigenvectors for each class.
    209 
    210 \subsection{Limitations}\label{Limitations}
    211 
    212 A limitation of the CART algorithm is that it only considers axis-aligned splits. These splits can be suboptimal when the data is linearly separable, as described by \citet{breiman1984cart}. Algorithms like CART-LC and OC1 are not constrained to axis-aligned splits, but they often converge to local optima due to their hill climbing nature. While OC1 attempts to escape local optima by adding random values to the coefficients of the splitting hyperplane after its improvements have stagnated, this strategy does not guarantee escaping local optima. HHCART is not limited in either of these regards, but it does only test splits that are axis-aligned in specific transformed feature spaces. This subset of oblique splits can be overly limited in cases where it is computationally feasible to evaluate more oblique splits.
    213 
    214 Additionally, while CART-LC, OC1, and HHCART can generate shallower trees relative to CART \citep{MurthyKS94}, the resulting trees often use splits that are linear combinations of many features. This results in oblique trees that can be difficult to interpret, and more costly to perform predictions with.
    215 
    216 \section{CART-ELC}\label{algorithm}
    217 
    218 \subsection{Description}\label{Description}
    219 
    220 We introduce CART-ELC to address the limitations described in \autoref{Limitations}. CART-ELC has one hyperparameter, $r$, defining both the minimum number of samples each candidate hyperplane must pass through and the maximum number of non-zero coefficients the hyperplane's normal vector can have. We define $m$ as the dimensionality of the feature space. With these definitions, we now discuss the two cases for the algorithm.
    221 
    222 \textbf{Case 1: $\mathbf{r = m}$}
    223 
    224 \begin{adjustwidth}{1em}{1em}
    225 
    226 In the general case for $r=m$, CART-ELC performs an exhaustive search of all unique hyperplanes that pass through at least $r$ samples. This is done by evaluating each hyperplane according to a splitting criterion, tracking the best splitting hyperplane, and after all hyperplanes have been evaluated, the algorithm partitions the dataset using the best split. This process is then recursively applied to both partitions, stopping when a stopping criterion is met. The general case for $r=m$ is when each selection of $r$ samples uniquely identifies a hyperplane. In such cases, this process guarantees the greedily optimal hyperplane passing through at least $r$ samples will be used to partition the dataset at each splitting node. This guarantee does not apply in degenerate cases where samples don't uniquely identify a hyperplane, but when floating-point vectors are sampled from a continuous distribution, it is unlikely a selection of $r$ samples is affinely dependent. In such cases, the normal vector for a hyperplane passing through the $r$ samples is chosen according to the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation. % This may not be the best splitting hyperplane passing through the $r$ samples. %, but enumerating all uniquely partitioning hyperplanes passing through the $r$ samples is computationally expensive.
    227 
    228 % In the general case for $r=m$, CART-ELC performs an exhaustive search of all unique hyperplanes that pass through at least $r$ samples. The algorithm evaluates each candidate according to a splitting criterion, as shown on line 10 of \autoref{Pseudocode}. As the hyperplanes are evaluated, we track the best candidate, and after all hyperplanes have been evaluated, the algorithm partitions the dataset using the best split. This process is then recursively applied to both partitions, stopping when a stopping criterion is met. The general case for $r=m$ is when each selection of $r$ samples uniquely identifies a hyperplane. The calculation for this hyperplane is done on line 8 of \autoref{Pseudocode}. In such cases, this process guarantees the greedily optimal hyperplane passing through at least $r$ samples will be used to partition the dataset at each splitting node. This guarantee does not apply in degenerate cases where samples don't uniquely identify a hyperplane, but when floating-point vectors are sampled from a continuous distribution, it is unlikely a selection of $r$ samples is affinely dependent. In such cases, the normal vector for a hyperplane passing through the $r$ samples is chosen according to the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation. This may not be the best splitting hyperplane passing through the $r$ samples, but enumerating all uniquely partitioning hyperplanes passing through the $r$ samples is computationally expensive.
    229 
    230 
    231 \end{adjustwidth}
    232 
    233 \textbf{Case 2: $\mathbf{r < m}$}
    234 
    235 \begin{adjustwidth}{1em}{1em}
    236 
    237 	CART-ELC limits the search space to hyperplanes with at most $r$ non-zero coefficients and one bias term. In the general case, the algorithm considers each unique hyperplane defined by selecting a combination of $r$ features passing through at least $r$ samples where the remaining $m-r$ features have coefficients of zero. Like the $r=m$ case, each candidate is then evaluated using a splitting criterion with the best split chosen to partition the dataset. This process is then recursively applied to both partitions until a stopping criterion is met. Additionally, like the $r=m$ case, the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation is used to select a normal vector for a hyperplane that passes through the $r$ samples when the selection of samples and features is affinely dependent.
    238 
    239 \end{adjustwidth}
    240 
    241 \textbf{Hyperplane Orientation}
    242 
    243 \begin{adjustwidth}{1em}{1em}
    244 
    245 	Every hyperplane has two unit normal vectors in $\mathbb{R}^n$ for $n \in \mathbb{N}$.  Since we assign samples to the left partition when samples are on or above the hyperplane, the selection of a normal vector can change how samples that lie on the hyperplane contribute to the splitting score. To eliminate this nondeterminism, one may enforce a canonical sign rule. %, such as requiring the first nonzero entry of the normal vector to be positive
    246 Given this, optimality guarantees for hyperplane splitting only hold under the deterministic orientation of the hyperplanes.
    247 
    248 \end{adjustwidth}
    249 
    250 \begin{figure}[h]
    251     \centering
    252         \centering
    253         \begin{tikzpicture}[scale=1]
    254             % Draw axes
    255             \draw[->] (-1,0) -- (4,0) node[right] {$x$};
    256             \draw[->] (0,-1) -- (0,4) node[above] {$y$};
    257 
    258             % Plot points
    259             \fill[black] (1,1) circle (2pt) node[above] {$x_1$};
    260             \fill[black] (4,4) circle (2pt) node[below] {$x_2$};
    261 
    262             % Draw hyperplanes
    263             \draw[thick, dashed] (0,1.25) -- (4,3.25) node[right] {$H_1$};
    264             \draw[thick, dashed] (0,2.5) -- (4,4.5) node[right] {$H_2$};
    265         \end{tikzpicture}
    266         \caption{Hyperplanes resulting in unique partitionings of two samples.}
    267 		\label{fig:hyper}
    268 \end{figure}
    269 
    270 In the general case for cases 1 and 2, the algorithm guarantees finding the best splitting hyperplane with at most $r$ non-zero coefficients and a bias term that passes through at least $r$ samples at each splitting node, but in neither case can it guarantee this is the greedily optimal oblique split. An illustration of this is shown in \autoref{fig:hyper}. Based on the samples in \autoref{fig:hyper}, if we define $r = m$, only one hyperplane will be evaluated: the hyperplane that passes through both samples and places them in the same partition. This hyperplane will result in the same partitioning as the hyperplane $H_2$. However, a hyperplane that results in the same partitioning as hyperplane $H_1$ is better if $x_1$ and $x_2$ have different labels. Since CART-ELC does not evaluate a hyperplane that partitions the samples in this way, the algorithm can not guarantee greedily optimal oblique splits. 
    271 
    272 Additionally, there may be multiple selections of features and samples that describe the same hyperplane. While computing the splitting score for each hyperplane is not necessary, the algorithm will evaluate each. When floating-point vectors are sampled from a continuous distribution, it is infrequent different selections of samples and features will result in the same hyperplane.
    273 
    274 The algorithm also defines two stopping criteria. The first stopping criterion is class homogeneity: if a partition contains samples that all belong to the same class, the algorithm will stop because subsequent splits would not change the model's predictions.
    275 
    276 The second criterion is when a partition is empty. If no special handling is implemented, the algorithm will attempt to partition samples indefinitely in cases where one partition is empty and the other is not homogeneous. As such, the algorithm must verify both partitions are not empty after each partitioning.
    277 
    278 Finally, the choice to not specify a splitting criterion, additional stopping criteria, specific hyperplane tie-breaking behavior, or pruning was made to ensure future research can use this algorithm without limiting its usefulness to specific domains.
    279 
    280 \subsection{Pseudocode}\label{Pseudocode}
    281 
    282 \begin{algorithm}[H]
    283 	\caption{CART-ELC}
    284 \begin{algorithmic}[1]
    285 \Function{fit}{samples, labels, $m$, $r$}
    286     \If{homogeneous(labels)}
    287         \State \Return Node(majorityClass(labels)) 
    288     \EndIf
    289 	\State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore()
    290     \For{selectedSamples in combinations(samples, $r$)}
    291         \For{selectedFeatures in combinations($m$, $r$)}
    292             \State vectorsToPassThrough $\gets$ featureSubset(selectedSamples, selectedFeatures)
    293             \State currentSplit $\gets$ findHyperplanePassingThrough(vectorsToPassThrough)
    294             \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples)
    295 			\If{ isBetterThan(currentSplittingScore, bestSplittingScore)}
    296                 \State bestSplittingScore, bestSplit  $\gets$ currentSplittingScore, currentSplit
    297             \EndIf
    298         \EndFor
    299     \EndFor
    300     \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit)
    301 	\If{left is empty or right is empty}
    302         \State \Return Node(majorityClass(labels)) 
    303 	\EndIf
    304     \State leftSubtree $\gets$ fit(left.samples, left.labels, $m$, $r$)
    305 	\State rightSubtree $\gets$ fit(right.samples, right.labels, $m$, $r$)
    306     \State tree $\gets$ Node(bestSplit)
    307     \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree
    308     \State \Return tree
    309 \EndFunction
    310 \end{algorithmic}
    311 \end{algorithm}
    312 
    313 \subsection{Splitting Criteria}\label{splitting_criteria}
    314 
    315 To evaluate the time complexity of our algorithm and perform an empirical analysis, we must define some splitting criteria. The three we implemented and will define are the twoing criterion \citep{breiman1984cart}, the Gini criterion \citep{breiman1984cart}, and information gain \citep{quinlan1986}. The default splitting criterion for our implementation of CART-ELC is the Gini criterion. This decision is based on our empirical analysis in \autoref{empirical_criteria_comparison}, which showed the twoing and Gini criteria often result in trees that yield the same classification accuracy, but the Gini criterion is computationally cheaper. Despite this, our empirical evaluation in \autoref{empirical_comparison} uses the twoing criterion to maintain consistency with prior research.
    316 
    317 \subsubsection{Twoing Criterion}\label{twoing}
    318 
    319 The twoing criterion is a splitting criterion that evaluates the quality of a split by considering the number of samples on each side of the splitting hyperplane and the difference in the distribution of classifications between the two partitions. Trees using the twoing criterion tend to be well-balanced because the splitting score is maximized when the number of samples in each partition is equal.
    320 
    321 The formula for the twoing criterion defines the following variables:
    322 
    323 \begin{itemize}
    324     \setlength{\itemsep}{0.1em} 
    325     \item \( p_L \): The probability a sample is in the left partition.
    326     \item \( p_R \): The probability a sample is in the right partition.
    327     \item \( C \): The set of all classes in the union of the left and right partitions.
    328     \item \( p_{Lj} \): The probability a random sample from the left partition has a classification of \( j \).
    329     \item \( p_{Rj} \): The probability a random sample from the right partition has a classification of \( j \).
    330 \end{itemize}
    331 
    332 \begin{equation*}
    333 T = \frac{p_L p_R}{4} \left( \sum_{j \in C} \left| p_{Lj} - p_{Rj} \right| \right)^2
    334 	\label{twoing_equ}
    335 \end{equation*}
    336 
    337 
    338 \subsubsection{Gini Criterion}\label{gini}
    339 
    340 The Gini criterion is the most popular splitting criterion for decision trees. Trees induced using the Gini criterion minimize class impurity, resulting in trees that may be less balanced than the twoing criterion. The splitting score is calculated by computing the Gini impurity for both partitions, weighting them based on the number of samples in each partition, and summing the results.
    341 
    342 The formula for the Gini criterion defines the following variables:
    343 
    344 \begin{itemize}
    345     \setlength{\itemsep}{0.1em} 
    346     \item \( p_L \): The probability a sample is in the left partition.
    347     \item \( p_R \): The probability a sample is in the right partition.
    348     \item \( C \): The set of all classes in the union of the left and right partitions.
    349     \item \( p_{Lj} \): The probability a random sample from the left partition has a classification of \( j \).
    350     \item \( p_{Rj} \): The probability a random sample from the right partition has a classification of \( j \).
    351 \end{itemize}
    352 
    353 \begin{equation*}
    354 	G = p_L \left(1 - \sum_{j\in C} p_{Lj}^2 \right) + p_R \left(1 - \sum_{j \in C} p_{Rj}^2 \right)
    355 	\label{gini_equ}
    356 \end{equation*}
    357 
    358 \subsubsection{Information Gain}\label{IG}
    359 
    360 Information gain is a criterion based on reducing class impurity. Unlike the Gini criterion, information gain calculates impurity using entropy.
    361 
    362 The formula for information gain defines the following variables where $H(S)$ is the entropy of the set $S$ \citep{it}:
    363 
    364 \begin{itemize}
    365     \setlength{\itemsep}{0.1em} 
    366     \item \(S \): The set of all samples before the split.
    367     \item \(S_L \): The set of all samples in the left partition.
    368     \item \(S_R \): The set of all samples in the right partition.
    369 \end{itemize}
    370 
    371 \begin{equation*}
    372 	IG = H(S) - \left( \frac{|S_{L}|}{|S|} H(S_{L}) + \frac{|S_{R}|}{|S|} H(S_{R}) \right)
    373 \end{equation*}
    374 
    375 \subsection{Time Complexity}\label{complexity}
    376 
    377 \subsubsection{Asymptotic Time Complexity Analysis}\label{analysis}
    378 
    379 To evaluate the algorithm's asymptotic time complexity, we concentrate on the cost of finding and executing a split at a single decision node. Although a complete decision tree often contains multiple splitting nodes, the computational cost is most significant for the first split. Once the dataset is partitioned, subsequent splits operate on smaller subsets and thus require less computation. This is shown in \autoref{fig:operations} by considering the computational differences between the $n=1000$ and $n=500$ columns.
    380 
    381 The homogeneity calculation, which requires at most \( n \) operations per label, is dominated by the complexity of the loop over each hyperplane. As such, it does not need to be considered for the final asymptotic time complexity. Next, we consider the evaluation of hyperplanes. Assume the feature matrix is \( n \times m \), and 
    382 
    383 \begin{equation*}
    384 	0 < r \leq n, m. 
    385 \end{equation*}
    386 
    387 Hyperplane construction requires selecting \( r \) samples from \( n \), which can be done in \( \binom{n}{r} \) ways, and for each selection of samples, there are \( \binom{m}{r} \) ways to choose \( r \) features. Therefore, the total number of hyperplanes to evaluate is
    388 
    389 \[
    390 	\binom{n}{r} \cdot\binom{m}{r}.
    391 \]
    392 
    393 To derive the equation of the hyperplane passing through the \(r\) samples, an \( r \times r \) matrix is constructed based on the $r$ selected features and samples. Subsequently, its column-wise means are computed; these means are then subtracted from each row of the matrix, and the covariance matrix is calculated. Eigen decomposition is then performed on the resulting matrix to find the hyperplane's normal vector, and the hyperplane's bias term is determined by taking the dot product between the normal vector and the column-wise mean vector. These calculations are sufficient to derive the hyperplane's equation in normal form. The calculation of the covariance matrix and the eigen decomposition dominate the complexity of finding the equation for the hyperplane. Each is in the complexity class of \( \Theta(r^3) \), assuming the use of a naïve matrix multiplication approach \citep{matrix}.
    394 
    395 We then evaluate the split using our splitting criterion. Assume our splitting criterion is the Gini criterion. To compute our splitting score, we evaluate
    396 
    397 \begin{equation*}
    398 	G = p_L \left(1 - \sum_{j\in C} p_{Lj}^2 \right) + p_R \left(1 - \sum_{j \in C} p_{Rj}^2 \right).
    399 	\label{gini_usage}
    400 \end{equation*}
    401 
    402 The complexity of calculating the splitting score is dominated by determining which side of the hyperplane each sample lies on. This requires calculating the dot product between the sample and the hyperplane's normal vector, adding the bias term, and checking if the result is positive, negative, or zero. As such, the evaluation of one hyperplane is in the complexity class of \( \Theta(rn) \), and since \( |C| \leq n \), the number of classes does not impact the asymptotic time complexity of evaluating each hyperplane.
    403 
    404 To split the dataset on the best split, we recompute the side of the hyperplane each sample lies on and add each sample to the corresponding list for samples that are either above or on the hyperplane or strictly below it. While this recomputation can be avoided by caching the results, it does not impact the overall asymptotic time complexity of the algorithm. The evaluation of all samples will take \( \Theta(rn) \) time, and since we need to move each sample to the appropriate list, and each sample has \( m \) features, this will take \( \Theta(nm) \) time. Thus, the asymptotic time complexity of splitting the dataset is in the complexity class of
    405 
    406 \begin{equation*}
    407 	\Theta(rn + nm).
    408 \end{equation*}
    409 
    410 Since splitting on the best split is only done once, its complexity is dominated by the complexity of evaluating all hyperplanes. As such, this final split does not impact the overall asymptotic time complexity.
    411 
    412 Since there are \( \binom{n}{r} \cdot \binom{m}{r} \) splits to evaluate, each split takes \( \Theta(r^3) \) operations to compute the hyperplane, and \( \Theta(rn) \) time to evaluate, we find the asymptotic time complexity of finding and executing a split at a single decision node is in the complexity class of
    413 
    414 \[
    415 	\Theta\left( \binom{n}{r} \cdot \binom{m}{r} \cdot r(r^2 + n) \right).
    416 \]
    417 
    418 \subsubsection{Operations for a Single Split}\label{ops}
    419 
    420 \begin{table}[h]
    421 	\centering
    422     \caption{Operations required for CART-ELC to identify and execute a split at a single decision node, assuming a multiplicative factor of one for the asymptotic time complexity, no additive constants, and $r = m$.}
    423 	\small
    424 	\begin{tabular}{lcccccc} 
    425 		\addlinespace
    426 		\toprule
    427 		\multirow{2}{*}{r} & \multicolumn{6}{c}{n}  \\ 
    428 		\cmidrule(lr){2-7}  
    429 		& 100 & 500 & 1000 & 5000 & 10000 & 20000 \\ 
    430 		\midrule
    431 			1 & 1.01e+04 & 2.50e+05 & 1.00e+06 & 2.50e+07 & 1.00e+08 & 4.00e+08 \\
    432 			2 & 1.03e+06 & 1.26e+08 & 1.00e+09 & 1.25e+11 & 1.00e+12 & 8.00e+12 \\
    433 			3 & 5.29e+07 & 3.16e+10 & 5.03e+11 & 3.13e+14 & 5.00e+15 & 8.00e+16 \\
    434 			4 & 1.82e+09 & 5.31e+12 & 1.68e+14 & 5.22e+17 & 1.67e+19 & 5.34e+20 \\
    435 			5 & 4.71e+10 & 6.70e+14 & 4.23e+16 & 6.53e+20 & 4.17e+22 & 2.67e+24 \\
    436 			6 & 9.73e+11 & 6.77e+16 & 8.50e+18 & 6.54e+23 & 8.35e+25 & 1.07e+28 \\
    437 			7 & 1.67e+13 & 5.71e+18 & 1.43e+21 & 5.46e+26 & 1.39e+29 & 3.56e+31 \\
    438 			8 & 2.44e+14 & 4.13e+20 & 2.05e+23 & 3.90e+29 & 1.99e+32 & 1.02e+35 \\
    439 			9 & 3.10e+15 & 2.62e+22 & 2.59e+25 & 2.44e+32 & 2.49e+35 & 2.55e+38 \\
    440 			10 & 3.46e+16 & 1.47e+24 & 2.90e+27 & 1.36e+35 & 2.77e+38 & 5.66e+41 \\
    441 		\bottomrule
    442 	\end{tabular}
    443 	\label{fig:operations}
    444 \end{table}
    445 
    446 While the assumptions underlying \autoref{fig:operations} are idealized, it offers insights into the computational cost for CART-ELC to identify and split on a single decision node. These values indicate using CART-ELC on small datasets ($n \leq 1000$) when $r$ values are small ($r \leq 3$) remains manageable, but as $r$ and $n$ values increase, the computational costs quickly grow reaching $3.13 \cdot 10^{14}$ operations at $r = 3$, $n = 5000$ for a single split. 
    447 
    448 While performing our evaluations shown in \autoref{fig:results}, we found $r$ values of three or higher to be too costly in most cases. This was partially because the results shown in \autoref{fig:operations} assume $r=m$, but in \autoref{fig:results} this was often not true, and thus there was an additional multiplicative factor of $\binom{m}{r}$.
    449 
    450 \section{Experiments and Results}
    451 
    452 In this section, we present our data collection methodology and provide a detailed description of the datasets used. We then compare CART-ELC with other decision tree induction algorithms, evaluate different splitting criteria, and discuss our findings.
    453 
    454 \subsection{Methodology}\label{methodology}
    455 
    456 The results shown in \autoref{fig:results} for OC1 and OC1-AP \citep{MurthyKS94}, CART-LC and CART \citep{breiman1984cart}, and C4.5 \citep{c4.5}, where C4.5 and CART were collected using the IND 2.1 package \citep{buntine}, were reported by \citet{MurthyKS94}. Alongside these, we present results from our implementations of HHCART \citep{HHCart} and CART-ELC.%
    457 \makeatletter
    458 \@ifpackagewith{tmlr}{accepted}{\footnote{Our implementations of HHCART and CART-ELC are available at \url{https://github.com/andrewlaack/cart-elc}.}}{}%
    459 \@ifpackagewith{tmlr}{preprint}{\footnote{Our implementations of HHCART and CART-ELC are available at \url{https://github.com/andrewlaack/cart-elc}.}}{}%
    460 \makeatother
    461 \space To ensure a fair comparison, we followed experimental procedures consistent with those of \citet{MurthyKS94}, differing only in our hyperparameter tuning strategy for CART-ELC and HHCART. The change to hyperparameter tuning was necessary due to our inclusion of a max depth hyperparameter for HHCART and CART-ELC, as well as the $r$ hyperparameter for CART-ELC.
    462 
    463 Similar to the experiments performed by \citet{MurthyKS94}, HHCART and CART-ELC used the twoing criterion as the splitting criterion, defined in \autoref{splitting_criteria}. Additionally, HHCART and CART-ELC did not apply any post-pruning or pre-pruning strategies.
    464  
    465 For each experiment with CART-ELC we defined two variables, $d$ and $j$, the largest max depth and $r$ value to test respectively. We set $d = 5$ because increasing the max depth beyond five often led to identical tree structures or overfitting (see \autoref{hyper_tuning}). We also set $j = 2$ for all datasets except for the iris dataset where we set $j = 4$. These choices for $j$ values were made due to computational constraints imposed by our algorithm's complexity and dataset sizes, outlined in \autoref{complexity}. 
    466 
    467 Shown below are the steps performed for each experiment.
    468 
    469 \begin{enumerate}
    470 	\item Complete ten iterations of the following:
    471     \begin{enumerate}
    472         \item Randomly partition the dataset into five partitions of roughly equal size.
    473 		\item Perform a grid search over all integers $1 \leq i \leq j$ and $1 \leq k \leq d$ where $k$ is defined as the current max depth hyperparameter, and $i$ is defined as the current $r$ hyperparameter. For each iteration:
    474 		\begin{enumerate}
    475 			\item Build a tree using all partitions except for one as the training data.
    476 			\item Evaluate the accuracy of the tree on the unseen partition.
    477 			\item Repeat steps (i) and (ii) for each of the five partitions.
    478 			\item Calculate the mean accuracy and tree size across the five iterations where tree size is defined as the number of leaf nodes in the tree.
    479             \end{enumerate}
    480     \end{enumerate}
    481 \item Calculate the mean and standard deviation of the tree sizes and accuracies based on the final results of each cross-validation (CV) \citep{cv} iteration for each hyperparameter pair.
    482     \item Select the hyperparameter pair that offers the best balance between mean accuracy and tree size. The mean and standard deviation of the accuracy and tree size with the selected hyperparameter pair are the results of the experiment.
    483 \end{enumerate}
    484 
    485 For our evaluation of both HHCART algorithms, we performed the same steps outlined above, but HHCART does not have an $r$ hyperparameter, and thus our hyperparameter search was only performed for max depth values.
    486 
    487 \subsection{Datasets}\label{datasets}
    488 
    489 \subsubsection{Star/Galaxy Discrimination (Bright)}\label{sg_bright}
    490 
    491 This dataset was derived from bright images collected by \citet{odewahn} of stars and galaxies. The dataset is comprised of 2,462 samples, 14 features, and two classifications. The two classifications are star and galaxy. The 14 features were measurements defined by the original researchers that they deemed to be relevant for star/galaxy discrimination.
    492 
    493 \subsubsection{Star/Galaxy Discrimination (Dim)}\label{sg_dim}
    494 
    495 This dataset was derived from dim images collected by \citet{odewahn} of stars and galaxies. The dataset is comprised of 4,192 samples, 14 features, and two classifications. The two classifications are star and galaxy. The 14 features were measurements defined by the original researchers that they deemed to be relevant for star/galaxy discrimination.
    496 
    497 \subsubsection{Breast Cancer Diagnosis}\label{Breast Cancer Diagnosis}
    498 
    499 This dataset \citep{breast_cancer} is comprised of 699 samples with nine features and two classifications. The two classifications are malignant and benign. Similar to the evaluations performed by \citet{MurthyKS94}, we removed the 16 samples from this dataset that were missing the "Bare Nuclei" feature. Of the remaining 683 samples, 444 were labeled as benign and 239 were labeled as malignant.
    500 
    501 \subsubsection{Iris Classification}\label{iris}
    502 
    503 This dataset \citep{iris} is comprised of 150 samples with four features and three classifications. Each of the three classifications specifies a different species of iris and contains 50 samples.
    504 
    505 \subsubsection{Boston Housing Cost}\label{housing}
    506 
    507 This dataset \citep{boston} is comprised of 506 samples with 12 continuous features, one binary feature, and two classifications. While the original dataset contained a continuous label, we discretized the dataset by assigning the class of a given sample to be one if the continuous label's value was strictly less than 21,000 and two if it was greater than or equal to 21,000, as was done by \citet{MurthyKS94}. After this preprocessing, the dataset contained 260 samples with a classification of two and 246 samples with a classification of one. The dataset was also missing some features for select samples which we left as NaN for our evaluation of CART-ELC and performed mean imputation for HHCART as HHCART is unable to handle missing features.
    508 
    509 \subsubsection{Diabetes Diagnosis}\label{diabetes}
    510 
    511 This dataset \citep{diabetes} is comprised of 768 samples with eight features and two classifications. These two classifications represent testing positive and negative for diabetes. Of the 768 samples, 500 tested negative for diabetes, and 268 tested positive.
    512 
    513 
    514 \newpage
    515 \subsection{Empirical Algorithmic Comparison}\label{empirical_comparison}
    516 
    517 \begin{table}[h]
    518 	\centering
    519 	\small
    520     \caption{Accuracy and tree size comparison across decision tree induction algorithms.}
    521 	\begin{tabular}{lcccccc} 
    522 		\addlinespace
    523 		\toprule
    524 		\multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Accuracy}  \\ 
    525 		\cmidrule(lr){2-7}  
    526 		& S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ 
    527 		\midrule
    528 		CART-ELC  & \textbf{98.9 ± 0.2} & \textbf{95.2 ± 0.5} & 96.3 ± 0.4 & 95.1 ± 0.8 & 83.5 ± 0.7 & \textbf{74.5 ± 1.3}  \\
    529 
    530 		HHCART(A) & 98.3 ± 0.5 & 93.7 ± 0.8 & \textbf{96.9 ± 0.3} &  \textbf{95.5 ± 1.4} &  \textbf{83.9 ± 0.8} & 73.2 ± 1.2  \\
    531 		HHCART(D) & 98.1 ± 0.4 & 93.7 ± 0.9 & \textbf{96.9 ± 0.3} &  94.3 ± 1.5 &  82.2 ± 1.4 & 73.2 ± 1.2  \\
    532 		OC1       & \textbf{98.9 ± 0.2} & 95.0 ± 0.3 & 96.2 ± 0.3 & 94.7 ± 3.1 & 82.4 ± 0.8 & 74.4 ± 1.0  \\
    533 		OC1-AP    & 98.1 ± 0.2 & 94.0 ± 0.2 & 94.5 ± 0.5 & 92.7 ± 2.4 & 81.8 ± 1.0 & 73.8 ± 1.0  \\
    534 		CART-LC   & 98.8 ± 0.2 & 92.8 ± 0.5 & 95.3 ± 0.6 & 93.5 ± 2.9 & 81.4 ± 1.2 & 73.7 ± 1.2  \\
    535 		CART   & 98.5 ± 0.5 & 94.2 ± 0.7 & 95.0 ± 1.6 & 93.8 ± 3.7 & 82.1 ± 3.5 & 73.9 ± 3.4  \\
    536 		C4.5      & 98.5 ± 0.5 & 93.3 ± 0.8 & 95.3 ± 2.0 & 95.1 ± 3.2 & 83.2 ± 3.1 & 71.4 ± 3.3  \\ 
    537 		\midrule
    538 		\multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Tree Size}  \\ 
    539 		\cmidrule(lr){2-7}  
    540 		& S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ 
    541 		\midrule
    542 		CART-ELC  & \textbf{3.7 ± 0.2}  & \textbf{9.8 ± 4.2}  & \textbf{2.0 ± 0.0}  & 4.8 ± 0.1  & \textbf{4.0 ± 0.0}  & \textbf{4.0 ± 0.0}  \\
    543 		HHCART(A) & 6.1 ± 0.3  & 14.6 ± 4.8  & \textbf{2.0 ± 0.0}  & \textbf{3.1 ± 0.1}  & 7.8 ± 0.2  & \textbf{4.0 ± 0.0}  \\
    544 		HHCART(D) & 6.3 ± 0.4  & 14.9 ± 5.0  & \textbf{2.0 ± 0.0}  & 4.7 ± 0.1  & 23.3 ± 0.8  & \textbf{4.0 ± 0.0}  \\
    545 		OC1       & 4.3 ± 1.0  & 13.0 ± 8.7  & 2.8 ± 0.9  & \textbf{3.1 ± 0.2}  & 6.9 ± 3.2  & 5.4 ± 3.8  \\
    546 		OC1-AP    & 6.9 ± 2.4  & 29.3 ± 8.8  & 6.4 ± 1.7  & 3.2 ± 0.3  & 8.6 ± 4.5  & 11.4 ± 7.5  \\
    547 		CART-LC   & 3.9 ± 1.3  & 24.2 ± 8.7  & 3.5 ± 0.9  & 3.2 ± 0.3  & 5.8 ± 3.2  & 8.0 ± 5.2  \\
    548 		CART   & 13.9 ± 5.7  & 30.4 ± 10.0  & 11.5 ± 7.2  & 4.3 ± 1.6  & 15.1 ± 10  & 11.5 ± 9.1  \\
    549 		C4.5      & 14.3 ± 2.2  & 77.9 ± 7.4  & 9.8 ± 2.2  & 4.6 ± 0.8  & 28.2 ± 3.3  & 56.3 ± 7.9  \\ 
    550 		\bottomrule
    551 	\end{tabular}
    552 	\label{fig:results}
    553 \end{table}
    554 
    555 
    556 The results presented in \autoref{fig:results} demonstrate that CART-ELC, HHCART(A), HHCART(D), and OC1 obtained consistently strong accuracies across the datasets evaluated when compared with other decision tree induction algorithms. Additionally, of these top performers, CART-ELC had the smallest trees across most datasets, falling behind on the iris dataset where a higher max depth value was selected due to minor variances in accuracies across max depth and $r$ values. Finally, we note the standard deviations in accuracy for CART-ELC, on average, were lower than those of OC1 and the variants of HHCART.
    557 
    558 
    559 \begin{table}[h]
    560 	\centering
    561 \caption{P-values for accuracy comparisons between various models and CART-ELC. P-values strictly less than 0.05 are bolded. P-values reported as 0.000 are not zero, but are smaller than the reporting precision.}
    562 	\small
    563 	\begin{tabular}{lccccccc} 
    564 		\addlinespace
    565 		\toprule
    566 		\multirow{1}{*}{Algorithm} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\ 
    567 		\midrule
    568 HHCART(A) & \textbf{0.004} & \textbf{0.000} & \textbf{0.001} & 0.446 & 0.250 & \textbf{0.032} \\
    569 HHCART(D) & \textbf{0.000} & \textbf{0.000} & \textbf{0.001} & 0.159 & \textbf{0.021} & \textbf{0.032} \\
    570 OC1 & 1.000 & 0.296 & 0.536 & 0.701 & \textbf{0.004} & 0.849 \\
    571 OC1-AP & \textbf{0.000} & \textbf{0.000} & \textbf{0.000} & \textbf{0.012} & \textbf{0.000} & 0.195 \\
    572 CART-LC & 0.278 & \textbf{0.000} & \textbf{0.000} & 0.122 & \textbf{0.000} & 0.170 \\
    573 CART & \textbf{0.037} & \textbf{0.002} & \textbf{0.032} & 0.303 & 0.244 & 0.612 \\
    574 C4.5 & \textbf{0.037} & \textbf{0.000} & 0.153 & 1.000 & 0.771 & \textbf{0.017} \\
    575     \bottomrule
    576 \end{tabular}
    577 \label{fig:p_values}
    578 \end{table}
    579 
    580 The results in \autoref{fig:p_values}, obtained by performing Welch's t-tests, compare the accuracies of each algorithm against CART-ELC. These results support the finding that CART-ELC, HHCART, and OC1 are the best-performing decision tree induction algorithms on the datasets evaluated. Additionally, CART-ELC had statistically significant performance improvements on both star/galaxy discrimination datasets and the diabetes dataset relative to both HHCART variants and only had statistically significant performance improvements when compared to OC1 on the housing dataset. The final result of interest is that both variants of HHCART performed better than CART-ELC on the cancer dataset by a statistically significant amount.
    581 
    582 \begin{table}[h]
    583 	\centering
    584 	\caption{Cohen's d effect size for accuracies between various models and CART-ELC. Comparisons with $p < 0.05$ are bolded.}
    585 		\small
    586 	\begin{tabular}{lccccccc} 
    587 		\addlinespace
    588 		\toprule
    589 		\multirow{1}{*}{Algorithm} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\ 
    590 		\midrule
    591 
    592 		HHCART(A) & \textbf{1.576} & \textbf{2.249} & \textbf{-1.697} & -0.351 & -0.532 & \textbf{1.039} \\
    593 		HHCART(D) & \textbf{2.530} & \textbf{2.060} & \textbf{-1.697} & 0.666 & \textbf{1.175} & \textbf{1.039} \\
    594 	OC1 & 0.000 & 0.485 & 0.283 & 0.177 & \textbf{1.463} & 0.086 \\
    595 	OC1-AP & \textbf{4.000} & \textbf{3.151} & \textbf{3.976} & \textbf{1.342} & \textbf{1.970} & 0.604 \\
    596 	CART-LC & 0.500 & \textbf{4.800} & \textbf{1.961} & 0.752 & \textbf{2.138} & 0.639 \\
    597 	CART & \textbf{1.050} & \textbf{1.644} & \textbf{1.115} & 0.486 & 0.555 & 0.233 \\
    598 	C4.5 & \textbf{1.050} & \textbf{2.848} & 0.693 & 0.000 & 0.133 & \textbf{1.236} \\
    599     \bottomrule
    600 \end{tabular}
    601 \label{fig:cohens_d}
    602 \end{table}
    603 
    604 
    605 The results in \autoref{fig:cohens_d} show the effect sizes of the difference between accuracies with respect to CART-ELC. Negative values indicate the algorithm outperformed CART-ELC on the dataset, and positive values indicate CART-ELC outperformed the algorithm on the dataset. These results show the magnitude of CART-ELC's outperformance of HHCART on the star/galaxy datasets is greater than the magnitude of HHCART's outperformance of CART-ELC on the cancer dataset. 
    606 
    607 HHCART's strong performance relative to CART-ELC on the cancer dataset is likely due to the dataset being both linearly separable and having a sizable gap between the two classes. HHCART typically uses splits that pass through one sample in the transformed feature space, and thus it can find splits with a larger separation on both sides with respect to the classes on each side. This contrasts CART-ELC where each hyperplane generally passes through multiple samples and will thus generalize worse on some datasets because the separation on both sides of the hyperplane will be smaller.
    608 
    609 Despite this, a limitation of HHCART that makes CART-ELC perform better on certain datasets is its inability to handle outliers well. Outliers nominally impact CART-ELC as the splitting criterion is minimally impacted by them, but HHCART uses a change of basis derived from the eigen decomposition of the covariance matrix for each class, and these resulting eigenvectors can be greatly impacted by outliers. This results in HHCART searching for axis-aligned splits in suboptimal feature spaces. Similarly, when there are clusters of samples with the same classification throughout the feature space, this can also pose issues for HHCART as the covariance matrices will be impacted by the disparate groups more than CART-ELC will be.
    610 
    611 \subsection{Empirical Criteria Comparison}\label{empirical_criteria_comparison}
    612 
    613 \begin{table}[h]
    614 	\centering
    615     \caption{Accuracy and tree size comparison across splitting criteria.}
    616 	\small
    617 	\begin{tabular}{lccccccc} 
    618 		\addlinespace
    619 		\toprule
    620 		\multirow{2}{*}{Splitting Criterion} & \multicolumn{6}{c}{Accuracy}  \\ 
    621 		\cmidrule(lr){2-7}  
    622 		\multirow{1}{*}{} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\ 
    623 		\midrule
    624 		\hyperref[twoing]{Twoing Criterion} & \textbf{98.9 ± 0.2} & \textbf{95.2 ± 0.5} & \textbf{96.3 ± 0.4} & 95.1 ± 0.8 & \textbf{83.5 ± 0.7} & \textbf{74.5 ± 1.3} \\
    625 
    626 		\hyperref[gini]{Gini Criterion} & \textbf{98.9 ± 0.3} & 95.1 ± 0.5 & \textbf{96.3 ± 0.4} & 95.1 ± 0.9 & \textbf{83.5 ± 0.7} & \textbf{74.5 ± 1.3} \\
    627 
    628 		\hyperref[IG]{Information Gain}& \textbf{98.9 ± 0.2} & 94.9 ± 0.5 & 96.0 ± 0.4 & \textbf{95.3 ± 0.8} & 81.8 ± 1.0 & 74.0 ± 1.1 \\
    629 
    630 		\midrule
    631 		\multirow{2}{*}{Splitting Criterion} & \multicolumn{6}{c}{Tree Size}  \\ 
    632 		\cmidrule(lr){2-7}  
    633 		& S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ 
    634 		\midrule
    635 
    636 
    637 		\hyperref[twoing] {Twoing Criterion} & 3.7 ± 0.2 & \textbf{9.8 ± 4.2} & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & 4.0 ± 0.0 & \textbf{4.0 ± 0.0} \\ 
    638 			\hyperref[gini]{Gini Criterion}	& \textbf{3.6 ± 0.1} & 12.4 ± 8.3 & \textbf{2.0 ± 0.0} & \textbf{3.0 ± 0.1} & 4.0 ± 0.0 & \textbf{4.0 ± 0.0} \\
    639 		\hyperref[IG]{Information Gain} & 3.8 ± 0.2 & 9.9 ± 4.5 & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & \textbf{2.0 ± 0.0} & \textbf{4.0 ± 0.0} \\ 
    640 
    641     \bottomrule
    642 	\end{tabular}
    643 	\label{fig:crits}
    644 \end{table}
    645 
    646 Our experimental results across splitting criteria, shown in \autoref{fig:crits}, indicate the twoing criterion and Gini criterion are both strong splitting criteria, and information gain only falls behind nominally. Additionally, on the dataset we evaluated that contained more than two classifications, information gain outperformed the other two criteria.
    647 
    648 \section{Conclusions and Future Work}\label{conclusions_and_future_work}
    649 
    650 CART-ELC achieves competitive accuracies across a variety of small datasets when compared with other decision tree algorithms despite only searching a subset of all unique hyperplanes. Additionally, while other oblique decision trees often use splits that are linear combinations of many features, CART-ELC's $r$ hyperparameter limits the number of features in each linear combination, which can result in simpler decision boundaries. This, along with CART-ELC's ability to generate shallower trees, often results in trees that are both more interpretable and efficient when making predictions.
    651 
    652 Despite this, the computational complexity of CART-ELC remains prohibitively high for large datasets. As such, future research should focus on strategies to mitigate these computational costs, such as developing methods to select a smaller subset of candidate hyperplanes or using CART-ELC with ensemble methods. Adapting CART-ELC for use in random forests \citep{rndfrst} and stochastic gradient boosting \citep{sgboost} may be a promising direction, as these approaches could reduce computational costs by training on subsets of samples or by introducing stochastic elements into the training process. Finally, while our evaluation included six datasets, further experimentation on a wider variety of datasets and more extensive CV may be beneficial for assessing the algorithm's overall efficacy.
    653 
    654 \bibliography{references}
    655 \bibliographystyle{tmlr}
    656 
    657 \appendix
    658 
    659 \newpage
    660 
    661 \section{Hyperparameter Tuning Results for CART-ELC}\label{hyper_tuning}
    662 
    663 Figures~\ref{graph_twoing}--\ref{graph_ig} show the accuracies and tree sizes for CART-ELC across each splitting criterion, dataset, $r$ value, and max depth. These results show higher $r$ values often result in higher accuracies at lower max depths but begin overfitting at lower max depths. We also see the accuracies and tree sizes across splitting criteria are quite consistent, often varying in accuracy but rarely in trends. Finally, on the S/G Dim dataset, accuracies appear to be plateauing around a max depth of five for $r=2$ while on the other datasets, max depths of five either result in models overfitting or accuracies stagnating for $r \geq 2$.
    664 
    665 \begin{figure}[h]
    666 \includegraphics[width=\linewidth]{twoing.pdf}
    667 \caption{Max depth and $r$ value results for CART-ELC when using the twoing criterion as the splitting criterion.}
    668 \label{graph_twoing}
    669 \end{figure}
    670 
    671 \begin{figure}[h]
    672 	\includegraphics[width=\linewidth]{gini.pdf}
    673 	\caption{Max depth and $r$ value results for CART-ELC when using the Gini criterion as the splitting criterion.}
    674 \label{graph_gini}
    675 \end{figure}
    676 
    677 \begin{figure}[h]
    678 \includegraphics[width=\linewidth]{ig.pdf}
    679 	\caption{Max depth and $r$ value results for CART-ELC when using information gain as the splitting criterion.}
    680 \label{graph_ig}
    681 \end{figure}
    682 
    683 \end{document}