commit 458417a8bb0330a7df7826bf39af0438a94e55a9
parent aae325a5f2eb3ac44adbcb7e74199b5185b57514
Author: AndrewLockVI <andrew@laack.co>
Date: Thu, 8 May 2025 21:39:34 -0500
arXiv v1
Diffstat:
5 files changed, 3028 insertions(+), 0 deletions(-)
diff --git a/papers/arxiv/v1/main.tex b/papers/arxiv/v1/main.tex
@@ -0,0 +1,683 @@
+\documentclass[10pt]{article} % For LaTeX2e
+\usepackage{changepage} % Put this in your preamble
+\usepackage{float}
+\usepackage[preprint]{tmlr}
+\pdfoutput=1
+\usepackage{caption}
+
+\usepackage{pgfplots}
+\pgfplotsset{compat=1.17} % or another compatible version for arXiv
+
+\usepackage{graphicx}
+\usepackage{ifthen}
+
+\usepackage{multirow}
+\usepackage{booktabs}
+
+\newboolean{isFinal}
+\setboolean{isFinal}{false}
+
+\usepackage{listings}
+
+\usepackage{enumitem}
+\usepackage{algorithm}
+\usepackage{algpseudocode}
+
+% If accepted, instead use the following line for the camera-ready submission:
+%\usepackage[accepted]{tmlr}
+% To de-anonymize and remove mentions to TMLR (for example for posting to preprint servers), instead use the following:
+%\usepackage[preprint]{tmlr}
+
+% Optional math commands from https://github.com/goodfeli/dlbook_notation.
+\input{math_commands.tex}
+
+\usepackage{url}
+
+\usepackage{hyperref}
+
+\lstdefinestyle{mystyle}{
+ basicstyle=\ttfamily\footnotesize, % Set the font and size
+ numbers=left, % Add line numbers
+ numberstyle=\tiny, % Style of line numbers
+ stepnumber=1, % Step between line numbers
+ numbersep=5pt, % Distance of numbers from code
+ backgroundcolor=\color{gray!10}, % Light gray background
+ keywordstyle=\color{blue}\bfseries, % Style for keywords
+ commentstyle=\color{green!50!black}, % Style for comments
+ stringstyle=\color{red}, % Style for strings
+ breaklines=true, % Auto line breaking
+ tabsize=4, % Tab width
+ frame=single % Draw a frame around the code
+}
+
+\usepackage{natbib}
+
+\title{CART-ELC: Oblique Decision Tree Induction via Exhaustive Search}
+
+% Authors must not appear in the submitted version. They should be hidden
+% as long as the tmlr package is used without the [accepted] or [preprint] options.
+% Non-anonymous submissions will be rejected without review.
+
+\author{\name Andrew D. Laack \email andrew@laack.co\\
+ University of Wisconsin-Superior
+}
+
+% The \author macro works with any number of authors. Use \AND
+% to separate the names and addresses of multiple authors.
+
+\newcommand{\fix}{\marginpar{FIX}}
+\newcommand{\new}{\marginpar{NEW}}
+
+\def\month{MM} % Insert correct month for camera-ready version
+\def\year{YYYY} % Insert correct year for camera-ready version
+\def\openreview{\url{https://openreview.net/forum?id=XXXX}} % Insert correct link to OpenReview for camera-ready version
+
+
+\renewcommand{\thealgorithm}{\arabic{algorithm}:}
+
+\begin{document}
+
+\ifthenelse{\boolean{isFinal}}{
+
+ {\large \textbf{CART-ELC: Oblique Decision Tree Induction via Exhaustive Search}} \\
+ { {Andrew D. Laack}}
+
+ \section{Prerequisites}
+
+ Terms a broad computer science audience may not be familiar with are defined in this section.
+
+ \begin{enumerate}
+ \item \textbf{Machine Learning:}
+ A field that focuses on algorithms that learn from patterns in data.
+ \item \textbf{Sample:}
+ A single data point, often the result of an experiment. Mathematically, samples are vectors.
+ \item \textbf{Feature:}
+ A coordinate of a sample, or an attribute of all samples.
+ \item \textbf{Label:}
+ Label and classification are used interchangeably to describe the value we are trying to predict about a sample.
+ \item \textbf{Model:}
+ A model is a trained implementation of a machine learning algorithm that has learned patterns from data.
+ \item \textbf{Decision Tree:}
+ A machine learning algorithm that uses a binary tree for classification (discrete labels) or regression (continuous labels) where tree traversals are done based on feature values for the sample being evaluated. Additionally, decision trees generally use axis-aligned splitting boundaries. Pseudocode for a decision tree algorithm is found in \autoref{decision_tree}.
+ \item \textbf{Splitting Criterion:}
+ A function to quantify how good a candidate split is for a decision tree (see \autoref{splitting_criteria}).
+ \item \textbf{Hyperplane:}
+ A mathematical object that has $n-1$ dimensions in $n$ dimensional space.
+ \item \textbf{Oblique Decision Tree:}
+ A decision tree that uses hyperplanes as splitting boundaries that are not constrained to axis-alignment.
+ \item \textbf{Accuracy:}
+ The relative frequency of correct predictions made by a model on a given dataset.
+ \item \textbf{P-value:}
+ The probability of obtaining a result at least as strong as the one observed, assuming the null hypothesis is true.
+ \item \textbf{Cohen's d:}
+ A measure of effect size quantifying the difference in standard deviations between two means.
+ \end{enumerate}
+
+ \newpage
+
+
+\subsection{Decision Tree Pseudocode}\label{decision_tree}
+
+\begin{algorithm}
+ \caption{CART Algorithm}
+\begin{algorithmic}[1]
+\Function{fit}{samples, labels, featureCount}
+ \If{homogeneous(labels)}
+ \State \Return Node(majorityClass(labels))
+ \EndIf
+ \State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore()
+ \For{sample in samples}
+ \For{feature in range(0,featureCount)}
+ \State currentSplit $\gets$ (feature, sample[feature])
+ \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples)
+ \If{ isBetterThan(currentSplittingScore, bestSplittingScore)}
+ \State bestSplittingScore, bestSplit $\gets$ currentSplittingScore, currentSplit
+ \EndIf
+ \EndFor
+ \EndFor
+ \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit)
+ \If{left is empty or right is empty}
+ \State \Return Node(majorityClass(labels))
+ \EndIf
+ \State leftSubtree $\gets$ fit(left.samples, left.labels, featureCount)
+ \State rightSubtree $\gets$ fit(right.samples, right.labels, featureCount)
+ \State tree $\gets$ Node(bestSplit)
+ \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree
+ \State \Return tree
+\EndFunction
+\end{algorithmic}
+\end{algorithm}
+
+\vfill
+
+ \begin{center}
+ \textbf{\large This concludes the Prerequisites section.}
+ \end{center}
+
+\newpage
+}{}
+
+
+\maketitle
+
+%\vspace{-.75cm}
+%\vspace{.5cm}
+
+
+\begin{abstract}
+
+Oblique decision trees have attracted attention due to their potential for improved classification performance over traditional axis-aligned decision trees. However, methods that rely on exhaustive search to find oblique splits face computational challenges. As a result, they have not been widely explored. We introduce a novel algorithm, Classification and Regression Tree - Exhaustive Linear Combinations (CART-ELC), for inducing oblique decision trees that performs an exhaustive search on a restricted set of hyperplanes. We then investigate the algorithm's computational complexity and its predictive capabilities. Our results demonstrate that CART-ELC consistently achieves competitive performance on small datasets, often yielding statistically significant improvements in classification accuracy relative to existing decision tree induction algorithms, while frequently producing shallower, simpler, and thus more interpretable trees.
+
+
+
+\end{abstract}
+
+%
+%\makeatletter
+%\@ifpackagewith{tmlr}{preprint}{
+%
+%\noindent\textbf{Keywords:} Decision trees, Oblique decision trees, Supervised learning, Classification algorithms
+%}{}%
+%\makeatother
+
+
+
+
+\section{Introduction}\label{introduction}
+
+Decision tree induction algorithms are among the most widely used algorithms for training interpretable machine learning models. This interpretability is one of the reasons they are often preferred over other models with better predictive capabilities \citep{survey, boosting}. While there has been much research on axis-aligned decision trees \citep{recent_overview, breiman1984cart, survey}, research on oblique decision trees has been less extensive \citep{HHCart, MurthyKS94, breiman1984cart}.
+
+Where traditional decision trees use axis-aligned splits to partition the feature space, oblique decision trees use linear combinations of features that are not restricted to axis-alignment. This allows oblique decision trees to be stronger learners with shallower trees while still being fairly interpretable \citep{MurthyKS94}. However, this increased flexibility results in a larger search space as the number of hyperplanes that uniquely partition a dataset when considering the relative positions of samples with respect to the hyperplane is at most $2^m \cdot \binom{n}{m}$, where $m$ is the dimensionality of the feature space and $n$ is the number of samples \citep{MurthyKS94}.
+
+\subsection{Related Works}\label{related_works}
+
+ \subsubsection{CART}
+
+The Classification and Regression Tree (CART) algorithm \citep{breiman1984cart} induces axis-aligned decision trees by recursively partitioning the dataset. At each step, it evaluates a set of candidate axis-aligned splits, selects the best one, and partitions the dataset using this split. These steps are then performed recursively for each partition until a stopping criterion is met.
+
+ \subsubsection{CART-LC}
+
+The Classification and Regression Tree - Linear Combinations (CART-LC) algorithm \citep{breiman1984cart} is a deterministic hill climbing algorithm for inducing oblique decision trees. The algorithm starts by median-centering the features and dividing them by their interquartile range. It then identifies the best axis-aligned split and iteratively adjusts the coefficients of the corresponding hyperplane to improve the split. Once improvements fall below a threshold, the algorithm converts the refined split from the transformed feature space to the original feature space, and uses this split to partition the dataset. This process is then applied recursively for each partition until a stopping criterion is met.
+
+ \subsubsection{OC1}
+
+The Oblique Classifier 1 (OC1) algorithm \citep{MurthyKS94} is a stochastic hill climbing algorithm for inducing oblique decision trees. It begins by initializing a random hyperplane and deterministically perturbs the hyperplane's coefficients to improve its splitting score. If no improvements can be made, a new hyperplane is created by adding random values to the coefficients in an attempt to escape the local optimum. The new hyperplane is then evaluated against the previous one. If the new score is better, the process repeats, starting with this new hyperplane as the initial hyperplane. If it is not better, another random hyperplane is initialized, and the entire procedure starts again. This process of applying perturbations and then attempting to escape local optima is applied $z$ times, where $z$ is a hyperparameter of the algorithm. After $z$ iterations, the split with the best splitting score is then used to partition the dataset. The algorithm then recursively applies the above procedure to each partition until a stopping criterion is met.
+
+\subsubsection{HHCART}
+
+The Householder Classification and Regression Tree (HHCART) algorithm \citep{HHCart} induces oblique decision trees by leveraging class-specific orientation. HHCART(D), a variant of the algorithm, computes the dominant eigenvector of the covariance matrix for each class. Each of these dominant eigenvectors is then individually used to perform a Householder transformation to align the corresponding class orientation with a coordinate axis. In the resulting feature space, the algorithm searches for the best axis-aligned split, which is then mapped back to the original feature space as an oblique split. The best oblique split found after evaluating each dominant eigenvector is used to partition the dataset. This process is then applied to each of the resulting partitions until a stopping criterion is met. \citet{HHCart} defined two variants of HHCART: HHCART(D), which only uses the dominant eigenvector for each class, and HHCART(A), which uses all eigenvectors for each class.
+
+\subsection{Limitations}\label{Limitations}
+
+A limitation of the CART algorithm is that it only considers axis-aligned splits. These splits can be suboptimal when the data is linearly separable, as described by \citet{breiman1984cart}. Algorithms like CART-LC and OC1 are not constrained to axis-aligned splits, but they often converge to local optima due to their hill climbing nature. While OC1 attempts to escape local optima by adding random values to the coefficients of the splitting hyperplane after its improvements have stagnated, this strategy does not guarantee escaping local optima. HHCART is not limited in either of these regards, but it does only test splits that are axis-aligned in specific transformed feature spaces. This subset of oblique splits can be overly limited in cases where it is computationally feasible to evaluate more oblique splits.
+
+Additionally, while CART-LC, OC1, and HHCART can generate shallower trees relative to CART \citep{MurthyKS94}, the resulting trees often use splits that are linear combinations of many features. This results in oblique trees that can be difficult to interpret, and more costly to perform predictions with.
+
+\section{CART-ELC}\label{algorithm}
+
+\subsection{Description}\label{Description}
+
+We introduce CART-ELC to address the limitations described in \autoref{Limitations}. CART-ELC has one hyperparameter, $r$, defining both the minimum number of samples each candidate hyperplane must pass through and the maximum number of non-zero coefficients the hyperplane's normal vector can have. We define $m$ as the dimensionality of the feature space. With these definitions, we now discuss the two cases for the algorithm.
+
+\textbf{Case 1: $\mathbf{r = m}$}
+
+\begin{adjustwidth}{1em}{1em}
+
+In the general case for $r=m$, CART-ELC performs an exhaustive search of all unique hyperplanes that pass through at least $r$ samples. This is done by evaluating each hyperplane according to a splitting criterion, tracking the best splitting hyperplane, and after all hyperplanes have been evaluated, the algorithm partitions the dataset using the best split. This process is then recursively applied to both partitions, stopping when a stopping criterion is met. The general case for $r=m$ is when each selection of $r$ samples uniquely identifies a hyperplane. In such cases, this process guarantees the greedily optimal hyperplane passing through at least $r$ samples will be used to partition the dataset at each splitting node. This guarantee does not apply in degenerate cases where samples don't uniquely identify a hyperplane, but when floating-point vectors are sampled from a continuous distribution, it is unlikely a selection of $r$ samples is affinely dependent. In such cases, the normal vector for a hyperplane passing through the $r$ samples is chosen according to the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation. % This may not be the best splitting hyperplane passing through the $r$ samples. %, but enumerating all uniquely partitioning hyperplanes passing through the $r$ samples is computationally expensive.
+
+% In the general case for $r=m$, CART-ELC performs an exhaustive search of all unique hyperplanes that pass through at least $r$ samples. The algorithm evaluates each candidate according to a splitting criterion, as shown on line 10 of \autoref{Pseudocode}. As the hyperplanes are evaluated, we track the best candidate, and after all hyperplanes have been evaluated, the algorithm partitions the dataset using the best split. This process is then recursively applied to both partitions, stopping when a stopping criterion is met. The general case for $r=m$ is when each selection of $r$ samples uniquely identifies a hyperplane. The calculation for this hyperplane is done on line 8 of \autoref{Pseudocode}. In such cases, this process guarantees the greedily optimal hyperplane passing through at least $r$ samples will be used to partition the dataset at each splitting node. This guarantee does not apply in degenerate cases where samples don't uniquely identify a hyperplane, but when floating-point vectors are sampled from a continuous distribution, it is unlikely a selection of $r$ samples is affinely dependent. In such cases, the normal vector for a hyperplane passing through the $r$ samples is chosen according to the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation. This may not be the best splitting hyperplane passing through the $r$ samples, but enumerating all uniquely partitioning hyperplanes passing through the $r$ samples is computationally expensive.
+
+
+\end{adjustwidth}
+
+\textbf{Case 2: $\mathbf{r < m}$}
+
+\begin{adjustwidth}{1em}{1em}
+
+ CART-ELC limits the search space to hyperplanes with at most $r$ non-zero coefficients and one bias term. In the general case, the algorithm considers each unique hyperplane defined by selecting a combination of $r$ features passing through at least $r$ samples where the remaining $m-r$ features have coefficients of zero. Like the $r=m$ case, each candidate is then evaluated using a splitting criterion with the best split chosen to partition the dataset. This process is then recursively applied to both partitions until a stopping criterion is met. Additionally, like the $r=m$ case, the deterministic tie-breaking behavior of the underlying linear algebra solver's implementation is used to select a normal vector for a hyperplane that passes through the $r$ samples when the selection of samples and features is affinely dependent.
+
+\end{adjustwidth}
+
+\textbf{Hyperplane Orientation}
+
+\begin{adjustwidth}{1em}{1em}
+
+ Every hyperplane has two unit normal vectors in $\mathbb{R}^n$ for $n \in \mathbb{N}$. Since we assign samples to the left partition when samples are on or above the hyperplane, the selection of a normal vector can change how samples that lie on the hyperplane contribute to the splitting score. To eliminate this nondeterminism, one may enforce a canonical sign rule. %, such as requiring the first nonzero entry of the normal vector to be positive
+Given this, optimality guarantees for hyperplane splitting only hold under the deterministic orientation of the hyperplanes.
+
+\end{adjustwidth}
+
+\begin{figure}[h]
+ \centering
+ \centering
+ \begin{tikzpicture}[scale=1]
+ % Draw axes
+ \draw[->] (-1,0) -- (4,0) node[right] {$x$};
+ \draw[->] (0,-1) -- (0,4) node[above] {$y$};
+
+ % Plot points
+ \fill[black] (1,1) circle (2pt) node[above] {$x_1$};
+ \fill[black] (4,4) circle (2pt) node[below] {$x_2$};
+
+ % Draw hyperplanes
+ \draw[thick, dashed] (0,1.25) -- (4,3.25) node[right] {$H_1$};
+ \draw[thick, dashed] (0,2.5) -- (4,4.5) node[right] {$H_2$};
+ \end{tikzpicture}
+ \caption{Hyperplanes resulting in unique partitionings of two samples.}
+ \label{fig:hyper}
+\end{figure}
+
+In the general case for cases 1 and 2, the algorithm guarantees finding the best splitting hyperplane with at most $r$ non-zero coefficients and a bias term that passes through at least $r$ samples at each splitting node, but in neither case can it guarantee this is the greedily optimal oblique split. An illustration of this is shown in \autoref{fig:hyper}. Based on the samples in \autoref{fig:hyper}, if we define $r = m$, only one hyperplane will be evaluated: the hyperplane that passes through both samples and places them in the same partition. This hyperplane will result in the same partitioning as the hyperplane $H_2$. However, a hyperplane that results in the same partitioning as hyperplane $H_1$ is better if $x_1$ and $x_2$ have different labels. Since CART-ELC does not evaluate a hyperplane that partitions the samples in this way, the algorithm can not guarantee greedily optimal oblique splits.
+
+Additionally, there may be multiple selections of features and samples that describe the same hyperplane. While computing the splitting score for each hyperplane is not necessary, the algorithm will evaluate each. When floating-point vectors are sampled from a continuous distribution, it is infrequent different selections of samples and features will result in the same hyperplane.
+
+The algorithm also defines two stopping criteria. The first stopping criterion is class homogeneity: if a partition contains samples that all belong to the same class, the algorithm will stop because subsequent splits would not change the model's predictions.
+
+The second criterion is when a partition is empty. If no special handling is implemented, the algorithm will attempt to partition samples indefinitely in cases where one partition is empty and the other is not homogeneous. As such, the algorithm must verify both partitions are not empty after each partitioning.
+
+Finally, the choice to not specify a splitting criterion, additional stopping criteria, specific hyperplane tie-breaking behavior, or pruning was made to ensure future research can use this algorithm without limiting its usefulness to specific domains.
+
+\subsection{Pseudocode}\label{Pseudocode}
+
+\begin{algorithm}[H]
+ \caption{CART-ELC}
+\begin{algorithmic}[1]
+\Function{fit}{samples, labels, $m$, $r$}
+ \If{homogeneous(labels)}
+ \State \Return Node(majorityClass(labels))
+ \EndIf
+ \State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore()
+ \For{selectedSamples in combinations(samples, $r$)}
+ \For{selectedFeatures in combinations($m$, $r$)}
+ \State vectorsToPassThrough $\gets$ featureSubset(selectedSamples, selectedFeatures)
+ \State currentSplit $\gets$ findHyperplanePassingThrough(vectorsToPassThrough)
+ \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples)
+ \If{ isBetterThan(currentSplittingScore, bestSplittingScore)}
+ \State bestSplittingScore, bestSplit $\gets$ currentSplittingScore, currentSplit
+ \EndIf
+ \EndFor
+ \EndFor
+ \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit)
+ \If{left is empty or right is empty}
+ \State \Return Node(majorityClass(labels))
+ \EndIf
+ \State leftSubtree $\gets$ fit(left.samples, left.labels, $m$, $r$)
+ \State rightSubtree $\gets$ fit(right.samples, right.labels, $m$, $r$)
+ \State tree $\gets$ Node(bestSplit)
+ \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree
+ \State \Return tree
+\EndFunction
+\end{algorithmic}
+\end{algorithm}
+
+\subsection{Splitting Criteria}\label{splitting_criteria}
+
+To evaluate the time complexity of our algorithm and perform an empirical analysis, we must define some splitting criteria. The three we implemented and will define are the twoing criterion \citep{breiman1984cart}, the Gini criterion \citep{breiman1984cart}, and information gain \citep{quinlan1986}. The default splitting criterion for our implementation of CART-ELC is the Gini criterion. This decision is based on our empirical analysis in \autoref{empirical_criteria_comparison}, which showed the twoing and Gini criteria often result in trees that yield the same classification accuracy, but the Gini criterion is computationally cheaper. Despite this, our empirical evaluation in \autoref{empirical_comparison} uses the twoing criterion to maintain consistency with prior research.
+
+\subsubsection{Twoing Criterion}\label{twoing}
+
+The twoing criterion is a splitting criterion that evaluates the quality of a split by considering the number of samples on each side of the splitting hyperplane and the difference in the distribution of classifications between the two partitions. Trees using the twoing criterion tend to be well-balanced because the splitting score is maximized when the number of samples in each partition is equal.
+
+The formula for the twoing criterion defines the following variables:
+
+\begin{itemize}
+ \setlength{\itemsep}{0.1em}
+ \item \( p_L \): The probability a sample is in the left partition.
+ \item \( p_R \): The probability a sample is in the right partition.
+ \item \( C \): The set of all classes in the union of the left and right partitions.
+ \item \( p_{Lj} \): The probability a random sample from the left partition has a classification of \( j \).
+ \item \( p_{Rj} \): The probability a random sample from the right partition has a classification of \( j \).
+\end{itemize}
+
+\begin{equation*}
+T = \frac{p_L p_R}{4} \left( \sum_{j \in C} \left| p_{Lj} - p_{Rj} \right| \right)^2
+ \label{twoing_equ}
+\end{equation*}
+
+
+\subsubsection{Gini Criterion}\label{gini}
+
+The Gini criterion is the most popular splitting criterion for decision trees. Trees induced using the Gini criterion minimize class impurity, resulting in trees that may be less balanced than the twoing criterion. The splitting score is calculated by computing the Gini impurity for both partitions, weighting them based on the number of samples in each partition, and summing the results.
+
+The formula for the Gini criterion defines the following variables:
+
+\begin{itemize}
+ \setlength{\itemsep}{0.1em}
+ \item \( p_L \): The probability a sample is in the left partition.
+ \item \( p_R \): The probability a sample is in the right partition.
+ \item \( C \): The set of all classes in the union of the left and right partitions.
+ \item \( p_{Lj} \): The probability a random sample from the left partition has a classification of \( j \).
+ \item \( p_{Rj} \): The probability a random sample from the right partition has a classification of \( j \).
+\end{itemize}
+
+\begin{equation*}
+ G = p_L \left(1 - \sum_{j\in C} p_{Lj}^2 \right) + p_R \left(1 - \sum_{j \in C} p_{Rj}^2 \right)
+ \label{gini_equ}
+\end{equation*}
+
+\subsubsection{Information Gain}\label{IG}
+
+Information gain is a criterion based on reducing class impurity. Unlike the Gini criterion, information gain calculates impurity using entropy.
+
+The formula for information gain defines the following variables where $H(S)$ is the entropy of the set $S$ \citep{it}:
+
+\begin{itemize}
+ \setlength{\itemsep}{0.1em}
+ \item \(S \): The set of all samples before the split.
+ \item \(S_L \): The set of all samples in the left partition.
+ \item \(S_R \): The set of all samples in the right partition.
+\end{itemize}
+
+\begin{equation*}
+ IG = H(S) - \left( \frac{|S_{L}|}{|S|} H(S_{L}) + \frac{|S_{R}|}{|S|} H(S_{R}) \right)
+\end{equation*}
+
+\subsection{Time Complexity}\label{complexity}
+
+\subsubsection{Asymptotic Time Complexity Analysis}\label{analysis}
+
+To evaluate the algorithm's asymptotic time complexity, we concentrate on the cost of finding and executing a split at a single decision node. Although a complete decision tree often contains multiple splitting nodes, the computational cost is most significant for the first split. Once the dataset is partitioned, subsequent splits operate on smaller subsets and thus require less computation. This is shown in \autoref{fig:operations} by considering the computational differences between the $n=1000$ and $n=500$ columns.
+
+The homogeneity calculation, which requires at most \( n \) operations per label, is dominated by the complexity of the loop over each hyperplane. As such, it does not need to be considered for the final asymptotic time complexity. Next, we consider the evaluation of hyperplanes. Assume the feature matrix is \( n \times m \), and
+
+\begin{equation*}
+ 0 < r \leq n, m.
+\end{equation*}
+
+Hyperplane construction requires selecting \( r \) samples from \( n \), which can be done in \( \binom{n}{r} \) ways, and for each selection of samples, there are \( \binom{m}{r} \) ways to choose \( r \) features. Therefore, the total number of hyperplanes to evaluate is
+
+\[
+ \binom{n}{r} \cdot\binom{m}{r}.
+\]
+
+To derive the equation of the hyperplane passing through the \(r\) samples, an \( r \times r \) matrix is constructed based on the $r$ selected features and samples. Subsequently, its column-wise means are computed; these means are then subtracted from each row of the matrix, and the covariance matrix is calculated. Eigen decomposition is then performed on the resulting matrix to find the hyperplane's normal vector, and the hyperplane's bias term is determined by taking the dot product between the normal vector and the column-wise mean vector. These calculations are sufficient to derive the hyperplane's equation in normal form. The calculation of the covariance matrix and the eigen decomposition dominate the complexity of finding the equation for the hyperplane. Each is in the complexity class of \( \Theta(r^3) \), assuming the use of a naïve matrix multiplication approach \citep{matrix}.
+
+We then evaluate the split using our splitting criterion. Assume our splitting criterion is the Gini criterion. To compute our splitting score, we evaluate
+
+\begin{equation*}
+ G = p_L \left(1 - \sum_{j\in C} p_{Lj}^2 \right) + p_R \left(1 - \sum_{j \in C} p_{Rj}^2 \right).
+ \label{gini_usage}
+\end{equation*}
+
+The complexity of calculating the splitting score is dominated by determining which side of the hyperplane each sample lies on. This requires calculating the dot product between the sample and the hyperplane's normal vector, adding the bias term, and checking if the result is positive, negative, or zero. As such, the evaluation of one hyperplane is in the complexity class of \( \Theta(rn) \), and since \( |C| \leq n \), the number of classes does not impact the asymptotic time complexity of evaluating each hyperplane.
+
+To split the dataset on the best split, we recompute the side of the hyperplane each sample lies on and add each sample to the corresponding list for samples that are either above or on the hyperplane or strictly below it. While this recomputation can be avoided by caching the results, it does not impact the overall asymptotic time complexity of the algorithm. The evaluation of all samples will take \( \Theta(rn) \) time, and since we need to move each sample to the appropriate list, and each sample has \( m \) features, this will take \( \Theta(nm) \) time. Thus, the asymptotic time complexity of splitting the dataset is in the complexity class of
+
+\begin{equation*}
+ \Theta(rn + nm).
+\end{equation*}
+
+Since splitting on the best split is only done once, its complexity is dominated by the complexity of evaluating all hyperplanes. As such, this final split does not impact the overall asymptotic time complexity.
+
+Since there are \( \binom{n}{r} \cdot \binom{m}{r} \) splits to evaluate, each split takes \( \Theta(r^3) \) operations to compute the hyperplane, and \( \Theta(rn) \) time to evaluate, we find the asymptotic time complexity of finding and executing a split at a single decision node is in the complexity class of
+
+\[
+ \Theta\left( \binom{n}{r} \cdot \binom{m}{r} \cdot r(r^2 + n) \right).
+\]
+
+\subsubsection{Operations for a Single Split}\label{ops}
+
+\begin{table}[h]
+ \centering
+ \caption{Operations required for CART-ELC to identify and execute a split at a single decision node, assuming a multiplicative factor of one for the asymptotic time complexity, no additive constants, and $r = m$.}
+ \small
+ \begin{tabular}{lcccccc}
+ \addlinespace
+ \toprule
+ \multirow{2}{*}{r} & \multicolumn{6}{c}{n} \\
+ \cmidrule(lr){2-7}
+ & 100 & 500 & 1000 & 5000 & 10000 & 20000 \\
+ \midrule
+ 1 & 1.01e+04 & 2.50e+05 & 1.00e+06 & 2.50e+07 & 1.00e+08 & 4.00e+08 \\
+ 2 & 1.03e+06 & 1.26e+08 & 1.00e+09 & 1.25e+11 & 1.00e+12 & 8.00e+12 \\
+ 3 & 5.29e+07 & 3.16e+10 & 5.03e+11 & 3.13e+14 & 5.00e+15 & 8.00e+16 \\
+ 4 & 1.82e+09 & 5.31e+12 & 1.68e+14 & 5.22e+17 & 1.67e+19 & 5.34e+20 \\
+ 5 & 4.71e+10 & 6.70e+14 & 4.23e+16 & 6.53e+20 & 4.17e+22 & 2.67e+24 \\
+ 6 & 9.73e+11 & 6.77e+16 & 8.50e+18 & 6.54e+23 & 8.35e+25 & 1.07e+28 \\
+ 7 & 1.67e+13 & 5.71e+18 & 1.43e+21 & 5.46e+26 & 1.39e+29 & 3.56e+31 \\
+ 8 & 2.44e+14 & 4.13e+20 & 2.05e+23 & 3.90e+29 & 1.99e+32 & 1.02e+35 \\
+ 9 & 3.10e+15 & 2.62e+22 & 2.59e+25 & 2.44e+32 & 2.49e+35 & 2.55e+38 \\
+ 10 & 3.46e+16 & 1.47e+24 & 2.90e+27 & 1.36e+35 & 2.77e+38 & 5.66e+41 \\
+ \bottomrule
+ \end{tabular}
+ \label{fig:operations}
+\end{table}
+
+While the assumptions underlying \autoref{fig:operations} are idealized, it offers insights into the computational cost for CART-ELC to identify and split on a single decision node. These values indicate using CART-ELC on small datasets ($n \leq 1000$) when $r$ values are small ($r \leq 3$) remains manageable, but as $r$ and $n$ values increase, the computational costs quickly grow reaching $3.13 \cdot 10^{14}$ operations at $r = 3$, $n = 5000$ for a single split.
+
+While performing our evaluations shown in \autoref{fig:results}, we found $r$ values of three or higher to be too costly in most cases. This was partially because the results shown in \autoref{fig:operations} assume $r=m$, but in \autoref{fig:results} this was often not true, and thus there was an additional multiplicative factor of $\binom{m}{r}$.
+
+\section{Experiments and Results}
+
+In this section, we present our data collection methodology and provide a detailed description of the datasets used. We then compare CART-ELC with other decision tree induction algorithms, evaluate different splitting criteria, and discuss our findings.
+
+\subsection{Methodology}\label{methodology}
+
+The results shown in \autoref{fig:results} for OC1 and OC1-AP \citep{MurthyKS94}, CART-LC and CART \citep{breiman1984cart}, and C4.5 \citep{c4.5}, where C4.5 and CART were collected using the IND 2.1 package \citep{buntine}, were reported by \citet{MurthyKS94}. Alongside these, we present results from our implementations of HHCART \citep{HHCart} and CART-ELC.%
+\makeatletter
+\@ifpackagewith{tmlr}{accepted}{\footnote{Our implementations of HHCART and CART-ELC are available at \url{https://github.com/andrewlaack/cart-elc}.}}{}%
+\@ifpackagewith{tmlr}{preprint}{\footnote{Our implementations of HHCART and CART-ELC are available at \url{https://github.com/andrewlaack/cart-elc}.}}{}%
+\makeatother
+\space To ensure a fair comparison, we followed experimental procedures consistent with those of \citet{MurthyKS94}, differing only in our hyperparameter tuning strategy for CART-ELC and HHCART. The change to hyperparameter tuning was necessary due to our inclusion of a max depth hyperparameter for HHCART and CART-ELC, as well as the $r$ hyperparameter for CART-ELC.
+
+Similar to the experiments performed by \citet{MurthyKS94}, HHCART and CART-ELC used the twoing criterion as the splitting criterion, defined in \autoref{splitting_criteria}. Additionally, HHCART and CART-ELC did not apply any post-pruning or pre-pruning strategies.
+
+For each experiment with CART-ELC we defined two variables, $d$ and $j$, the largest max depth and $r$ value to test respectively. We set $d = 5$ because increasing the max depth beyond five often led to identical tree structures or overfitting (see \autoref{hyper_tuning}). We also set $j = 2$ for all datasets except for the iris dataset where we set $j = 4$. These choices for $j$ values were made due to computational constraints imposed by our algorithm's complexity and dataset sizes, outlined in \autoref{complexity}.
+
+Shown below are the steps performed for each experiment.
+
+\begin{enumerate}
+ \item Complete ten iterations of the following:
+ \begin{enumerate}
+ \item Randomly partition the dataset into five partitions of roughly equal size.
+ \item Perform a grid search over all integers $1 \leq i \leq j$ and $1 \leq k \leq d$ where $k$ is defined as the current max depth hyperparameter, and $i$ is defined as the current $r$ hyperparameter. For each iteration:
+ \begin{enumerate}
+ \item Build a tree using all partitions except for one as the training data.
+ \item Evaluate the accuracy of the tree on the unseen partition.
+ \item Repeat steps (i) and (ii) for each of the five partitions.
+ \item Calculate the mean accuracy and tree size across the five iterations where tree size is defined as the number of leaf nodes in the tree.
+ \end{enumerate}
+ \end{enumerate}
+\item Calculate the mean and standard deviation of the tree sizes and accuracies based on the final results of each cross-validation (CV) \citep{cv} iteration for each hyperparameter pair.
+ \item Select the hyperparameter pair that offers the best balance between mean accuracy and tree size. The mean and standard deviation of the accuracy and tree size with the selected hyperparameter pair are the results of the experiment.
+\end{enumerate}
+
+For our evaluation of both HHCART algorithms, we performed the same steps outlined above, but HHCART does not have an $r$ hyperparameter, and thus our hyperparameter search was only performed for max depth values.
+
+\subsection{Datasets}\label{datasets}
+
+\subsubsection{Star/Galaxy Discrimination (Bright)}\label{sg_bright}
+
+This dataset was derived from bright images collected by \citet{odewahn} of stars and galaxies. The dataset is comprised of 2,462 samples, 14 features, and two classifications. The two classifications are star and galaxy. The 14 features were measurements defined by the original researchers that they deemed to be relevant for star/galaxy discrimination.
+
+\subsubsection{Star/Galaxy Discrimination (Dim)}\label{sg_dim}
+
+This dataset was derived from dim images collected by \citet{odewahn} of stars and galaxies. The dataset is comprised of 4,192 samples, 14 features, and two classifications. The two classifications are star and galaxy. The 14 features were measurements defined by the original researchers that they deemed to be relevant for star/galaxy discrimination.
+
+\subsubsection{Breast Cancer Diagnosis}\label{Breast Cancer Diagnosis}
+
+This dataset \citep{breast_cancer} is comprised of 699 samples with nine features and two classifications. The two classifications are malignant and benign. Similar to the evaluations performed by \citet{MurthyKS94}, we removed the 16 samples from this dataset that were missing the "Bare Nuclei" feature. Of the remaining 683 samples, 444 were labeled as benign and 239 were labeled as malignant.
+
+\subsubsection{Iris Classification}\label{iris}
+
+This dataset \citep{iris} is comprised of 150 samples with four features and three classifications. Each of the three classifications specifies a different species of iris and contains 50 samples.
+
+\subsubsection{Boston Housing Cost}\label{housing}
+
+This dataset \citep{boston} is comprised of 506 samples with 12 continuous features, one binary feature, and two classifications. While the original dataset contained a continuous label, we discretized the dataset by assigning the class of a given sample to be one if the continuous label's value was strictly less than 21,000 and two if it was greater than or equal to 21,000, as was done by \citet{MurthyKS94}. After this preprocessing, the dataset contained 260 samples with a classification of two and 246 samples with a classification of one. The dataset was also missing some features for select samples which we left as NaN for our evaluation of CART-ELC and performed mean imputation for HHCART as HHCART is unable to handle missing features.
+
+\subsubsection{Diabetes Diagnosis}\label{diabetes}
+
+This dataset \citep{diabetes} is comprised of 768 samples with eight features and two classifications. These two classifications represent testing positive and negative for diabetes. Of the 768 samples, 500 tested negative for diabetes, and 268 tested positive.
+
+
+\newpage
+\subsection{Empirical Algorithmic Comparison}\label{empirical_comparison}
+
+\begin{table}[h]
+ \centering
+ \small
+ \caption{Accuracy and tree size comparison across decision tree induction algorithms.}
+ \begin{tabular}{lcccccc}
+ \addlinespace
+ \toprule
+ \multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Accuracy} \\
+ \cmidrule(lr){2-7}
+ & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\
+ \midrule
+ CART-ELC & \textbf{98.9 ± 0.2} & \textbf{95.2 ± 0.5} & 96.3 ± 0.4 & 95.1 ± 0.8 & 83.5 ± 0.7 & \textbf{74.5 ± 1.3} \\
+
+ HHCART(A) & 98.3 ± 0.5 & 93.7 ± 0.8 & \textbf{96.9 ± 0.3} & \textbf{95.5 ± 1.4} & \textbf{83.9 ± 0.8} & 73.2 ± 1.2 \\
+ HHCART(D) & 98.1 ± 0.4 & 93.7 ± 0.9 & \textbf{96.9 ± 0.3} & 94.3 ± 1.5 & 82.2 ± 1.4 & 73.2 ± 1.2 \\
+ OC1 & \textbf{98.9 ± 0.2} & 95.0 ± 0.3 & 96.2 ± 0.3 & 94.7 ± 3.1 & 82.4 ± 0.8 & 74.4 ± 1.0 \\
+ OC1-AP & 98.1 ± 0.2 & 94.0 ± 0.2 & 94.5 ± 0.5 & 92.7 ± 2.4 & 81.8 ± 1.0 & 73.8 ± 1.0 \\
+ CART-LC & 98.8 ± 0.2 & 92.8 ± 0.5 & 95.3 ± 0.6 & 93.5 ± 2.9 & 81.4 ± 1.2 & 73.7 ± 1.2 \\
+ CART & 98.5 ± 0.5 & 94.2 ± 0.7 & 95.0 ± 1.6 & 93.8 ± 3.7 & 82.1 ± 3.5 & 73.9 ± 3.4 \\
+ C4.5 & 98.5 ± 0.5 & 93.3 ± 0.8 & 95.3 ± 2.0 & 95.1 ± 3.2 & 83.2 ± 3.1 & 71.4 ± 3.3 \\
+ \midrule
+ \multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Tree Size} \\
+ \cmidrule(lr){2-7}
+ & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\
+ \midrule
+ CART-ELC & \textbf{3.7 ± 0.2} & \textbf{9.8 ± 4.2} & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & \textbf{4.0 ± 0.0} & \textbf{4.0 ± 0.0} \\
+ HHCART(A) & 6.1 ± 0.3 & 14.6 ± 4.8 & \textbf{2.0 ± 0.0} & \textbf{3.1 ± 0.1} & 7.8 ± 0.2 & \textbf{4.0 ± 0.0} \\
+ HHCART(D) & 6.3 ± 0.4 & 14.9 ± 5.0 & \textbf{2.0 ± 0.0} & 4.7 ± 0.1 & 23.3 ± 0.8 & \textbf{4.0 ± 0.0} \\
+ OC1 & 4.3 ± 1.0 & 13.0 ± 8.7 & 2.8 ± 0.9 & \textbf{3.1 ± 0.2} & 6.9 ± 3.2 & 5.4 ± 3.8 \\
+ OC1-AP & 6.9 ± 2.4 & 29.3 ± 8.8 & 6.4 ± 1.7 & 3.2 ± 0.3 & 8.6 ± 4.5 & 11.4 ± 7.5 \\
+ CART-LC & 3.9 ± 1.3 & 24.2 ± 8.7 & 3.5 ± 0.9 & 3.2 ± 0.3 & 5.8 ± 3.2 & 8.0 ± 5.2 \\
+ CART & 13.9 ± 5.7 & 30.4 ± 10.0 & 11.5 ± 7.2 & 4.3 ± 1.6 & 15.1 ± 10 & 11.5 ± 9.1 \\
+ C4.5 & 14.3 ± 2.2 & 77.9 ± 7.4 & 9.8 ± 2.2 & 4.6 ± 0.8 & 28.2 ± 3.3 & 56.3 ± 7.9 \\
+ \bottomrule
+ \end{tabular}
+ \label{fig:results}
+\end{table}
+
+
+The results presented in \autoref{fig:results} demonstrate that CART-ELC, HHCART(A), HHCART(D), and OC1 obtained consistently strong accuracies across the datasets evaluated when compared with other decision tree induction algorithms. Additionally, of these top performers, CART-ELC had the smallest trees across most datasets, falling behind on the iris dataset where a higher max depth value was selected due to minor variances in accuracies across max depth and $r$ values. Finally, we note the standard deviations in accuracy for CART-ELC, on average, were lower than those of OC1 and the variants of HHCART.
+
+
+\begin{table}[h]
+ \centering
+\caption{P-values for accuracy comparisons between various models and CART-ELC. P-values strictly less than 0.05 are bolded. P-values reported as 0.000 are not zero, but are smaller than the reporting precision.}
+ \small
+ \begin{tabular}{lccccccc}
+ \addlinespace
+ \toprule
+ \multirow{1}{*}{Algorithm} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\
+ \midrule
+HHCART(A) & \textbf{0.004} & \textbf{0.000} & \textbf{0.001} & 0.446 & 0.250 & \textbf{0.032} \\
+HHCART(D) & \textbf{0.000} & \textbf{0.000} & \textbf{0.001} & 0.159 & \textbf{0.021} & \textbf{0.032} \\
+OC1 & 1.000 & 0.296 & 0.536 & 0.701 & \textbf{0.004} & 0.849 \\
+OC1-AP & \textbf{0.000} & \textbf{0.000} & \textbf{0.000} & \textbf{0.012} & \textbf{0.000} & 0.195 \\
+CART-LC & 0.278 & \textbf{0.000} & \textbf{0.000} & 0.122 & \textbf{0.000} & 0.170 \\
+CART & \textbf{0.037} & \textbf{0.002} & \textbf{0.032} & 0.303 & 0.244 & 0.612 \\
+C4.5 & \textbf{0.037} & \textbf{0.000} & 0.153 & 1.000 & 0.771 & \textbf{0.017} \\
+ \bottomrule
+\end{tabular}
+\label{fig:p_values}
+\end{table}
+
+The results in \autoref{fig:p_values}, obtained by performing Welch's t-tests, compare the accuracies of each algorithm against CART-ELC. These results support the finding that CART-ELC, HHCART, and OC1 are the best-performing decision tree induction algorithms on the datasets evaluated. Additionally, CART-ELC had statistically significant performance improvements on both star/galaxy discrimination datasets and the diabetes dataset relative to both HHCART variants and only had statistically significant performance improvements when compared to OC1 on the housing dataset. The final result of interest is that both variants of HHCART performed better than CART-ELC on the cancer dataset by a statistically significant amount.
+
+\begin{table}[h]
+ \centering
+ \caption{Cohen's d effect size for accuracies between various models and CART-ELC. Comparisons with $p < 0.05$ are bolded.}
+ \small
+ \begin{tabular}{lccccccc}
+ \addlinespace
+ \toprule
+ \multirow{1}{*}{Algorithm} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\
+ \midrule
+
+ HHCART(A) & \textbf{1.576} & \textbf{2.249} & \textbf{-1.697} & -0.351 & -0.532 & \textbf{1.039} \\
+ HHCART(D) & \textbf{2.530} & \textbf{2.060} & \textbf{-1.697} & 0.666 & \textbf{1.175} & \textbf{1.039} \\
+ OC1 & 0.000 & 0.485 & 0.283 & 0.177 & \textbf{1.463} & 0.086 \\
+ OC1-AP & \textbf{4.000} & \textbf{3.151} & \textbf{3.976} & \textbf{1.342} & \textbf{1.970} & 0.604 \\
+ CART-LC & 0.500 & \textbf{4.800} & \textbf{1.961} & 0.752 & \textbf{2.138} & 0.639 \\
+ CART & \textbf{1.050} & \textbf{1.644} & \textbf{1.115} & 0.486 & 0.555 & 0.233 \\
+ C4.5 & \textbf{1.050} & \textbf{2.848} & 0.693 & 0.000 & 0.133 & \textbf{1.236} \\
+ \bottomrule
+\end{tabular}
+\label{fig:cohens_d}
+\end{table}
+
+
+The results in \autoref{fig:cohens_d} show the effect sizes of the difference between accuracies with respect to CART-ELC. Negative values indicate the algorithm outperformed CART-ELC on the dataset, and positive values indicate CART-ELC outperformed the algorithm on the dataset. These results show the magnitude of CART-ELC's outperformance of HHCART on the star/galaxy datasets is greater than the magnitude of HHCART's outperformance of CART-ELC on the cancer dataset.
+
+HHCART's strong performance relative to CART-ELC on the cancer dataset is likely due to the dataset being both linearly separable and having a sizable gap between the two classes. HHCART typically uses splits that pass through one sample in the transformed feature space, and thus it can find splits with a larger separation on both sides with respect to the classes on each side. This contrasts CART-ELC where each hyperplane generally passes through multiple samples and will thus generalize worse on some datasets because the separation on both sides of the hyperplane will be smaller.
+
+Despite this, a limitation of HHCART that makes CART-ELC perform better on certain datasets is its inability to handle outliers well. Outliers nominally impact CART-ELC as the splitting criterion is minimally impacted by them, but HHCART uses a change of basis derived from the eigen decomposition of the covariance matrix for each class, and these resulting eigenvectors can be greatly impacted by outliers. This results in HHCART searching for axis-aligned splits in suboptimal feature spaces. Similarly, when there are clusters of samples with the same classification throughout the feature space, this can also pose issues for HHCART as the covariance matrices will be impacted by the disparate groups more than CART-ELC will be.
+
+\subsection{Empirical Criteria Comparison}\label{empirical_criteria_comparison}
+
+\begin{table}[h]
+ \centering
+ \caption{Accuracy and tree size comparison across splitting criteria.}
+ \small
+ \begin{tabular}{lccccccc}
+ \addlinespace
+ \toprule
+ \multirow{2}{*}{Splitting Criterion} & \multicolumn{6}{c}{Accuracy} \\
+ \cmidrule(lr){2-7}
+ \multirow{1}{*}{} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\
+ \midrule
+ \hyperref[twoing]{Twoing Criterion} & \textbf{98.9 ± 0.2} & \textbf{95.2 ± 0.5} & \textbf{96.3 ± 0.4} & 95.1 ± 0.8 & \textbf{83.5 ± 0.7} & \textbf{74.5 ± 1.3} \\
+
+ \hyperref[gini]{Gini Criterion} & \textbf{98.9 ± 0.3} & 95.1 ± 0.5 & \textbf{96.3 ± 0.4} & 95.1 ± 0.9 & \textbf{83.5 ± 0.7} & \textbf{74.5 ± 1.3} \\
+
+ \hyperref[IG]{Information Gain}& \textbf{98.9 ± 0.2} & 94.9 ± 0.5 & 96.0 ± 0.4 & \textbf{95.3 ± 0.8} & 81.8 ± 1.0 & 74.0 ± 1.1 \\
+
+ \midrule
+ \multirow{2}{*}{Splitting Criterion} & \multicolumn{6}{c}{Tree Size} \\
+ \cmidrule(lr){2-7}
+ & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\
+ \midrule
+
+
+ \hyperref[twoing] {Twoing Criterion} & 3.7 ± 0.2 & \textbf{9.8 ± 4.2} & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & 4.0 ± 0.0 & \textbf{4.0 ± 0.0} \\
+ \hyperref[gini]{Gini Criterion} & \textbf{3.6 ± 0.1} & 12.4 ± 8.3 & \textbf{2.0 ± 0.0} & \textbf{3.0 ± 0.1} & 4.0 ± 0.0 & \textbf{4.0 ± 0.0} \\
+ \hyperref[IG]{Information Gain} & 3.8 ± 0.2 & 9.9 ± 4.5 & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & \textbf{2.0 ± 0.0} & \textbf{4.0 ± 0.0} \\
+
+ \bottomrule
+ \end{tabular}
+ \label{fig:crits}
+\end{table}
+
+Our experimental results across splitting criteria, shown in \autoref{fig:crits}, indicate the twoing criterion and Gini criterion are both strong splitting criteria, and information gain only falls behind nominally. Additionally, on the dataset we evaluated that contained more than two classifications, information gain outperformed the other two criteria.
+
+\section{Conclusions and Future Work}\label{conclusions_and_future_work}
+
+CART-ELC achieves competitive accuracies across a variety of small datasets when compared with other decision tree algorithms despite only searching a subset of all unique hyperplanes. Additionally, while other oblique decision trees often use splits that are linear combinations of many features, CART-ELC's $r$ hyperparameter limits the number of features in each linear combination, which can result in simpler decision boundaries. This, along with CART-ELC's ability to generate shallower trees, often results in trees that are both more interpretable and efficient when making predictions.
+
+Despite this, the computational complexity of CART-ELC remains prohibitively high for large datasets. As such, future research should focus on strategies to mitigate these computational costs, such as developing methods to select a smaller subset of candidate hyperplanes or using CART-ELC with ensemble methods. Adapting CART-ELC for use in random forests \citep{rndfrst} and stochastic gradient boosting \citep{sgboost} may be a promising direction, as these approaches could reduce computational costs by training on subsets of samples or by introducing stochastic elements into the training process. Finally, while our evaluation included six datasets, further experimentation on a wider variety of datasets and more extensive CV may be beneficial for assessing the algorithm's overall efficacy.
+
+\bibliography{references}
+\bibliographystyle{tmlr}
+
+\appendix
+
+\newpage
+
+\section{Hyperparameter Tuning Results for CART-ELC}\label{hyper_tuning}
+
+Figures~\ref{graph_twoing}--\ref{graph_ig} show the accuracies and tree sizes for CART-ELC across each splitting criterion, dataset, $r$ value, and max depth. These results show higher $r$ values often result in higher accuracies at lower max depths but begin overfitting at lower max depths. We also see the accuracies and tree sizes across splitting criteria are quite consistent, often varying in accuracy but rarely in trends. Finally, on the S/G Dim dataset, accuracies appear to be plateauing around a max depth of five for $r=2$ while on the other datasets, max depths of five either result in models overfitting or accuracies stagnating for $r \geq 2$.
+
+\begin{figure}[h]
+\includegraphics[width=\linewidth]{twoing.pdf}
+\caption{Max depth and $r$ value results for CART-ELC when using the twoing criterion as the splitting criterion.}
+\label{graph_twoing}
+\end{figure}
+
+\begin{figure}[h]
+ \includegraphics[width=\linewidth]{gini.pdf}
+ \caption{Max depth and $r$ value results for CART-ELC when using the Gini criterion as the splitting criterion.}
+\label{graph_gini}
+\end{figure}
+
+\begin{figure}[h]
+\includegraphics[width=\linewidth]{ig.pdf}
+ \caption{Max depth and $r$ value results for CART-ELC when using information gain as the splitting criterion.}
+\label{graph_ig}
+\end{figure}
+
+\end{document}
diff --git a/papers/arxiv/v1/math_commands.tex b/papers/arxiv/v1/math_commands.tex
@@ -0,0 +1,508 @@
+%%%%% NEW MATH DEFINITIONS %%%%%
+
+\usepackage{amsmath,amsfonts,bm}
+
+% Mark sections of captions for referring to divisions of figures
+\newcommand{\figleft}{{\em (Left)}}
+\newcommand{\figcenter}{{\em (Center)}}
+\newcommand{\figright}{{\em (Right)}}
+\newcommand{\figtop}{{\em (Top)}}
+\newcommand{\figbottom}{{\em (Bottom)}}
+\newcommand{\captiona}{{\em (a)}}
+\newcommand{\captionb}{{\em (b)}}
+\newcommand{\captionc}{{\em (c)}}
+\newcommand{\captiond}{{\em (d)}}
+
+% Highlight a newly defined term
+\newcommand{\newterm}[1]{{\bf #1}}
+
+
+% Figure reference, lower-case.
+\def\figref#1{figure~\ref{#1}}
+% Figure reference, capital. For start of sentence
+\def\Figref#1{Figure~\ref{#1}}
+\def\twofigref#1#2{figures \ref{#1} and \ref{#2}}
+\def\quadfigref#1#2#3#4{figures \ref{#1}, \ref{#2}, \ref{#3} and \ref{#4}}
+% Section reference, lower-case.
+\def\secref#1{section~\ref{#1}}
+% Section reference, capital.
+\def\Secref#1{Section~\ref{#1}}
+% Reference to two sections.
+\def\twosecrefs#1#2{sections \ref{#1} and \ref{#2}}
+% Reference to three sections.
+\def\secrefs#1#2#3{sections \ref{#1}, \ref{#2} and \ref{#3}}
+% Reference to an equation, lower-case.
+\def\eqref#1{equation~\ref{#1}}
+% Reference to an equation, upper case
+\def\Eqref#1{Equation~\ref{#1}}
+% A raw reference to an equation---avoid using if possible
+\def\plaineqref#1{\ref{#1}}
+% Reference to a chapter, lower-case.
+\def\chapref#1{chapter~\ref{#1}}
+% Reference to an equation, upper case.
+\def\Chapref#1{Chapter~\ref{#1}}
+% Reference to a range of chapters
+\def\rangechapref#1#2{chapters\ref{#1}--\ref{#2}}
+% Reference to an algorithm, lower-case.
+\def\algref#1{algorithm~\ref{#1}}
+% Reference to an algorithm, upper case.
+\def\Algref#1{Algorithm~\ref{#1}}
+\def\twoalgref#1#2{algorithms \ref{#1} and \ref{#2}}
+\def\Twoalgref#1#2{Algorithms \ref{#1} and \ref{#2}}
+% Reference to a part, lower case
+\def\partref#1{part~\ref{#1}}
+% Reference to a part, upper case
+\def\Partref#1{Part~\ref{#1}}
+\def\twopartref#1#2{parts \ref{#1} and \ref{#2}}
+
+\def\ceil#1{\lceil #1 \rceil}
+\def\floor#1{\lfloor #1 \rfloor}
+\def\1{\bm{1}}
+\newcommand{\train}{\mathcal{D}}
+\newcommand{\valid}{\mathcal{D_{\mathrm{valid}}}}
+\newcommand{\test}{\mathcal{D_{\mathrm{test}}}}
+
+\def\eps{{\epsilon}}
+
+
+% Random variables
+\def\reta{{\textnormal{$\eta$}}}
+\def\ra{{\textnormal{a}}}
+\def\rb{{\textnormal{b}}}
+\def\rc{{\textnormal{c}}}
+\def\rd{{\textnormal{d}}}
+\def\re{{\textnormal{e}}}
+\def\rf{{\textnormal{f}}}
+\def\rg{{\textnormal{g}}}
+\def\rh{{\textnormal{h}}}
+\def\ri{{\textnormal{i}}}
+\def\rj{{\textnormal{j}}}
+\def\rk{{\textnormal{k}}}
+\def\rl{{\textnormal{l}}}
+% rm is already a command, just don't name any random variables m
+\def\rn{{\textnormal{n}}}
+\def\ro{{\textnormal{o}}}
+\def\rp{{\textnormal{p}}}
+\def\rq{{\textnormal{q}}}
+\def\rr{{\textnormal{r}}}
+\def\rs{{\textnormal{s}}}
+\def\rt{{\textnormal{t}}}
+\def\ru{{\textnormal{u}}}
+\def\rv{{\textnormal{v}}}
+\def\rw{{\textnormal{w}}}
+\def\rx{{\textnormal{x}}}
+\def\ry{{\textnormal{y}}}
+\def\rz{{\textnormal{z}}}
+
+% Random vectors
+\def\rvepsilon{{\mathbf{\epsilon}}}
+\def\rvtheta{{\mathbf{\theta}}}
+\def\rva{{\mathbf{a}}}
+\def\rvb{{\mathbf{b}}}
+\def\rvc{{\mathbf{c}}}
+\def\rvd{{\mathbf{d}}}
+\def\rve{{\mathbf{e}}}
+\def\rvf{{\mathbf{f}}}
+\def\rvg{{\mathbf{g}}}
+\def\rvh{{\mathbf{h}}}
+\def\rvu{{\mathbf{i}}}
+\def\rvj{{\mathbf{j}}}
+\def\rvk{{\mathbf{k}}}
+\def\rvl{{\mathbf{l}}}
+\def\rvm{{\mathbf{m}}}
+\def\rvn{{\mathbf{n}}}
+\def\rvo{{\mathbf{o}}}
+\def\rvp{{\mathbf{p}}}
+\def\rvq{{\mathbf{q}}}
+\def\rvr{{\mathbf{r}}}
+\def\rvs{{\mathbf{s}}}
+\def\rvt{{\mathbf{t}}}
+\def\rvu{{\mathbf{u}}}
+\def\rvv{{\mathbf{v}}}
+\def\rvw{{\mathbf{w}}}
+\def\rvx{{\mathbf{x}}}
+\def\rvy{{\mathbf{y}}}
+\def\rvz{{\mathbf{z}}}
+
+% Elements of random vectors
+\def\erva{{\textnormal{a}}}
+\def\ervb{{\textnormal{b}}}
+\def\ervc{{\textnormal{c}}}
+\def\ervd{{\textnormal{d}}}
+\def\erve{{\textnormal{e}}}
+\def\ervf{{\textnormal{f}}}
+\def\ervg{{\textnormal{g}}}
+\def\ervh{{\textnormal{h}}}
+\def\ervi{{\textnormal{i}}}
+\def\ervj{{\textnormal{j}}}
+\def\ervk{{\textnormal{k}}}
+\def\ervl{{\textnormal{l}}}
+\def\ervm{{\textnormal{m}}}
+\def\ervn{{\textnormal{n}}}
+\def\ervo{{\textnormal{o}}}
+\def\ervp{{\textnormal{p}}}
+\def\ervq{{\textnormal{q}}}
+\def\ervr{{\textnormal{r}}}
+\def\ervs{{\textnormal{s}}}
+\def\ervt{{\textnormal{t}}}
+\def\ervu{{\textnormal{u}}}
+\def\ervv{{\textnormal{v}}}
+\def\ervw{{\textnormal{w}}}
+\def\ervx{{\textnormal{x}}}
+\def\ervy{{\textnormal{y}}}
+\def\ervz{{\textnormal{z}}}
+
+% Random matrices
+\def\rmA{{\mathbf{A}}}
+\def\rmB{{\mathbf{B}}}
+\def\rmC{{\mathbf{C}}}
+\def\rmD{{\mathbf{D}}}
+\def\rmE{{\mathbf{E}}}
+\def\rmF{{\mathbf{F}}}
+\def\rmG{{\mathbf{G}}}
+\def\rmH{{\mathbf{H}}}
+\def\rmI{{\mathbf{I}}}
+\def\rmJ{{\mathbf{J}}}
+\def\rmK{{\mathbf{K}}}
+\def\rmL{{\mathbf{L}}}
+\def\rmM{{\mathbf{M}}}
+\def\rmN{{\mathbf{N}}}
+\def\rmO{{\mathbf{O}}}
+\def\rmP{{\mathbf{P}}}
+\def\rmQ{{\mathbf{Q}}}
+\def\rmR{{\mathbf{R}}}
+\def\rmS{{\mathbf{S}}}
+\def\rmT{{\mathbf{T}}}
+\def\rmU{{\mathbf{U}}}
+\def\rmV{{\mathbf{V}}}
+\def\rmW{{\mathbf{W}}}
+\def\rmX{{\mathbf{X}}}
+\def\rmY{{\mathbf{Y}}}
+\def\rmZ{{\mathbf{Z}}}
+
+% Elements of random matrices
+\def\ermA{{\textnormal{A}}}
+\def\ermB{{\textnormal{B}}}
+\def\ermC{{\textnormal{C}}}
+\def\ermD{{\textnormal{D}}}
+\def\ermE{{\textnormal{E}}}
+\def\ermF{{\textnormal{F}}}
+\def\ermG{{\textnormal{G}}}
+\def\ermH{{\textnormal{H}}}
+\def\ermI{{\textnormal{I}}}
+\def\ermJ{{\textnormal{J}}}
+\def\ermK{{\textnormal{K}}}
+\def\ermL{{\textnormal{L}}}
+\def\ermM{{\textnormal{M}}}
+\def\ermN{{\textnormal{N}}}
+\def\ermO{{\textnormal{O}}}
+\def\ermP{{\textnormal{P}}}
+\def\ermQ{{\textnormal{Q}}}
+\def\ermR{{\textnormal{R}}}
+\def\ermS{{\textnormal{S}}}
+\def\ermT{{\textnormal{T}}}
+\def\ermU{{\textnormal{U}}}
+\def\ermV{{\textnormal{V}}}
+\def\ermW{{\textnormal{W}}}
+\def\ermX{{\textnormal{X}}}
+\def\ermY{{\textnormal{Y}}}
+\def\ermZ{{\textnormal{Z}}}
+
+% Vectors
+\def\vzero{{\bm{0}}}
+\def\vone{{\bm{1}}}
+\def\vmu{{\bm{\mu}}}
+\def\vtheta{{\bm{\theta}}}
+\def\va{{\bm{a}}}
+\def\vb{{\bm{b}}}
+\def\vc{{\bm{c}}}
+\def\vd{{\bm{d}}}
+\def\ve{{\bm{e}}}
+\def\vf{{\bm{f}}}
+\def\vg{{\bm{g}}}
+\def\vh{{\bm{h}}}
+\def\vi{{\bm{i}}}
+\def\vj{{\bm{j}}}
+\def\vk{{\bm{k}}}
+\def\vl{{\bm{l}}}
+\def\vm{{\bm{m}}}
+\def\vn{{\bm{n}}}
+\def\vo{{\bm{o}}}
+\def\vp{{\bm{p}}}
+\def\vq{{\bm{q}}}
+\def\vr{{\bm{r}}}
+\def\vs{{\bm{s}}}
+\def\vt{{\bm{t}}}
+\def\vu{{\bm{u}}}
+\def\vv{{\bm{v}}}
+\def\vw{{\bm{w}}}
+\def\vx{{\bm{x}}}
+\def\vy{{\bm{y}}}
+\def\vz{{\bm{z}}}
+
+% Elements of vectors
+\def\evalpha{{\alpha}}
+\def\evbeta{{\beta}}
+\def\evepsilon{{\epsilon}}
+\def\evlambda{{\lambda}}
+\def\evomega{{\omega}}
+\def\evmu{{\mu}}
+\def\evpsi{{\psi}}
+\def\evsigma{{\sigma}}
+\def\evtheta{{\theta}}
+\def\eva{{a}}
+\def\evb{{b}}
+\def\evc{{c}}
+\def\evd{{d}}
+\def\eve{{e}}
+\def\evf{{f}}
+\def\evg{{g}}
+\def\evh{{h}}
+\def\evi{{i}}
+\def\evj{{j}}
+\def\evk{{k}}
+\def\evl{{l}}
+\def\evm{{m}}
+\def\evn{{n}}
+\def\evo{{o}}
+\def\evp{{p}}
+\def\evq{{q}}
+\def\evr{{r}}
+\def\evs{{s}}
+\def\evt{{t}}
+\def\evu{{u}}
+\def\evv{{v}}
+\def\evw{{w}}
+\def\evx{{x}}
+\def\evy{{y}}
+\def\evz{{z}}
+
+% Matrix
+\def\mA{{\bm{A}}}
+\def\mB{{\bm{B}}}
+\def\mC{{\bm{C}}}
+\def\mD{{\bm{D}}}
+\def\mE{{\bm{E}}}
+\def\mF{{\bm{F}}}
+\def\mG{{\bm{G}}}
+\def\mH{{\bm{H}}}
+\def\mI{{\bm{I}}}
+\def\mJ{{\bm{J}}}
+\def\mK{{\bm{K}}}
+\def\mL{{\bm{L}}}
+\def\mM{{\bm{M}}}
+\def\mN{{\bm{N}}}
+\def\mO{{\bm{O}}}
+\def\mP{{\bm{P}}}
+\def\mQ{{\bm{Q}}}
+\def\mR{{\bm{R}}}
+\def\mS{{\bm{S}}}
+\def\mT{{\bm{T}}}
+\def\mU{{\bm{U}}}
+\def\mV{{\bm{V}}}
+\def\mW{{\bm{W}}}
+\def\mX{{\bm{X}}}
+\def\mY{{\bm{Y}}}
+\def\mZ{{\bm{Z}}}
+\def\mBeta{{\bm{\beta}}}
+\def\mPhi{{\bm{\Phi}}}
+\def\mLambda{{\bm{\Lambda}}}
+\def\mSigma{{\bm{\Sigma}}}
+
+% Tensor
+\DeclareMathAlphabet{\mathsfit}{\encodingdefault}{\sfdefault}{m}{sl}
+\SetMathAlphabet{\mathsfit}{bold}{\encodingdefault}{\sfdefault}{bx}{n}
+\newcommand{\tens}[1]{\bm{\mathsfit{#1}}}
+\def\tA{{\tens{A}}}
+\def\tB{{\tens{B}}}
+\def\tC{{\tens{C}}}
+\def\tD{{\tens{D}}}
+\def\tE{{\tens{E}}}
+\def\tF{{\tens{F}}}
+\def\tG{{\tens{G}}}
+\def\tH{{\tens{H}}}
+\def\tI{{\tens{I}}}
+\def\tJ{{\tens{J}}}
+\def\tK{{\tens{K}}}
+\def\tL{{\tens{L}}}
+\def\tM{{\tens{M}}}
+\def\tN{{\tens{N}}}
+\def\tO{{\tens{O}}}
+\def\tP{{\tens{P}}}
+\def\tQ{{\tens{Q}}}
+\def\tR{{\tens{R}}}
+\def\tS{{\tens{S}}}
+\def\tT{{\tens{T}}}
+\def\tU{{\tens{U}}}
+\def\tV{{\tens{V}}}
+\def\tW{{\tens{W}}}
+\def\tX{{\tens{X}}}
+\def\tY{{\tens{Y}}}
+\def\tZ{{\tens{Z}}}
+
+
+% Graph
+\def\gA{{\mathcal{A}}}
+\def\gB{{\mathcal{B}}}
+\def\gC{{\mathcal{C}}}
+\def\gD{{\mathcal{D}}}
+\def\gE{{\mathcal{E}}}
+\def\gF{{\mathcal{F}}}
+\def\gG{{\mathcal{G}}}
+\def\gH{{\mathcal{H}}}
+\def\gI{{\mathcal{I}}}
+\def\gJ{{\mathcal{J}}}
+\def\gK{{\mathcal{K}}}
+\def\gL{{\mathcal{L}}}
+\def\gM{{\mathcal{M}}}
+\def\gN{{\mathcal{N}}}
+\def\gO{{\mathcal{O}}}
+\def\gP{{\mathcal{P}}}
+\def\gQ{{\mathcal{Q}}}
+\def\gR{{\mathcal{R}}}
+\def\gS{{\mathcal{S}}}
+\def\gT{{\mathcal{T}}}
+\def\gU{{\mathcal{U}}}
+\def\gV{{\mathcal{V}}}
+\def\gW{{\mathcal{W}}}
+\def\gX{{\mathcal{X}}}
+\def\gY{{\mathcal{Y}}}
+\def\gZ{{\mathcal{Z}}}
+
+% Sets
+\def\sA{{\mathbb{A}}}
+\def\sB{{\mathbb{B}}}
+\def\sC{{\mathbb{C}}}
+\def\sD{{\mathbb{D}}}
+% Don't use a set called E, because this would be the same as our symbol
+% for expectation.
+\def\sF{{\mathbb{F}}}
+\def\sG{{\mathbb{G}}}
+\def\sH{{\mathbb{H}}}
+\def\sI{{\mathbb{I}}}
+\def\sJ{{\mathbb{J}}}
+\def\sK{{\mathbb{K}}}
+\def\sL{{\mathbb{L}}}
+\def\sM{{\mathbb{M}}}
+\def\sN{{\mathbb{N}}}
+\def\sO{{\mathbb{O}}}
+\def\sP{{\mathbb{P}}}
+\def\sQ{{\mathbb{Q}}}
+\def\sR{{\mathbb{R}}}
+\def\sS{{\mathbb{S}}}
+\def\sT{{\mathbb{T}}}
+\def\sU{{\mathbb{U}}}
+\def\sV{{\mathbb{V}}}
+\def\sW{{\mathbb{W}}}
+\def\sX{{\mathbb{X}}}
+\def\sY{{\mathbb{Y}}}
+\def\sZ{{\mathbb{Z}}}
+
+% Entries of a matrix
+\def\emLambda{{\Lambda}}
+\def\emA{{A}}
+\def\emB{{B}}
+\def\emC{{C}}
+\def\emD{{D}}
+\def\emE{{E}}
+\def\emF{{F}}
+\def\emG{{G}}
+\def\emH{{H}}
+\def\emI{{I}}
+\def\emJ{{J}}
+\def\emK{{K}}
+\def\emL{{L}}
+\def\emM{{M}}
+\def\emN{{N}}
+\def\emO{{O}}
+\def\emP{{P}}
+\def\emQ{{Q}}
+\def\emR{{R}}
+\def\emS{{S}}
+\def\emT{{T}}
+\def\emU{{U}}
+\def\emV{{V}}
+\def\emW{{W}}
+\def\emX{{X}}
+\def\emY{{Y}}
+\def\emZ{{Z}}
+\def\emSigma{{\Sigma}}
+
+% entries of a tensor
+% Same font as tensor, without \bm wrapper
+\newcommand{\etens}[1]{\mathsfit{#1}}
+\def\etLambda{{\etens{\Lambda}}}
+\def\etA{{\etens{A}}}
+\def\etB{{\etens{B}}}
+\def\etC{{\etens{C}}}
+\def\etD{{\etens{D}}}
+\def\etE{{\etens{E}}}
+\def\etF{{\etens{F}}}
+\def\etG{{\etens{G}}}
+\def\etH{{\etens{H}}}
+\def\etI{{\etens{I}}}
+\def\etJ{{\etens{J}}}
+\def\etK{{\etens{K}}}
+\def\etL{{\etens{L}}}
+\def\etM{{\etens{M}}}
+\def\etN{{\etens{N}}}
+\def\etO{{\etens{O}}}
+\def\etP{{\etens{P}}}
+\def\etQ{{\etens{Q}}}
+\def\etR{{\etens{R}}}
+\def\etS{{\etens{S}}}
+\def\etT{{\etens{T}}}
+\def\etU{{\etens{U}}}
+\def\etV{{\etens{V}}}
+\def\etW{{\etens{W}}}
+\def\etX{{\etens{X}}}
+\def\etY{{\etens{Y}}}
+\def\etZ{{\etens{Z}}}
+
+% The true underlying data generating distribution
+\newcommand{\pdata}{p_{\rm{data}}}
+% The empirical distribution defined by the training set
+\newcommand{\ptrain}{\hat{p}_{\rm{data}}}
+\newcommand{\Ptrain}{\hat{P}_{\rm{data}}}
+% The model distribution
+\newcommand{\pmodel}{p_{\rm{model}}}
+\newcommand{\Pmodel}{P_{\rm{model}}}
+\newcommand{\ptildemodel}{\tilde{p}_{\rm{model}}}
+% Stochastic autoencoder distributions
+\newcommand{\pencode}{p_{\rm{encoder}}}
+\newcommand{\pdecode}{p_{\rm{decoder}}}
+\newcommand{\precons}{p_{\rm{reconstruct}}}
+
+\newcommand{\laplace}{\mathrm{Laplace}} % Laplace distribution
+
+\newcommand{\E}{\mathbb{E}}
+\newcommand{\Ls}{\mathcal{L}}
+\newcommand{\R}{\mathbb{R}}
+\newcommand{\emp}{\tilde{p}}
+\newcommand{\lr}{\alpha}
+\newcommand{\reg}{\lambda}
+\newcommand{\rect}{\mathrm{rectifier}}
+\newcommand{\softmax}{\mathrm{softmax}}
+\newcommand{\sigmoid}{\sigma}
+\newcommand{\softplus}{\zeta}
+\newcommand{\KL}{D_{\mathrm{KL}}}
+\newcommand{\Var}{\mathrm{Var}}
+\newcommand{\standarderror}{\mathrm{SE}}
+\newcommand{\Cov}{\mathrm{Cov}}
+% Wolfram Mathworld says $L^2$ is for function spaces and $\ell^2$ is for vectors
+% But then they seem to use $L^2$ for vectors throughout the site, and so does
+% wikipedia.
+\newcommand{\normlzero}{L^0}
+\newcommand{\normlone}{L^1}
+\newcommand{\normltwo}{L^2}
+\newcommand{\normlp}{L^p}
+\newcommand{\normmax}{L^\infty}
+
+\newcommand{\parents}{Pa} % See usage in notation.tex. Chosen to match Daphne's book.
+
+\DeclareMathOperator*{\argmax}{arg\,max}
+\DeclareMathOperator*{\argmin}{arg\,min}
+
+\DeclareMathOperator{\sign}{sign}
+\DeclareMathOperator{\Tr}{Tr}
+\let\ab\allowbreak
diff --git a/papers/arxiv/v1/references.bib b/papers/arxiv/v1/references.bib
@@ -0,0 +1,191 @@
+@book{breiman1984cart,
+ author = {Breiman, L. and Friedman, J. and Olshen, R. A. and Stone, C. J.},
+ title = {Classification and Regression Trees},
+ edition = {1st},
+ year = {1984},
+ publisher = {Chapman and Hall/CRC},
+ doi = {10.1201/9781315139470}
+}
+
+@article{MurthyKS94,
+
+ author = {Murthy, S. K. and Kasif, S. and Salzberg, S. L.},
+ title = {A System for Induction of Oblique Decision Trees},
+ journal = {Journal of Artificial Intelligence Research},
+ volume = {2},
+ pages = {1--32},
+ year = {1994},
+ doi = {10.1613/jair.63},
+}
+
+@article{quinlan1986,
+ author = {Quinlan, J. R.},
+ title = {Induction of Decision Trees},
+ journal = {Machine Learning},
+ volume = {1},
+ number = {1},
+ pages = {81--106},
+ year = {1986},
+ doi = {10.1007/BF00116251}
+}
+
+@article{HHCart,
+ author = {Wickramarachchi, D. C. and Robertson, B. L. and Reale, M. and Price, C. J. and Brown, J.},
+ title = {{HHCART: An oblique decision tree}},
+ journal = {Computational Statistics \& Data Analysis},
+ volume = {96},
+ pages = {12--23},
+ year = {2016},
+ issn = {0167-9473},
+ doi = {10.1016/j.csda.2015.11.006},
+}
+
+@article{odewahn,
+ author = {Odewahn, S. C. and Stockwell, E. B. and Pennington, R. L. and Humphreys, R. M. and Zumach, W. A.},
+ title = {Automated Star/Galaxy Discrimination With Neural Networks},
+ journal = {The Astronomical Journal},
+ year = {1992},
+ volume = {103},
+ pages = {318},
+ doi = {10.1086/116063},
+}
+
+@misc{breast_cancer,
+ author = {Wolberg, W.},
+ title = {{Breast Cancer Wisconsin (Original)}},
+ year = {1990},
+ howpublished = {UCI Machine Learning Repository},
+ note = {{doi}: 10.24432/C5HP4Z}
+}
+
+@misc{iris,
+ author = {Fisher, R. A.},
+ title = {Iris},
+ year = {1936},
+ note = {{doi}: 10.24432/C56C76},
+ howpublished = {UCI Machine Learning Repository},
+}
+
+@article{boston,
+title = {Hedonic housing prices and the demand for clean air},
+journal = {Journal of Environmental Economics and Management},
+volume = {5},
+number = {1},
+pages = {81--102},
+year = {1978},
+issn = {0095-0696},
+doi = {10.1016/0095-0696(78)90006-2},
+author = {D. Harrison and D. L. Rubinfeld},
+}
+
+
+@inproceedings{diabetes,
+ author = {Smith, J. W. and Everhart, J. E. and Dickson, W. C. and Knowler, W. C. and Johannes, R. S.},
+ title = {Using the ADAP Learning Algorithm to Forecast the Onset of Diabetes Mellitus},
+ booktitle = {Proceedings of the Annual Symposium on Computer Application in Medical Care},
+ pages = {261--265},
+ year = {1988}
+}
+
+@inproceedings{boosting,
+ author = {Drucker, H. and Cortes, C.},
+ booktitle = {Advances in Neural Information Processing Systems},
+ publisher = {MIT Press},
+ title = {Boosting Decision Trees},
+ volume = {8},
+ year = {1995}
+}
+
+@article{sgboost,
+ title = {Stochastic gradient boosting},
+ journal = {Computational Statistics \& Data Analysis},
+ volume = {38},
+ number = {4},
+ pages = {367--378},
+ year = {2002},
+ issn = {0167-9473},
+ doi = {10.1016/S0167-9473(01)00065-2},
+ author = {J. H. Friedman},
+}
+
+@article{survey,
+ author={Mienye, I. D. and Jere, N.},
+ journal={IEEE Access},
+ title={A Survey of Decision Trees: Concepts, Algorithms, and Applications},
+ year={2024},
+ volume={12},
+ pages={86716--86727},
+ doi={10.1109/ACCESS.2024.3416838}}
+
+@article{recent_overview,
+ title={Decision trees: a recent overview},
+ author={Kotsiantis, S. B.},
+ journal={Artificial Intelligence Review},
+ volume={39},
+ number={4},
+ pages={261--283},
+ year={2011},
+ publisher={Springer},
+ doi={10.1007/s10462-011-9272-4}
+}
+
+@article{it,
+ author={Shannon, C. E.},
+ journal={The Bell System Technical Journal},
+ title={A mathematical theory of communication},
+ year={1948},
+ volume={27},
+ number={3},
+ pages={379--423},
+ doi={10.1002/j.1538-7305.1948.tb01338.x},
+}
+
+
+@book{matrix,
+ title={Matrix Computations},
+ author={Golub, G.H. and Van Loan, C.F.},
+ isbn={9781421407944},
+ lccn={2012943449},
+ series={Johns Hopkins Studies in the Mathematical Sciences},
+ year={2013},
+ publisher={Johns Hopkins University Press}
+}
+
+
+@inproceedings{buntine,
+ title={Tree Classification Software},
+ author={W. L. Buntine},
+ booktitle = {Technology 2002: The Third National Technology Transfer Conference and Exposition, Volume 1},
+ year={1993},
+}
+
+@article{rndfrst,
+ author = {L. Breiman},
+ title = {Random Forests},
+ journal = {Machine Learning},
+ year = {2001},
+ volume = {45},
+ number = {1},
+ pages = {5--32},
+ month = {Oct},
+ issn = {1573-0565},
+ doi = {10.1023/A:1010933404324}
+}
+
+@book{c4.5,
+ title={C4.5: Programs for Machine Learning},
+ author={Quinlan, J. R.},
+ isbn={9781558602380},
+ series={Morgan Kaufmann series in machine learning},
+ year={1993},
+ publisher={Elsevier Science}
+}
+
+@inproceedings{cv,
+ author = {R. Kohavi},
+ title = {A study of cross-validation and bootstrap for accuracy estimation and model selection},
+ booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI)},
+ year = {1995},
+ pages = {1137--1143},
+ address = {Montreal, Canada},
+}
diff --git a/papers/arxiv/v1/tmlr.bst b/papers/arxiv/v1/tmlr.bst
@@ -0,0 +1,1440 @@
+%% File: `tmlr.bst'
+%% A copy of iclm2010.bst, which is a modification of `plainnl.bst' for use with natbib package
+%%
+%% Copyright 2010 Hal Daum\'e III
+%% Modified by J. Fürnkranz
+%% - Changed labels from (X and Y, 2000) to (X & Y, 2000)
+%%
+%% Copyright 1993-2007 Patrick W Daly
+%% Max-Planck-Institut f\"ur Sonnensystemforschung
+%% Max-Planck-Str. 2
+%% D-37191 Katlenburg-Lindau
+%% Germany
+%% E-mail: daly@mps.mpg.de
+%%
+%% This program can be redistributed and/or modified under the terms
+%% of the LaTeX Project Public License Distributed from CTAN
+%% archives in directory macros/latex/base/lppl.txt; either
+%% version 1 of the License, or any later version.
+%%
+ % Version and source file information:
+ % \ProvidesFile{icml2010.mbs}[2007/11/26 1.93 (PWD)]
+ %
+ % BibTeX `plainnat' family
+ % version 0.99b for BibTeX versions 0.99a or later,
+ % for LaTeX versions 2.09 and 2e.
+ %
+ % For use with the `natbib.sty' package; emulates the corresponding
+ % member of the `plain' family, but with author-year citations.
+ %
+ % With version 6.0 of `natbib.sty', it may also be used for numerical
+ % citations, while retaining the commands \citeauthor, \citefullauthor,
+ % and \citeyear to print the corresponding information.
+ %
+ % For version 7.0 of `natbib.sty', the KEY field replaces missing
+ % authors/editors, and the date is left blank in \bibitem.
+ %
+ % Includes field EID for the sequence/citation number of electronic journals
+ % which is used instead of page numbers.
+ %
+ % Includes fields ISBN and ISSN.
+ %
+ % Includes field URL for Internet addresses.
+ %
+ % Includes field DOI for Digital Object Idenfifiers.
+ %
+ % Works best with the url.sty package of Donald Arseneau.
+ %
+ % Works with identical authors and year are further sorted by
+ % citation key, to preserve any natural sequence.
+ %
+ENTRY
+ { address
+ author
+ booktitle
+ chapter
+ doi
+ eid
+ edition
+ editor
+ howpublished
+ institution
+ isbn
+ issn
+ journal
+ key
+ month
+ note
+ number
+ organization
+ pages
+ publisher
+ school
+ series
+ title
+ type
+ url
+ volume
+ year
+ }
+ {}
+ { label extra.label sort.label short.list }
+
+INTEGERS { output.state before.all mid.sentence after.sentence after.block }
+
+FUNCTION {init.state.consts}
+{ #0 'before.all :=
+ #1 'mid.sentence :=
+ #2 'after.sentence :=
+ #3 'after.block :=
+}
+
+STRINGS { s t }
+
+FUNCTION {output.nonnull}
+{ 's :=
+ output.state mid.sentence =
+ { ", " * write$ }
+ { output.state after.block =
+ { add.period$ write$
+ newline$
+ "\newblock " write$
+ }
+ { output.state before.all =
+ 'write$
+ { add.period$ " " * write$ }
+ if$
+ }
+ if$
+ mid.sentence 'output.state :=
+ }
+ if$
+ s
+}
+
+FUNCTION {output}
+{ duplicate$ empty$
+ 'pop$
+ 'output.nonnull
+ if$
+}
+
+FUNCTION {output.check}
+{ 't :=
+ duplicate$ empty$
+ { pop$ "empty " t * " in " * cite$ * warning$ }
+ 'output.nonnull
+ if$
+}
+
+FUNCTION {fin.entry}
+{ add.period$
+ write$
+ newline$
+}
+
+FUNCTION {new.block}
+{ output.state before.all =
+ 'skip$
+ { after.block 'output.state := }
+ if$
+}
+
+FUNCTION {new.sentence}
+{ output.state after.block =
+ 'skip$
+ { output.state before.all =
+ 'skip$
+ { after.sentence 'output.state := }
+ if$
+ }
+ if$
+}
+
+FUNCTION {not}
+{ { #0 }
+ { #1 }
+ if$
+}
+
+FUNCTION {and}
+{ 'skip$
+ { pop$ #0 }
+ if$
+}
+
+FUNCTION {or}
+{ { pop$ #1 }
+ 'skip$
+ if$
+}
+
+FUNCTION {new.block.checka}
+{ empty$
+ 'skip$
+ 'new.block
+ if$
+}
+
+FUNCTION {new.block.checkb}
+{ empty$
+ swap$ empty$
+ and
+ 'skip$
+ 'new.block
+ if$
+}
+
+FUNCTION {new.sentence.checka}
+{ empty$
+ 'skip$
+ 'new.sentence
+ if$
+}
+
+FUNCTION {new.sentence.checkb}
+{ empty$
+ swap$ empty$
+ and
+ 'skip$
+ 'new.sentence
+ if$
+}
+
+FUNCTION {field.or.null}
+{ duplicate$ empty$
+ { pop$ "" }
+ 'skip$
+ if$
+}
+
+FUNCTION {emphasize}
+{ duplicate$ empty$
+ { pop$ "" }
+ { "\emph{" swap$ * "}" * }
+ if$
+}
+
+INTEGERS { nameptr namesleft numnames }
+
+FUNCTION {format.names}
+{ 's :=
+ #1 'nameptr :=
+ s num.names$ 'numnames :=
+ numnames 'namesleft :=
+ { namesleft #0 > }
+ { s nameptr "{ff~}{vv~}{ll}{, jj}" format.name$ 't :=
+ nameptr #1 >
+ { namesleft #1 >
+ { ", " * t * }
+ { numnames #2 >
+ { "," * }
+ 'skip$
+ if$
+ t "others" =
+ { " et~al." * }
+ { " and " * t * }
+ if$
+ }
+ if$
+ }
+ 't
+ if$
+ nameptr #1 + 'nameptr :=
+ namesleft #1 - 'namesleft :=
+ }
+ while$
+}
+
+FUNCTION {format.key}
+{ empty$
+ { key field.or.null }
+ { "" }
+ if$
+}
+
+FUNCTION {format.authors}
+{ author empty$
+ { "" }
+ { author format.names }
+ if$
+}
+
+FUNCTION {format.editors}
+{ editor empty$
+ { "" }
+ { editor format.names
+ editor num.names$ #1 >
+ { " (eds.)" * }
+ { " (ed.)" * }
+ if$
+ }
+ if$
+}
+
+FUNCTION {format.isbn}
+{ isbn empty$
+ { "" }
+ { new.block "ISBN " isbn * }
+ if$
+}
+
+FUNCTION {format.issn}
+{ issn empty$
+ { "" }
+ { new.block "ISSN " issn * }
+ if$
+}
+
+FUNCTION {format.url}
+{ url empty$
+ { "" }
+ { new.block "URL \url{" url * "}" * }
+ if$
+}
+
+FUNCTION {format.doi}
+{ doi empty$
+ { "" }
+ { new.block "\doi{" doi * "}" * }
+ if$
+}
+
+FUNCTION {format.title}
+{ title empty$
+ { "" }
+ { title "t" change.case$ }
+ if$
+}
+
+FUNCTION {format.full.names}
+{'s :=
+ #1 'nameptr :=
+ s num.names$ 'numnames :=
+ numnames 'namesleft :=
+ { namesleft #0 > }
+ { s nameptr
+ "{vv~}{ll}" format.name$ 't :=
+ nameptr #1 >
+ {
+ namesleft #1 >
+ { ", " * t * }
+ {
+ numnames #2 >
+ { "," * }
+ 'skip$
+ if$
+ t "others" =
+ { " et~al." * }
+ { " and " * t * }
+ if$
+ }
+ if$
+ }
+ 't
+ if$
+ nameptr #1 + 'nameptr :=
+ namesleft #1 - 'namesleft :=
+ }
+ while$
+}
+
+FUNCTION {author.editor.full}
+{ author empty$
+ { editor empty$
+ { "" }
+ { editor format.full.names }
+ if$
+ }
+ { author format.full.names }
+ if$
+}
+
+FUNCTION {author.full}
+{ author empty$
+ { "" }
+ { author format.full.names }
+ if$
+}
+
+FUNCTION {editor.full}
+{ editor empty$
+ { "" }
+ { editor format.full.names }
+ if$
+}
+
+FUNCTION {make.full.names}
+{ type$ "book" =
+ type$ "inbook" =
+ or
+ 'author.editor.full
+ { type$ "proceedings" =
+ 'editor.full
+ 'author.full
+ if$
+ }
+ if$
+}
+
+FUNCTION {output.bibitem}
+{ newline$
+ "\bibitem[" write$
+ label write$
+ ")" make.full.names duplicate$ short.list =
+ { pop$ }
+ { * }
+ if$
+ "]{" * write$
+ cite$ write$
+ "}" write$
+ newline$
+ ""
+ before.all 'output.state :=
+}
+
+FUNCTION {n.dashify}
+{ 't :=
+ ""
+ { t empty$ not }
+ { t #1 #1 substring$ "-" =
+ { t #1 #2 substring$ "--" = not
+ { "--" *
+ t #2 global.max$ substring$ 't :=
+ }
+ { { t #1 #1 substring$ "-" = }
+ { "-" *
+ t #2 global.max$ substring$ 't :=
+ }
+ while$
+ }
+ if$
+ }
+ { t #1 #1 substring$ *
+ t #2 global.max$ substring$ 't :=
+ }
+ if$
+ }
+ while$
+}
+
+FUNCTION {format.date}
+{ year duplicate$ empty$
+ { "empty year in " cite$ * warning$
+ pop$ "" }
+ 'skip$
+ if$
+ month empty$
+ 'skip$
+ { month
+ " " * swap$ *
+ }
+ if$
+ extra.label *
+}
+
+FUNCTION {format.btitle}
+{ title emphasize
+}
+
+FUNCTION {tie.or.space.connect}
+{ duplicate$ text.length$ #3 <
+ { "~" }
+ { " " }
+ if$
+ swap$ * *
+}
+
+FUNCTION {either.or.check}
+{ empty$
+ 'pop$
+ { "can't use both " swap$ * " fields in " * cite$ * warning$ }
+ if$
+}
+
+FUNCTION {format.bvolume}
+{ volume empty$
+ { "" }
+ { "volume" volume tie.or.space.connect
+ series empty$
+ 'skip$
+ { " of " * series emphasize * }
+ if$
+ "volume and number" number either.or.check
+ }
+ if$
+}
+
+FUNCTION {format.number.series}
+{ volume empty$
+ { number empty$
+ { series field.or.null }
+ { output.state mid.sentence =
+ { "number" }
+ { "Number" }
+ if$
+ number tie.or.space.connect
+ series empty$
+ { "there's a number but no series in " cite$ * warning$ }
+ { " in " * series * }
+ if$
+ }
+ if$
+ }
+ { "" }
+ if$
+}
+
+FUNCTION {format.edition}
+{ edition empty$
+ { "" }
+ { output.state mid.sentence =
+ { edition "l" change.case$ " edition" * }
+ { edition "t" change.case$ " edition" * }
+ if$
+ }
+ if$
+}
+
+INTEGERS { multiresult }
+
+FUNCTION {multi.page.check}
+{ 't :=
+ #0 'multiresult :=
+ { multiresult not
+ t empty$ not
+ and
+ }
+ { t #1 #1 substring$
+ duplicate$ "-" =
+ swap$ duplicate$ "," =
+ swap$ "+" =
+ or or
+ { #1 'multiresult := }
+ { t #2 global.max$ substring$ 't := }
+ if$
+ }
+ while$
+ multiresult
+}
+
+FUNCTION {format.pages}
+{ pages empty$
+ { "" }
+ { pages multi.page.check
+ { "pp.\ " pages n.dashify tie.or.space.connect }
+ { "pp.\ " pages tie.or.space.connect }
+ if$
+ }
+ if$
+}
+
+FUNCTION {format.eid}
+{ eid empty$
+ { "" }
+ { "art." eid tie.or.space.connect }
+ if$
+}
+
+FUNCTION {format.vol.num.pages}
+{ volume field.or.null
+ number empty$
+ 'skip$
+ { "\penalty0 (" number * ")" * *
+ volume empty$
+ { "there's a number but no volume in " cite$ * warning$ }
+ 'skip$
+ if$
+ }
+ if$
+ pages empty$
+ 'skip$
+ { duplicate$ empty$
+ { pop$ format.pages }
+ { ":\penalty0 " * pages n.dashify * }
+ if$
+ }
+ if$
+}
+
+FUNCTION {format.vol.num.eid}
+{ volume field.or.null
+ number empty$
+ 'skip$
+ { "\penalty0 (" number * ")" * *
+ volume empty$
+ { "there's a number but no volume in " cite$ * warning$ }
+ 'skip$
+ if$
+ }
+ if$
+ eid empty$
+ 'skip$
+ { duplicate$ empty$
+ { pop$ format.eid }
+ { ":\penalty0 " * eid * }
+ if$
+ }
+ if$
+}
+
+FUNCTION {format.chapter.pages}
+{ chapter empty$
+ 'format.pages
+ { type empty$
+ { "chapter" }
+ { type "l" change.case$ }
+ if$
+ chapter tie.or.space.connect
+ pages empty$
+ 'skip$
+ { ", " * format.pages * }
+ if$
+ }
+ if$
+}
+
+FUNCTION {format.in.ed.booktitle}
+{ booktitle empty$
+ { "" }
+ { editor empty$
+ { "In " booktitle emphasize * }
+ { "In " format.editors * ", " * booktitle emphasize * }
+ if$
+ }
+ if$
+}
+
+FUNCTION {empty.misc.check}
+{ author empty$ title empty$ howpublished empty$
+ month empty$ year empty$ note empty$
+ and and and and and
+ key empty$ not and
+ { "all relevant fields are empty in " cite$ * warning$ }
+ 'skip$
+ if$
+}
+
+FUNCTION {format.thesis.type}
+{ type empty$
+ 'skip$
+ { pop$
+ type "t" change.case$
+ }
+ if$
+}
+
+FUNCTION {format.tr.number}
+{ type empty$
+ { "Technical Report" }
+ 'type
+ if$
+ number empty$
+ { "t" change.case$ }
+ { number tie.or.space.connect }
+ if$
+}
+
+FUNCTION {format.article.crossref}
+{ key empty$
+ { journal empty$
+ { "need key or journal for " cite$ * " to crossref " * crossref *
+ warning$
+ ""
+ }
+ { "In \emph{" journal * "}" * }
+ if$
+ }
+ { "In " }
+ if$
+ " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.book.crossref}
+{ volume empty$
+ { "empty volume in " cite$ * "'s crossref of " * crossref * warning$
+ "In "
+ }
+ { "Volume" volume tie.or.space.connect
+ " of " *
+ }
+ if$
+ editor empty$
+ editor field.or.null author field.or.null =
+ or
+ { key empty$
+ { series empty$
+ { "need editor, key, or series for " cite$ * " to crossref " *
+ crossref * warning$
+ "" *
+ }
+ { "\emph{" * series * "}" * }
+ if$
+ }
+ 'skip$
+ if$
+ }
+ 'skip$
+ if$
+ " \citet{" * crossref * "}" *
+}
+
+FUNCTION {format.incoll.inproc.crossref}
+{ editor empty$
+ editor field.or.null author field.or.null =
+ or
+ { key empty$
+ { booktitle empty$
+ { "need editor, key, or booktitle for " cite$ * " to crossref " *
+ crossref * warning$
+ ""
+ }
+ { "In \emph{" booktitle * "}" * }
+ if$
+ }
+ { "In " }
+ if$
+ }
+ { "In " }
+ if$
+ " \citet{" * crossref * "}" *
+}
+
+FUNCTION {article}
+{ output.bibitem
+ format.authors "author" output.check
+ author format.key output
+ new.block
+ format.title "title" output.check
+ new.block
+ crossref missing$
+ { journal emphasize "journal" output.check
+ eid empty$
+ { format.vol.num.pages output }
+ { format.vol.num.eid output }
+ if$
+ format.date "year" output.check
+ }
+ { format.article.crossref output.nonnull
+ eid empty$
+ { format.pages output }
+ { format.eid output }
+ if$
+ }
+ if$
+ format.issn output
+ format.doi output
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {book}
+{ output.bibitem
+ author empty$
+ { format.editors "author and editor" output.check
+ editor format.key output
+ }
+ { format.authors output.nonnull
+ crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+ if$
+ }
+ if$
+ new.block
+ format.btitle "title" output.check
+ crossref missing$
+ { format.bvolume output
+ new.block
+ format.number.series output
+ new.sentence
+ publisher "publisher" output.check
+ address output
+ }
+ { new.block
+ format.book.crossref output.nonnull
+ }
+ if$
+ format.edition output
+ format.date "year" output.check
+ format.isbn output
+ format.doi output
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {booklet}
+{ output.bibitem
+ format.authors output
+ author format.key output
+ new.block
+ format.title "title" output.check
+ howpublished address new.block.checkb
+ howpublished output
+ address output
+ format.date output
+ format.isbn output
+ format.doi output
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {inbook}
+{ output.bibitem
+ author empty$
+ { format.editors "author and editor" output.check
+ editor format.key output
+ }
+ { format.authors output.nonnull
+ crossref missing$
+ { "author and editor" editor either.or.check }
+ 'skip$
+ if$
+ }
+ if$
+ new.block
+ format.btitle "title" output.check
+ crossref missing$
+ { format.bvolume output
+ format.chapter.pages "chapter and pages" output.check
+ new.block
+ format.number.series output
+ new.sentence
+ publisher "publisher" output.check
+ address output
+ }
+ { format.chapter.pages "chapter and pages" output.check
+ new.block
+ format.book.crossref output.nonnull
+ }
+ if$
+ format.edition output
+ format.date "year" output.check
+ format.isbn output
+ format.doi output
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {incollection}
+{ output.bibitem
+ format.authors "author" output.check
+ author format.key output
+ new.block
+ format.title "title" output.check
+ new.block
+ crossref missing$
+ { format.in.ed.booktitle "booktitle" output.check
+ format.bvolume output
+ format.number.series output
+ format.chapter.pages output
+ new.sentence
+ publisher "publisher" output.check
+ address output
+ format.edition output
+ format.date "year" output.check
+ }
+ { format.incoll.inproc.crossref output.nonnull
+ format.chapter.pages output
+ }
+ if$
+ format.isbn output
+ format.doi output
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {inproceedings}
+{ output.bibitem
+ format.authors "author" output.check
+ author format.key output
+ new.block
+ format.title "title" output.check
+ new.block
+ crossref missing$
+ { format.in.ed.booktitle "booktitle" output.check
+ format.bvolume output
+ format.number.series output
+ format.pages output
+ address empty$
+ { organization publisher new.sentence.checkb
+ organization output
+ publisher output
+ format.date "year" output.check
+ }
+ { address output.nonnull
+ format.date "year" output.check
+ new.sentence
+ organization output
+ publisher output
+ }
+ if$
+ }
+ { format.incoll.inproc.crossref output.nonnull
+ format.pages output
+ }
+ if$
+ format.isbn output
+ format.doi output
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {conference} { inproceedings }
+
+FUNCTION {manual}
+{ output.bibitem
+ format.authors output
+ author format.key output
+ new.block
+ format.btitle "title" output.check
+ organization address new.block.checkb
+ organization output
+ address output
+ format.edition output
+ format.date output
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {mastersthesis}
+{ output.bibitem
+ format.authors "author" output.check
+ author format.key output
+ new.block
+ format.title "title" output.check
+ new.block
+ "Master's thesis" format.thesis.type output.nonnull
+ school "school" output.check
+ address output
+ format.date "year" output.check
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {misc}
+{ output.bibitem
+ format.authors output
+ author format.key output
+ title howpublished new.block.checkb
+ format.title output
+ howpublished new.block.checka
+ howpublished output
+ format.date output
+ format.issn output
+ format.url output
+ new.block
+ note output
+ fin.entry
+ empty.misc.check
+}
+
+FUNCTION {phdthesis}
+{ output.bibitem
+ format.authors "author" output.check
+ author format.key output
+ new.block
+ format.btitle "title" output.check
+ new.block
+ "PhD thesis" format.thesis.type output.nonnull
+ school "school" output.check
+ address output
+ format.date "year" output.check
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {proceedings}
+{ output.bibitem
+ format.editors output
+ editor format.key output
+ new.block
+ format.btitle "title" output.check
+ format.bvolume output
+ format.number.series output
+ address output
+ format.date "year" output.check
+ new.sentence
+ organization output
+ publisher output
+ format.isbn output
+ format.doi output
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {techreport}
+{ output.bibitem
+ format.authors "author" output.check
+ author format.key output
+ new.block
+ format.title "title" output.check
+ new.block
+ format.tr.number output.nonnull
+ institution "institution" output.check
+ address output
+ format.date "year" output.check
+ format.url output
+ new.block
+ note output
+ fin.entry
+}
+
+FUNCTION {unpublished}
+{ output.bibitem
+ format.authors "author" output.check
+ author format.key output
+ new.block
+ format.title "title" output.check
+ new.block
+ note "note" output.check
+ format.date output
+ format.url output
+ fin.entry
+}
+
+FUNCTION {default.type} { misc }
+
+
+MACRO {jan} {"January"}
+
+MACRO {feb} {"February"}
+
+MACRO {mar} {"March"}
+
+MACRO {apr} {"April"}
+
+MACRO {may} {"May"}
+
+MACRO {jun} {"June"}
+
+MACRO {jul} {"July"}
+
+MACRO {aug} {"August"}
+
+MACRO {sep} {"September"}
+
+MACRO {oct} {"October"}
+
+MACRO {nov} {"November"}
+
+MACRO {dec} {"December"}
+
+
+
+MACRO {acmcs} {"ACM Computing Surveys"}
+
+MACRO {acta} {"Acta Informatica"}
+
+MACRO {cacm} {"Communications of the ACM"}
+
+MACRO {ibmjrd} {"IBM Journal of Research and Development"}
+
+MACRO {ibmsj} {"IBM Systems Journal"}
+
+MACRO {ieeese} {"IEEE Transactions on Software Engineering"}
+
+MACRO {ieeetc} {"IEEE Transactions on Computers"}
+
+MACRO {ieeetcad}
+ {"IEEE Transactions on Computer-Aided Design of Integrated Circuits"}
+
+MACRO {ipl} {"Information Processing Letters"}
+
+MACRO {jacm} {"Journal of the ACM"}
+
+MACRO {jcss} {"Journal of Computer and System Sciences"}
+
+MACRO {scp} {"Science of Computer Programming"}
+
+MACRO {sicomp} {"SIAM Journal on Computing"}
+
+MACRO {tocs} {"ACM Transactions on Computer Systems"}
+
+MACRO {tods} {"ACM Transactions on Database Systems"}
+
+MACRO {tog} {"ACM Transactions on Graphics"}
+
+MACRO {toms} {"ACM Transactions on Mathematical Software"}
+
+MACRO {toois} {"ACM Transactions on Office Information Systems"}
+
+MACRO {toplas} {"ACM Transactions on Programming Languages and Systems"}
+
+MACRO {tcs} {"Theoretical Computer Science"}
+
+
+READ
+
+FUNCTION {sortify}
+{ purify$
+ "l" change.case$
+}
+
+INTEGERS { len }
+
+FUNCTION {chop.word}
+{ 's :=
+ 'len :=
+ s #1 len substring$ =
+ { s len #1 + global.max$ substring$ }
+ 's
+ if$
+}
+
+FUNCTION {format.lab.names}
+{ 's :=
+ s #1 "{vv~}{ll}" format.name$
+ s num.names$ duplicate$
+ #2 >
+ { pop$ " et~al." * }
+ { #2 <
+ 'skip$
+ { s #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" =
+ { " et~al." * }
+ { " \& " * s #2 "{vv~}{ll}" format.name$ * }
+ if$
+ }
+ if$
+ }
+ if$
+}
+
+FUNCTION {author.key.label}
+{ author empty$
+ { key empty$
+ { cite$ #1 #3 substring$ }
+ 'key
+ if$
+ }
+ { author format.lab.names }
+ if$
+}
+
+FUNCTION {author.editor.key.label}
+{ author empty$
+ { editor empty$
+ { key empty$
+ { cite$ #1 #3 substring$ }
+ 'key
+ if$
+ }
+ { editor format.lab.names }
+ if$
+ }
+ { author format.lab.names }
+ if$
+}
+
+FUNCTION {author.key.organization.label}
+{ author empty$
+ { key empty$
+ { organization empty$
+ { cite$ #1 #3 substring$ }
+ { "The " #4 organization chop.word #3 text.prefix$ }
+ if$
+ }
+ 'key
+ if$
+ }
+ { author format.lab.names }
+ if$
+}
+
+FUNCTION {editor.key.organization.label}
+{ editor empty$
+ { key empty$
+ { organization empty$
+ { cite$ #1 #3 substring$ }
+ { "The " #4 organization chop.word #3 text.prefix$ }
+ if$
+ }
+ 'key
+ if$
+ }
+ { editor format.lab.names }
+ if$
+}
+
+FUNCTION {calc.short.authors}
+{ type$ "book" =
+ type$ "inbook" =
+ or
+ 'author.editor.key.label
+ { type$ "proceedings" =
+ 'editor.key.organization.label
+ { type$ "manual" =
+ 'author.key.organization.label
+ 'author.key.label
+ if$
+ }
+ if$
+ }
+ if$
+ 'short.list :=
+}
+
+FUNCTION {calc.label}
+{ calc.short.authors
+ short.list
+ "("
+ *
+ year duplicate$ empty$
+ short.list key field.or.null = or
+ { pop$ "" }
+ 'skip$
+ if$
+ *
+ 'label :=
+}
+
+FUNCTION {sort.format.names}
+{ 's :=
+ #1 'nameptr :=
+ ""
+ s num.names$ 'numnames :=
+ numnames 'namesleft :=
+ { namesleft #0 > }
+ {
+ s nameptr "{vv{ } }{ll{ }}{ ff{ }}{ jj{ }}" format.name$ 't :=
+ nameptr #1 >
+ {
+ " " *
+ namesleft #1 = t "others" = and
+ { "zzzzz" * }
+ { numnames #2 > nameptr #2 = and
+ { "zz" * year field.or.null * " " * }
+ 'skip$
+ if$
+ t sortify *
+ }
+ if$
+ }
+ { t sortify * }
+ if$
+ nameptr #1 + 'nameptr :=
+ namesleft #1 - 'namesleft :=
+ }
+ while$
+}
+
+FUNCTION {sort.format.title}
+{ 't :=
+ "A " #2
+ "An " #3
+ "The " #4 t chop.word
+ chop.word
+ chop.word
+ sortify
+ #1 global.max$ substring$
+}
+
+FUNCTION {author.sort}
+{ author empty$
+ { key empty$
+ { "to sort, need author or key in " cite$ * warning$
+ ""
+ }
+ { key sortify }
+ if$
+ }
+ { author sort.format.names }
+ if$
+}
+
+FUNCTION {author.editor.sort}
+{ author empty$
+ { editor empty$
+ { key empty$
+ { "to sort, need author, editor, or key in " cite$ * warning$
+ ""
+ }
+ { key sortify }
+ if$
+ }
+ { editor sort.format.names }
+ if$
+ }
+ { author sort.format.names }
+ if$
+}
+
+FUNCTION {author.organization.sort}
+{ author empty$
+ { organization empty$
+ { key empty$
+ { "to sort, need author, organization, or key in " cite$ * warning$
+ ""
+ }
+ { key sortify }
+ if$
+ }
+ { "The " #4 organization chop.word sortify }
+ if$
+ }
+ { author sort.format.names }
+ if$
+}
+
+FUNCTION {editor.organization.sort}
+{ editor empty$
+ { organization empty$
+ { key empty$
+ { "to sort, need editor, organization, or key in " cite$ * warning$
+ ""
+ }
+ { key sortify }
+ if$
+ }
+ { "The " #4 organization chop.word sortify }
+ if$
+ }
+ { editor sort.format.names }
+ if$
+}
+
+
+FUNCTION {presort}
+{ calc.label
+ label sortify
+ " "
+ *
+ type$ "book" =
+ type$ "inbook" =
+ or
+ 'author.editor.sort
+ { type$ "proceedings" =
+ 'editor.organization.sort
+ { type$ "manual" =
+ 'author.organization.sort
+ 'author.sort
+ if$
+ }
+ if$
+ }
+ if$
+ " "
+ *
+ year field.or.null sortify
+ *
+ " "
+ *
+ cite$
+ *
+ #1 entry.max$ substring$
+ 'sort.label :=
+ sort.label *
+ #1 entry.max$ substring$
+ 'sort.key$ :=
+}
+
+ITERATE {presort}
+
+SORT
+
+STRINGS { longest.label last.label next.extra }
+
+INTEGERS { longest.label.width last.extra.num number.label }
+
+FUNCTION {initialize.longest.label}
+{ "" 'longest.label :=
+ #0 int.to.chr$ 'last.label :=
+ "" 'next.extra :=
+ #0 'longest.label.width :=
+ #0 'last.extra.num :=
+ #0 'number.label :=
+}
+
+FUNCTION {forward.pass}
+{ last.label label =
+ { last.extra.num #1 + 'last.extra.num :=
+ last.extra.num int.to.chr$ 'extra.label :=
+ }
+ { "a" chr.to.int$ 'last.extra.num :=
+ "" 'extra.label :=
+ label 'last.label :=
+ }
+ if$
+ number.label #1 + 'number.label :=
+}
+
+FUNCTION {reverse.pass}
+{ next.extra "b" =
+ { "a" 'extra.label := }
+ 'skip$
+ if$
+ extra.label 'next.extra :=
+ extra.label
+ duplicate$ empty$
+ 'skip$
+ { "{\natexlab{" swap$ * "}}" * }
+ if$
+ 'extra.label :=
+ label extra.label * 'label :=
+}
+
+EXECUTE {initialize.longest.label}
+
+ITERATE {forward.pass}
+
+REVERSE {reverse.pass}
+
+FUNCTION {bib.sort.order}
+{ sort.label 'sort.key$ :=
+}
+
+ITERATE {bib.sort.order}
+
+SORT
+
+FUNCTION {begin.bib}
+{ preamble$ empty$
+ 'skip$
+ { preamble$ write$ newline$ }
+ if$
+ "\begin{thebibliography}{" number.label int.to.str$ * "}" *
+ write$ newline$
+ "\providecommand{\natexlab}[1]{#1}"
+ write$ newline$
+ "\providecommand{\url}[1]{\texttt{#1}}"
+ write$ newline$
+ "\expandafter\ifx\csname urlstyle\endcsname\relax"
+ write$ newline$
+ " \providecommand{\doi}[1]{doi: #1}\else"
+ write$ newline$
+ " \providecommand{\doi}{doi: \begingroup \urlstyle{rm}\Url}\fi"
+ write$ newline$
+}
+
+EXECUTE {begin.bib}
+
+EXECUTE {init.state.consts}
+
+ITERATE {call.type$}
+
+FUNCTION {end.bib}
+{ newline$
+ "\end{thebibliography}" write$ newline$
+}
+
+EXECUTE {end.bib}
diff --git a/papers/arxiv/v1/tmlr.sty b/papers/arxiv/v1/tmlr.sty
@@ -0,0 +1,206 @@
+%%%% TMLR Macros (LaTex)
+%%%% Adapted by Hugo Larochelle and Fabian Pedregosa from the
+%%%% ICLR stylefile Macros and borrowing from the JMLR Macros
+%%%% Style File
+
+%%%% Last edited, January 2021 by Chris J. Maddison
+%%%% Change font choice
+
+% Change the overall width of the page. If these parameters are
+% changed, they will require corresponding changes in the
+% maketitle section.
+%
+\usepackage{eso-pic} % used by \AddToShipoutPicture
+\RequirePackage{fancyhdr}
+\RequirePackage{natbib}
+
+\usepackage[T1]{fontenc}
+\usepackage{lmodern}
+
+% modification to natbib citations
+\setcitestyle{authoryear,round,citesep={;},aysep={,},yysep={;}}
+
+\renewcommand{\topfraction}{0.95} % let figure take up nearly whole page
+\renewcommand{\textfraction}{0.05} % let figure take up nearly whole page
+
+
+%%%%%%%% Options
+\newif\if@accepted\@acceptedfalse
+\DeclareOption{accepted}{%
+ \@acceptedtrue
+}
+% declare preprint option, which creates a preprint version ready for
+% upload to, e.g., arXiv
+\newif\if@preprint\@preprintfalse
+\DeclareOption{preprint}{
+ \@preprinttrue
+ \@acceptedtrue
+}
+
+\DeclareOption*{\PackageWarning{tmlr}{Unknown ‘\CurrentOption’}}
+\ProcessOptions\relax
+
+% Specify the dimensions of each page
+
+\setlength{\paperheight}{11in}
+\setlength{\paperwidth}{8.5in}
+
+
+\oddsidemargin 0in % Note \oddsidemargin = \evensidemargin
+\evensidemargin 0in
+\marginparwidth 0.07 true in
+%\marginparwidth 0.75 true in
+%\topmargin 0 true pt % Nominal distance from top of page to top of
+%\topmargin 0.125in
+\topmargin -0.625in
+\addtolength{\headsep}{0.25in}
+\textheight 9.0 true in % Height of text (including footnotes & figures)
+\textwidth 6.5 true in % Width of text line.
+\widowpenalty=10000
+\clubpenalty=10000
+
+
+% \thispagestyle{empty} \pagestyle{empty}
+\flushbottom \sloppy
+
+% We're never going to need a table of contents, so just flush it to
+% save space --- suggested by drstrip@sandia-2
+\def\addcontentsline#1#2#3{}
+
+% Title stuff, taken from deproc.
+\def\maketitle{\par
+\begingroup
+ \def\thefootnote{\fnsymbol{footnote}}
+ \def\@makefnmark{\hbox to 0pt{$^{\@thefnmark}$\hss}} % for perfect author
+ % name centering
+% The footnote-mark was overlapping the footnote-text,
+% added the following to fix this problem (MK)
+ \long\def\@makefntext##1{\parindent 1em\noindent
+ \hbox to1.8em{\hss $\m@th ^{\@thefnmark}$}##1}
+ \@maketitle \@thanks
+\endgroup
+\setcounter{footnote}{0}
+\let\maketitle\relax \let\@maketitle\relax
+\gdef\@thanks{}\gdef\@author{}\gdef\@title{}\let\thanks\relax}
+
+\newlength\aftertitskip \newlength\beforetitskip
+\newlength\interauthorskip \newlength\aftermaketitskip
+
+%% Changeable parameters.
+\setlength\aftertitskip{0.3in plus 0.2in minus 0.2in}
+\setlength\beforetitskip{0.05in plus 0.08in minus 0.08in}
+\setlength\interauthorskip{0.08in plus 0.1in minus 0.1in}
+\setlength\aftermaketitskip{0.3in plus 0.1in minus 0.1in}
+
+\def\@startauthor{\noindent \normalsize\bf}
+\def\@endauthor{}
+
+\def\addr{\small\it}%
+\def\email{\hfill\small\it}%
+\def\name{\normalsize\bf}%
+\def\name{\normalsize\bf}%
+\def\AND{\@endauthor\rm\hss \vskip \interauthorskip \@startauthor}
+
+% The toptitlebar has been raised to top-justify the first page
+
+\usepackage{fancyhdr}
+\pagestyle{fancy}
+\fancyhead{}
+
+\if@accepted
+ \if@preprint
+ \lhead{}
+ \else
+ \lhead{Published in Transactions on Machine Learning Research (\month/\year)}
+ \fi
+\else
+ \lhead{Under review as submission to TMLR}
+\fi
+
+% Title (includes both anonymized and non-anonymized versions)
+\def\@maketitle{\vbox{\hsize\textwidth
+%\linewidth\hsize \vskip 0.1in \toptitlebar \centering
+{\LARGE\bf\sffamily \@title\par}\vskip \aftertitskip
+%\bottomtitlebar % \vskip 0.1in % minus
+\if@accepted
+ \if@preprint
+ \@startauthor \@author \@endauthor
+ \else
+ \@startauthor \@author \\ \\ {\bf Reviewed on OpenReview:} \openreview \@endauthor
+ \fi
+\else
+ \@startauthor Anonymous authors\\Paper under double-blind review \@endauthor
+\fi
+\vskip 0.3in minus 0.1in}}
+
+\renewenvironment{abstract}{\vskip.075in\centerline{\large\bf\sffamily
+Abstract}\vspace{0.5ex}\begin{quote}}{\par\end{quote}\vskip 1ex}
+
+% sections with less space
+\def\section{\@startsection {section}{1}{\z@}{-2.0ex plus
+ -0.5ex minus -.2ex}{1.5ex plus 0.3ex
+minus0.2ex}{\large\bf\raggedright\sffamily}}
+
+\def\subsection{\@startsection{subsection}{2}{\z@}{-1.8ex plus
+-0.5ex minus -.2ex}{0.8ex plus .2ex}{\normalsize\bf\raggedright\sffamily}}
+\def\subsubsection{\@startsection{subsubsection}{3}{\z@}{-1.5ex
+plus -0.5ex minus -.2ex}{0.5ex plus
+.2ex}{\normalsize\bf\raggedright\sffamily}}
+\def\paragraph{\@startsection{paragraph}{4}{\z@}{1.5ex plus
+0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subparagraph{\@startsection{subparagraph}{5}{\z@}{1.5ex plus
+ 0.5ex minus .2ex}{-1em}{\normalsize\bf}}
+\def\subsubsubsection{\vskip
+5pt{\noindent\normalsize\rm\raggedright\sffamily}}
+
+
+% Footnotes
+\footnotesep 6.65pt %
+\skip\footins 9pt plus 4pt minus 2pt
+\def\footnoterule{\kern-3pt \hrule width 12pc \kern 2.6pt }
+\setcounter{footnote}{0}
+
+% Lists and paragraphs
+\parindent 0pt
+\topsep 4pt plus 1pt minus 2pt
+\partopsep 1pt plus 0.5pt minus 0.5pt
+\itemsep 2pt plus 1pt minus 0.5pt
+\parsep 2pt plus 1pt minus 0.5pt
+\parskip .5pc
+
+
+%\leftmargin2em
+\leftmargin3pc
+\leftmargini\leftmargin \leftmarginii 2em
+\leftmarginiii 1.5em \leftmarginiv 1.0em \leftmarginv .5em
+
+%\labelsep \labelsep 5pt
+
+\def\@listi{\leftmargin\leftmargini}
+\def\@listii{\leftmargin\leftmarginii
+ \labelwidth\leftmarginii\advance\labelwidth-\labelsep
+ \topsep 2pt plus 1pt minus 0.5pt
+ \parsep 1pt plus 0.5pt minus 0.5pt
+ \itemsep \parsep}
+\def\@listiii{\leftmargin\leftmarginiii
+ \labelwidth\leftmarginiii\advance\labelwidth-\labelsep
+ \topsep 1pt plus 0.5pt minus 0.5pt
+ \parsep \z@ \partopsep 0.5pt plus 0pt minus 0.5pt
+ \itemsep \topsep}
+\def\@listiv{\leftmargin\leftmarginiv
+ \labelwidth\leftmarginiv\advance\labelwidth-\labelsep}
+\def\@listv{\leftmargin\leftmarginv
+ \labelwidth\leftmarginv\advance\labelwidth-\labelsep}
+\def\@listvi{\leftmargin\leftmarginvi
+ \labelwidth\leftmarginvi\advance\labelwidth-\labelsep}
+
+\abovedisplayskip 7pt plus2pt minus5pt%
+\belowdisplayskip \abovedisplayskip
+\abovedisplayshortskip 0pt plus3pt%
+\belowdisplayshortskip 4pt plus3pt minus3pt%
+
+
+\def\toptitlebar{\hrule height4pt\vskip .25in\vskip-\parskip}
+
+\def\bottomtitlebar{\vskip .29in\vskip-\parskip\hrule height1pt\vskip
+.09in} %