cart-elc

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit af5ed06408451e386fc2c14d47a5732252523704
parent 109df438b78ef299e6937f869f2ed3250b8954d8
Author: AndrewLockVI <andrew@laack.co>
Date:   Fri,  9 May 2025 09:58:18 -0500

Added all uws presentation materials.

Diffstat:
Apresentations/uws/presentation.pdf | 0
Apresentations/uws/presentation.tex | 451+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apresentations/uws/python-scripts/combinationGraphDataGen.py | 28++++++++++++++++++++++++++++
Apresentations/uws/python-scripts/correlatedDataGraph.py | 100+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apresentations/uws/python-scripts/dia3.py | 50++++++++++++++++++++++++++++++++++++++++++++++++++
Apresentations/uws/python-scripts/dia4.py | 54++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apresentations/uws/python-scripts/diabetes.py | 62++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apresentations/uws/python-scripts/genBetter.py | 30++++++++++++++++++++++++++++++
Apresentations/uws/python-scripts/genNew.py | 22++++++++++++++++++++++
Apresentations/uws/python-scripts/generate2D.py | 86+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apresentations/uws/python-scripts/generateDiabetes.py | 17+++++++++++++++++
Apresentations/uws/python-scripts/main.py | 163+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apresentations/uws/references.bib | 199+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
13 files changed, 1262 insertions(+), 0 deletions(-)

diff --git a/presentations/uws/presentation.pdf b/presentations/uws/presentation.pdf Binary files differ. diff --git a/presentations/uws/presentation.tex b/presentations/uws/presentation.tex @@ -0,0 +1,451 @@ +\documentclass[10pt]{beamer} +\usepackage{booktabs} + +\usepackage{animate} +\usepackage{float} +\usecolortheme[snowy]{owl} +\usepackage{listings} +\usepackage{graphicx} % Load graphicx package for image handling +\usepackage{natbib} + +\usepackage{algorithm} +\usepackage{algpseudocode} + +\title{CART-ELC: Oblique Decision Tree \\ Induction via Exhaustive Search} +\author{Andrew D. Laack} +\institute{University of Wisconsin-Superior} +\date{May 13, 2025} + +\usepackage{xcolor} % Ensure xcolor is loaded + +\usepackage{float} +\usepackage{pgfplots} +\pgfplotsset{compat=1.17} % or another compatible version for arXiv + +\usepackage{ifthen} +\usepackage{multirow} +\usepackage{booktabs} + +\newboolean{isFinal} +\setboolean{isFinal}{false} + +\setbeamercolor{section in toc}{fg=white} % Section titles in white +\setbeamercolor{subsection in toc}{fg=lightgray} + +\renewcommand{\thealgorithm}{\arabic{algorithm}:} + +\renewcommand{\alglinenumber}[1]{\tiny\color{white}\@#1.} + +\usepackage{url} +\usepackage{hyperref} % Hides ugly link borders + + +\begin{document} +\section{} + + +\begin{frame} +\titlepage + +\footnotesize +Source Available: + +\texttt{https://github.com/andrewlaack/cart-elc} + +\end{frame} + +\begin{frame} + \frametitle{Problem} + How can we determine if someone has diabetes based on their BMI? +\end{frame} + +\begin{frame} + \frametitle{Visualization} + \begin{figure}[h] + \centering + \includegraphics[scale=.4]{images/dia1.pdf} + \caption{Diabetes Dataset \citep{diabetes} Graph} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Solution} + \begin{figure}[h] + \centering + \includegraphics[scale=.4]{images/dia2.pdf} + \caption{Diabetes Dataset Graph w/ Boundary} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Tree} + \begin{figure}[h] + \centering + \includegraphics[scale=.3]{images/tree1.pdf} + \caption{Tree Representation} + \end{figure} +\end{frame} + + +\begin{frame} + \frametitle{But...} + 62.2\% Accuracy + \begin{figure}[h] + \centering + \includegraphics[scale=.4]{images/dia3.pdf} + \caption{Full Diabetes Dataset Graph w/ Boundary} + \end{figure} +\end{frame} + + + + +% not density, mass + +\begin{frame} + \frametitle{Extra Feature} + \begin{figure}[h] + \centering + \includegraphics[scale=.4]{images/dia4.pdf} + \caption{Diabetes Dataset w/ Glucose (a bit complicated)} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Pseudocode} + + \begin{algorithm}[H] + \tiny + \caption{CART Algorithm \citep{breiman}} + \begin{algorithmic}[1] + \Function{fit}{samples, labels, featureCount} + \If{homogeneous(labels)} + \State \Return Node(majorityClass(labels)) + \EndIf + \State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore() + \For{sample in samples} + \For{feature in range(0,featureCount)} + \State currentSplit $\gets$ (feature, sample[feature]) + \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples) + \If{ isBetterThan(currentSplittingScore, bestSplittingScore)} + \State bestSplittingScore, bestSplit $\gets$ currentSplittingScore, currentSplit + \EndIf + \EndFor + \EndFor + \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit) + \If{left is empty or right is empty} + \State \Return Node(majorityClass(labels)) + \EndIf + \State leftSubtree $\gets$ fit(left.samples, left.labels, featureCount) + \State rightSubtree $\gets$ fit(right.samples, right.labels, featureCount) + \State tree $\gets$ Node(bestSplit) + \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree + \State \Return tree + \EndFunction + \end{algorithmic} + \end{algorithm} + +\end{frame} + +\begin{frame} + \frametitle{Quantification of Goodness (Splitting Criterion)} + + \begin{equation*} + G = p_L \left(1 - \sum_{j\in C} p_{Lj}^2 \right) + p_R \left(1 - \sum_{j \in C} p_{Rj}^2 \right) + \label{gini_equ} + \end{equation*} + +\end{frame} + + +% DEPTH: 1 - 0.7356770833333334 +% DEPTH: 2 - 0.7721354166666666 +% DEPTH: 3 - 0.7734375 +% DEPTH: 4 - 0.78515625 + +\begin{frame} + \frametitle{Boundary (Depth = 1)} + \begin{figure}[h] + \centering + \includegraphics[scale=.35]{images/images/diabetes_tree_bmi_glucose_depth_1.png} + \caption{73.6\% Accuracy} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Boundary (Depth = 2)} + \begin{figure}[h] + \centering + \includegraphics[scale=.35]{images/images/diabetes_tree_bmi_glucose_depth_2.png} + \caption{77.2\% Accuracy} + \end{figure} +\end{frame} + + +\begin{frame} + \frametitle{Boundary (Depth = 3)} + \begin{figure}[h] + \centering + \includegraphics[scale=.35]{images/images/diabetes_tree_bmi_glucose_depth_3.png} + \caption{77.3\% Accuracy} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Boundary (Depth = 4)} + \begin{figure}[h] + \centering + \includegraphics[scale=.35]{images/images/diabetes_tree_bmi_glucose_depth_4.png} + \caption{78.5\% Accuracy} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Tree (Depth = 4)} + \begin{figure}[h] + \centering + \includegraphics[scale=.225]{images/tree2.pdf} + \caption{Tree Representation} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{New Problem} + \begin{figure}[h] + \centering + \includegraphics[scale=.25]{correlation/original_graph.png} + \caption{Cancer Diagnosis (Synthetic)} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Boundary (Depth = 1)} + \begin{figure}[h] + \centering + \includegraphics[scale=.25]{correlation/images/decision_tree_depth_1.png} + \caption{Cancer Diagnosis (Synthetic)} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Boundary (Depth = 5)} + \begin{figure}[h] + \centering + \includegraphics[scale=.25]{correlation/images/decision_tree_depth_5.png} + \caption{Cancer Diagnosis (Synthetic)} + \end{figure} +\end{frame} + + +\begin{frame} + \frametitle{Boundary (Depth = 10)} + \begin{figure}[h] + \centering + \includegraphics[scale=.25]{correlation/images/decision_tree_depth_10.png} + \caption{Cancer Diagnosis (Synthetic)} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Boundary (Depth = 20)} + \begin{figure}[h] + \centering + \includegraphics[scale=.25]{correlation/images/decision_tree_depth_20.png} + \caption{Cancer Diagnosis (Synthetic)} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Tree (Depth = 20)} + \begin{figure}[h] + \centering + \includegraphics[scale=.04]{correlation/tree3.pdf} + \caption{Tree Visualization} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{Solution} + \begin{figure}[h] + \centering + \includegraphics[scale=.25]{correlation/oblique.png} + \caption{Oblique Split} + \end{figure} +\end{frame} + +\begin{frame} + \frametitle{How?} + \begin{enumerate} + \item CART-LC \citep{breiman} + \item OC1 \citep{oc1} + \item HHCART \citep{hhcart} + \item CART-ELC \citep{laack} + \end{enumerate} +\end{frame} + +\begin{frame} + \frametitle{Pseudocode} + + \begin{algorithm}[H] + \tiny + \caption{CART-ELC} + \begin{algorithmic}[1] + \Function{fit}{samples, labels, $m$, $r$} + \If{homogeneous(labels)} + \State \Return Node(majorityClass(labels)) + \EndIf + \State bestSplit, bestSplittingScore $\gets$ None, worstSplittingScore() + \For{selectedSamples in combinations(samples, $r$)} + \For{selectedFeatures in combinations($m$, $r$)} + \State vectorsToPassThrough $\gets$ featureSubset(selectedSamples, selectedFeatures) + \State currentSplit $\gets$ findHyperplanePassingThrough(vectorsToPassThrough) + \State currentSplittingScore $\gets$ evaluateSplit(currentSplit, samples) + \If{ isBetterThan(currentSplittingScore, bestSplittingScore)} + \State bestSplittingScore, bestSplit $\gets$ currentSplittingScore, currentSplit + \EndIf + \EndFor + \EndFor + \State left, right $\gets$ splitDataByBestSplit(samples, labels, bestSplit) + \If{left is empty or right is empty} + \State \Return Node(majorityClass(labels)) + \EndIf + \State leftSubtree $\gets$ fit(left.samples, left.labels, $m$, $r$) + \State rightSubtree $\gets$ fit(right.samples, right.labels, $m$, $r$) + \State tree $\gets$ Node(bestSplit) + \State tree.left, tree.right $\gets$ leftSubtree, rightSubtree + \State \Return tree + \EndFunction + \end{algorithmic} + \end{algorithm} +\end{frame} + +\begin{frame} + \frametitle{Asymptotic Time Complexity per Split (CART-ELC)} +\[ + \Theta\left( \binom{n}{r} \cdot \binom{m}{r} \cdot r(r^2 + n) \right) +\] + +\end{frame} + +\begin{frame} + \frametitle{Operations for a Single Split (CART-ELC)} + +\begin{table}[h] + \centering + \footnotesize + \caption{Operations for single split assuming multiplicative factor of one, no additive constants, and $r = m$.} + \begin{tabular}{lcccccc} + \addlinespace + \toprule + \multirow{2}{*}{r} & \multicolumn{6}{c}{n} \\ + \cmidrule(lr){2-7} + & 100 & 500 & 1000 & 5000 & 10000 & 20000 \\ + \midrule + 1 & 1.01e+04 & 2.50e+05 & 1.00e+06 & 2.50e+07 & 1.00e+08 & 4.00e+08 \\ + 2 & 1.03e+06 & 1.26e+08 & 1.00e+09 & 1.25e+11 & 1.00e+12 & 8.00e+12 \\ + 3 & 5.29e+07 & 3.16e+10 & 5.03e+11 & 3.13e+14 & 5.00e+15 & 8.00e+16 \\ + 4 & 1.82e+09 & 5.31e+12 & 1.68e+14 & 5.22e+17 & 1.67e+19 & 5.34e+20 \\ + 5 & 4.71e+10 & 6.70e+14 & 4.23e+16 & 6.53e+20 & 4.17e+22 & 2.67e+24 \\ + 6 & 9.73e+11 & 6.77e+16 & 8.50e+18 & 6.54e+23 & 8.35e+25 & 1.07e+28 \\ + 7 & 1.67e+13 & 5.71e+18 & 1.43e+21 & 5.46e+26 & 1.39e+29 & 3.56e+31 \\ + 8 & 2.44e+14 & 4.13e+20 & 2.05e+23 & 3.90e+29 & 1.99e+32 & 1.02e+35 \\ + 9 & 3.10e+15 & 2.62e+22 & 2.59e+25 & 2.44e+32 & 2.49e+35 & 2.55e+38 \\ + 10 & 3.46e+16 & 1.47e+24 & 2.90e+27 & 1.36e+35 & 2.77e+38 & 5.66e+41 \\ + \bottomrule + \end{tabular} + \label{fig:operations} +\end{table} +\end{frame} + +\begin{frame} + \frametitle{Empirical Comparison} + \tiny + +\begin{table}[h] + \centering + \caption{Accuracy and tree size comparison across decision tree induction algorithms.} + \begin{tabular}{lcccccc} + \addlinespace + \toprule + \multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Accuracy} \\ + \cmidrule(lr){2-7} + & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ + \midrule + CART-ELC & \textbf{98.9 ± 0.2} & \textbf{95.2 ± 0.5} & 96.3 ± 0.4 & 95.1 ± 0.8 & 83.5 ± 0.7 & \textbf{74.5 ± 1.3} \\ + + HHCART(A) & 98.3 ± 0.5 & 93.7 ± 0.8 & \textbf{96.9 ± 0.3} & \textbf{95.5 ± 1.4} & \textbf{83.9 ± 0.8} & 73.2 ± 1.2 \\ + HHCART(D) & 98.1 ± 0.4 & 93.7 ± 0.9 & \textbf{96.9 ± 0.3} & 94.3 ± 1.5 & 82.2 ± 1.4 & 73.2 ± 1.2 \\ + OC1 & \textbf{98.9 ± 0.2} & 95.0 ± 0.3 & 96.2 ± 0.3 & 94.7 ± 3.1 & 82.4 ± 0.8 & 74.4 ± 1.0 \\ + OC1-AP & 98.1 ± 0.2 & 94.0 ± 0.2 & 94.5 ± 0.5 & 92.7 ± 2.4 & 81.8 ± 1.0 & 73.8 ± 1.0 \\ + CART-LC & 98.8 ± 0.2 & 92.8 ± 0.5 & 95.3 ± 0.6 & 93.5 ± 2.9 & 81.4 ± 1.2 & 73.7 ± 1.2 \\ + CART-AP & 98.5 ± 0.5 & 94.2 ± 0.7 & 95.0 ± 1.6 & 93.8 ± 3.7 & 82.1 ± 3.5 & 73.9 ± 3.4 \\ + C4.5 & 98.5 ± 0.5 & 93.3 ± 0.8 & 95.3 ± 2.0 & 95.1 ± 3.2 & 83.2 ± 3.1 & 71.4 ± 3.3 \\ + \midrule + \multirow{2}{*}{Algorithm} & \multicolumn{6}{c}{Tree Size} \\ + \cmidrule(lr){2-7} + & S/G Bright & S/G Dim & Cancer & Iris & Housing & Diabetes \\ + \midrule + CART-ELC & \textbf{3.7 ± 0.2} & \textbf{9.8 ± 4.2} & \textbf{2.0 ± 0.0} & 4.8 ± 0.1 & \textbf{4.0 ± 0.0} & \textbf{4.0 ± 0.0} \\ + HHCART(A) & 6.1 ± 0.3 & 14.6 ± 4.8 & \textbf{2.0 ± 0.0} & \textbf{3.1 ± 0.1} & 7.8 ± 0.2 & \textbf{4.0 ± 0.0} \\ + HHCART(D) & 6.3 ± 0.4 & 14.9 ± 5.0 & \textbf{2.0 ± 0.0} & 4.7 ± 0.1 & 23.3 ± 0.8 & \textbf{4.0 ± 0.0} \\ + OC1 & 4.3 ± 1.0 & 13.0 ± 8.7 & 2.8 ± 0.9 & \textbf{3.1 ± 0.2} & 6.9 ± 3.2 & 5.4 ± 3.8 \\ + OC1-AP & 6.9 ± 2.4 & 29.3 ± 8.8 & 6.4 ± 1.7 & 3.2 ± 0.3 & 8.6 ± 4.5 & 11.4 ± 7.5 \\ + CART-LC & 3.9 ± 1.3 & 24.2 ± 8.7 & 3.5 ± 0.9 & 3.2 ± 0.3 & 5.8 ± 3.2 & 8.0 ± 5.2 \\ + CART-AP & 13.9 ± 5.7 & 30.4 ± 10.0 & 11.5 ± 7.2 & 4.3 ± 1.6 & 15.1 ± 10 & 11.5 ± 9.1 \\ + C4.5 & 14.3 ± 2.2 & 77.9 ± 7.4 & 9.8 ± 2.2 & 4.6 ± 0.8 & 28.2 ± 3.3 & 56.3 ± 7.9 \\ + \bottomrule + \end{tabular} + \label{fig:results} +\end{table} +\end{frame} + +\begin{frame} + \frametitle{Cohen's d} + \tiny + +\begin{table}[h] + \centering + \caption{Cohen's d effect size for accuracies between various models and CART-ELC. Comparisons with $p < 0.05$ are bolded.} + \begin{tabular}{lccccccc} + \addlinespace + \toprule + \multirow{1}{*}{Algorithm} & \multirow{1}{*}{S/G Bright} & \multirow{1}{*}{S/G Dim} & \multirow{1}{*}{Cancer} & \multirow{1}{*}{Iris} & \multirow{1}{*}{Housing} & \multirow{1}{*}{Diabetes} \\ + \midrule + + HHCART(A) & \textbf{1.576} & \textbf{2.249} & \textbf{-1.697} & -0.351 & -0.532 & \textbf{1.039} \\ + HHCART(D) & \textbf{2.530} & \textbf{2.060} & \textbf{-1.697} & 0.666 & \textbf{1.175} & \textbf{1.039} \\ + OC1 & 0.000 & 0.485 & 0.283 & 0.177 & \textbf{1.463} & 0.086 \\ + CART-LC & 0.500 & \textbf{4.800} & \textbf{1.961} & 0.752 & \textbf{2.138} & 0.639 \\ + + OC1-AP & \textbf{4.000} & \textbf{3.151} & \textbf{3.976} & \textbf{1.342} & \textbf{1.970} & 0.604 \\ + CART-AP & \textbf{1.050} & \textbf{1.644} & \textbf{1.115} & 0.486 & 0.555 & 0.233 \\ + C4.5 & \textbf{1.050} & \textbf{2.848} & 0.693 & 0.000 & 0.133 & \textbf{1.236} \\ + \bottomrule +\end{tabular} +\label{fig:cohens_d} +\end{table} +\end{frame} + +\begin{frame} + \frametitle{Future Directions} + + \begin{enumerate} + \item Smaller Subsets of Candidates + \begin{enumerate} + \item Active Sampling + \item Feature Selection + \end{enumerate} + + \item Random Forests \citep{rndfrst} + \item Stochastic Gradient Boosting \citep{sgboost} + \end{enumerate} + +\end{frame} + +\begin{frame} + \frametitle{References} +\bibliographystyle{plain} +\bibliography{references} +\end{frame} + +\end{document} diff --git a/presentations/uws/python-scripts/combinationGraphDataGen.py b/presentations/uws/python-scripts/combinationGraphDataGen.py @@ -0,0 +1,28 @@ +import math + +nVals = [100, 200, 300, 400, 500] +rVals = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] + +results = [] + +for r in rVals: + results.append([]) + for n in nVals: + # With m = r, math.comb(m, r) = math.comb(r, r) = 1. + # Therefore, the complexity is: + # math.comb(n, r) * (r^3 + r * n) + result = math.comb(n, r) * (r**3 + r*n) + results[-1].append(str(result)) + +for res in rVals: + if res == 0: + continue # Skip r = 0 since it gives 0 instructions. + print(str(res) + " & ", end="") + itr = 1 + for r_val in results[res]: + if itr == len(nVals): + print(r_val + " \\\\") + else: + print(r_val + " &", end=" ") + itr += 1 + diff --git a/presentations/uws/python-scripts/correlatedDataGraph.py b/presentations/uws/python-scripts/correlatedDataGraph.py @@ -0,0 +1,100 @@ +import numpy as np +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeClassifier, plot_tree +import os + +# Create directory for output images if not exists +os.makedirs("images", exist_ok=True) + +# Dark mode settings using matplotlib only +# plt.rcParams.update({ +# "axes.facecolor": "#000000", +# "axes.edgecolor": "#333333", +# "figure.facecolor": "#000000", +# "savefig.facecolor": "#000000", +# "text.color": "white", +# "axes.labelcolor": "white", +# "xtick.color": "white", +# "ytick.color": "white", +# "grid.color": "gray", +# "axes.grid": True +# }) + +ARRSIZE = 500 + +# Generate data for two classes +X_red = np.random.rand(ARRSIZE) * 10 +X_blue = np.random.rand(ARRSIZE) * 10 + +y_red = X_red / 1.5 + np.random.normal(0, 0.5, ARRSIZE) +y_blue = X_blue / 1.5 + np.random.normal(0, 0.5, ARRSIZE) +y_blue += 1.5 # Shift blue class upward + +# Combine the data +X_combined = np.concatenate([X_red, X_blue]) +y_combined = np.concatenate([y_red, y_blue]) +labelsRed = np.zeros(ARRSIZE) +labelsBlue = np.ones(ARRSIZE) +labels = np.concatenate([labelsRed, labelsBlue]) + +# Prepare the feature matrix for training +data = np.column_stack((X_combined, y_combined)) + +# Create meshgrid for decision boundaries +xx, yy = np.meshgrid(np.linspace(0, 10, 500), np.linspace(0, 10, 500)) +grid = np.c_[xx.ravel(), yy.ravel()] + +# Save the original scatter plot +plt.figure(figsize=(16, 9)) +plt.scatter(X_combined, y_combined, c=labels, cmap='bwr', s=60, edgecolors='#000000') +plt.xlim(0, 10) +plt.ylim(0, 10) +plt.xlabel('Tumor Size') +plt.ylabel('Mass') +plt.tight_layout() +plt.savefig("original_graph.png", dpi=300, bbox_inches='tight') +plt.close() + +# Loop through depths +for depth in range(1, 21): + clf = DecisionTreeClassifier(max_depth=depth, random_state=42) + clf.fit(data, labels) + + Z = clf.predict_proba(grid)[:, 1].reshape(xx.shape) + + plt.figure(figsize=(16, 9)) + + plt.contourf(xx, yy, Z, levels=[0, 0.5, 1], colors=['#0000FF', '#FF0000'], alpha=0.3) + plt.contour(xx, yy, Z, levels=[0.5], colors='#000000', linewidths=2) + + plt.scatter(X_combined, y_combined, c=labels, cmap='bwr', s=60, edgecolors='#000000') + + plt.xlim(0, 10) + plt.ylim(0, 10) + plt.xlabel('Tumor Size') + plt.ylabel('Mass') + plt.tight_layout() + plt.savefig(f"images/decision_tree_depth_{depth}.png", dpi=300, bbox_inches='tight') + plt.close() + + if depth == 20: + fig, ax = plt.subplots(figsize=(90, 90)) + + # Set dark background + + # Plot the tree without default fill + plot_tree( + clf, + ax=ax, + filled=True, + feature_names=["Tumor Size", "Mass"], + class_names=["Red", "Blue"], + impurity=False, + proportion=False, + rounded=True, + fontsize=10 # Use readable size, adjust as needed + ) + + # Save with dark facecolor + plt.savefig("tree3.pdf", dpi=300, bbox_inches='tight', facecolor=fig.get_facecolor()) + plt.close() diff --git a/presentations/uws/python-scripts/dia3.py b/presentations/uws/python-scripts/dia3.py @@ -0,0 +1,50 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt + +# Dark mode settings using matplotlib only +# plt.rcParams.update({ +# "axes.facecolor": "#000000", +# "axes.edgecolor": "#333333", +# "figure.facecolor": "#000000", +# "savefig.facecolor": "#000000", +# "text.color": "white", +# "axes.labelcolor": "white", +# "xtick.color": "white", +# "ytick.color": "white", +# "grid.color": "gray", +# "axes.grid": True +# }) + +df = pd.read_csv('./diabetes.csv') +X = df['BMI'].to_numpy() +y = df['Outcome'].to_numpy() + +# Create a scatter plot using matplotlib only +plt.figure(figsize=(10, 6)) +plt.scatter(X, y, c=y, cmap='bwr', s=40, edgecolors='#000000') + +plt.xlabel("BMI") +plt.ylabel("Diagnosis") + +plt.axvspan(0, 32.25, color='blue', alpha=0.3) # Light blue to the left +plt.axvspan(32.25, 50, color='red', alpha=0.3) # Light red to the right + +plt.axvline(32.25, color="black", linestyle="-") + + +plt.xlim([0, 50]) +plt.tight_layout() + +# Save the figure instead of displaying it +plt.savefig("dia3.pdf") + +plt.close() # Closes the plot so it doesn't display or use memory + +# Calculate classification accuracy based on threshold +wrong = 0 +for i in range(len(X)): + if (y[i] == 1 and X[i] < 32.25) or (y[i] == 0 and X[i] > 32.25): + wrong += 1 + +print((len(X) - wrong) / len(X)) diff --git a/presentations/uws/python-scripts/dia4.py b/presentations/uws/python-scripts/dia4.py @@ -0,0 +1,54 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn import tree +from sklearn import metrics + +# Dark mode settings using matplotlib only +# plt.rcParams.update({ +# "axes.facecolor": "#000000", +# "axes.edgecolor": "#333333", +# "figure.facecolor": "#000000", +# "savefig.facecolor": "#000000", +# "text.color": "white", +# "axes.labelcolor": "white", +# "xtick.color": "white", +# "ytick.color": "white", +# "grid.color": "gray", +# "axes.grid": True +# }) + +df = pd.read_csv('./diabetes.csv') +X = df['BMI'] +y = df['Glucose'] +color = df['Outcome'] + +xSub = [] +ySub = [] +X = X.to_numpy() +y = y.to_numpy() + +# Create scatter plot with dark styling +plt.figure(figsize=(10, 6)) +plt.scatter(X, y, c=color, cmap='bwr', s=60, edgecolors="#000000") +plt.xlabel("BMI") +plt.ylabel("Glucose") +plt.xlim([0, 50]) +plt.tight_layout() + +# Save the graph as dia4.pdf +plt.savefig("dia4.pdf") +plt.close() + +# Decision Tree for prediction +targ = [] + +for i in range(0, len(X)): + targ.append([X[i], y[i]]) + +dt = tree.DecisionTreeClassifier(max_depth=5) +dt.fit(targ, color) +preds = dt.predict(targ) + +# Print the accuracy score +print(metrics.accuracy_score(y_pred=preds, y_true=color)) diff --git a/presentations/uws/python-scripts/diabetes.py b/presentations/uws/python-scripts/diabetes.py @@ -0,0 +1,62 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn.tree import DecisionTreeClassifier, plot_tree + + +df = pd.read_csv('./diabetes.csv') +X = df['BMI'] +y = df['Outcome'] + +xSub = [] +ySub = [] +X = X.to_numpy() +y = y.to_numpy() + +for i in range(60, 90): + if y[i] == 0: + if X[i] > 32: + continue + xSub.append(X[i]) + ySub.append(y[i]) + +xSub = np.array(xSub).reshape(-1, 1) +ySub = np.array(ySub).reshape(-1, 1) + +clf = DecisionTreeClassifier(max_depth=1) +clf.fit(xSub, ySub) + +# Save decision tree with dark background +fig, ax = plt.subplots(figsize=(8, 6)) +plot_tree(clf, filled=True, feature_names=["BMI"], class_names=["No Diabetes", "Diabetes"], ax=ax) +plt.tight_layout() +plt.savefig("tree1.pdf") +plt.close() + +# Save first scatter plot (dia1) +plt.figure(figsize=(10, 6)) +plt.scatter(xSub, ySub, c=ySub, cmap='bwr', s=60, edgecolors='#000000') +plt.xlabel("BMI") +plt.ylabel("Diagnosis") +plt.xlim([0, 50]) +plt.tight_layout() +plt.savefig("dia1.pdf") +plt.close() + +# Save second scatter plot with vertical line at 32.25 (dia2) +plt.figure(figsize=(10, 6)) +plt.scatter(xSub, ySub, c=ySub, cmap='bwr', s=60, edgecolors="#000000") + +# Add shaded regions +plt.axvspan(0, 32.25, color='blue', alpha=0.3) # Light blue to the left +plt.axvspan(32.25, 50, color='red', alpha=0.3) # Light red to the right + +# Vertical decision boundary +plt.axvline(32.25, color="black", linestyle="-") + +plt.xlabel("BMI") +plt.ylabel("Diagnosis") +plt.xlim([0, 50]) +plt.tight_layout() +plt.savefig("dia2.pdf") +plt.close() diff --git a/presentations/uws/python-scripts/genBetter.py b/presentations/uws/python-scripts/genBetter.py @@ -0,0 +1,30 @@ +import matplotlib.pyplot as plt +import seaborn as sns +import random + +x = [] +y = [] +for i in range(0,100): + rnd = (random.random() * 300) + 100 + rndAdd = rnd + (random.random() * 140 - 70) + + x.append(rndAdd) + + if rnd > (286-249)/2 + 249: + y.append(1) + else: + y.append(0) +plt.scatter(x,y, c=y, cmap='bwr') +plt.axvline((286-249)/2 + 249) + +# plt.show() + +wrong = 0 +for i in range(0,len(x)): + if x[i] > (286-249)/2 + 249: + if y[i] == 0: + wrong += 1 + else: + if y[i] == 1: + wrong += 1 +print((len(x) - wrong) / len(x)) diff --git a/presentations/uws/python-scripts/genNew.py b/presentations/uws/python-scripts/genNew.py @@ -0,0 +1,22 @@ +import matplotlib.pyplot as plt +import seaborn as sns +import random + +x = [] +y = [] +target = [] + +for i in range(0,1000): + rnd = (random.random() * 300) + 100 + rndAdd = rnd + (random.random() * 140 - 70) + x.append(rndAdd) + height = (random.random() * 40 + 30) + rndAdd * 0.1 + y.append(height) + + if rnd / height > 4: + target.append(1) + else: + target.append(0) +plt.scatter(y,x, c=target, cmap='bwr') + +plt.show() diff --git a/presentations/uws/python-scripts/generate2D.py b/presentations/uws/python-scripts/generate2D.py @@ -0,0 +1,86 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +from sklearn.tree import DecisionTreeClassifier, plot_tree +from sklearn.metrics import accuracy_score +import os + +# Create output directory +os.makedirs("images", exist_ok=True) + +# Dark mode settings using matplotlib only +# plt.rcParams.update({ +# "axes.facecolor": "#000000", +# "axes.edgecolor": "#333333", +# "figure.facecolor": "#000000", +# "savefig.facecolor": "#000000", +# "text.color": "white", +# "axes.labelcolor": "white", +# "xtick.color": "white", +# "ytick.color": "white", +# "grid.color": "gray", +# "axes.grid": True +# }) + +# Load dataset +df = pd.read_csv('./diabetes.csv') + +# Extract features and target +X_bmi = df['BMI'].to_numpy() +X_glucose = df['Glucose'].to_numpy() +y = df['Outcome'].to_numpy() + +# Combine features into a 2D array +data = np.column_stack((X_bmi, X_glucose)) + +# Create meshgrid for decision region plotting +xx, yy = np.meshgrid(np.linspace(0, 70, 500), np.linspace(0, 200, 500)) +grid = np.c_[xx.ravel(), yy.ravel()] + +# Plot the original data +plt.figure(figsize=(12, 6)) +plt.scatter(X_bmi, X_glucose, c=y, cmap='bwr', s=60, edgecolors="#000000") +plt.xlabel("BMI") +plt.ylabel("Glucose") +plt.title("Diabetes Dataset: BMI vs Glucose") +plt.xlim(0, 70) +plt.ylim(0, 200) +plt.tight_layout() +plt.savefig("original_diabetes_plot_bmi_glucose.png", dpi=300, bbox_inches='tight') +plt.close() + +# Train decision trees and plot decision boundaries +for depth in range(1, 5): + clf = DecisionTreeClassifier(max_depth=depth, random_state=42) + clf.fit(data, y) + preds = clf.predict(data) + + # Plot decision tree at max depth + if depth == 4: + fig, ax = plt.subplots(figsize=(14, 14)) + plot_tree(clf, feature_names=["BMI", "Glucose"], class_names=["No Diabetes", "Diabetes"], ax=ax, filled=True) + + # Save decision tree plot with a black background + plt.savefig("tree2.pdf", bbox_inches='tight') + plt.close() + + # Print accuracy + acc = accuracy_score(y_pred=preds, y_true=y) + print(f"DEPTH: {depth} - Accuracy: {acc:.4f}") + + # Plot decision boundaries + Z = clf.predict_proba(grid)[:, 1].reshape(xx.shape) + + plt.figure(figsize=(12, 6)) + plt.contourf(xx, yy, Z, levels=[0, 0.5, 1], colors=['#0000FF', '#FF0000'], alpha=0.3) + plt.contour(xx, yy, Z, levels=[0.5], colors='black', linewidths=2) + + plt.scatter(X_bmi, X_glucose, c=y, cmap='bwr', s=60, edgecolors="#000000") + plt.xlabel("BMI") + plt.ylabel("Glucose") + plt.title(f"Decision Tree (Depth {depth})") + plt.xlim(0, 70) + plt.ylim(0, 200) + plt.tight_layout() + plt.savefig(f"images/diabetes_tree_bmi_glucose_depth_{depth}.png", dpi=300, bbox_inches='tight') + plt.close() diff --git a/presentations/uws/python-scripts/generateDiabetes.py b/presentations/uws/python-scripts/generateDiabetes.py @@ -0,0 +1,17 @@ +import matplotlib.pyplot as plt +import seaborn as sns + +x = [249 , 286 , 380 , 112 , 385 , 163 , 208 , 121 , 166 , 218] +y = [0 ,1 ,1 ,0 ,1 ,0 ,0 , 0 ,0 ,0] + +plt.scatter(x,y, c=y, cmap='bwr') +plt.show() + + +x = [249 , 286 , 380 , 112 , 385 , 163 , 208 , 121 , 166 , 218] +y = [0 ,1 ,1 ,0 ,1 ,0 ,0 , 0 ,0 ,0] + +plt.scatter(x,y, c=y, cmap='bwr') +plt.axvline((286-249)/2 + 249) + +plt.show() diff --git a/presentations/uws/python-scripts/main.py b/presentations/uws/python-scripts/main.py @@ -0,0 +1,163 @@ +from manim import * +import numpy as np +class CreateSimpleExample(Scene): + def construct(self): + colorList = [RED, GREEN, BLUE, YELLOW] + + # Create the grid + grid = Axes( + x_range=[0, 10, .5], + y_range=[0, 10, .5], + x_length=9, + y_length=5.5, + axis_config={ + "numbers_to_include": np.arange(0, 11, 1), + "font_size": 24, + }, + tips=False, + ) + + y_label = grid.get_y_axis_label("y") + x_label = grid.get_x_axis_label("x") + + self.play(Create(grid), Create(y_label), Create(x_label)) + + points = [] # To store points and their target locations + animations = [] # To store the animations for moving points + + for i in range(200): + target_blue = [(np.random.random() * 7.8) - 3.9, (0.5 * np.random.random() * 3.5) + .1, 0] + target_red = [(np.random.random() * 7.8) - 3.9, (0.5 * np.random.random() * -3.8) - .1, 0] + + blue_point = Dot(point=[0, 0, 0], color=BLUE, radius=.05) + red_point = Dot(point=[0, 0, 0], color=RED, radius=.05) + + self.add(blue_point, red_point) + + # Store the points and their animations to move to target positions + animations.append(blue_point.animate.move_to(target_blue)) + animations.append(red_point.animate.move_to(target_red)) + + # Play animations to move points to their target positions + self.play(*animations) + self.wait(duration=1) + + + text = Text("Impurity = 0.5").move_to([0,3,0]) + + + self.play(Create(text)) + + self.wait(duration=1) + + self.wait(1) + + line = Line(start=[-4, 0,0], end=[4,0,0]) + self.play(Create(line)) + + self.wait(2) + + + t1 = Rectangle(width=8, height=0).set_fill(BLUE, opacity=.2) + t2 = Rectangle(width=8).set_fill(BLUE, opacity=.3).move_to([0,1,0]) + + m1 = Rectangle(width=8, height=0).set_fill(RED, opacity=.2) + m2 = Rectangle(width=8).set_fill(RED, opacity=.3).move_to([0,-1,0]) + self.play(Transform(m1,m2), Transform(t1,t2)) + + self.wait(duration=1) + + text3 = Text("Impurity = 0.0").move_to([0,3,0]) + self.play(Transform(text,text3)) + + self.wait(duration=2) + + + +class CreateMoreComplex(Scene): + def construct(self): + colorList = [RED, GREEN, BLUE, YELLOW] + + # Create the grid + grid = Axes( + x_range=[0, 10, .5], + y_range=[0, 10, .5], + x_length=9, + y_length=5.5, + axis_config={ + "numbers_to_include": np.arange(0, 11, 1), + "font_size": 24, + }, + tips=False, + ) + + y_label = grid.get_y_axis_label("y") + x_label = grid.get_x_axis_label("x") + + self.play(Create(grid), Create(y_label), Create(x_label)) + + points = [] # To store points and their target locations + animations = [] # To store the animations for moving points + + for i in range(200): + target_blue = [(np.random.random() * 7.8) - 3.9, (0.5 * np.random.random() * 3.5) + .1, 0] + target_red = [(np.random.random() * (7.4 / 2)) - 3.9, (0.5 * np.random.random() * -3.5) - .1, 0] + target_yellow = [(np.random.random() * (7.4 / 2)), (0.5 * np.random.random() * -3.5) - .1, 0] + + blue_point = Dot(point=[0, 0, 0], color=BLUE, radius=.05) + red_point = Dot(point=[0, 0, 0], color=RED, radius=.05) + yellow_point= Dot(point=[0, 0, 0], color=YELLOW, radius=.05) + + self.add(blue_point, red_point, yellow_point) + + # Store the points and their animations to move to target positions + animations.append(blue_point.animate.move_to(target_blue)) + animations.append(red_point.animate.move_to(target_red)) + animations.append(yellow_point.animate.move_to(target_yellow)) + + # Play animations to move points to their target positions + self.play(*animations) + self.wait(duration=1) + + + text = Text("Impurity = 0.666").move_to([0,3,0]) + + self.play(Create(text)) + + self.wait(duration=1) + + self.wait(1) + + line = Line(start=[-4, 0,0], end=[4,0,0]) + self.play(Create(line)) + + self.wait(2) + + + t1 = Rectangle(width=8).set_fill(BLUE, opacity=.2).move_to([0,1,0]) + + m1 = Rectangle(width=8).set_fill(RED, opacity=.2).move_to([0,-1,0]) + + self.play(GrowFromCenter(t1)) + self.play(GrowFromCenter(m1)) + + self.wait(duration=1) + + text3 = Text("Impurity = 0.5").move_to([0,3,0]) + self.play(Transform(text,text3)) + + + line = Line(start=[0, 0,0], end=[0,-2,0]) + self.play(Create(line)) + + self.play(m1.animate.stretch(.5, dim=0).align_to([0, -1, 0], RIGHT)) + + rb = Rectangle(width=4).set_fill(YELLOW, opacity=.3).align_to([0,-1,0], LEFT) + rb.shift(DOWN) + + self.play(GrowFromCenter(rb)) + + + text4 = Text("Impurity = 0.0").move_to([0,3,0]) + self.play(Transform(text,text4)) + self.wait(duration=2) diff --git a/presentations/uws/references.bib b/presentations/uws/references.bib @@ -0,0 +1,199 @@ +@book{breiman, + author = {Breiman, L. and Friedman, J. and Olshen, R. A. and Stone, C. J.}, + title = {Classification and Regression Trees}, + edition = {1st}, + year = {1984}, + publisher = {Chapman and Hall/CRC}, + doi = {10.1201/9781315139470} +} + +@article{oc1, + + author = {Murthy, S. K. and Kasif, S. and Salzberg, S. L.}, + title = {A System for Induction of Oblique Decision Trees}, + journal = {Journal of Artificial Intelligence Research}, + volume = {2}, + pages = {1--32}, + year = {1994}, + doi = {10.1613/jair.63}, +} + +@article{quinlan1986, + author = {Quinlan, J. R.}, + title = {Induction of Decision Trees}, + journal = {Machine Learning}, + volume = {1}, + number = {1}, + pages = {81--106}, + year = {1986}, + doi = {10.1007/BF00116251} +} + +@article{hhcart, + author = {Wickramarachchi, D. C. and Robertson, B. L. and Reale, M. and Price, C. J. and Brown, J.}, + title = {{HHCART: An oblique decision tree}}, + journal = {Computational Statistics \& Data Analysis}, + volume = {96}, + pages = {12--23}, + year = {2016}, + issn = {0167-9473}, + doi = {10.1016/j.csda.2015.11.006}, +} + +@article{odewahn, + author = {Odewahn, S. C. and Stockwell, E. B. and Pennington, R. L. and Humphreys, R. M. and Zumach, W. A.}, + title = {Automated Star/Galaxy Discrimination With Neural Networks}, + journal = {The Astronomical Journal}, + year = {1992}, + volume = {103}, + pages = {318}, + doi = {10.1086/116063}, +} + +@misc{breast_cancer, + author = {Wolberg, W.}, + title = {{Breast Cancer Wisconsin (Original)}}, + year = {1990}, + howpublished = {UCI Machine Learning Repository}, + note = {{doi}: 10.24432/C5HP4Z} +} + +@misc{iris, + author = {Fisher, R. A.}, + title = {Iris}, + year = {1936}, + note = {{doi}: 10.24432/C56C76}, + howpublished = {UCI Machine Learning Repository}, +} + +@article{boston, +title = {Hedonic housing prices and the demand for clean air}, +journal = {Journal of Environmental Economics and Management}, +volume = {5}, +number = {1}, +pages = {81--102}, +year = {1978}, +issn = {0095-0696}, +doi = {10.1016/0095-0696(78)90006-2}, +author = {D. Harrison and D. L. Rubinfeld}, +} + + +@inproceedings{diabetes, + author = {Smith, J. W. and Everhart, J. E. and Dickson, W. C. and Knowler, W. C. and Johannes, R. S.}, + title = {Using the ADAP Learning Algorithm to Forecast the Onset of Diabetes Mellitus}, + booktitle = {Proceedings of the Annual Symposium on Computer Application in Medical Care}, + pages = {261--265}, + year = {1988} +} + +@inproceedings{boosting, + author = {Drucker, H. and Cortes, C.}, + booktitle = {Advances in Neural Information Processing Systems}, + publisher = {MIT Press}, + title = {Boosting Decision Trees}, + volume = {8}, + year = {1995} +} + +@article{sgboost, + title = {Stochastic gradient boosting}, + journal = {Computational Statistics \& Data Analysis}, + volume = {38}, + number = {4}, + pages = {367--378}, + year = {2002}, + issn = {0167-9473}, + doi = {10.1016/S0167-9473(01)00065-2}, + author = {J. H. Friedman}, +} + +@article{survey, + author={Mienye, I. D. and Jere, N.}, + journal={IEEE Access}, + title={A Survey of Decision Trees: Concepts, Algorithms, and Applications}, + year={2024}, + volume={12}, + pages={86716--86727}, + doi={10.1109/ACCESS.2024.3416838}} + +@article{recent_overview, + title={Decision trees: a recent overview}, + author={Kotsiantis, S. B.}, + journal={Artificial Intelligence Review}, + volume={39}, + number={4}, + pages={261--283}, + year={2011}, + publisher={Springer}, + doi={10.1007/s10462-011-9272-4} +} + +@article{it, + author={Shannon, C. E.}, + journal={The Bell System Technical Journal}, + title={A mathematical theory of communication}, + year={1948}, + volume={27}, + number={3}, + pages={379--423}, + doi={10.1002/j.1538-7305.1948.tb01338.x}, +} + + +@book{matrix, + title={Matrix Computations}, + author={Golub, G.H. and Van Loan, C.F.}, + isbn={9781421407944}, + lccn={2012943449}, + series={Johns Hopkins Studies in the Mathematical Sciences}, + year={2013}, + publisher={Johns Hopkins University Press} +} + + +@inproceedings{buntine, + title={Tree Classification Software}, + author={W. L. Buntine}, + booktitle = {Technology 2002: The Third National Technology Transfer Conference and Exposition, Volume 1}, + year={1993}, +} + +@article{rndfrst, + author = {L. Breiman}, + title = {Random Forests}, + journal = {Machine Learning}, + year = {2001}, + volume = {45}, + number = {1}, + pages = {5--32}, + month = {Oct}, + issn = {1573-0565}, + doi = {10.1023/A:1010933404324} +} + +@book{c4.5, + title={C4.5: Programs for Machine Learning}, + author={Quinlan, J. R.}, + isbn={9781558602380}, + series={Morgan Kaufmann series in machine learning}, + year={1993}, + publisher={Elsevier Science} +} + +@inproceedings{cv, + author = {R. Kohavi}, + title = {A study of cross-validation and bootstrap for accuracy estimation and model selection}, + booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI)}, + year = {1995}, + pages = {1137--1143}, + address = {Montreal, Canada}, +} + +@unpublished{laack, + title = {CART-ELC: Oblique Decision Tree Induction via Exhaustive Search}, + author = {Laack, A. D.}, + year = {2025}, + doi = {10.48550/arXiv.2505.05402}, + note = {Manuscript under review at Transactions on Machine Learning Research (TMLR)}, +}