commit 29450ced42ecb1bd98d0e3116f10f4e085c2fd31 parent fc97d111faf5f8f0a79d83ae3734f5a7d72a4de4 Author: Andrew <andrewlaack1@gmail.com> Date: Mon, 30 Dec 2024 23:20:53 -0600 Refactor Diffstat:
16 files changed, 62 insertions(+), 214 deletions(-)
diff --git a/Notes.md b/Notes.md @@ -1,67 +0,0 @@ -# Notes as I go - -BOTH RESULT IN THE SAME GRAPHS... - - - - -MAKE BENCHMARKING GRAPHS FOR COURSE - -Making gini impurity calculation in C++ = 44x improvement in compute time - -I am also noticing an insane difference in amount of memory being used... c++ is using nothing!!! - - -AFTER REWRITING INTO C++ vs original C++ + PYTHON IMPLEMENTATION - DO BENCHMARKING - -WITH 10,000 elements each 4 features, the c++ implementation computed the first split in: -27.272 seconds without optimization flags -3.867 with optimization flags - -PYTHON (WITH C++) DID THE SAME IN: -5.183 (user) w/ optimization flags - -WITH 20,000 ELEMENTS, 4 FEATURES: -PYTHON - 17.321 (user) -C++ - 15.271s (user) - -WITH 30,000 ELEMENTS, 4 FEATURES: -PYTHON - 37.5128 (user) -C++ 34.335 (user) - - -WITH WORKLOAD OF 6000 SAMPLES AND 784 FEATURES: - -MAX MEMORY C++ - 24628kB -MAX MEMORY Python - 521764kB - -ADD BENCHMARKING WITHOUT LIKELY AND THEN WITH. - -20,000 Samples, 4 Features: - WITHOUT LIKELY: 15.526 seconds (user), 15.632 seconds (user) - WITH LIKELY: 15.524 seconds (user), 15.608s (user) - -random list 1,000 elements, 784 dimensions, depth = 1 - python and c++: 24.258 (user) - just c++ : 10.657 (user) - -same as above but with 3,000 elements: - python and c++: 1m 55s (user) - just c++ : 1m 37s (user) - - ---- - -1,000 samples with 784 dimensions, depth = 1: - c++ single thread (python time): - 10.33 seconds - 10.33 seconds - 10.3526 seconds - c++ with multithreading (python time): - 1.852 seconds - 1.901 seconds - 1.886 seconds - -HOLY SHIT!! This pins all 6 cpu cores. - - diff --git a/README.md b/README.md @@ -1,3 +1,3 @@ -# pseudo-optimal-decision-tree +# CART Decision Tree Classifier -Pseudo optimal decision tree repository- \ No newline at end of file +Sample implementation of CART decision tree classifier written in C++ with hooks into python using pybind11. diff --git a/rewrite/.~lock.output.xlsx# b/cppDecisionTree/.~lock.output.xlsx# diff --git a/rewrite/CMakeLists.txt b/cppDecisionTree/CMakeLists.txt diff --git a/rewrite/Makefile b/cppDecisionTree/Makefile diff --git a/cppDecisionTree/Usage.py b/cppDecisionTree/Usage.py @@ -0,0 +1,60 @@ +import numpy as np +import pandas as pd +from decision_tree import DecisionTreeClassifier +from sklearn.metrics import accuracy_score +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split +from sklearn.decomposition import PCA +import time +import matplotlib.pyplot as plt + +mnist = fetch_openml("mnist_784", version=1) +X, y = mnist["data"], mnist["target"] +X = X.astype(np.float32) # Ensure data is in float64 format for the classifier +y = y.astype(np.int32) # Ensure target labels are integers + +accLs = [] +xVals = [] + +SEED = 110 + +np.random.seed(SEED) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED) +y_train = np.array(y_train) +X_train = np.array(X_train) +X_test = np.array(X_test) + +for i in range(1, 11001): + if (i <= 100 and i % 10 == 0) or ((i) % 500 == 0) or (i == 1) or (i <= 500 and i % 50 == 0): + X_train_pca = np.array(X_train).copy() + X_test_pca = np.array(X_test).copy() + + + + num_samples = i + random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False) + + + X_train_pca = X_train_pca[random_indices] + y_train_current = y_train[random_indices] + + clf = DecisionTreeClassifier(100) + + start_time = time.time() + clf.fit(X_train_pca, len(X_train_pca), y_train_current, len(X_train_pca[0])) + end_time = time.time() + elapsed_time = end_time - start_time + print(f"{i}: Time taken: {elapsed_time:.4f} seconds") + y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0])) + accuracy = accuracy_score(y_test, y_pred) + print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%") + + accLs.append(accuracy) + xVals.append(i) + +df = pd.DataFrame(accLs, xVals) +df.to_csv('output.csv') + + +plt.plot(xVals, accLs) +plt.show() diff --git a/rewrite/cpp/Criterion.cpp b/cppDecisionTree/cpp/Criterion.cpp diff --git a/rewrite/cpp/Criterion.h b/cppDecisionTree/cpp/Criterion.h diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/cppDecisionTree/cpp/DecisionTreeClassifier.cpp diff --git a/rewrite/cpp/DecisionTreeClassifier.h b/cppDecisionTree/cpp/DecisionTreeClassifier.h diff --git a/rewrite/cpp/TreeNode.cpp b/cppDecisionTree/cpp/TreeNode.cpp diff --git a/rewrite/cpp/TreeNode.h b/cppDecisionTree/cpp/TreeNode.h diff --git a/rewrite/cpp/bindings.cpp b/cppDecisionTree/cpp/bindings.cpp diff --git a/rewrite/Test.py b/rewrite/Test.py @@ -1,87 +0,0 @@ -import numpy as np -import pandas as pd -from decision_tree import DecisionTreeClassifier -from sklearn.metrics import accuracy_score -from sklearn.datasets import fetch_openml -from sklearn.model_selection import train_test_split -from sklearn.decomposition import PCA -import time -import matplotlib.pyplot as plt -import cv2 # OpenCV is required for Sobel operations - -mnist = fetch_openml("mnist_784", version=1) -X, y = mnist["data"], mnist["target"] -X = X.astype(np.float32) # Ensure data is in float64 format for the classifier -y = y.astype(np.int32) # Ensure target labels are integers - -accLs = [] -xVals = [] -pca = PCA(n_components=784) - -SEED = 110 - -np.random.seed(SEED) -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED) -y_train = np.array(y_train) -X_train = np.array(X_train) -X_test = np.array(X_test) - -def append_sobel_features(X): - # Assume X is a 3D array with shape (n_samples, height, width) for image data - sobel_features = [] - for sample in X: - # Compute Sobel edges along x and y directions - sobel_x = cv2.Sobel(sample, cv2.CV_64F, 1, 0, ksize=3) - sobel_y = cv2.Sobel(sample, cv2.CV_64F, 0, 1, ksize=3) - sobel_combined = np.sqrt(sobel_x**2 + sobel_y**2) # Combine both directions - sobel_combined = sobel_combined.flatten() # Flatten the 2D Sobel result - - # Append to list - sobel_features.append(sobel_combined) - - # Convert to NumPy array - sobel_features = np.array(sobel_features) - return np.hstack((X.reshape(len(X), -1), sobel_features)) - -# Apply the function to X_train and X_test -X_train = append_sobel_features(X_train) -X_test = append_sobel_features(X_test) - -print(X_train.shape) -print(X_test.shape) - - -for i in range(1000, 5001): - if (i <= 100 and i % 10 == 0) or ((i) % 500 == 0) or (i == 1) or (i <= 500 and i % 50 == 0): - X_train_pca = np.array(X_train).copy() - X_test_pca = np.array(X_test).copy() - - - - num_samples = i - random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False) - - - X_train_pca = X_train_pca[random_indices] - y_train_current = y_train[random_indices] - - clf = DecisionTreeClassifier(100) - - start_time = time.time() - clf.fit(X_train_pca, len(X_train_pca), y_train_current, len(X_train_pca[0])) - end_time = time.time() - elapsed_time = end_time - start_time - print(f"{i}: Time taken: {elapsed_time:.4f} seconds") - y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0])) - accuracy = accuracy_score(y_test, y_pred) - print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%") - - accLs.append(accuracy) - xVals.append(i) - -df = pd.DataFrame(accLs, xVals) -df.to_csv('output.csv') - - -plt.plot(xVals, accLs) -plt.show() diff --git a/rewrite/cpp/ObliqueNode.h b/rewrite/cpp/ObliqueNode.h @@ -1,40 +0,0 @@ -#include "string" - -struct SplitResults{ - float* XLeft; - float* XRight; - int* yLeft; - int* yRight; - int leftSize; - int rightSize; -}; - -class ObliqueNode{ - public: - ObliqueNode(int classification); - ObliqueNode(float splittingVal, int featureIndex); - bool isLeaf(); - void setSplit(float splittingValue, int featureIndex); - float evalSplit(float* X, int* y, int samples, int features, std::string criterion); - ObliqueNode* getLeftChild(); - ObliqueNode* getRightChild(); - void setLeftChild(ObliqueNode* child); - void setRightChild(ObliqueNode* child); - float getSplitVal(); - int getIndexSplit(); - SplitResults splitOnNode(float* X, int* y, int samples, int features); - std::string getDotEdges(); - int getClassification(); - bool lessThan(float* sample, int features); - - private: - bool leaf; - float splitValue; - int index; - ObliqueNode* leftChild = nullptr; - ObliqueNode* rightChild = nullptr; - std::string getDotLabel(); - int classification; -}; - - diff --git a/rewrite/cpp/OptimalObliqueTree.h b/rewrite/cpp/OptimalObliqueTree.h @@ -1,17 +0,0 @@ -#include "ObliqueNode.h" - -class OptimalObliqueTree{ - public: - OptimalObliqueTree(int depth); - void fit(float* X, int samples, int* y, int features); - int* predict(float* X, int samples, int features); - std::string getDot(); - private: - int depth; - int featureCount = -1; - ObliqueNode* splittingTree = nullptr; - ObliqueNode* bestSplit(float* X, int samples, int* y, int features); - ObliqueNode* recurse(float* X, int samples, int* y, int features, int depth); - int primaryClass(int* y, int labelCount); - void deleteTree(ObliqueNode* node); -};