decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 7d9083a0c8161d3ee0671936348eefcc0c02d8d6
parent fbb285ba01aa4db1ae65a4af09e49f307790396c
Author: Andrew <andrewlaack1@gmail.com>
Date:   Mon, 23 Dec 2024 14:30:30 -0600

Loaded in MNIST and configured bind

Diffstat:
MNotes.md | 7++++++-
Mold/Testing.py | 4++--
Mrewrite/CMakeLists.txt | 10++++++++--
Mrewrite/Test.py | 49++++++++++++++++++++++++++++---------------------
Mrewrite/cpp/DecisionTreeClassifier.cpp | 19+++++++++++++++++--
Mrewrite/cpp/DecisionTreeClassifier.h | 3+++
Mrewrite/cpp/TreeNode.cpp | 1-
Mrewrite/cpp/TreeNode.h | 4++--
Mrewrite/cpp/bindings.cpp | 20+++++++++++++++-----
9 files changed, 81 insertions(+), 36 deletions(-)

diff --git a/Notes.md b/Notes.md @@ -41,5 +41,10 @@ ADD BENCHMARKING WITHOUT LIKELY AND THEN WITH. WITHOUT LIKELY: 15.526 seconds (user), 15.632 seconds (user) WITH LIKELY: 15.524 seconds (user), 15.608s (user) +random list 1,000 elements, 784 dimensions, depth = 1 + python and c++: 24.258 (user) + just c++ : 10.657 (user) -BUILD OPTIMAL DECISION TREE +same as above but with 3,000 elements: + python and c++: 1m 55s (user) + just c++ : 1m 37s (user) diff --git a/old/Testing.py b/old/Testing.py @@ -13,10 +13,10 @@ import numpy as np # train_X = train_X.reshape(-1, 784) # test_X = test_X.reshape(-1, 784) -X_train = np.random.randint(1, 10, size=(6000, 784)) +X_train = np.random.randint(1, 10, size=(3000, 784)) # Generate 60,000 labels for y_train (for example, labels between 1 and 10) -y_train = np.random.randint(1, 11, size=6000) +y_train = np.random.randint(1, 11, size=3000) # train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]] # train_y = [1, 1 , 2, 1, 5, 2,1 ,3] diff --git a/rewrite/CMakeLists.txt b/rewrite/CMakeLists.txt @@ -11,8 +11,13 @@ set(CMAKE_CXX_STANDARD 17) find_package(Python3 REQUIRED COMPONENTS Interpreter Development) find_package(pybind11 REQUIRED) -# Set compiler flags -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -fPIC") +# Ensure optimization flag is applied for Release builds (you could also manually override Debug builds) +if(CMAKE_BUILD_TYPE STREQUAL "Release") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -Wall -fPIC") +else() + # Optionally you can set optimizations differently for Debug builds + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O3 -Wall -fPIC") +endif() # Add source files set(SOURCES @@ -40,3 +45,4 @@ add_custom_target(clean_build COMMAND rm -f ${CMAKE_BINARY_DIR}/*.o COMMENT "Clean build files" ) + diff --git a/rewrite/Test.py b/rewrite/Test.py @@ -1,38 +1,45 @@ -from sklearn.datasets import load_digits import numpy as np from decision_tree import DecisionTreeClassifier -from keras.datasets import mnist from sklearn.metrics import accuracy_score +from sklearn.datasets import fetch_openml +from sklearn.model_selection import train_test_split +from sklearn.decomposition import PCA +import time -(train_X, train_y), (test_X, test_y) = mnist.load_data() -train_X = train_X.reshape(-1, 784) -test_X = test_X.reshape(-1, 784) +mnist = fetch_openml("mnist_784", version=1) +X, y = mnist["data"], mnist["target"] +X = X.astype(np.float32) # Ensure data is in float64 format for the classifier +y = y.astype(np.int32) # Ensure target labels are integers -train_X = train_X[0:2000] -train_y = train_y[0:2000] +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) +pca = PCA(n_components=75) # Reduce to 50 dimensions +X_train_pca = pca.fit_transform(X_train) +X_test_pca = pca.transform(X_test) -clf = DecisionTreeClassifier(20) +X_train_pca = X_train_pca[0:20000] +y_train = y_train[0:10000] +# Initialize and train the Decision Tree Classifier +clf = DecisionTreeClassifier(25) print("TRAINING") -clf.fit(train_X, len(train_X), train_y, len(train_X[0])) -predictions = clf.predict(test_X, test_X.shape[0], test_X.shape[1]) +start_time = time.time() +clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0])) +end_time = time.time() +elapsed_time = end_time - start_time +print(f"Time taken: {elapsed_time:.4f} seconds") -# Output results -print("Predictions:", predictions) -print(accuracy_score(y_pred=predictions, y_true=test_y)) +# Predict on the test set +print("PREDICTING") +y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0])) +# Evaluate the classifier's accuracy +accuracy = accuracy_score(y_test, y_pred) +print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%") - - -dot_representation = clf.getDot() -# print("\nDecision Tree (DOT Format):\n", dot_representation) - -# Optionally save DOT representation to a file -with open("decision_tree.dot", "w") as file: - file.write(dot_representation) +exit() diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/rewrite/cpp/DecisionTreeClassifier.cpp @@ -11,7 +11,7 @@ DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){ void DecisionTreeClassifier::fit(float* X, int samples, int* y, int features){ if (splittingTree != nullptr){ - throw logic_error("Decision trees don't support incremental learning, fit can only be called once."); + deleteTree(splittingTree); } if(features <= 0){ @@ -145,7 +145,7 @@ TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int colu } -int* DecisionTreeClassifier::predict(float* X, int samples, int features){ +int* DecisionTreeClassifier::predict(float* X, int samples, int features) { if(featureCount == -1){ throw logic_error("Unable to predict prior to calling fit()."); @@ -175,3 +175,18 @@ int* DecisionTreeClassifier::predict(float* X, int samples, int features){ return predictions; } + +DecisionTreeClassifier::~DecisionTreeClassifier(){ + deleteTree(splittingTree); +} + +void DecisionTreeClassifier::deleteTree(TreeNode* node){ + + if(node == nullptr){ + return; + } + + deleteTree(node->getLeftChild()); + deleteTree(node->getRightChild()); + delete node; +} diff --git a/rewrite/cpp/DecisionTreeClassifier.h b/rewrite/cpp/DecisionTreeClassifier.h @@ -1,4 +1,5 @@ #include "TreeNode.h" +#include <vector> class DecisionTreeClassifier{ public: @@ -6,6 +7,7 @@ class DecisionTreeClassifier{ void fit(float* X, int samples, int* y, int features); int* predict(float* X, int samples, int features); std::string getDot(); + ~DecisionTreeClassifier(); private: int depth; int featureCount = -1; @@ -13,4 +15,5 @@ class DecisionTreeClassifier{ TreeNode* bestSplit(float* X, int samples, int* y, int features); TreeNode* recurse(float* X, int samples, int* y, int features, int depth); int primaryClass(int* y, int labelCount); + void deleteTree(TreeNode* node); }; diff --git a/rewrite/cpp/TreeNode.cpp b/rewrite/cpp/TreeNode.cpp @@ -3,7 +3,6 @@ #include "Criterion.h" #include "math.h" #include "iostream" -#include <sstream> //for std::stringstream #include <string> //for std::string TreeNode::TreeNode(int classification){ diff --git a/rewrite/cpp/TreeNode.h b/rewrite/cpp/TreeNode.h @@ -31,8 +31,8 @@ class TreeNode{ bool leaf; float splitValue; int index; - TreeNode* leftChild; - TreeNode* rightChild; + TreeNode* leftChild = nullptr; + TreeNode* rightChild = nullptr; std::string getDotLabel(); int classification; }; diff --git a/rewrite/cpp/bindings.cpp b/rewrite/cpp/bindings.cpp @@ -1,6 +1,8 @@ + #include <pybind11/pybind11.h> #include <pybind11/numpy.h> #include "DecisionTreeClassifier.h" +#include <vector> namespace py = pybind11; @@ -17,14 +19,22 @@ PYBIND11_MODULE(decision_tree, m) { .def("predict", [](DecisionTreeClassifier &self, py::array_t<float> X, int samples, int features) { auto X_buf = X.request(); // Request a buffer from NumPy array float* X_ptr = static_cast<float*>(X_buf.ptr); + + // Get the prediction result as a raw pointer (dynamically allocated array) int* result = self.predict(X_ptr, samples, features); - // Return a NumPy array from the result - return py::array_t<int>(samples, result); + + // Create a NumPy array from the raw pointer + py::array_t<int> result_array(samples, result); + + // Once the NumPy array is created, delete the dynamically allocated array + delete[] result; // Properly deallocate the memory + + return result_array; // Return the NumPy array }) .def("getDot", &DecisionTreeClassifier::getDot) - .def("__repr__", [](const DecisionTreeClassifier &dt) { - return "<DecisionTreeClassifier>"; - }); + .def("__repr__", [](const DecisionTreeClassifier &dt) { + return "<DecisionTreeClassifier>"; + }); }