commit 7d9083a0c8161d3ee0671936348eefcc0c02d8d6
parent fbb285ba01aa4db1ae65a4af09e49f307790396c
Author: Andrew <andrewlaack1@gmail.com>
Date: Mon, 23 Dec 2024 14:30:30 -0600
Loaded in MNIST and configured bind
Diffstat:
9 files changed, 81 insertions(+), 36 deletions(-)
diff --git a/Notes.md b/Notes.md
@@ -41,5 +41,10 @@ ADD BENCHMARKING WITHOUT LIKELY AND THEN WITH.
WITHOUT LIKELY: 15.526 seconds (user), 15.632 seconds (user)
WITH LIKELY: 15.524 seconds (user), 15.608s (user)
+random list 1,000 elements, 784 dimensions, depth = 1
+ python and c++: 24.258 (user)
+ just c++ : 10.657 (user)
-BUILD OPTIMAL DECISION TREE
+same as above but with 3,000 elements:
+ python and c++: 1m 55s (user)
+ just c++ : 1m 37s (user)
diff --git a/old/Testing.py b/old/Testing.py
@@ -13,10 +13,10 @@ import numpy as np
# train_X = train_X.reshape(-1, 784)
# test_X = test_X.reshape(-1, 784)
-X_train = np.random.randint(1, 10, size=(6000, 784))
+X_train = np.random.randint(1, 10, size=(3000, 784))
# Generate 60,000 labels for y_train (for example, labels between 1 and 10)
-y_train = np.random.randint(1, 11, size=6000)
+y_train = np.random.randint(1, 11, size=3000)
# train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]]
# train_y = [1, 1 , 2, 1, 5, 2,1 ,3]
diff --git a/rewrite/CMakeLists.txt b/rewrite/CMakeLists.txt
@@ -11,8 +11,13 @@ set(CMAKE_CXX_STANDARD 17)
find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
find_package(pybind11 REQUIRED)
-# Set compiler flags
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3 -Wall -fPIC")
+# Ensure optimization flag is applied for Release builds (you could also manually override Debug builds)
+if(CMAKE_BUILD_TYPE STREQUAL "Release")
+ set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -Wall -fPIC")
+else()
+ # Optionally you can set optimizations differently for Debug builds
+ set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O3 -Wall -fPIC")
+endif()
# Add source files
set(SOURCES
@@ -40,3 +45,4 @@ add_custom_target(clean_build
COMMAND rm -f ${CMAKE_BINARY_DIR}/*.o
COMMENT "Clean build files"
)
+
diff --git a/rewrite/Test.py b/rewrite/Test.py
@@ -1,38 +1,45 @@
-from sklearn.datasets import load_digits
import numpy as np
from decision_tree import DecisionTreeClassifier
-from keras.datasets import mnist
from sklearn.metrics import accuracy_score
+from sklearn.datasets import fetch_openml
+from sklearn.model_selection import train_test_split
+from sklearn.decomposition import PCA
+import time
-(train_X, train_y), (test_X, test_y) = mnist.load_data()
-train_X = train_X.reshape(-1, 784)
-test_X = test_X.reshape(-1, 784)
+mnist = fetch_openml("mnist_784", version=1)
+X, y = mnist["data"], mnist["target"]
+X = X.astype(np.float32) # Ensure data is in float64 format for the classifier
+y = y.astype(np.int32) # Ensure target labels are integers
-train_X = train_X[0:2000]
-train_y = train_y[0:2000]
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+pca = PCA(n_components=75) # Reduce to 50 dimensions
+X_train_pca = pca.fit_transform(X_train)
+X_test_pca = pca.transform(X_test)
-clf = DecisionTreeClassifier(20)
+X_train_pca = X_train_pca[0:20000]
+y_train = y_train[0:10000]
+# Initialize and train the Decision Tree Classifier
+clf = DecisionTreeClassifier(25)
print("TRAINING")
-clf.fit(train_X, len(train_X), train_y, len(train_X[0]))
-predictions = clf.predict(test_X, test_X.shape[0], test_X.shape[1])
+start_time = time.time()
+clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0]))
+end_time = time.time()
+elapsed_time = end_time - start_time
+print(f"Time taken: {elapsed_time:.4f} seconds")
-# Output results
-print("Predictions:", predictions)
-print(accuracy_score(y_pred=predictions, y_true=test_y))
+# Predict on the test set
+print("PREDICTING")
+y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
+# Evaluate the classifier's accuracy
+accuracy = accuracy_score(y_test, y_pred)
+print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
-
-
-dot_representation = clf.getDot()
-# print("\nDecision Tree (DOT Format):\n", dot_representation)
-
-# Optionally save DOT representation to a file
-with open("decision_tree.dot", "w") as file:
- file.write(dot_representation)
+exit()
diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/rewrite/cpp/DecisionTreeClassifier.cpp
@@ -11,7 +11,7 @@ DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){
void DecisionTreeClassifier::fit(float* X, int samples, int* y, int features){
if (splittingTree != nullptr){
- throw logic_error("Decision trees don't support incremental learning, fit can only be called once.");
+ deleteTree(splittingTree);
}
if(features <= 0){
@@ -145,7 +145,7 @@ TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int colu
}
-int* DecisionTreeClassifier::predict(float* X, int samples, int features){
+int* DecisionTreeClassifier::predict(float* X, int samples, int features) {
if(featureCount == -1){
throw logic_error("Unable to predict prior to calling fit().");
@@ -175,3 +175,18 @@ int* DecisionTreeClassifier::predict(float* X, int samples, int features){
return predictions;
}
+
+DecisionTreeClassifier::~DecisionTreeClassifier(){
+ deleteTree(splittingTree);
+}
+
+void DecisionTreeClassifier::deleteTree(TreeNode* node){
+
+ if(node == nullptr){
+ return;
+ }
+
+ deleteTree(node->getLeftChild());
+ deleteTree(node->getRightChild());
+ delete node;
+}
diff --git a/rewrite/cpp/DecisionTreeClassifier.h b/rewrite/cpp/DecisionTreeClassifier.h
@@ -1,4 +1,5 @@
#include "TreeNode.h"
+#include <vector>
class DecisionTreeClassifier{
public:
@@ -6,6 +7,7 @@ class DecisionTreeClassifier{
void fit(float* X, int samples, int* y, int features);
int* predict(float* X, int samples, int features);
std::string getDot();
+ ~DecisionTreeClassifier();
private:
int depth;
int featureCount = -1;
@@ -13,4 +15,5 @@ class DecisionTreeClassifier{
TreeNode* bestSplit(float* X, int samples, int* y, int features);
TreeNode* recurse(float* X, int samples, int* y, int features, int depth);
int primaryClass(int* y, int labelCount);
+ void deleteTree(TreeNode* node);
};
diff --git a/rewrite/cpp/TreeNode.cpp b/rewrite/cpp/TreeNode.cpp
@@ -3,7 +3,6 @@
#include "Criterion.h"
#include "math.h"
#include "iostream"
-#include <sstream> //for std::stringstream
#include <string> //for std::string
TreeNode::TreeNode(int classification){
diff --git a/rewrite/cpp/TreeNode.h b/rewrite/cpp/TreeNode.h
@@ -31,8 +31,8 @@ class TreeNode{
bool leaf;
float splitValue;
int index;
- TreeNode* leftChild;
- TreeNode* rightChild;
+ TreeNode* leftChild = nullptr;
+ TreeNode* rightChild = nullptr;
std::string getDotLabel();
int classification;
};
diff --git a/rewrite/cpp/bindings.cpp b/rewrite/cpp/bindings.cpp
@@ -1,6 +1,8 @@
+
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include "DecisionTreeClassifier.h"
+#include <vector>
namespace py = pybind11;
@@ -17,14 +19,22 @@ PYBIND11_MODULE(decision_tree, m) {
.def("predict", [](DecisionTreeClassifier &self, py::array_t<float> X, int samples, int features) {
auto X_buf = X.request(); // Request a buffer from NumPy array
float* X_ptr = static_cast<float*>(X_buf.ptr);
+
+ // Get the prediction result as a raw pointer (dynamically allocated array)
int* result = self.predict(X_ptr, samples, features);
- // Return a NumPy array from the result
- return py::array_t<int>(samples, result);
+
+ // Create a NumPy array from the raw pointer
+ py::array_t<int> result_array(samples, result);
+
+ // Once the NumPy array is created, delete the dynamically allocated array
+ delete[] result; // Properly deallocate the memory
+
+ return result_array; // Return the NumPy array
})
.def("getDot", &DecisionTreeClassifier::getDot)
- .def("__repr__", [](const DecisionTreeClassifier &dt) {
- return "<DecisionTreeClassifier>";
- });
+ .def("__repr__", [](const DecisionTreeClassifier &dt) {
+ return "<DecisionTreeClassifier>";
+ });
}