decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 7103ec3a11e61323b8a261e188584744da94800a
parent a8e1ba5db2e658a6b806deccdde07f9bd3b28a15
Author: Andrew <andrewlaack1@gmail.com>
Date:   Thu, 26 Dec 2024 12:29:08 -0600

Did kernel stuff

Diffstat:
Arewrite/.~lock.output.xlsx# | 2++
Mrewrite/CMakeLists.txt | 1+
Mrewrite/Makefile | 27+++++++++++++++++++++++++++
Mrewrite/Test.py | 53+++++++++++++++++++++++++++++++----------------------
Mrewrite/cpp/DecisionTreeClassifier.cpp | 44++++++++++++++++++++++++++++++++++++++++++--
Mrewrite/cpp/DecisionTreeClassifier.h | 5++++-
Arewrite/cpp/Kernel.cpp | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Arewrite/cpp/Kernel.h | 16++++++++++++++++
Mrewrite/cpp/bindings.cpp | 2+-
Arewrite/cpp/test.cpp | 35+++++++++++++++++++++++++++++++++++
Arewrite/output.csv | 29+++++++++++++++++++++++++++++
11 files changed, 267 insertions(+), 26 deletions(-)

diff --git a/rewrite/.~lock.output.xlsx# b/rewrite/.~lock.output.xlsx# @@ -0,0 +1 @@ +,andrew,Viki,24.12.2024 14:23,file:///home/andrew/.config/libreoffice/4;+ \ No newline at end of file diff --git a/rewrite/CMakeLists.txt b/rewrite/CMakeLists.txt @@ -25,6 +25,7 @@ set(SOURCES cpp/TreeNode.cpp cpp/Criterion.cpp cpp/bindings.cpp + cpp/Kernel.cpp ) # Create the shared library diff --git a/rewrite/Makefile b/rewrite/Makefile @@ -190,6 +190,30 @@ cpp/DecisionTreeClassifier.cpp.s: $(MAKE) $(MAKESILENT) -f CMakeFiles/decision_tree.dir/build.make CMakeFiles/decision_tree.dir/cpp/DecisionTreeClassifier.cpp.s .PHONY : cpp/DecisionTreeClassifier.cpp.s +cpp/Kernel.o: cpp/Kernel.cpp.o +.PHONY : cpp/Kernel.o + +# target to build an object file +cpp/Kernel.cpp.o: + $(MAKE) $(MAKESILENT) -f CMakeFiles/decision_tree.dir/build.make CMakeFiles/decision_tree.dir/cpp/Kernel.cpp.o +.PHONY : cpp/Kernel.cpp.o + +cpp/Kernel.i: cpp/Kernel.cpp.i +.PHONY : cpp/Kernel.i + +# target to preprocess a source file +cpp/Kernel.cpp.i: + $(MAKE) $(MAKESILENT) -f CMakeFiles/decision_tree.dir/build.make CMakeFiles/decision_tree.dir/cpp/Kernel.cpp.i +.PHONY : cpp/Kernel.cpp.i + +cpp/Kernel.s: cpp/Kernel.cpp.s +.PHONY : cpp/Kernel.s + +# target to generate assembly for a file +cpp/Kernel.cpp.s: + $(MAKE) $(MAKESILENT) -f CMakeFiles/decision_tree.dir/build.make CMakeFiles/decision_tree.dir/cpp/Kernel.cpp.s +.PHONY : cpp/Kernel.cpp.s + cpp/TreeNode.o: cpp/TreeNode.cpp.o .PHONY : cpp/TreeNode.o @@ -254,6 +278,9 @@ help: @echo "... cpp/DecisionTreeClassifier.o" @echo "... cpp/DecisionTreeClassifier.i" @echo "... cpp/DecisionTreeClassifier.s" + @echo "... cpp/Kernel.o" + @echo "... cpp/Kernel.i" + @echo "... cpp/Kernel.s" @echo "... cpp/TreeNode.o" @echo "... cpp/TreeNode.i" @echo "... cpp/TreeNode.s" diff --git a/rewrite/Test.py b/rewrite/Test.py @@ -1,4 +1,5 @@ import numpy as np +import pandas as pd from decision_tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score from sklearn.datasets import fetch_openml @@ -14,39 +15,47 @@ X = X.astype(np.float32) # Ensure data is in float64 format for the classifier y = y.astype(np.int32) # Ensure target labels are integers accLs = [] -pca = PCA(n_components=784) # Reduce to 50 dimensions +xVals = [] +pca = PCA(n_components=784) -for i in range(1,1000): - X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) +SEED = 110 - X_train_pca = pca.fit_transform(X_train) - X_test_pca = pca.transform(X_test) +np.random.seed(SEED) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED) +y_train = np.array(y_train) +for i in range(1000, 5001): + if (i <= 100 and i % 10 == 0) or ((i) % 500 == 0) or (i == 1) or (i <= 500 and i % 50 == 0): + X_train_pca = np.array(X_train).copy() + X_test_pca = np.array(X_test).copy() - num_samples = i - np.random.seed(42) - random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False) - y_train = np.array(y_train) - X_train_pca = X_train_pca[random_indices] - y_train = y_train[random_indices] + num_samples = i + random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False) - clf = DecisionTreeClassifier(100) - start_time = time.time() - clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0])) - end_time = time.time() - elapsed_time = end_time - start_time - print(f"{i}: Time taken: {elapsed_time:.4f} seconds") + X_train_pca = X_train_pca[random_indices] + y_train_current = y_train[random_indices] + clf = DecisionTreeClassifier(100, True) - y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0])) - accuracy = accuracy_score(y_test, y_pred) - print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%") + start_time = time.time() + clf.fit(X_train_pca, len(X_train_pca), y_train_current, len(X_train_pca[0])) + end_time = time.time() + elapsed_time = end_time - start_time + print(f"{i}: Time taken: {elapsed_time:.4f} seconds") + y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0])) + accuracy = accuracy_score(y_test, y_pred) + print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%") - accLs.append(accuracy) + accLs.append(accuracy) + xVals.append(i) -plt.plot(range(1,1000), accLs) +df = pd.DataFrame(accLs, xVals) +df.to_csv('output.csv') + + +plt.plot(xVals, accLs) plt.show() diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/rewrite/cpp/DecisionTreeClassifier.cpp @@ -7,11 +7,17 @@ using namespace std; -DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){ - depth = maxDepth; +DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth, bool useImageKernelAugmentation){ + this->depth = maxDepth; + this->useImageKernelPreprocessing = useImageKernelAugmentation; } void DecisionTreeClassifier::fit(float* X, int samples, int* y, int features){ + + if(this->kernel != nullptr){ + delete kernel; + } + if (splittingTree != nullptr){ deleteTree(splittingTree); } @@ -24,8 +30,26 @@ void DecisionTreeClassifier::fit(float* X, int samples, int* y, int features){ throw invalid_argument("Invalid argument, there must be 1 or more samples to train on."); } + if(useImageKernelPreprocessing){ + float kernelMatrix[9] = { + -1, 0, 1, + -2, 0, -2, + -1, 0, 1 + }; + this->kernel = new Kernel(3,3, kernelMatrix); + KernelOutput out = this->kernel->augmentWithKernel(X,samples,features); + X = out.features; + features = out.featureCount; + } + + + splittingTree = recurse(X, samples, y, features, depth); featureCount = features; + + if(useImageKernelPreprocessing){ + delete[] X; + } } @@ -174,6 +198,13 @@ int* DecisionTreeClassifier::predict(float* X, int samples, int features) { throw logic_error("Unable to predict prior to calling fit()."); } + + if(useImageKernelPreprocessing){ + KernelOutput out = this->kernel->augmentWithKernel(X,samples,features); + X = out.features; + features = out.featureCount; + } + if(features != this->featureCount){ throw invalid_argument("Incorrect number of features for prediction."); } @@ -196,11 +227,20 @@ int* DecisionTreeClassifier::predict(float* X, int samples, int features) { predictions[i] = current->getClassification(); } + if(useImageKernelPreprocessing){ + delete [] X; + } + return predictions; } DecisionTreeClassifier::~DecisionTreeClassifier(){ deleteTree(splittingTree); + + if(this->kernel != nullptr){ + delete kernel; + } + } void DecisionTreeClassifier::deleteTree(TreeNode* node){ diff --git a/rewrite/cpp/DecisionTreeClassifier.h b/rewrite/cpp/DecisionTreeClassifier.h @@ -1,9 +1,10 @@ #include "TreeNode.h" #include <vector> +#include "Kernel.h" class DecisionTreeClassifier{ public: - DecisionTreeClassifier(int depth); + DecisionTreeClassifier(int depth, bool useImageKernelAugmentation); void fit(float* X, int samples, int* y, int features); int* predict(float* X, int samples, int features); std::string getDot(); @@ -16,4 +17,6 @@ class DecisionTreeClassifier{ TreeNode* recurse(float* X, int samples, int* y, int features, int depth); int primaryClass(int* y, int labelCount); void deleteTree(TreeNode* node); + bool useImageKernelPreprocessing; + Kernel* kernel = nullptr; }; diff --git a/rewrite/cpp/Kernel.cpp b/rewrite/cpp/Kernel.cpp @@ -0,0 +1,79 @@ +#include "Kernel.h" +#include <iostream> +#include <stdexcept> +#include "math.h" + +// verified proper inputs +Kernel::Kernel(int X_dim, int y_dim, float* kernel){ + + if(X_dim <= 2 || y_dim <= 2){ + throw std::invalid_argument("Kernel has a minimum size of 3x3."); + } + + if(X_dim % 2 == 0 || y_dim % 2 == 0){ + throw std::invalid_argument("Kernel must have odd size edges."); + } + + this->X_dim = X_dim; + this->y_dim = y_dim; + this->kernel = kernel; + + float summed = 0; + for(int y = 0 ; y < y_dim; ++y){ + for(int x = 0 ; x < X_dim; ++x){ + summed += kernel[x + (X_dim * y)]; + } + } + + this->sum = summed; +} + +KernelOutput Kernel::augmentWithKernel(float* X, int samples, int features){ + KernelOutput out = KernelOutput(); + out.featureCount = features * 2; + + float* X_return = new float[features*samples*2]; + + for(int i = 0 ; i < samples; ++i){ + for(int x = 0 ; x < features; ++x){ + X_return[i*features + x] = X[i*features + x]; + } + } + + + int currentOffset = samples * features; + for(int y = 0 ; y < features; ++y){ + for(int x = 0 ; x < samples; ++x){ + X_return[currentOffset] = computeIndex(X, x, y, features, samples); + currentOffset += 1; + } + } + + out.features = X_return; + + return out; +} + +float Kernel::computeIndex(float* X, int xPos, int yPos, int features, int samples){ + + float average = 0; + int itr = 0; + for(int i = 0 ; i < y_dim ; ++i){ + for(int x = 0; x < X_dim ; ++x){ + + int currentX = xPos + x; + int currentY = (i + yPos); + if(currentX < 0 || currentY < 0 || currentX >= features || currentY >= samples){ + continue; + } + float currentKern = kernel[x + i*X_dim]; + float currentValue = X[xPos + x + ((i + yPos) * features)]; + + itr += 1; + average += currentValue * currentKern; + } + } + average /= sum; + + return average; +} diff --git a/rewrite/cpp/Kernel.h b/rewrite/cpp/Kernel.h @@ -0,0 +1,16 @@ +struct KernelOutput{ + float* features; + int featureCount; +}; + +class Kernel{ + public: + Kernel(int X_dim, int y_dim, float* kernel); + KernelOutput augmentWithKernel(float* X, int samples, int features); + private: + float* kernel; + int X_dim; + int y_dim; + float sum; + float computeIndex(float* X, int xPos, int yPos, int features, int samples); +}; diff --git a/rewrite/cpp/bindings.cpp b/rewrite/cpp/bindings.cpp @@ -8,7 +8,7 @@ namespace py = pybind11; PYBIND11_MODULE(decision_tree, m) { py::class_<DecisionTreeClassifier>(m, "DecisionTreeClassifier") - .def(py::init<int>()) + .def(py::init<int, bool>()) .def("fit", [](DecisionTreeClassifier &self, py::array_t<float> X, int samples, py::array_t<int> y, int features) { auto X_buf = X.request(); // Request a buffer from NumPy array auto y_buf = y.request(); // Request a buffer from NumPy array diff --git a/rewrite/cpp/test.cpp b/rewrite/cpp/test.cpp @@ -0,0 +1,35 @@ +#include "Kernel.h" +#include "iostream" +int main(){ + + + float X[] = { + 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f, + 1.0f, 1.0f, 1.0f + }; + + float input[] = { + 1.0f, 1.0f, 1.0f, + 1.0f, 5.0f, 1.0f, + 1.0f, 1.0f, 1.0f + }; + + Kernel kern = Kernel(3,3,X); + KernelOutput out = kern.augmentWithKernel(input, 3, 3); + for(int i = 0 ; i < 3; ++i){ + for(int x = 0 ; x < 3 ; ++x){ + std::cout << out.features[i*3 + x] << " "; + } + std::cout << std::endl; + } + + std::cout << std::endl; + + for(int i = 0 ; i < 3; ++i){ + for(int x = 0 ; x < 3 ; ++x){ + std::cout << out.features[(i*3 + x) + 9] << " "; + } + std::cout << std::endl; + } +} diff --git a/rewrite/output.csv b/rewrite/output.csv @@ -0,0 +1,29 @@ +,0 +1,0.0945 +10,0.1327142857142857 +20,0.19542857142857142 +30,0.22857142857142856 +40,0.344 +50,0.29642857142857143 +60,0.3717142857142857 +70,0.4052142857142857 +80,0.3412857142857143 +90,0.44307142857142856 +100,0.4712857142857143 +150,0.4295 +200,0.5545 +250,0.5513571428571429 +300,0.5530714285714285 +350,0.5372142857142858 +400,0.5695 +450,0.6051428571428571 +500,0.5994285714285714 +1000,0.6572857142857143 +1500,0.6706428571428571 +2000,0.6952857142857143 +2500,0.7142142857142857 +3000,0.7461428571428571 +3500,0.7458571428571429 +4000,0.7458571428571429 +4500,0.7596428571428572 +5000,0.7695714285714286