commit 7103ec3a11e61323b8a261e188584744da94800a
parent a8e1ba5db2e658a6b806deccdde07f9bd3b28a15
Author: Andrew <andrewlaack1@gmail.com>
Date: Thu, 26 Dec 2024 12:29:08 -0600
Did kernel stuff
Diffstat:
11 files changed, 267 insertions(+), 26 deletions(-)
diff --git a/rewrite/.~lock.output.xlsx# b/rewrite/.~lock.output.xlsx#
@@ -0,0 +1 @@
+,andrew,Viki,24.12.2024 14:23,file:///home/andrew/.config/libreoffice/4;+
\ No newline at end of file
diff --git a/rewrite/CMakeLists.txt b/rewrite/CMakeLists.txt
@@ -25,6 +25,7 @@ set(SOURCES
cpp/TreeNode.cpp
cpp/Criterion.cpp
cpp/bindings.cpp
+ cpp/Kernel.cpp
)
# Create the shared library
diff --git a/rewrite/Makefile b/rewrite/Makefile
@@ -190,6 +190,30 @@ cpp/DecisionTreeClassifier.cpp.s:
$(MAKE) $(MAKESILENT) -f CMakeFiles/decision_tree.dir/build.make CMakeFiles/decision_tree.dir/cpp/DecisionTreeClassifier.cpp.s
.PHONY : cpp/DecisionTreeClassifier.cpp.s
+cpp/Kernel.o: cpp/Kernel.cpp.o
+.PHONY : cpp/Kernel.o
+
+# target to build an object file
+cpp/Kernel.cpp.o:
+ $(MAKE) $(MAKESILENT) -f CMakeFiles/decision_tree.dir/build.make CMakeFiles/decision_tree.dir/cpp/Kernel.cpp.o
+.PHONY : cpp/Kernel.cpp.o
+
+cpp/Kernel.i: cpp/Kernel.cpp.i
+.PHONY : cpp/Kernel.i
+
+# target to preprocess a source file
+cpp/Kernel.cpp.i:
+ $(MAKE) $(MAKESILENT) -f CMakeFiles/decision_tree.dir/build.make CMakeFiles/decision_tree.dir/cpp/Kernel.cpp.i
+.PHONY : cpp/Kernel.cpp.i
+
+cpp/Kernel.s: cpp/Kernel.cpp.s
+.PHONY : cpp/Kernel.s
+
+# target to generate assembly for a file
+cpp/Kernel.cpp.s:
+ $(MAKE) $(MAKESILENT) -f CMakeFiles/decision_tree.dir/build.make CMakeFiles/decision_tree.dir/cpp/Kernel.cpp.s
+.PHONY : cpp/Kernel.cpp.s
+
cpp/TreeNode.o: cpp/TreeNode.cpp.o
.PHONY : cpp/TreeNode.o
@@ -254,6 +278,9 @@ help:
@echo "... cpp/DecisionTreeClassifier.o"
@echo "... cpp/DecisionTreeClassifier.i"
@echo "... cpp/DecisionTreeClassifier.s"
+ @echo "... cpp/Kernel.o"
+ @echo "... cpp/Kernel.i"
+ @echo "... cpp/Kernel.s"
@echo "... cpp/TreeNode.o"
@echo "... cpp/TreeNode.i"
@echo "... cpp/TreeNode.s"
diff --git a/rewrite/Test.py b/rewrite/Test.py
@@ -1,4 +1,5 @@
import numpy as np
+import pandas as pd
from decision_tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_openml
@@ -14,39 +15,47 @@ X = X.astype(np.float32) # Ensure data is in float64 format for the classifier
y = y.astype(np.int32) # Ensure target labels are integers
accLs = []
-pca = PCA(n_components=784) # Reduce to 50 dimensions
+xVals = []
+pca = PCA(n_components=784)
-for i in range(1,1000):
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+SEED = 110
- X_train_pca = pca.fit_transform(X_train)
- X_test_pca = pca.transform(X_test)
+np.random.seed(SEED)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
+y_train = np.array(y_train)
+for i in range(1000, 5001):
+ if (i <= 100 and i % 10 == 0) or ((i) % 500 == 0) or (i == 1) or (i <= 500 and i % 50 == 0):
+ X_train_pca = np.array(X_train).copy()
+ X_test_pca = np.array(X_test).copy()
- num_samples = i
- np.random.seed(42)
- random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False)
- y_train = np.array(y_train)
- X_train_pca = X_train_pca[random_indices]
- y_train = y_train[random_indices]
+ num_samples = i
+ random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False)
- clf = DecisionTreeClassifier(100)
- start_time = time.time()
- clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0]))
- end_time = time.time()
- elapsed_time = end_time - start_time
- print(f"{i}: Time taken: {elapsed_time:.4f} seconds")
+ X_train_pca = X_train_pca[random_indices]
+ y_train_current = y_train[random_indices]
+ clf = DecisionTreeClassifier(100, True)
- y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
- accuracy = accuracy_score(y_test, y_pred)
- print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
+ start_time = time.time()
+ clf.fit(X_train_pca, len(X_train_pca), y_train_current, len(X_train_pca[0]))
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ print(f"{i}: Time taken: {elapsed_time:.4f} seconds")
+ y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
+ accuracy = accuracy_score(y_test, y_pred)
+ print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
- accLs.append(accuracy)
+ accLs.append(accuracy)
+ xVals.append(i)
-plt.plot(range(1,1000), accLs)
+df = pd.DataFrame(accLs, xVals)
+df.to_csv('output.csv')
+
+
+plt.plot(xVals, accLs)
plt.show()
diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/rewrite/cpp/DecisionTreeClassifier.cpp
@@ -7,11 +7,17 @@
using namespace std;
-DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){
- depth = maxDepth;
+DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth, bool useImageKernelAugmentation){
+ this->depth = maxDepth;
+ this->useImageKernelPreprocessing = useImageKernelAugmentation;
}
void DecisionTreeClassifier::fit(float* X, int samples, int* y, int features){
+
+ if(this->kernel != nullptr){
+ delete kernel;
+ }
+
if (splittingTree != nullptr){
deleteTree(splittingTree);
}
@@ -24,8 +30,26 @@ void DecisionTreeClassifier::fit(float* X, int samples, int* y, int features){
throw invalid_argument("Invalid argument, there must be 1 or more samples to train on.");
}
+ if(useImageKernelPreprocessing){
+ float kernelMatrix[9] = {
+ -1, 0, 1,
+ -2, 0, -2,
+ -1, 0, 1
+ };
+ this->kernel = new Kernel(3,3, kernelMatrix);
+ KernelOutput out = this->kernel->augmentWithKernel(X,samples,features);
+ X = out.features;
+ features = out.featureCount;
+ }
+
+
+
splittingTree = recurse(X, samples, y, features, depth);
featureCount = features;
+
+ if(useImageKernelPreprocessing){
+ delete[] X;
+ }
}
@@ -174,6 +198,13 @@ int* DecisionTreeClassifier::predict(float* X, int samples, int features) {
throw logic_error("Unable to predict prior to calling fit().");
}
+
+ if(useImageKernelPreprocessing){
+ KernelOutput out = this->kernel->augmentWithKernel(X,samples,features);
+ X = out.features;
+ features = out.featureCount;
+ }
+
if(features != this->featureCount){
throw invalid_argument("Incorrect number of features for prediction.");
}
@@ -196,11 +227,20 @@ int* DecisionTreeClassifier::predict(float* X, int samples, int features) {
predictions[i] = current->getClassification();
}
+ if(useImageKernelPreprocessing){
+ delete [] X;
+ }
+
return predictions;
}
DecisionTreeClassifier::~DecisionTreeClassifier(){
deleteTree(splittingTree);
+
+ if(this->kernel != nullptr){
+ delete kernel;
+ }
+
}
void DecisionTreeClassifier::deleteTree(TreeNode* node){
diff --git a/rewrite/cpp/DecisionTreeClassifier.h b/rewrite/cpp/DecisionTreeClassifier.h
@@ -1,9 +1,10 @@
#include "TreeNode.h"
#include <vector>
+#include "Kernel.h"
class DecisionTreeClassifier{
public:
- DecisionTreeClassifier(int depth);
+ DecisionTreeClassifier(int depth, bool useImageKernelAugmentation);
void fit(float* X, int samples, int* y, int features);
int* predict(float* X, int samples, int features);
std::string getDot();
@@ -16,4 +17,6 @@ class DecisionTreeClassifier{
TreeNode* recurse(float* X, int samples, int* y, int features, int depth);
int primaryClass(int* y, int labelCount);
void deleteTree(TreeNode* node);
+ bool useImageKernelPreprocessing;
+ Kernel* kernel = nullptr;
};
diff --git a/rewrite/cpp/Kernel.cpp b/rewrite/cpp/Kernel.cpp
@@ -0,0 +1,79 @@
+#include "Kernel.h"
+#include <iostream>
+#include <stdexcept>
+#include "math.h"
+
+// verified proper inputs
+Kernel::Kernel(int X_dim, int y_dim, float* kernel){
+
+ if(X_dim <= 2 || y_dim <= 2){
+ throw std::invalid_argument("Kernel has a minimum size of 3x3.");
+ }
+
+ if(X_dim % 2 == 0 || y_dim % 2 == 0){
+ throw std::invalid_argument("Kernel must have odd size edges.");
+ }
+
+ this->X_dim = X_dim;
+ this->y_dim = y_dim;
+ this->kernel = kernel;
+
+ float summed = 0;
+ for(int y = 0 ; y < y_dim; ++y){
+ for(int x = 0 ; x < X_dim; ++x){
+ summed += kernel[x + (X_dim * y)];
+ }
+ }
+
+ this->sum = summed;
+}
+
+KernelOutput Kernel::augmentWithKernel(float* X, int samples, int features){
+ KernelOutput out = KernelOutput();
+ out.featureCount = features * 2;
+
+ float* X_return = new float[features*samples*2];
+
+ for(int i = 0 ; i < samples; ++i){
+ for(int x = 0 ; x < features; ++x){
+ X_return[i*features + x] = X[i*features + x];
+ }
+ }
+
+
+ int currentOffset = samples * features;
+ for(int y = 0 ; y < features; ++y){
+ for(int x = 0 ; x < samples; ++x){
+ X_return[currentOffset] = computeIndex(X, x, y, features, samples);
+ currentOffset += 1;
+ }
+ }
+
+ out.features = X_return;
+
+ return out;
+}
+
+float Kernel::computeIndex(float* X, int xPos, int yPos, int features, int samples){
+
+ float average = 0;
+ int itr = 0;
+ for(int i = 0 ; i < y_dim ; ++i){
+ for(int x = 0; x < X_dim ; ++x){
+
+ int currentX = xPos + x;
+ int currentY = (i + yPos);
+ if(currentX < 0 || currentY < 0 || currentX >= features || currentY >= samples){
+ continue;
+ }
+ float currentKern = kernel[x + i*X_dim];
+ float currentValue = X[xPos + x + ((i + yPos) * features)];
+
+ itr += 1;
+ average += currentValue * currentKern;
+ }
+ }
+ average /= sum;
+
+ return average;
+}
diff --git a/rewrite/cpp/Kernel.h b/rewrite/cpp/Kernel.h
@@ -0,0 +1,16 @@
+struct KernelOutput{
+ float* features;
+ int featureCount;
+};
+
+class Kernel{
+ public:
+ Kernel(int X_dim, int y_dim, float* kernel);
+ KernelOutput augmentWithKernel(float* X, int samples, int features);
+ private:
+ float* kernel;
+ int X_dim;
+ int y_dim;
+ float sum;
+ float computeIndex(float* X, int xPos, int yPos, int features, int samples);
+};
diff --git a/rewrite/cpp/bindings.cpp b/rewrite/cpp/bindings.cpp
@@ -8,7 +8,7 @@ namespace py = pybind11;
PYBIND11_MODULE(decision_tree, m) {
py::class_<DecisionTreeClassifier>(m, "DecisionTreeClassifier")
- .def(py::init<int>())
+ .def(py::init<int, bool>())
.def("fit", [](DecisionTreeClassifier &self, py::array_t<float> X, int samples, py::array_t<int> y, int features) {
auto X_buf = X.request(); // Request a buffer from NumPy array
auto y_buf = y.request(); // Request a buffer from NumPy array
diff --git a/rewrite/cpp/test.cpp b/rewrite/cpp/test.cpp
@@ -0,0 +1,35 @@
+#include "Kernel.h"
+#include "iostream"
+int main(){
+
+
+ float X[] = {
+ 1.0f, 1.0f, 1.0f,
+ 1.0f, 1.0f, 1.0f,
+ 1.0f, 1.0f, 1.0f
+ };
+
+ float input[] = {
+ 1.0f, 1.0f, 1.0f,
+ 1.0f, 5.0f, 1.0f,
+ 1.0f, 1.0f, 1.0f
+ };
+
+ Kernel kern = Kernel(3,3,X);
+ KernelOutput out = kern.augmentWithKernel(input, 3, 3);
+ for(int i = 0 ; i < 3; ++i){
+ for(int x = 0 ; x < 3 ; ++x){
+ std::cout << out.features[i*3 + x] << " ";
+ }
+ std::cout << std::endl;
+ }
+
+ std::cout << std::endl;
+
+ for(int i = 0 ; i < 3; ++i){
+ for(int x = 0 ; x < 3 ; ++x){
+ std::cout << out.features[(i*3 + x) + 9] << " ";
+ }
+ std::cout << std::endl;
+ }
+}
diff --git a/rewrite/output.csv b/rewrite/output.csv
@@ -0,0 +1,29 @@
+,0
+1,0.0945
+10,0.1327142857142857
+20,0.19542857142857142
+30,0.22857142857142856
+40,0.344
+50,0.29642857142857143
+60,0.3717142857142857
+70,0.4052142857142857
+80,0.3412857142857143
+90,0.44307142857142856
+100,0.4712857142857143
+150,0.4295
+200,0.5545
+250,0.5513571428571429
+300,0.5530714285714285
+350,0.5372142857142858
+400,0.5695
+450,0.6051428571428571
+500,0.5994285714285714
+1000,0.6572857142857143
+1500,0.6706428571428571
+2000,0.6952857142857143
+2500,0.7142142857142857
+3000,0.7461428571428571
+3500,0.7458571428571429
+4000,0.7458571428571429
+4500,0.7596428571428572
+5000,0.7695714285714286