Refactor - decision-tree-classifier - Unnamed repository; edit this file 'description' to name the repository.

commit 29450ced42ecb1bd98d0e3116f10f4e085c2fd31
parent fc97d111faf5f8f0a79d83ae3734f5a7d72a4de4
Author: Andrew <andrewlaack1@gmail.com>
Date:   Mon, 30 Dec 2024 23:20:53 -0600

Refactor

Diffstat:
D Notes.md  | 67 -------------------------------------------------------------------
M README.md  | 5 ++---
R rewrite/.~lock.output.xlsx# -> cppDecisionTree/.~lock.output.xlsx#  | 0 
R rewrite/CMakeLists.txt -> cppDecisionTree/CMakeLists.txt  | 0 
R rewrite/Makefile -> cppDecisionTree/Makefile  | 0 
A cppDecisionTree/Usage.py  | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R rewrite/cpp/Criterion.cpp -> cppDecisionTree/cpp/Criterion.cpp  | 0 
R rewrite/cpp/Criterion.h -> cppDecisionTree/cpp/Criterion.h  | 0 
R rewrite/cpp/DecisionTreeClassifier.cpp -> cppDecisionTree/cpp/DecisionTreeClassifier.cpp  | 0 
R rewrite/cpp/DecisionTreeClassifier.h -> cppDecisionTree/cpp/DecisionTreeClassifier.h  | 0 
R rewrite/cpp/TreeNode.cpp -> cppDecisionTree/cpp/TreeNode.cpp  | 0 
R rewrite/cpp/TreeNode.h -> cppDecisionTree/cpp/TreeNode.h  | 0 
R rewrite/cpp/bindings.cpp -> cppDecisionTree/cpp/bindings.cpp  | 0 
D rewrite/Test.py  | 87 -------------------------------------------------------------------------------
D rewrite/cpp/ObliqueNode.h  | 40 ----------------------------------------
D rewrite/cpp/OptimalObliqueTree.h  | 17 -----------------

16 files changed, 62 insertions(+), 214 deletions(-)
diff --git a/Notes.md b/Notes.md
@@ -1,67 +0,0 @@
-# Notes as I go
-
-BOTH RESULT IN THE SAME GRAPHS...
-
-
-
-
-MAKE BENCHMARKING GRAPHS FOR COURSE
-
-Making gini impurity calculation in C++ = 44x improvement in compute time
-
-I am also noticing an insane difference in amount of memory being used... c++ is using nothing!!!
-
-
-AFTER REWRITING INTO C++ vs original C++ + PYTHON IMPLEMENTATION - DO BENCHMARKING
-
-WITH 10,000 elements each 4 features, the c++ implementation computed the first split in:
-27.272 seconds without optimization flags
-3.867 with optimization flags
-
-PYTHON (WITH C++) DID THE SAME IN:
-5.183 (user) w/ optimization flags
-
-WITH 20,000 ELEMENTS, 4 FEATURES:
-PYTHON - 17.321 (user)
-C++ - 15.271s (user)
-
-WITH 30,000 ELEMENTS, 4 FEATURES:
-PYTHON - 37.5128 (user)
-C++ 34.335 (user)
-
-
-WITH WORKLOAD OF 6000 SAMPLES AND 784 FEATURES:
-
-MAX MEMORY C++ - 24628kB
-MAX MEMORY Python - 521764kB
-
-ADD BENCHMARKING WITHOUT LIKELY AND THEN WITH.
-
-20,000 Samples, 4 Features:
-    WITHOUT LIKELY: 15.526 seconds (user), 15.632 seconds (user)
-    WITH LIKELY: 15.524 seconds (user), 15.608s (user)
-
-random list 1,000 elements, 784 dimensions, depth = 1
-    python and c++: 24.258 (user)
-    just c++ : 10.657 (user)
-
-same as above but with 3,000 elements:
-    python and c++: 1m 55s (user)
-    just c++ :  1m 37s (user)
-
-
----
-
-1,000 samples with 784 dimensions, depth = 1:
-    c++ single thread (python time):
-        10.33 seconds
-        10.33 seconds
-        10.3526 seconds
-    c++ with multithreading (python time):
-        1.852 seconds
-        1.901 seconds
-        1.886 seconds
-
-HOLY SHIT!! This pins all 6 cpu cores.
-
-
diff --git a/README.md b/README.md
@@ -1,3 +1,3 @@
-# pseudo-optimal-decision-tree
+# CART Decision Tree Classifier
 
-Pseudo optimal decision tree repository-
\ No newline at end of file
+Sample implementation of CART decision tree classifier written in C++ with hooks into python using pybind11.
diff --git a/rewrite/.~lock.output.xlsx# b/cppDecisionTree/.~lock.output.xlsx#
diff --git a/rewrite/CMakeLists.txt b/cppDecisionTree/CMakeLists.txt
diff --git a/rewrite/Makefile b/cppDecisionTree/Makefile
diff --git a/cppDecisionTree/Usage.py b/cppDecisionTree/Usage.py
@@ -0,0 +1,60 @@
+import numpy as np
+import pandas as pd
+from decision_tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score
+from sklearn.datasets import fetch_openml
+from sklearn.model_selection import train_test_split
+from sklearn.decomposition import PCA
+import time
+import matplotlib.pyplot as plt
+
+mnist = fetch_openml("mnist_784", version=1)
+X, y = mnist["data"], mnist["target"]
+X = X.astype(np.float32)  # Ensure data is in float64 format for the classifier
+y = y.astype(np.int32)    # Ensure target labels are integers
+
+accLs = []
+xVals = []
+
+SEED = 110
+
+np.random.seed(SEED)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
+y_train = np.array(y_train)
+X_train = np.array(X_train)
+X_test = np.array(X_test)
+
+for i in range(1, 11001):
+    if (i <= 100 and i % 10 == 0) or ((i) % 500 == 0) or (i == 1) or (i <= 500 and i % 50 == 0):
+        X_train_pca = np.array(X_train).copy()
+        X_test_pca = np.array(X_test).copy()
+
+
+
+        num_samples = i
+        random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False)
+
+
+        X_train_pca = X_train_pca[random_indices]
+        y_train_current = y_train[random_indices]
+
+        clf = DecisionTreeClassifier(100)
+
+        start_time = time.time()
+        clf.fit(X_train_pca, len(X_train_pca), y_train_current, len(X_train_pca[0]))
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        print(f"{i}: Time taken: {elapsed_time:.4f} seconds")
+        y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
+        accuracy = accuracy_score(y_test, y_pred)
+        print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
+
+        accLs.append(accuracy)
+        xVals.append(i)
+
+df = pd.DataFrame(accLs, xVals)
+df.to_csv('output.csv') 
+
+
+plt.plot(xVals, accLs)
+plt.show()
diff --git a/rewrite/cpp/Criterion.cpp b/cppDecisionTree/cpp/Criterion.cpp
diff --git a/rewrite/cpp/Criterion.h b/cppDecisionTree/cpp/Criterion.h
diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/cppDecisionTree/cpp/DecisionTreeClassifier.cpp
diff --git a/rewrite/cpp/DecisionTreeClassifier.h b/cppDecisionTree/cpp/DecisionTreeClassifier.h
diff --git a/rewrite/cpp/TreeNode.cpp b/cppDecisionTree/cpp/TreeNode.cpp
diff --git a/rewrite/cpp/TreeNode.h b/cppDecisionTree/cpp/TreeNode.h
diff --git a/rewrite/cpp/bindings.cpp b/cppDecisionTree/cpp/bindings.cpp
diff --git a/rewrite/Test.py b/rewrite/Test.py
@@ -1,87 +0,0 @@
-import numpy as np
-import pandas as pd
-from decision_tree import DecisionTreeClassifier
-from sklearn.metrics import accuracy_score
-from sklearn.datasets import fetch_openml
-from sklearn.model_selection import train_test_split
-from sklearn.decomposition import PCA
-import time
-import matplotlib.pyplot as plt
-import cv2  # OpenCV is required for Sobel operations
-
-mnist = fetch_openml("mnist_784", version=1)
-X, y = mnist["data"], mnist["target"]
-X = X.astype(np.float32)  # Ensure data is in float64 format for the classifier
-y = y.astype(np.int32)    # Ensure target labels are integers
-
-accLs = []
-xVals = []
-pca = PCA(n_components=784)
-
-SEED = 110
-
-np.random.seed(SEED)
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)
-y_train = np.array(y_train)
-X_train = np.array(X_train)
-X_test = np.array(X_test)
-
-def append_sobel_features(X):
-    # Assume X is a 3D array with shape (n_samples, height, width) for image data
-    sobel_features = []
-    for sample in X:
-        # Compute Sobel edges along x and y directions
-        sobel_x = cv2.Sobel(sample, cv2.CV_64F, 1, 0, ksize=3)
-        sobel_y = cv2.Sobel(sample, cv2.CV_64F, 0, 1, ksize=3)
-        sobel_combined = np.sqrt(sobel_x**2 + sobel_y**2)  # Combine both directions
-        sobel_combined = sobel_combined.flatten()  # Flatten the 2D Sobel result
-        
-        # Append to list
-        sobel_features.append(sobel_combined)
-    
-    # Convert to NumPy array
-    sobel_features = np.array(sobel_features)
-    return np.hstack((X.reshape(len(X), -1), sobel_features))
-
-# Apply the function to X_train and X_test
-X_train = append_sobel_features(X_train)
-X_test = append_sobel_features(X_test)
-
-print(X_train.shape)
-print(X_test.shape)
-
-
-for i in range(1000, 5001):
-    if (i <= 100 and i % 10 == 0) or ((i) % 500 == 0) or (i == 1) or (i <= 500 and i % 50 == 0):
-        X_train_pca = np.array(X_train).copy()
-        X_test_pca = np.array(X_test).copy()
-
-
-
-        num_samples = i
-        random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False)
-
-
-        X_train_pca = X_train_pca[random_indices]
-        y_train_current = y_train[random_indices]
-
-        clf = DecisionTreeClassifier(100)
-
-        start_time = time.time()
-        clf.fit(X_train_pca, len(X_train_pca), y_train_current, len(X_train_pca[0]))
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        print(f"{i}: Time taken: {elapsed_time:.4f} seconds")
-        y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
-        accuracy = accuracy_score(y_test, y_pred)
-        print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
-
-        accLs.append(accuracy)
-        xVals.append(i)
-
-df = pd.DataFrame(accLs, xVals)
-df.to_csv('output.csv') 
-
-
-plt.plot(xVals, accLs)
-plt.show()
diff --git a/rewrite/cpp/ObliqueNode.h b/rewrite/cpp/ObliqueNode.h
@@ -1,40 +0,0 @@
-#include "string"
-
-struct SplitResults{
-	float* XLeft;
-	float* XRight;
-	int* yLeft;
-	int* yRight;
-	int leftSize;
-	int rightSize;
-};
-
-class ObliqueNode{
-	public:
-		ObliqueNode(int classification);
-		ObliqueNode(float splittingVal, int featureIndex);
-		bool isLeaf();
-		void setSplit(float splittingValue, int featureIndex);
-		float evalSplit(float* X, int* y, int samples, int features, std::string criterion);
-		ObliqueNode* getLeftChild();
-		ObliqueNode* getRightChild();
-		void setLeftChild(ObliqueNode* child);
-		void setRightChild(ObliqueNode* child);
-		float getSplitVal();
-		int getIndexSplit();
-		SplitResults splitOnNode(float* X, int* y, int samples, int features);
-		std::string getDotEdges();
-		int getClassification();
-		bool lessThan(float* sample, int features);
-
-	private:
-		bool leaf;
-		float splitValue;
-		int index;
-		ObliqueNode* leftChild = nullptr;
-		ObliqueNode* rightChild = nullptr;
-		std::string getDotLabel();
-		int classification;
-};
-
-
diff --git a/rewrite/cpp/OptimalObliqueTree.h b/rewrite/cpp/OptimalObliqueTree.h
@@ -1,17 +0,0 @@
-#include "ObliqueNode.h"
-
-class OptimalObliqueTree{
-	public:
-		OptimalObliqueTree(int depth);
-		void fit(float* X, int samples, int* y, int features);
-		int* predict(float* X, int samples, int features);
-		std::string getDot();
-	private:
-		int depth;
-		int featureCount = -1;
-		ObliqueNode* splittingTree = nullptr;
-		ObliqueNode* bestSplit(float* X, int samples, int* y, int features);
-		ObliqueNode* recurse(float* X, int samples, int* y, int features, int depth);
-		int primaryClass(int* y, int labelCount);
-		void deleteTree(ObliqueNode* node);
-};

	decision-tree-classifier Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

D	Notes.md	\|	67	-------------------------------------------------------------------
M	README.md	\|	5	++---
R	rewrite/.~lock.output.xlsx# -> cppDecisionTree/.~lock.output.xlsx#	\|	0
R	rewrite/CMakeLists.txt -> cppDecisionTree/CMakeLists.txt	\|	0
R	rewrite/Makefile -> cppDecisionTree/Makefile	\|	0
A	cppDecisionTree/Usage.py	\|	60	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
R	rewrite/cpp/Criterion.cpp -> cppDecisionTree/cpp/Criterion.cpp	\|	0
R	rewrite/cpp/Criterion.h -> cppDecisionTree/cpp/Criterion.h	\|	0
R	rewrite/cpp/DecisionTreeClassifier.cpp -> cppDecisionTree/cpp/DecisionTreeClassifier.cpp	\|	0
R	rewrite/cpp/DecisionTreeClassifier.h -> cppDecisionTree/cpp/DecisionTreeClassifier.h	\|	0
R	rewrite/cpp/TreeNode.cpp -> cppDecisionTree/cpp/TreeNode.cpp	\|	0
R	rewrite/cpp/TreeNode.h -> cppDecisionTree/cpp/TreeNode.h	\|	0
R	rewrite/cpp/bindings.cpp -> cppDecisionTree/cpp/bindings.cpp	\|	0
D	rewrite/Test.py	\|	87	-------------------------------------------------------------------------------
D	rewrite/cpp/ObliqueNode.h	\|	40	----------------------------------------
D	rewrite/cpp/OptimalObliqueTree.h	\|	17	-----------------