did some comparisons in performance - decision-tree-classifier - Unnamed repository; edit this file 'description' to name the repository.

commit 037c020a5b0397a611b4024036ce68c48f50fbc8
parent b880d7f052b305b1c9fdd9a3ba4547c472641899
Author: Andrew <andrewlaack1@gmail.com>
Date:   Fri, 20 Dec 2024 20:49:45 -0600

did some comparisons in performance

Diffstat:
A Notes.md  | 30 ++++++++++++++++++++++++++++++
M classifier/Testing.py  | 29 ++++++++++++-----------------
M rewrite/DecisionTreeClassifier.cpp  | 13 +++++++++----
M rewrite/DecisionTreeClassifier.h  | 2 +-
M rewrite/Makefile  | 15 ++++++++++-----
M rewrite/Tests.cpp  | 15 +++++++++------

6 files changed, 71 insertions(+), 33 deletions(-)
diff --git a/Notes.md b/Notes.md
@@ -0,0 +1,30 @@
+# Notes as I go
+
+BOTH RESULT IN THE SAME GRAPHS...
+
+
+
+
+MAKE BENCHMARKING GRAPHS FOR COURSE
+
+Making gini impurity calculation in C++ = 44x improvement in compute time
+
+I am also noticing an insane difference in amount of memory being used... c++ is using nothing!!!
+
+
+AFTER REWRITING INTO C++ vs original C++ + PYTHON IMPLEMENTATION - DO BENCHMARKING
+
+WITH 10,000 elements each 4 features, the c++ implementation computed the first split in:
+27.272 seconds without optimization flags
+3.867 with optimization flags
+
+PYTHON (WITH C++) DID THE SAME IN:
+5.183 (user) w/ optimization flags
+
+WITH 20,000 ELEMENTS, 4 FEATURES:
+PYTHON - 17.321 (user)
+C++ - 15.271s (user)
+
+WITH 30,000 ELEMENTS, 4 FEATURES:
+PYTHON - 37.5128 (user)
+C++ 34.335 (user)
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -1,32 +1,27 @@
 from Podtc import PseudoOptimalDecisionTreeClassifier
 import numpy as np
-import plotly.express as px
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.datasets import load_digits
-import pandas as pd
-from keras.datasets import mnist
-from sklearn.metrics import accuracy_score
-from setuptools import setup
+##import plotly.express as px
+#from sklearn.tree import DecisionTreeClassifier
+#from sklearn.datasets import load_digits
+#import pandas as pd
+#from keras.datasets import mnist
+#from sklearn.metrics import accuracy_score
+#from setuptools import setup
 
 # (train_X, train_y), (test_X, test_y) = mnist.load_data()
 
 # train_X = train_X.reshape(-1, 784)
 # test_X = test_X.reshape(-1, 784)
 
-y_train  = [10, 10, 10, 1, 2, 3]
-X_train= [
-		[1,1,5,3],
-		[1,2,5,3],
-		[1,7,5,3],
-		[1,3,5,3],
-		[1,7,5,3],
-		[1,1,5,3]
-        ]
+X_train = np.random.randint(1, 10, size=(30000, 4))
+
+# Generate 60,000 labels for y_train (for example, labels between 1 and 10)
+y_train = np.random.randint(1, 11, size=30000)
 
 # train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]]
 # train_y = [1, 1 , 2, 1, 5, 2,1 ,3]
 
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=100000);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=1);
 
 classifier.fit(X_train, y_train)
 
diff --git a/rewrite/DecisionTreeClassifier.cpp b/rewrite/DecisionTreeClassifier.cpp
@@ -9,7 +9,7 @@ DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){
 }
 
 void DecisionTreeClassifier::fit(float* X, int rows, int* y, int columns){
-	splittingTree = recurse(X, rows, y, columns);
+	splittingTree = recurse(X, rows, y, columns, depth);
 }
 
 
@@ -25,7 +25,12 @@ std::string DecisionTreeClassifier::getDot(){
 
 
 // add depth
-TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int columns){
+TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int columns, int depthRem){
+
+	if(depthRem == 0){
+		TreeNode* ret = new TreeNode();
+		return ret;
+	}
 
 	// found minimum node
 	if(rows == 1){
@@ -48,9 +53,9 @@ TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int column
 	}
 
 	// traverse lt tree
-	TreeNode* left = recurse(split.XLeft, split.leftSize, split.yLeft, columns);
+	TreeNode* left = recurse(split.XLeft, split.leftSize, split.yLeft, columns, depthRem - 1);
 	// traverse gt tree
-	TreeNode* right = recurse(split.XRight, split.rightSize, split.yRight, columns);
+	TreeNode* right = recurse(split.XRight, split.rightSize, split.yRight, columns, depthRem - 1);
 
 	chosen->setLeftChild(left);
 	chosen->setRightChild(right);
diff --git a/rewrite/DecisionTreeClassifier.h b/rewrite/DecisionTreeClassifier.h
@@ -10,5 +10,5 @@ class DecisionTreeClassifier{
 		int depth;
 		TreeNode* splittingTree = nullptr;
 		TreeNode* bestSplit(float* X, int rows, int* y, int columns);
-		TreeNode* recurse(float* X, int rows, int* y, int columns);
+		TreeNode* recurse(float* X, int rows, int* y, int columns, int depth);
 };
diff --git a/rewrite/Makefile b/rewrite/Makefile
@@ -2,11 +2,16 @@ clean:
 	rm *.o
 	rm *.out
 
-node:
-	g++ -c TreeNode.cpp
 
-tests: DecisionTreeClassifier.o TreeNode.o
-	g++ Tests.cpp DecisionTreeClassifier.o TreeNode.o
+CXXFLAGS = -O3 -Wall -std=c++17 # Add -O3 for optimization and other flags
+LDFLAGS = # Add any linker flags here
+
+# Targets
+node:
+	g++ $(CXXFLAGS) -c TreeNode.cpp -o TreeNode.o
 
 decisionTree:
-	g++ -c DecisionTreeClassifier.cpp
+	g++ $(CXXFLAGS) -c DecisionTreeClassifier.cpp -o DecisionTreeClassifier.o
+
+tests: DecisionTreeClassifier.o TreeNode.o
+	g++ $(CXXFLAGS) Tests.cpp DecisionTreeClassifier.o TreeNode.o -o a.out 
diff --git a/rewrite/Tests.cpp b/rewrite/Tests.cpp
@@ -36,16 +36,16 @@ int main(){
 
 	testTreeNode();
 
-    DecisionTreeClassifier clf(10);
+    DecisionTreeClassifier clf(1);
     // Large array of labels for 1000 samples
-    int labels[1000];
-    for (int i = 0; i < 1000; ++i) {
+    int labels[30000];
+    for (int i = 0; i < 30000; ++i) {
         labels[i] = i % 10;  // Example: create labels that cycle through 0 to 9
     }
 
     // Large array of samples, 1000 samples with 4 features each
-    float samples[1000][4];
-    for (int i = 0; i < 1000; ++i) {
+    float samples[30000][4];
+    for (int i = 0; i < 30000; ++i) {
         // Fill the samples with some arbitrary data (example: sequential)
         samples[i][0] = (i % 10) + 1;  // Feature 1 (e.g., 1 to 10)
         samples[i][1] = (i % 5) + 1;   // Feature 2 (e.g., 1 to 5)
@@ -53,8 +53,11 @@ int main(){
         samples[i][3] = (i % 3) + 1;   // Feature 4 (e.g., 1 to 3)
     }
 
+	cout << "FITTING" << endl;
     // Fit the classifier to the data
-    clf.fit(*samples, 1000, labels, 4);
+    clf.fit(*samples, 30000, labels, 4);
+
+	return 0;
 
     ofstream outFile("decision_tree.dot");

	decision-tree-classifier Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

A	Notes.md	\|	30	++++++++++++++++++++++++++++++
M	classifier/Testing.py	\|	29	++++++++++++-----------------
M	rewrite/DecisionTreeClassifier.cpp	\|	13	+++++++++----
M	rewrite/DecisionTreeClassifier.h	\|	2	+-
M	rewrite/Makefile	\|	15	++++++++++-----
M	rewrite/Tests.cpp	\|	15	+++++++++------