commit 037c020a5b0397a611b4024036ce68c48f50fbc8
parent b880d7f052b305b1c9fdd9a3ba4547c472641899
Author: Andrew <andrewlaack1@gmail.com>
Date: Fri, 20 Dec 2024 20:49:45 -0600
did some comparisons in performance
Diffstat:
6 files changed, 71 insertions(+), 33 deletions(-)
diff --git a/Notes.md b/Notes.md
@@ -0,0 +1,30 @@
+# Notes as I go
+
+BOTH RESULT IN THE SAME GRAPHS...
+
+
+
+
+MAKE BENCHMARKING GRAPHS FOR COURSE
+
+Making gini impurity calculation in C++ = 44x improvement in compute time
+
+I am also noticing an insane difference in amount of memory being used... c++ is using nothing!!!
+
+
+AFTER REWRITING INTO C++ vs original C++ + PYTHON IMPLEMENTATION - DO BENCHMARKING
+
+WITH 10,000 elements each 4 features, the c++ implementation computed the first split in:
+27.272 seconds without optimization flags
+3.867 with optimization flags
+
+PYTHON (WITH C++) DID THE SAME IN:
+5.183 (user) w/ optimization flags
+
+WITH 20,000 ELEMENTS, 4 FEATURES:
+PYTHON - 17.321 (user)
+C++ - 15.271s (user)
+
+WITH 30,000 ELEMENTS, 4 FEATURES:
+PYTHON - 37.5128 (user)
+C++ 34.335 (user)
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -1,32 +1,27 @@
from Podtc import PseudoOptimalDecisionTreeClassifier
import numpy as np
-import plotly.express as px
-from sklearn.tree import DecisionTreeClassifier
-from sklearn.datasets import load_digits
-import pandas as pd
-from keras.datasets import mnist
-from sklearn.metrics import accuracy_score
-from setuptools import setup
+##import plotly.express as px
+#from sklearn.tree import DecisionTreeClassifier
+#from sklearn.datasets import load_digits
+#import pandas as pd
+#from keras.datasets import mnist
+#from sklearn.metrics import accuracy_score
+#from setuptools import setup
# (train_X, train_y), (test_X, test_y) = mnist.load_data()
# train_X = train_X.reshape(-1, 784)
# test_X = test_X.reshape(-1, 784)
-y_train = [10, 10, 10, 1, 2, 3]
-X_train= [
- [1,1,5,3],
- [1,2,5,3],
- [1,7,5,3],
- [1,3,5,3],
- [1,7,5,3],
- [1,1,5,3]
- ]
+X_train = np.random.randint(1, 10, size=(30000, 4))
+
+# Generate 60,000 labels for y_train (for example, labels between 1 and 10)
+y_train = np.random.randint(1, 11, size=30000)
# train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]]
# train_y = [1, 1 , 2, 1, 5, 2,1 ,3]
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=100000);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=1);
classifier.fit(X_train, y_train)
diff --git a/rewrite/DecisionTreeClassifier.cpp b/rewrite/DecisionTreeClassifier.cpp
@@ -9,7 +9,7 @@ DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){
}
void DecisionTreeClassifier::fit(float* X, int rows, int* y, int columns){
- splittingTree = recurse(X, rows, y, columns);
+ splittingTree = recurse(X, rows, y, columns, depth);
}
@@ -25,7 +25,12 @@ std::string DecisionTreeClassifier::getDot(){
// add depth
-TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int columns){
+TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int columns, int depthRem){
+
+ if(depthRem == 0){
+ TreeNode* ret = new TreeNode();
+ return ret;
+ }
// found minimum node
if(rows == 1){
@@ -48,9 +53,9 @@ TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int column
}
// traverse lt tree
- TreeNode* left = recurse(split.XLeft, split.leftSize, split.yLeft, columns);
+ TreeNode* left = recurse(split.XLeft, split.leftSize, split.yLeft, columns, depthRem - 1);
// traverse gt tree
- TreeNode* right = recurse(split.XRight, split.rightSize, split.yRight, columns);
+ TreeNode* right = recurse(split.XRight, split.rightSize, split.yRight, columns, depthRem - 1);
chosen->setLeftChild(left);
chosen->setRightChild(right);
diff --git a/rewrite/DecisionTreeClassifier.h b/rewrite/DecisionTreeClassifier.h
@@ -10,5 +10,5 @@ class DecisionTreeClassifier{
int depth;
TreeNode* splittingTree = nullptr;
TreeNode* bestSplit(float* X, int rows, int* y, int columns);
- TreeNode* recurse(float* X, int rows, int* y, int columns);
+ TreeNode* recurse(float* X, int rows, int* y, int columns, int depth);
};
diff --git a/rewrite/Makefile b/rewrite/Makefile
@@ -2,11 +2,16 @@ clean:
rm *.o
rm *.out
-node:
- g++ -c TreeNode.cpp
-tests: DecisionTreeClassifier.o TreeNode.o
- g++ Tests.cpp DecisionTreeClassifier.o TreeNode.o
+CXXFLAGS = -O3 -Wall -std=c++17 # Add -O3 for optimization and other flags
+LDFLAGS = # Add any linker flags here
+
+# Targets
+node:
+ g++ $(CXXFLAGS) -c TreeNode.cpp -o TreeNode.o
decisionTree:
- g++ -c DecisionTreeClassifier.cpp
+ g++ $(CXXFLAGS) -c DecisionTreeClassifier.cpp -o DecisionTreeClassifier.o
+
+tests: DecisionTreeClassifier.o TreeNode.o
+ g++ $(CXXFLAGS) Tests.cpp DecisionTreeClassifier.o TreeNode.o -o a.out
diff --git a/rewrite/Tests.cpp b/rewrite/Tests.cpp
@@ -36,16 +36,16 @@ int main(){
testTreeNode();
- DecisionTreeClassifier clf(10);
+ DecisionTreeClassifier clf(1);
// Large array of labels for 1000 samples
- int labels[1000];
- for (int i = 0; i < 1000; ++i) {
+ int labels[30000];
+ for (int i = 0; i < 30000; ++i) {
labels[i] = i % 10; // Example: create labels that cycle through 0 to 9
}
// Large array of samples, 1000 samples with 4 features each
- float samples[1000][4];
- for (int i = 0; i < 1000; ++i) {
+ float samples[30000][4];
+ for (int i = 0; i < 30000; ++i) {
// Fill the samples with some arbitrary data (example: sequential)
samples[i][0] = (i % 10) + 1; // Feature 1 (e.g., 1 to 10)
samples[i][1] = (i % 5) + 1; // Feature 2 (e.g., 1 to 5)
@@ -53,8 +53,11 @@ int main(){
samples[i][3] = (i % 3) + 1; // Feature 4 (e.g., 1 to 3)
}
+ cout << "FITTING" << endl;
// Fit the classifier to the data
- clf.fit(*samples, 1000, labels, 4);
+ clf.fit(*samples, 30000, labels, 4);
+
+ return 0;
ofstream outFile("decision_tree.dot");