decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 037c020a5b0397a611b4024036ce68c48f50fbc8
parent b880d7f052b305b1c9fdd9a3ba4547c472641899
Author: Andrew <andrewlaack1@gmail.com>
Date:   Fri, 20 Dec 2024 20:49:45 -0600

did some comparisons in performance

Diffstat:
ANotes.md | 30++++++++++++++++++++++++++++++
Mclassifier/Testing.py | 29++++++++++++-----------------
Mrewrite/DecisionTreeClassifier.cpp | 13+++++++++----
Mrewrite/DecisionTreeClassifier.h | 2+-
Mrewrite/Makefile | 15++++++++++-----
Mrewrite/Tests.cpp | 15+++++++++------
6 files changed, 71 insertions(+), 33 deletions(-)

diff --git a/Notes.md b/Notes.md @@ -0,0 +1,30 @@ +# Notes as I go + +BOTH RESULT IN THE SAME GRAPHS... + + + + +MAKE BENCHMARKING GRAPHS FOR COURSE + +Making gini impurity calculation in C++ = 44x improvement in compute time + +I am also noticing an insane difference in amount of memory being used... c++ is using nothing!!! + + +AFTER REWRITING INTO C++ vs original C++ + PYTHON IMPLEMENTATION - DO BENCHMARKING + +WITH 10,000 elements each 4 features, the c++ implementation computed the first split in: +27.272 seconds without optimization flags +3.867 with optimization flags + +PYTHON (WITH C++) DID THE SAME IN: +5.183 (user) w/ optimization flags + +WITH 20,000 ELEMENTS, 4 FEATURES: +PYTHON - 17.321 (user) +C++ - 15.271s (user) + +WITH 30,000 ELEMENTS, 4 FEATURES: +PYTHON - 37.5128 (user) +C++ 34.335 (user) diff --git a/classifier/Testing.py b/classifier/Testing.py @@ -1,32 +1,27 @@ from Podtc import PseudoOptimalDecisionTreeClassifier import numpy as np -import plotly.express as px -from sklearn.tree import DecisionTreeClassifier -from sklearn.datasets import load_digits -import pandas as pd -from keras.datasets import mnist -from sklearn.metrics import accuracy_score -from setuptools import setup +##import plotly.express as px +#from sklearn.tree import DecisionTreeClassifier +#from sklearn.datasets import load_digits +#import pandas as pd +#from keras.datasets import mnist +#from sklearn.metrics import accuracy_score +#from setuptools import setup # (train_X, train_y), (test_X, test_y) = mnist.load_data() # train_X = train_X.reshape(-1, 784) # test_X = test_X.reshape(-1, 784) -y_train = [10, 10, 10, 1, 2, 3] -X_train= [ - [1,1,5,3], - [1,2,5,3], - [1,7,5,3], - [1,3,5,3], - [1,7,5,3], - [1,1,5,3] - ] +X_train = np.random.randint(1, 10, size=(30000, 4)) + +# Generate 60,000 labels for y_train (for example, labels between 1 and 10) +y_train = np.random.randint(1, 11, size=30000) # train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]] # train_y = [1, 1 , 2, 1, 5, 2,1 ,3] -classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=100000); +classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=1); classifier.fit(X_train, y_train) diff --git a/rewrite/DecisionTreeClassifier.cpp b/rewrite/DecisionTreeClassifier.cpp @@ -9,7 +9,7 @@ DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){ } void DecisionTreeClassifier::fit(float* X, int rows, int* y, int columns){ - splittingTree = recurse(X, rows, y, columns); + splittingTree = recurse(X, rows, y, columns, depth); } @@ -25,7 +25,12 @@ std::string DecisionTreeClassifier::getDot(){ // add depth -TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int columns){ +TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int columns, int depthRem){ + + if(depthRem == 0){ + TreeNode* ret = new TreeNode(); + return ret; + } // found minimum node if(rows == 1){ @@ -48,9 +53,9 @@ TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int column } // traverse lt tree - TreeNode* left = recurse(split.XLeft, split.leftSize, split.yLeft, columns); + TreeNode* left = recurse(split.XLeft, split.leftSize, split.yLeft, columns, depthRem - 1); // traverse gt tree - TreeNode* right = recurse(split.XRight, split.rightSize, split.yRight, columns); + TreeNode* right = recurse(split.XRight, split.rightSize, split.yRight, columns, depthRem - 1); chosen->setLeftChild(left); chosen->setRightChild(right); diff --git a/rewrite/DecisionTreeClassifier.h b/rewrite/DecisionTreeClassifier.h @@ -10,5 +10,5 @@ class DecisionTreeClassifier{ int depth; TreeNode* splittingTree = nullptr; TreeNode* bestSplit(float* X, int rows, int* y, int columns); - TreeNode* recurse(float* X, int rows, int* y, int columns); + TreeNode* recurse(float* X, int rows, int* y, int columns, int depth); }; diff --git a/rewrite/Makefile b/rewrite/Makefile @@ -2,11 +2,16 @@ clean: rm *.o rm *.out -node: - g++ -c TreeNode.cpp -tests: DecisionTreeClassifier.o TreeNode.o - g++ Tests.cpp DecisionTreeClassifier.o TreeNode.o +CXXFLAGS = -O3 -Wall -std=c++17 # Add -O3 for optimization and other flags +LDFLAGS = # Add any linker flags here + +# Targets +node: + g++ $(CXXFLAGS) -c TreeNode.cpp -o TreeNode.o decisionTree: - g++ -c DecisionTreeClassifier.cpp + g++ $(CXXFLAGS) -c DecisionTreeClassifier.cpp -o DecisionTreeClassifier.o + +tests: DecisionTreeClassifier.o TreeNode.o + g++ $(CXXFLAGS) Tests.cpp DecisionTreeClassifier.o TreeNode.o -o a.out diff --git a/rewrite/Tests.cpp b/rewrite/Tests.cpp @@ -36,16 +36,16 @@ int main(){ testTreeNode(); - DecisionTreeClassifier clf(10); + DecisionTreeClassifier clf(1); // Large array of labels for 1000 samples - int labels[1000]; - for (int i = 0; i < 1000; ++i) { + int labels[30000]; + for (int i = 0; i < 30000; ++i) { labels[i] = i % 10; // Example: create labels that cycle through 0 to 9 } // Large array of samples, 1000 samples with 4 features each - float samples[1000][4]; - for (int i = 0; i < 1000; ++i) { + float samples[30000][4]; + for (int i = 0; i < 30000; ++i) { // Fill the samples with some arbitrary data (example: sequential) samples[i][0] = (i % 10) + 1; // Feature 1 (e.g., 1 to 10) samples[i][1] = (i % 5) + 1; // Feature 2 (e.g., 1 to 5) @@ -53,8 +53,11 @@ int main(){ samples[i][3] = (i % 3) + 1; // Feature 4 (e.g., 1 to 3) } + cout << "FITTING" << endl; // Fit the classifier to the data - clf.fit(*samples, 1000, labels, 4); + clf.fit(*samples, 30000, labels, 4); + + return 0; ofstream outFile("decision_tree.dot");