decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit b880d7f052b305b1c9fdd9a3ba4547c472641899
parent fef61fdd521f2aead99a8c89953e94d3a8d4ce16
Author: Andrew <andrewlaack1@gmail.com>
Date:   Fri, 20 Dec 2024 20:06:28 -0600

Added graph viz and finished work on fit method.. for now. Would like interpolation.

Diffstat:
Mclassifier/Podtc.py | 8++++++++
Mclassifier/Testing.py | 22++++++++++++++++------
Mrewrite/DecisionTreeClassifier.cpp | 57+++++++++++++++++++++++++++++++++++++++++++++++++++------
Mrewrite/DecisionTreeClassifier.h | 4+++-
Mrewrite/Tests.cpp | 46++++++++++++++++++++++++++++++++--------------
Mrewrite/TreeNode.cpp | 32+++++++++++++++++++++++++++++++-
Mrewrite/TreeNode.h | 2++
7 files changed, 143 insertions(+), 28 deletions(-)

diff --git a/classifier/Podtc.py b/classifier/Podtc.py @@ -87,6 +87,14 @@ class PseudoOptimalDecisionTreeClassifier(): ltArr, gtArr = bestSplit.split(arr=together) + if len(ltArr) == len(together): + classification, elements = self.__classification(ltArr) + return LeafNode(classification, len(ltArr), elements) + + if len(gtArr) == len(together): + classification, elements = self.__classification(gtArr) + return LeafNode(classification, len(gtArr), elements) + # might make sense to simply stop # if the length of either array is 0 # because that means splits aren't doing anything.. diff --git a/classifier/Testing.py b/classifier/Testing.py @@ -8,21 +8,31 @@ from keras.datasets import mnist from sklearn.metrics import accuracy_score from setuptools import setup -(train_X, train_y), (test_X, test_y) = mnist.load_data() +# (train_X, train_y), (test_X, test_y) = mnist.load_data() -train_X = train_X.reshape(-1, 784) -test_X = test_X.reshape(-1, 784) +# train_X = train_X.reshape(-1, 784) +# test_X = test_X.reshape(-1, 784) +y_train = [10, 10, 10, 1, 2, 3] +X_train= [ + [1,1,5,3], + [1,2,5,3], + [1,7,5,3], + [1,3,5,3], + [1,7,5,3], + [1,1,5,3] + ] # train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]] # train_y = [1, 1 , 2, 1, 5, 2,1 ,3] -classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.1, proportionOfDimsToTrainOn=.01, maxDepth=2); +classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=100000); -classifier.fit(train_X, train_y) +classifier.fit(X_train, y_train) -# classifier.graph() +classifier.graph() +exit() y_pred = classifier.predict(test_X) print("MY ACCURACY:") diff --git a/rewrite/DecisionTreeClassifier.cpp b/rewrite/DecisionTreeClassifier.cpp @@ -9,19 +9,62 @@ DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){ } void DecisionTreeClassifier::fit(float* X, int rows, int* y, int columns){ - // IMPORTANT: MUST DEALLOCATE CHOSEN AFTER USE... - cout << "COMPUTING BEST" << endl; + splittingTree = recurse(X, rows, y, columns); +} + + +std::string DecisionTreeClassifier::getDot(){ + if (splittingTree == nullptr){ + throw logic_error("Decision tree must be created prior to generating dot output."); + } + std::string edges = splittingTree->getDotEdges(); + std::string dot = "digraph decisionTree {\n" + edges + "}"; + return dot; +} + + + +// add depth +TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int columns){ + + // found minimum node + if(rows == 1){ + TreeNode* ret = new TreeNode(); + return ret; + } + + // get best split option TreeNode* chosen = bestSplit(X, rows, y, columns); - cout << "SPLIT VAL: " << chosen->getSplitVal() << endl; - cout << "INDEX: "<< chosen->getIndexSplit() << endl; + SplitResults split = chosen->splitOnNode(X, y, rows, columns); + + // no valid splits, but we still did create some new arrays. + if(split.rightSize == rows || split.leftSize == rows){ + TreeNode* ret = new TreeNode(); + delete split.XLeft; + delete split.XRight; + delete split.yLeft; + delete split.yRight; + return ret; + } - SplitResults res = chosen->splitOnNode(X,y,rows, columns); + // traverse lt tree + TreeNode* left = recurse(split.XLeft, split.leftSize, split.yLeft, columns); + // traverse gt tree + TreeNode* right = recurse(split.XRight, split.rightSize, split.yRight, columns); - // create recursive helper method. + chosen->setLeftChild(left); + chosen->setRightChild(right); + delete split.XLeft; + delete split.XRight; + delete split.yLeft; + delete split.yRight; + return chosen; } + + // 1 1 0 // 3 3 0 // 2 1 1 @@ -58,3 +101,5 @@ TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int colu return bestNode; } + + diff --git a/rewrite/DecisionTreeClassifier.h b/rewrite/DecisionTreeClassifier.h @@ -5,8 +5,10 @@ class DecisionTreeClassifier{ DecisionTreeClassifier(int depth); void fit(float* X, int rows, int* y, int columns); int* predict(); + std::string getDot(); private: int depth; + TreeNode* splittingTree = nullptr; TreeNode* bestSplit(float* X, int rows, int* y, int columns); - + TreeNode* recurse(float* X, int rows, int* y, int columns); }; diff --git a/rewrite/Tests.cpp b/rewrite/Tests.cpp @@ -1,4 +1,5 @@ #include "DecisionTreeClassifier.h" +#include <fstream> // For file I/O #include "iostream" #include "assert.h" @@ -15,6 +16,7 @@ void testTreeNode(){ {1,1,5,3} }; + TreeNode tn = TreeNode(5.0f ,1); bool isLeaf = tn.isLeaf(); @@ -34,21 +36,37 @@ int main(){ testTreeNode(); - DecisionTreeClassifier clf = DecisionTreeClassifier(10); - int labels[] = {10, 10, 10, 1, 2, 3}; + DecisionTreeClassifier clf(10); + // Large array of labels for 1000 samples + int labels[1000]; + for (int i = 0; i < 1000; ++i) { + labels[i] = i % 10; // Example: create labels that cycle through 0 to 9 + } - float samples[][4] = { - {1,1,5,3}, - {1,2,5,3}, - {1,7,5,3}, - {1,3,5,3}, - {1,7,5,3}, - {1,1.1,5,3} - }; + // Large array of samples, 1000 samples with 4 features each + float samples[1000][4]; + for (int i = 0; i < 1000; ++i) { + // Fill the samples with some arbitrary data (example: sequential) + samples[i][0] = (i % 10) + 1; // Feature 1 (e.g., 1 to 10) + samples[i][1] = (i % 5) + 1; // Feature 2 (e.g., 1 to 5) + samples[i][2] = (i % 7) + 1; // Feature 3 (e.g., 1 to 7) + samples[i][3] = (i % 3) + 1; // Feature 4 (e.g., 1 to 3) + } + + // Fit the classifier to the data + clf.fit(*samples, 1000, labels, 4); + + ofstream outFile("decision_tree.dot"); - //index 1, split val 3 - //ltCount = 3 - //gteqCount = 3 + // Check if the file is open + if (outFile.is_open()) { + // Write the decision tree to the file + outFile << clf.getDot() << endl; + outFile.close(); // Close the file after writing + cout << "Decision tree written to 'decision_tree.dot'" << endl; + } else { + cerr << "Failed to open the file." << endl; + } - clf.fit(*samples, 6, labels, 4); + return 0; } diff --git a/rewrite/TreeNode.cpp b/rewrite/TreeNode.cpp @@ -3,7 +3,8 @@ #include "unordered_map" #include "math.h" #include "iostream" - +#include <sstream> //for std::stringstream +#include <string> //for std::string TreeNode::TreeNode(){ leaf = true; @@ -195,3 +196,32 @@ float TreeNode::giniImpurity(float* X, int* y, int samples, int features){ return gini; } + + + +std::string TreeNode::getDotEdges(){ + + if(isLeaf()){ + return ""; + } + + std::string current = getDotLabel() + "->" + leftChild->getDotLabel() + ";\n"; + current += getDotLabel() + "->" + rightChild->getDotLabel() + ";\n"; + + current += rightChild->getDotEdges(); + current += leftChild->getDotEdges(); + + return current; +} + +std::string TreeNode::getDotLabel(){ + const void * address = static_cast<const void*>(this); + std::stringstream ss; + ss << address; + std::string name = ss.str(); + if (isLeaf()){ + return "\"" + name + " - LEAF" + "\""; + } + + return "\"" + name + "\nINDEX: " + std::to_string(index) + "\nVALUE:" + std::to_string(splitValue) + "\""; +} diff --git a/rewrite/TreeNode.h b/rewrite/TreeNode.h @@ -23,6 +23,7 @@ class TreeNode{ float getSplitVal(); int getIndexSplit(); SplitResults splitOnNode(float* X, int* y, int samples, int features); + std::string getDotEdges(); private: bool leaf; @@ -31,6 +32,7 @@ class TreeNode{ TreeNode* leftChild; TreeNode* rightChild; float giniImpurity(float* X, int* y, int samples, int features); + std::string getDotLabel(); };