commit b880d7f052b305b1c9fdd9a3ba4547c472641899
parent fef61fdd521f2aead99a8c89953e94d3a8d4ce16
Author: Andrew <andrewlaack1@gmail.com>
Date: Fri, 20 Dec 2024 20:06:28 -0600
Added graph viz and finished work on fit method.. for now. Would like interpolation.
Diffstat:
7 files changed, 143 insertions(+), 28 deletions(-)
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -87,6 +87,14 @@ class PseudoOptimalDecisionTreeClassifier():
ltArr, gtArr = bestSplit.split(arr=together)
+ if len(ltArr) == len(together):
+ classification, elements = self.__classification(ltArr)
+ return LeafNode(classification, len(ltArr), elements)
+
+ if len(gtArr) == len(together):
+ classification, elements = self.__classification(gtArr)
+ return LeafNode(classification, len(gtArr), elements)
+
# might make sense to simply stop
# if the length of either array is 0
# because that means splits aren't doing anything..
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -8,21 +8,31 @@ from keras.datasets import mnist
from sklearn.metrics import accuracy_score
from setuptools import setup
-(train_X, train_y), (test_X, test_y) = mnist.load_data()
+# (train_X, train_y), (test_X, test_y) = mnist.load_data()
-train_X = train_X.reshape(-1, 784)
-test_X = test_X.reshape(-1, 784)
+# train_X = train_X.reshape(-1, 784)
+# test_X = test_X.reshape(-1, 784)
+y_train = [10, 10, 10, 1, 2, 3]
+X_train= [
+ [1,1,5,3],
+ [1,2,5,3],
+ [1,7,5,3],
+ [1,3,5,3],
+ [1,7,5,3],
+ [1,1,5,3]
+ ]
# train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]]
# train_y = [1, 1 , 2, 1, 5, 2,1 ,3]
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.1, proportionOfDimsToTrainOn=.01, maxDepth=2);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=100000);
-classifier.fit(train_X, train_y)
+classifier.fit(X_train, y_train)
-# classifier.graph()
+classifier.graph()
+exit()
y_pred = classifier.predict(test_X)
print("MY ACCURACY:")
diff --git a/rewrite/DecisionTreeClassifier.cpp b/rewrite/DecisionTreeClassifier.cpp
@@ -9,19 +9,62 @@ DecisionTreeClassifier::DecisionTreeClassifier(int maxDepth){
}
void DecisionTreeClassifier::fit(float* X, int rows, int* y, int columns){
- // IMPORTANT: MUST DEALLOCATE CHOSEN AFTER USE...
- cout << "COMPUTING BEST" << endl;
+ splittingTree = recurse(X, rows, y, columns);
+}
+
+
+std::string DecisionTreeClassifier::getDot(){
+ if (splittingTree == nullptr){
+ throw logic_error("Decision tree must be created prior to generating dot output.");
+ }
+ std::string edges = splittingTree->getDotEdges();
+ std::string dot = "digraph decisionTree {\n" + edges + "}";
+ return dot;
+}
+
+
+
+// add depth
+TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int columns){
+
+ // found minimum node
+ if(rows == 1){
+ TreeNode* ret = new TreeNode();
+ return ret;
+ }
+
+ // get best split option
TreeNode* chosen = bestSplit(X, rows, y, columns);
- cout << "SPLIT VAL: " << chosen->getSplitVal() << endl;
- cout << "INDEX: "<< chosen->getIndexSplit() << endl;
+ SplitResults split = chosen->splitOnNode(X, y, rows, columns);
+
+ // no valid splits, but we still did create some new arrays.
+ if(split.rightSize == rows || split.leftSize == rows){
+ TreeNode* ret = new TreeNode();
+ delete split.XLeft;
+ delete split.XRight;
+ delete split.yLeft;
+ delete split.yRight;
+ return ret;
+ }
- SplitResults res = chosen->splitOnNode(X,y,rows, columns);
+ // traverse lt tree
+ TreeNode* left = recurse(split.XLeft, split.leftSize, split.yLeft, columns);
+ // traverse gt tree
+ TreeNode* right = recurse(split.XRight, split.rightSize, split.yRight, columns);
- // create recursive helper method.
+ chosen->setLeftChild(left);
+ chosen->setRightChild(right);
+ delete split.XLeft;
+ delete split.XRight;
+ delete split.yLeft;
+ delete split.yRight;
+ return chosen;
}
+
+
// 1 1 0
// 3 3 0
// 2 1 1
@@ -58,3 +101,5 @@ TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int colu
return bestNode;
}
+
+
diff --git a/rewrite/DecisionTreeClassifier.h b/rewrite/DecisionTreeClassifier.h
@@ -5,8 +5,10 @@ class DecisionTreeClassifier{
DecisionTreeClassifier(int depth);
void fit(float* X, int rows, int* y, int columns);
int* predict();
+ std::string getDot();
private:
int depth;
+ TreeNode* splittingTree = nullptr;
TreeNode* bestSplit(float* X, int rows, int* y, int columns);
-
+ TreeNode* recurse(float* X, int rows, int* y, int columns);
};
diff --git a/rewrite/Tests.cpp b/rewrite/Tests.cpp
@@ -1,4 +1,5 @@
#include "DecisionTreeClassifier.h"
+#include <fstream> // For file I/O
#include "iostream"
#include "assert.h"
@@ -15,6 +16,7 @@ void testTreeNode(){
{1,1,5,3}
};
+
TreeNode tn = TreeNode(5.0f ,1);
bool isLeaf = tn.isLeaf();
@@ -34,21 +36,37 @@ int main(){
testTreeNode();
- DecisionTreeClassifier clf = DecisionTreeClassifier(10);
- int labels[] = {10, 10, 10, 1, 2, 3};
+ DecisionTreeClassifier clf(10);
+ // Large array of labels for 1000 samples
+ int labels[1000];
+ for (int i = 0; i < 1000; ++i) {
+ labels[i] = i % 10; // Example: create labels that cycle through 0 to 9
+ }
- float samples[][4] = {
- {1,1,5,3},
- {1,2,5,3},
- {1,7,5,3},
- {1,3,5,3},
- {1,7,5,3},
- {1,1.1,5,3}
- };
+ // Large array of samples, 1000 samples with 4 features each
+ float samples[1000][4];
+ for (int i = 0; i < 1000; ++i) {
+ // Fill the samples with some arbitrary data (example: sequential)
+ samples[i][0] = (i % 10) + 1; // Feature 1 (e.g., 1 to 10)
+ samples[i][1] = (i % 5) + 1; // Feature 2 (e.g., 1 to 5)
+ samples[i][2] = (i % 7) + 1; // Feature 3 (e.g., 1 to 7)
+ samples[i][3] = (i % 3) + 1; // Feature 4 (e.g., 1 to 3)
+ }
+
+ // Fit the classifier to the data
+ clf.fit(*samples, 1000, labels, 4);
+
+ ofstream outFile("decision_tree.dot");
- //index 1, split val 3
- //ltCount = 3
- //gteqCount = 3
+ // Check if the file is open
+ if (outFile.is_open()) {
+ // Write the decision tree to the file
+ outFile << clf.getDot() << endl;
+ outFile.close(); // Close the file after writing
+ cout << "Decision tree written to 'decision_tree.dot'" << endl;
+ } else {
+ cerr << "Failed to open the file." << endl;
+ }
- clf.fit(*samples, 6, labels, 4);
+ return 0;
}
diff --git a/rewrite/TreeNode.cpp b/rewrite/TreeNode.cpp
@@ -3,7 +3,8 @@
#include "unordered_map"
#include "math.h"
#include "iostream"
-
+#include <sstream> //for std::stringstream
+#include <string> //for std::string
TreeNode::TreeNode(){
leaf = true;
@@ -195,3 +196,32 @@ float TreeNode::giniImpurity(float* X, int* y, int samples, int features){
return gini;
}
+
+
+
+std::string TreeNode::getDotEdges(){
+
+ if(isLeaf()){
+ return "";
+ }
+
+ std::string current = getDotLabel() + "->" + leftChild->getDotLabel() + ";\n";
+ current += getDotLabel() + "->" + rightChild->getDotLabel() + ";\n";
+
+ current += rightChild->getDotEdges();
+ current += leftChild->getDotEdges();
+
+ return current;
+}
+
+std::string TreeNode::getDotLabel(){
+ const void * address = static_cast<const void*>(this);
+ std::stringstream ss;
+ ss << address;
+ std::string name = ss.str();
+ if (isLeaf()){
+ return "\"" + name + " - LEAF" + "\"";
+ }
+
+ return "\"" + name + "\nINDEX: " + std::to_string(index) + "\nVALUE:" + std::to_string(splitValue) + "\"";
+}
diff --git a/rewrite/TreeNode.h b/rewrite/TreeNode.h
@@ -23,6 +23,7 @@ class TreeNode{
float getSplitVal();
int getIndexSplit();
SplitResults splitOnNode(float* X, int* y, int samples, int features);
+ std::string getDotEdges();
private:
bool leaf;
@@ -31,6 +32,7 @@ class TreeNode{
TreeNode* leftChild;
TreeNode* rightChild;
float giniImpurity(float* X, int* y, int samples, int features);
+ std::string getDotLabel();
};