Finished working on tree creation logic and printing - decision-tree-classifier - Unnamed repository; edit this file 'description' to name the repository.

commit 8c9198bcbb729ddcf7ac3909048f3f65f7ae7fba
parent 0668e37fc1e73ed4bda57088244ff0f8dd0b3494
Author: Andrew <andrewlaack1@gmail.com>
Date:   Wed, 18 Dec 2024 00:30:28 -0600

Finished working on tree creation logic and printing

Diffstat:
M classifier/Podtc.py  | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M classifier/SplittingNode.py  | 15 ++++++++-------
M classifier/Testing.py  | 16 +++++++++-------

3 files changed, 102 insertions(+), 35 deletions(-)
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -2,6 +2,7 @@ import numpy as np
 from tqdm import tqdm
 from SplittingNode import SplittingNode
 import math
+import graphviz
 
 class PseudoOptimalDecisionTreeClassifier():
 
@@ -9,11 +10,11 @@ class PseudoOptimalDecisionTreeClassifier():
     # First is first split, last is ... well yeah.
 
     proportionToTrainOn = 0
-    splitList = []
     threshold = 0
     maxDepth = 0
     propValSplits = 0
     propDimsTrain = .5
+    bestSplit = None
 
     def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5):
         self.threshold = pruneThreshold 
@@ -34,7 +35,7 @@ class PseudoOptimalDecisionTreeClassifier():
         if(proportionOfDimsToTrainOn > 1 or proportionOfDimsToTrainOn <= 0):
             raise Exception(f"Proportion of dimensions to train on {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]")
 
-        self.propDimsTrain = proportionOfDimsToTrainOn
+        self.propDimsTrain = proportionOfDimsToTrainOn 
         return
 
     def fit(self, X,  y):
@@ -45,35 +46,59 @@ class PseudoOptimalDecisionTreeClassifier():
         # together [:,-1] == y
         together = np.append(X,y_re, axis=1)
 
-        self.splitList.append(self._best_split(together, self.proportionToTrainOn, self.propDimsTrain))
+        dims = self.findDims(together)
 
-        ltArr = np.array([])
-        gtArr = np.array([])
-        ltArr, gtArr = self.splitList[-1].split(X)
+        self.bestSplit = self.recurse(together, self.maxDepth, dims)
 
-        print(self.splitList[0])
         return
 
+    def findDims(self, together):
 
+        dimCount = len(together[0]) - 1
+        dims = np.arange(dimCount)
+        dimsToSample = math.ceil(dimCount * self.propDimsTrain)
+        if(dimsToSample != dimCount):
+            dims = dimsWithMostVar(dimsToSample, together)
 
-    # pass in current root
-    # find best options from then on
+        return dims
 
-    # Find best split
-    def _best_split(self, together, proportionUsed, propOfDims):
+    def recurse(self, together, depth, dims):
+        
+        if(depth == 0):
+            return None
 
-        dimCount = len(together[0]) - 1
-        dims = np.arange(dimCount)
+        bestSplit, ltGini, gtGini = self._best_split(together, self.proportionToTrainOn, dims)
 
-        dimsToSample = math.ceil(dimCount * propOfDims)
+        if bestSplit is None:
+            raise ValueError("bestSplit cannot be None")
 
-        if(dimsToSample != dimCount):
-            dims = dimsWithMostVar(dimsToSample, together)
+        ltArr, gtArr = bestSplit.split(arr=together)
 
 
+        blt = None
+        bgt = None
+
+        if len(ltArr) > 1 and ltGini > 0:
+            blt = self.recurse(ltArr, depth - 1, dims)
+            bestSplit.leftChild = blt
+
+        if len(gtArr) > 1 and gtGini > 0:
+            bgt = self.recurse(gtArr, depth - 1, dims)
+            bestSplit.rightChild= bgt
+
+        return bestSplit
+
+    # pass in current root
+    # find best options from then on
+
+    # Find best split
+    def _best_split(self, together, proportionUsed, dims):
 
         bestGini = float("inf")
         bestNode  = None
+        blg = float("inf")
+        bgg = float("inf")
+
 
         # columns (excluding y)
         for x in tqdm(dims):
@@ -90,11 +115,15 @@ class PseudoOptimalDecisionTreeClassifier():
                 splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x]
                 split = SplittingNode(x, splitOn)
                 current = split.gini(together, self.propValSplits)
-                if current < bestGini:
+                if current[0] < bestGini:
                     bestNode = split
-                    bestGini = current
+                    bestGini = current[0]
+                    blg = current[1]
+                    bgg = current[2]
 
-        return bestNode
+        # Return the best node, the left gini impurity, and right gini impurity.
+        # These impurities allow for us to stop if we have a pure node.
+        return (bestNode, blg, bgg)
 
     def predict(self):
         print("TODO")
@@ -119,17 +148,52 @@ class PseudoOptimalDecisionTreeClassifier():
             raise Exception(f"X must contain more than one sample.")
         return
 
+    def graph(self):    
+        if self.bestSplit == None:
+            raise Exception(f"Unable to create graph of classifier, call fit first.")
+
+        graph = graphviz.Digraph()
+        graph = createGraph(self.bestSplit, graph)
+        graph.render('whatever', format='png', view=True)
+
+def createGraph(node, graph):
+    graph.node(str(node.__hash__()), str(node))
+    traverseForGraph(node, graph)
+    return graph
+
+def traverseForGraph(node : SplittingNode, graph):
+
+    cid = str(node.__hash__())
+
+    if node.leftChild != None:
+        graph.node(str(node.leftChild.__hash__()), str(node.leftChild))
+        graph.edge(cid, str(node.leftChild.__hash__()))
+        traverseForGraph(node.leftChild, graph)
+    else:
+        graph.node(str(node.__hash__() + 1), "Leaf")
+        graph.edge(str(cid), str(1 + node.__hash__()))
+
+    if node.rightChild != None:
+        graph.node(str(node.rightChild.__hash__()), str(node.rightChild))
+        graph.edge(cid, str(node.rightChild.__hash__()))
+        traverseForGraph(node.rightChild, graph)
+    else:
+        graph.node(str(node.__hash__() - 1), "Leaf")
+        graph.edge(str(cid), str( node.__hash__() - 1))
+
+
+
+
+    return graph
 
 # use np.var
 def dimsWithMostVar(dimCount, arr):
-    print("Selecting Split Dims")
     
     assert dimCount < len(arr[0]) - 1
     
     vars = np.var(arr[:, :-1], axis=0)
     retArr = np.argsort(vars)[::-1]
     retArr = retArr[:dimCount]
-    print("Split Dims Selected")
 
     assert dimCount == retArr.shape[0]
     return retArr
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -5,16 +5,17 @@ class SplittingNode:
 
     index = 0
     val = 0
-    child = None
+    leftChild = None
+    rightChild = None
 
 
-    def setChild(self, child):
-        self.child = child
-
-    def __init__(self, feature_index, value):
+    def __init__(self, feature_index, value, rightChild = None, leftChild = None):
         self.index = feature_index
         self.val = value
 
+        self.rightChild = rightChild
+        self.leftChild = leftChild
+
     # maybe add input validation???
     # do in place weighted gini calculation
 
@@ -60,7 +61,7 @@ class SplittingNode:
         gt_percent = geqCount / len(combined)
         weighted_gini = (lt_gini * lt_percent) + (gt_gini * gt_percent)
 
-        return weighted_gini
+        return (weighted_gini, lt_gini, gt_gini)
 
 
     def _lessThan(self, sample):
@@ -105,4 +106,4 @@ class SplittingNode:
         return ltArr, gtArr
 
     def __str__(self):
-        return f"Splitting index: {self.index}\nSplitting value: {self.val}"
+        return f"Splitting index: {self.index}\nSplitting value: {round(self.val,2)}"
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -2,15 +2,17 @@ from Podtc import PseudoOptimalDecisionTreeClassifier
 import numpy as np
 import plotly.express as px
 
-X = np.random.random((60000, 2))
-y = (X[:,0] + X[:,1]) > 1
+X = np.random.random((20000, 2)) * 1000
+y = (X[:,0] + X[:,1]) > 500
 
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=1);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.2, proportionOfDimsToTrainOn=1, maxDepth=100);
 
 classifier.fit(X,y)
-classifier.predict()
-print(classifier)
+# classifier.predict()
+# print(classifier)
 
+classifier.graph()
 
-scatter = px.scatter(x=X[:,0], y=X[:,1], color=y)
-scatter.show()
+
+#scatter = px.scatter(x=X[:,0], y=X[:,1], color=y)
+#scatter.show()

	decision-tree-classifier Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

M	classifier/Podtc.py	\|	106	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
M	classifier/SplittingNode.py	\|	15	++++++++-------
M	classifier/Testing.py	\|	16	+++++++++-------