commit 8c9198bcbb729ddcf7ac3909048f3f65f7ae7fba
parent 0668e37fc1e73ed4bda57088244ff0f8dd0b3494
Author: Andrew <andrewlaack1@gmail.com>
Date: Wed, 18 Dec 2024 00:30:28 -0600
Finished working on tree creation logic and printing
Diffstat:
3 files changed, 102 insertions(+), 35 deletions(-)
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -2,6 +2,7 @@ import numpy as np
from tqdm import tqdm
from SplittingNode import SplittingNode
import math
+import graphviz
class PseudoOptimalDecisionTreeClassifier():
@@ -9,11 +10,11 @@ class PseudoOptimalDecisionTreeClassifier():
# First is first split, last is ... well yeah.
proportionToTrainOn = 0
- splitList = []
threshold = 0
maxDepth = 0
propValSplits = 0
propDimsTrain = .5
+ bestSplit = None
def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5):
self.threshold = pruneThreshold
@@ -34,7 +35,7 @@ class PseudoOptimalDecisionTreeClassifier():
if(proportionOfDimsToTrainOn > 1 or proportionOfDimsToTrainOn <= 0):
raise Exception(f"Proportion of dimensions to train on {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]")
- self.propDimsTrain = proportionOfDimsToTrainOn
+ self.propDimsTrain = proportionOfDimsToTrainOn
return
def fit(self, X, y):
@@ -45,35 +46,59 @@ class PseudoOptimalDecisionTreeClassifier():
# together [:,-1] == y
together = np.append(X,y_re, axis=1)
- self.splitList.append(self._best_split(together, self.proportionToTrainOn, self.propDimsTrain))
+ dims = self.findDims(together)
- ltArr = np.array([])
- gtArr = np.array([])
- ltArr, gtArr = self.splitList[-1].split(X)
+ self.bestSplit = self.recurse(together, self.maxDepth, dims)
- print(self.splitList[0])
return
+ def findDims(self, together):
+ dimCount = len(together[0]) - 1
+ dims = np.arange(dimCount)
+ dimsToSample = math.ceil(dimCount * self.propDimsTrain)
+ if(dimsToSample != dimCount):
+ dims = dimsWithMostVar(dimsToSample, together)
- # pass in current root
- # find best options from then on
+ return dims
- # Find best split
- def _best_split(self, together, proportionUsed, propOfDims):
+ def recurse(self, together, depth, dims):
+
+ if(depth == 0):
+ return None
- dimCount = len(together[0]) - 1
- dims = np.arange(dimCount)
+ bestSplit, ltGini, gtGini = self._best_split(together, self.proportionToTrainOn, dims)
- dimsToSample = math.ceil(dimCount * propOfDims)
+ if bestSplit is None:
+ raise ValueError("bestSplit cannot be None")
- if(dimsToSample != dimCount):
- dims = dimsWithMostVar(dimsToSample, together)
+ ltArr, gtArr = bestSplit.split(arr=together)
+ blt = None
+ bgt = None
+
+ if len(ltArr) > 1 and ltGini > 0:
+ blt = self.recurse(ltArr, depth - 1, dims)
+ bestSplit.leftChild = blt
+
+ if len(gtArr) > 1 and gtGini > 0:
+ bgt = self.recurse(gtArr, depth - 1, dims)
+ bestSplit.rightChild= bgt
+
+ return bestSplit
+
+ # pass in current root
+ # find best options from then on
+
+ # Find best split
+ def _best_split(self, together, proportionUsed, dims):
bestGini = float("inf")
bestNode = None
+ blg = float("inf")
+ bgg = float("inf")
+
# columns (excluding y)
for x in tqdm(dims):
@@ -90,11 +115,15 @@ class PseudoOptimalDecisionTreeClassifier():
splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x]
split = SplittingNode(x, splitOn)
current = split.gini(together, self.propValSplits)
- if current < bestGini:
+ if current[0] < bestGini:
bestNode = split
- bestGini = current
+ bestGini = current[0]
+ blg = current[1]
+ bgg = current[2]
- return bestNode
+ # Return the best node, the left gini impurity, and right gini impurity.
+ # These impurities allow for us to stop if we have a pure node.
+ return (bestNode, blg, bgg)
def predict(self):
print("TODO")
@@ -119,17 +148,52 @@ class PseudoOptimalDecisionTreeClassifier():
raise Exception(f"X must contain more than one sample.")
return
+ def graph(self):
+ if self.bestSplit == None:
+ raise Exception(f"Unable to create graph of classifier, call fit first.")
+
+ graph = graphviz.Digraph()
+ graph = createGraph(self.bestSplit, graph)
+ graph.render('whatever', format='png', view=True)
+
+def createGraph(node, graph):
+ graph.node(str(node.__hash__()), str(node))
+ traverseForGraph(node, graph)
+ return graph
+
+def traverseForGraph(node : SplittingNode, graph):
+
+ cid = str(node.__hash__())
+
+ if node.leftChild != None:
+ graph.node(str(node.leftChild.__hash__()), str(node.leftChild))
+ graph.edge(cid, str(node.leftChild.__hash__()))
+ traverseForGraph(node.leftChild, graph)
+ else:
+ graph.node(str(node.__hash__() + 1), "Leaf")
+ graph.edge(str(cid), str(1 + node.__hash__()))
+
+ if node.rightChild != None:
+ graph.node(str(node.rightChild.__hash__()), str(node.rightChild))
+ graph.edge(cid, str(node.rightChild.__hash__()))
+ traverseForGraph(node.rightChild, graph)
+ else:
+ graph.node(str(node.__hash__() - 1), "Leaf")
+ graph.edge(str(cid), str( node.__hash__() - 1))
+
+
+
+
+ return graph
# use np.var
def dimsWithMostVar(dimCount, arr):
- print("Selecting Split Dims")
assert dimCount < len(arr[0]) - 1
vars = np.var(arr[:, :-1], axis=0)
retArr = np.argsort(vars)[::-1]
retArr = retArr[:dimCount]
- print("Split Dims Selected")
assert dimCount == retArr.shape[0]
return retArr
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -5,16 +5,17 @@ class SplittingNode:
index = 0
val = 0
- child = None
+ leftChild = None
+ rightChild = None
- def setChild(self, child):
- self.child = child
-
- def __init__(self, feature_index, value):
+ def __init__(self, feature_index, value, rightChild = None, leftChild = None):
self.index = feature_index
self.val = value
+ self.rightChild = rightChild
+ self.leftChild = leftChild
+
# maybe add input validation???
# do in place weighted gini calculation
@@ -60,7 +61,7 @@ class SplittingNode:
gt_percent = geqCount / len(combined)
weighted_gini = (lt_gini * lt_percent) + (gt_gini * gt_percent)
- return weighted_gini
+ return (weighted_gini, lt_gini, gt_gini)
def _lessThan(self, sample):
@@ -105,4 +106,4 @@ class SplittingNode:
return ltArr, gtArr
def __str__(self):
- return f"Splitting index: {self.index}\nSplitting value: {self.val}"
+ return f"Splitting index: {self.index}\nSplitting value: {round(self.val,2)}"
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -2,15 +2,17 @@ from Podtc import PseudoOptimalDecisionTreeClassifier
import numpy as np
import plotly.express as px
-X = np.random.random((60000, 2))
-y = (X[:,0] + X[:,1]) > 1
+X = np.random.random((20000, 2)) * 1000
+y = (X[:,0] + X[:,1]) > 500
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=1);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.2, proportionOfDimsToTrainOn=1, maxDepth=100);
classifier.fit(X,y)
-classifier.predict()
-print(classifier)
+# classifier.predict()
+# print(classifier)
+classifier.graph()
-scatter = px.scatter(x=X[:,0], y=X[:,1], color=y)
-scatter.show()
+
+#scatter = px.scatter(x=X[:,0], y=X[:,1], color=y)
+#scatter.show()