decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 8c9198bcbb729ddcf7ac3909048f3f65f7ae7fba
parent 0668e37fc1e73ed4bda57088244ff0f8dd0b3494
Author: Andrew <andrewlaack1@gmail.com>
Date:   Wed, 18 Dec 2024 00:30:28 -0600

Finished working on tree creation logic and printing

Diffstat:
Mclassifier/Podtc.py | 106+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------
Mclassifier/SplittingNode.py | 15++++++++-------
Mclassifier/Testing.py | 16+++++++++-------
3 files changed, 102 insertions(+), 35 deletions(-)

diff --git a/classifier/Podtc.py b/classifier/Podtc.py @@ -2,6 +2,7 @@ import numpy as np from tqdm import tqdm from SplittingNode import SplittingNode import math +import graphviz class PseudoOptimalDecisionTreeClassifier(): @@ -9,11 +10,11 @@ class PseudoOptimalDecisionTreeClassifier(): # First is first split, last is ... well yeah. proportionToTrainOn = 0 - splitList = [] threshold = 0 maxDepth = 0 propValSplits = 0 propDimsTrain = .5 + bestSplit = None def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5): self.threshold = pruneThreshold @@ -34,7 +35,7 @@ class PseudoOptimalDecisionTreeClassifier(): if(proportionOfDimsToTrainOn > 1 or proportionOfDimsToTrainOn <= 0): raise Exception(f"Proportion of dimensions to train on {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]") - self.propDimsTrain = proportionOfDimsToTrainOn + self.propDimsTrain = proportionOfDimsToTrainOn return def fit(self, X, y): @@ -45,35 +46,59 @@ class PseudoOptimalDecisionTreeClassifier(): # together [:,-1] == y together = np.append(X,y_re, axis=1) - self.splitList.append(self._best_split(together, self.proportionToTrainOn, self.propDimsTrain)) + dims = self.findDims(together) - ltArr = np.array([]) - gtArr = np.array([]) - ltArr, gtArr = self.splitList[-1].split(X) + self.bestSplit = self.recurse(together, self.maxDepth, dims) - print(self.splitList[0]) return + def findDims(self, together): + dimCount = len(together[0]) - 1 + dims = np.arange(dimCount) + dimsToSample = math.ceil(dimCount * self.propDimsTrain) + if(dimsToSample != dimCount): + dims = dimsWithMostVar(dimsToSample, together) - # pass in current root - # find best options from then on + return dims - # Find best split - def _best_split(self, together, proportionUsed, propOfDims): + def recurse(self, together, depth, dims): + + if(depth == 0): + return None - dimCount = len(together[0]) - 1 - dims = np.arange(dimCount) + bestSplit, ltGini, gtGini = self._best_split(together, self.proportionToTrainOn, dims) - dimsToSample = math.ceil(dimCount * propOfDims) + if bestSplit is None: + raise ValueError("bestSplit cannot be None") - if(dimsToSample != dimCount): - dims = dimsWithMostVar(dimsToSample, together) + ltArr, gtArr = bestSplit.split(arr=together) + blt = None + bgt = None + + if len(ltArr) > 1 and ltGini > 0: + blt = self.recurse(ltArr, depth - 1, dims) + bestSplit.leftChild = blt + + if len(gtArr) > 1 and gtGini > 0: + bgt = self.recurse(gtArr, depth - 1, dims) + bestSplit.rightChild= bgt + + return bestSplit + + # pass in current root + # find best options from then on + + # Find best split + def _best_split(self, together, proportionUsed, dims): bestGini = float("inf") bestNode = None + blg = float("inf") + bgg = float("inf") + # columns (excluding y) for x in tqdm(dims): @@ -90,11 +115,15 @@ class PseudoOptimalDecisionTreeClassifier(): splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x] split = SplittingNode(x, splitOn) current = split.gini(together, self.propValSplits) - if current < bestGini: + if current[0] < bestGini: bestNode = split - bestGini = current + bestGini = current[0] + blg = current[1] + bgg = current[2] - return bestNode + # Return the best node, the left gini impurity, and right gini impurity. + # These impurities allow for us to stop if we have a pure node. + return (bestNode, blg, bgg) def predict(self): print("TODO") @@ -119,17 +148,52 @@ class PseudoOptimalDecisionTreeClassifier(): raise Exception(f"X must contain more than one sample.") return + def graph(self): + if self.bestSplit == None: + raise Exception(f"Unable to create graph of classifier, call fit first.") + + graph = graphviz.Digraph() + graph = createGraph(self.bestSplit, graph) + graph.render('whatever', format='png', view=True) + +def createGraph(node, graph): + graph.node(str(node.__hash__()), str(node)) + traverseForGraph(node, graph) + return graph + +def traverseForGraph(node : SplittingNode, graph): + + cid = str(node.__hash__()) + + if node.leftChild != None: + graph.node(str(node.leftChild.__hash__()), str(node.leftChild)) + graph.edge(cid, str(node.leftChild.__hash__())) + traverseForGraph(node.leftChild, graph) + else: + graph.node(str(node.__hash__() + 1), "Leaf") + graph.edge(str(cid), str(1 + node.__hash__())) + + if node.rightChild != None: + graph.node(str(node.rightChild.__hash__()), str(node.rightChild)) + graph.edge(cid, str(node.rightChild.__hash__())) + traverseForGraph(node.rightChild, graph) + else: + graph.node(str(node.__hash__() - 1), "Leaf") + graph.edge(str(cid), str( node.__hash__() - 1)) + + + + + return graph # use np.var def dimsWithMostVar(dimCount, arr): - print("Selecting Split Dims") assert dimCount < len(arr[0]) - 1 vars = np.var(arr[:, :-1], axis=0) retArr = np.argsort(vars)[::-1] retArr = retArr[:dimCount] - print("Split Dims Selected") assert dimCount == retArr.shape[0] return retArr diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py @@ -5,16 +5,17 @@ class SplittingNode: index = 0 val = 0 - child = None + leftChild = None + rightChild = None - def setChild(self, child): - self.child = child - - def __init__(self, feature_index, value): + def __init__(self, feature_index, value, rightChild = None, leftChild = None): self.index = feature_index self.val = value + self.rightChild = rightChild + self.leftChild = leftChild + # maybe add input validation??? # do in place weighted gini calculation @@ -60,7 +61,7 @@ class SplittingNode: gt_percent = geqCount / len(combined) weighted_gini = (lt_gini * lt_percent) + (gt_gini * gt_percent) - return weighted_gini + return (weighted_gini, lt_gini, gt_gini) def _lessThan(self, sample): @@ -105,4 +106,4 @@ class SplittingNode: return ltArr, gtArr def __str__(self): - return f"Splitting index: {self.index}\nSplitting value: {self.val}" + return f"Splitting index: {self.index}\nSplitting value: {round(self.val,2)}" diff --git a/classifier/Testing.py b/classifier/Testing.py @@ -2,15 +2,17 @@ from Podtc import PseudoOptimalDecisionTreeClassifier import numpy as np import plotly.express as px -X = np.random.random((60000, 2)) -y = (X[:,0] + X[:,1]) > 1 +X = np.random.random((20000, 2)) * 1000 +y = (X[:,0] + X[:,1]) > 500 -classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=1); +classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.2, proportionOfDimsToTrainOn=1, maxDepth=100); classifier.fit(X,y) -classifier.predict() -print(classifier) +# classifier.predict() +# print(classifier) +classifier.graph() -scatter = px.scatter(x=X[:,0], y=X[:,1], color=y) -scatter.show() + +#scatter = px.scatter(x=X[:,0], y=X[:,1], color=y) +#scatter.show()