decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 505b2e05ba4f43008cda0974f61c30d454821115
parent 8c9198bcbb729ddcf7ac3909048f3f65f7ae7fba
Author: Andrew <andrewlaack1@gmail.com>
Date:   Thu, 19 Dec 2024 02:02:35 -0600

Added prediction

Diffstat:
Aclassifier/LeafNode.py | 9+++++++++
Mclassifier/Podtc.py | 143++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Mclassifier/SplittingNode.py | 9++-------
Mclassifier/Testing.py | 48+++++++++++++++++++++++++++++++++++++++++++++---
4 files changed, 165 insertions(+), 44 deletions(-)

diff --git a/classifier/LeafNode.py b/classifier/LeafNode.py @@ -0,0 +1,9 @@ +class LeafNode: + def __init__(self, classification, count, elements): + self.classification = classification + self.count = count + self.elements = elements + + def __str__(self): + + return(f"Classification: {self.classification}\nCount: {self.count}\nLabel Counts: {self.elements.tolist()}") diff --git a/classifier/Podtc.py b/classifier/Podtc.py @@ -1,6 +1,8 @@ +from warnings import warn import numpy as np from tqdm import tqdm from SplittingNode import SplittingNode +from LeafNode import LeafNode import math import graphviz @@ -9,13 +11,6 @@ class PseudoOptimalDecisionTreeClassifier(): # First is first split, last is ... well yeah. - proportionToTrainOn = 0 - threshold = 0 - maxDepth = 0 - propValSplits = 0 - propDimsTrain = .5 - bestSplit = None - def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5): self.threshold = pruneThreshold self.maxDepth = maxDepth @@ -42,6 +37,9 @@ class PseudoOptimalDecisionTreeClassifier(): self.__validateInput(X,y) y_re = y.reshape(-1,1) + + self.sampleSize = X.shape[1] + self.categories = np.unique(y) # together [:,-1] == y together = np.append(X,y_re, axis=1) @@ -62,10 +60,21 @@ class PseudoOptimalDecisionTreeClassifier(): return dims + + + def __classification(self, together): + lastCol = together[:, -1].astype('int') + counts = np.bincount(lastCol, minlength=len(self.categories)) + majority_label = np.argmax(counts) + return majority_label, counts + + + def recurse(self, together, depth, dims): if(depth == 0): - return None + classification, elements = self.__classification(together) + return LeafNode(classification, len(together), elements) bestSplit, ltGini, gtGini = self._best_split(together, self.proportionToTrainOn, dims) @@ -74,17 +83,19 @@ class PseudoOptimalDecisionTreeClassifier(): ltArr, gtArr = bestSplit.split(arr=together) - - blt = None - bgt = None - if len(ltArr) > 1 and ltGini > 0: blt = self.recurse(ltArr, depth - 1, dims) bestSplit.leftChild = blt + else: + classification, elements = self.__classification(ltArr) + bestSplit.leftChild = LeafNode(classification, len(ltArr), elements) if len(gtArr) > 1 and gtGini > 0: bgt = self.recurse(gtArr, depth - 1, dims) bestSplit.rightChild= bgt + else: + classification, elements = self.__classification(gtArr) + bestSplit.rightChild = LeafNode(classification, len(gtArr), elements) return bestSplit @@ -125,24 +136,85 @@ class PseudoOptimalDecisionTreeClassifier(): # These impurities allow for us to stop if we have a pure node. return (bestNode, blg, bgg) - def predict(self): - print("TODO") - return + + # OPTIMIZE THIS... THIS SHOULD BE DONE WITH BATCHES NOT SINGLE INSTANCES + def predict(self, X): + + self.__validatePrediction(X) + + y = np.zeros(shape=(len(X),)) + + leaf = LeafNode(0,0,[]) + + for i in range(0, len(y)): + + done = False + current = self.bestSplit + + while not done: + if isinstance(current,type(leaf)): + y[i] = current.classification + done = True + continue + + if self._lessThan(current, X[i]): #type: ignore + current = current.leftChild #type: ignore + else: + current = current.rightChild #type: ignore + return y + + def _lessThan(self, split : SplittingNode, sample): + if(sample[split.index] < split.val): + return True + return False + def __str__(self): return "TODO" - def __validateInput(self, X,y): + def __validatePrediction(self, X): + # check if bestSplit has been set + try: + self.bestSplit + except: + raise Exception("Tree must be fit prior to prediction") - if X.shape[0] != y.shape[0]: - raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.") + X = np.asarray(X) + + if len(X.shape) != 2: + raise Exception(f"X shape {X.shape} not supported. Ensure input array is 2d.") + + if np.issubdtype(X.dtype, np.str_): + raise Exception(f"X contains strings which is not allowed.") + if X.shape[1] != self.sampleSize: + raise Exception(f"Prediction sample of size {X.shape[1]}, not compatible with fitted size of {self.sampleSize}") + + + return 0 + + def __validateInput(self, X,y): + + y = np.asarray(y) + X = np.asarray(X) + if len(X.shape) != 2: raise Exception(f"X shape {X.shape} not supported. Ensure input array is 2d.") - if len(y.shape) != 1: - raise Exception(f"y shape {y.shape} not supported. Ensure input array is 1d.") + if np.issubdtype(y.dtype, np.str_): + raise Exception(f"y contains strings which is not allowed.") + + if np.issubdtype(X.dtype, np.str_): + raise Exception(f"X contains strings which is not allowed.") + + if np.issubdtype(y.dtype, np.floating): + if not np.all(np.equal(np.floor(y), y)): # Check if all values are whole numbers + raise Exception("y array contains continuous values, but classification requires discrete values") + + if X.shape[0] != y.shape[0]: + raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.") + if X.shape[0] <= 1: raise Exception(f"X must contain more than one sample.") @@ -161,26 +233,29 @@ def createGraph(node, graph): traverseForGraph(node, graph) return graph -def traverseForGraph(node : SplittingNode, graph): +def traverseForGraph(node, graph): cid = str(node.__hash__()) - if node.leftChild != None: - graph.node(str(node.leftChild.__hash__()), str(node.leftChild)) - graph.edge(cid, str(node.leftChild.__hash__())) + if node.leftChild == None: + raise Exception("left child should never be none") + + if node.rightChild == None: + raise Exception("right child should never be none") + + graph.node(str(node.leftChild.__hash__()), str(node.leftChild)) + graph.node(str(node.rightChild.__hash__()), str(node.rightChild)) + + graph.edge(cid, str(node.leftChild.__hash__())) + graph.edge(cid, str(node.rightChild.__hash__())) + + + # will be false in case where child is leaf node + if type(node.leftChild) == type(node): traverseForGraph(node.leftChild, graph) - else: - graph.node(str(node.__hash__() + 1), "Leaf") - graph.edge(str(cid), str(1 + node.__hash__())) - if node.rightChild != None: - graph.node(str(node.rightChild.__hash__()), str(node.rightChild)) - graph.edge(cid, str(node.rightChild.__hash__())) + if type(node.rightChild) == type(node): traverseForGraph(node.rightChild, graph) - else: - graph.node(str(node.__hash__() - 1), "Leaf") - graph.edge(str(cid), str( node.__hash__() - 1)) - diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py @@ -1,14 +1,9 @@ import numpy as np import math +from numba import njit class SplittingNode: - index = 0 - val = 0 - leftChild = None - rightChild = None - - def __init__(self, feature_index, value, rightChild = None, leftChild = None): self.index = feature_index self.val = value @@ -71,8 +66,8 @@ class SplittingNode: return False # split the data by current node - def split(self, arr): + def split(self, arr): ltCount = 0 gtCount = 0 diff --git a/classifier/Testing.py b/classifier/Testing.py @@ -1,13 +1,55 @@ from Podtc import PseudoOptimalDecisionTreeClassifier import numpy as np import plotly.express as px +from sklearn.tree import DecisionTreeClassifier +from sklearn.datasets import load_digits +import pandas as pd +from keras.datasets import mnist +from sklearn.metrics import accuracy_score -X = np.random.random((20000, 2)) * 1000 -y = (X[:,0] + X[:,1]) > 500 -classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.2, proportionOfDimsToTrainOn=1, maxDepth=100); +(train_X, train_y), (test_X, test_y) = mnist.load_data() + +train_X = train_X.reshape(-1, 784) +test_X = test_X.reshape(-1, 784) + +classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.09, proportionToValidateSplits=.02, proportionOfDimsToTrainOn=.75, maxDepth=15); +classifier.fit(train_X, train_y) +y_pred = classifier.predict(test_X) + +print(accuracy_score(y_true=test_y, y_pred=y_pred)) + + +classifier = DecisionTreeClassifier(max_depth=15) +classifier.fit(train_X, train_y) +y_pred = classifier.predict(test_X) + +print("SECOND ACCURACY:") +print(accuracy_score(y_true=test_y, y_pred=y_pred)) + + +assert False + +X = np.random.random((200, 2)) +y = np.random.random((200)) * 10 +y = y.round() +# y = (X[:,0] + X[:,1]) > .5 + + +#clf = DecisionTreeClassifier() +#clf.fit(X,y) + +classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=2); + classifier.fit(X,y) + + +X_pred = np.random.random((20, 2)).round() + +print(X_pred) +print(classifier.predict(X_pred)) + # classifier.predict() # print(classifier)