commit 505b2e05ba4f43008cda0974f61c30d454821115
parent 8c9198bcbb729ddcf7ac3909048f3f65f7ae7fba
Author: Andrew <andrewlaack1@gmail.com>
Date: Thu, 19 Dec 2024 02:02:35 -0600
Added prediction
Diffstat:
4 files changed, 165 insertions(+), 44 deletions(-)
diff --git a/classifier/LeafNode.py b/classifier/LeafNode.py
@@ -0,0 +1,9 @@
+class LeafNode:
+ def __init__(self, classification, count, elements):
+ self.classification = classification
+ self.count = count
+ self.elements = elements
+
+ def __str__(self):
+
+ return(f"Classification: {self.classification}\nCount: {self.count}\nLabel Counts: {self.elements.tolist()}")
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -1,6 +1,8 @@
+from warnings import warn
import numpy as np
from tqdm import tqdm
from SplittingNode import SplittingNode
+from LeafNode import LeafNode
import math
import graphviz
@@ -9,13 +11,6 @@ class PseudoOptimalDecisionTreeClassifier():
# First is first split, last is ... well yeah.
- proportionToTrainOn = 0
- threshold = 0
- maxDepth = 0
- propValSplits = 0
- propDimsTrain = .5
- bestSplit = None
-
def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5):
self.threshold = pruneThreshold
self.maxDepth = maxDepth
@@ -42,6 +37,9 @@ class PseudoOptimalDecisionTreeClassifier():
self.__validateInput(X,y)
y_re = y.reshape(-1,1)
+
+ self.sampleSize = X.shape[1]
+ self.categories = np.unique(y)
# together [:,-1] == y
together = np.append(X,y_re, axis=1)
@@ -62,10 +60,21 @@ class PseudoOptimalDecisionTreeClassifier():
return dims
+
+
+ def __classification(self, together):
+ lastCol = together[:, -1].astype('int')
+ counts = np.bincount(lastCol, minlength=len(self.categories))
+ majority_label = np.argmax(counts)
+ return majority_label, counts
+
+
+
def recurse(self, together, depth, dims):
if(depth == 0):
- return None
+ classification, elements = self.__classification(together)
+ return LeafNode(classification, len(together), elements)
bestSplit, ltGini, gtGini = self._best_split(together, self.proportionToTrainOn, dims)
@@ -74,17 +83,19 @@ class PseudoOptimalDecisionTreeClassifier():
ltArr, gtArr = bestSplit.split(arr=together)
-
- blt = None
- bgt = None
-
if len(ltArr) > 1 and ltGini > 0:
blt = self.recurse(ltArr, depth - 1, dims)
bestSplit.leftChild = blt
+ else:
+ classification, elements = self.__classification(ltArr)
+ bestSplit.leftChild = LeafNode(classification, len(ltArr), elements)
if len(gtArr) > 1 and gtGini > 0:
bgt = self.recurse(gtArr, depth - 1, dims)
bestSplit.rightChild= bgt
+ else:
+ classification, elements = self.__classification(gtArr)
+ bestSplit.rightChild = LeafNode(classification, len(gtArr), elements)
return bestSplit
@@ -125,24 +136,85 @@ class PseudoOptimalDecisionTreeClassifier():
# These impurities allow for us to stop if we have a pure node.
return (bestNode, blg, bgg)
- def predict(self):
- print("TODO")
- return
+
+ # OPTIMIZE THIS... THIS SHOULD BE DONE WITH BATCHES NOT SINGLE INSTANCES
+ def predict(self, X):
+
+ self.__validatePrediction(X)
+
+ y = np.zeros(shape=(len(X),))
+
+ leaf = LeafNode(0,0,[])
+
+ for i in range(0, len(y)):
+
+ done = False
+ current = self.bestSplit
+
+ while not done:
+ if isinstance(current,type(leaf)):
+ y[i] = current.classification
+ done = True
+ continue
+
+ if self._lessThan(current, X[i]): #type: ignore
+ current = current.leftChild #type: ignore
+ else:
+ current = current.rightChild #type: ignore
+ return y
+
+ def _lessThan(self, split : SplittingNode, sample):
+ if(sample[split.index] < split.val):
+ return True
+ return False
+
def __str__(self):
return "TODO"
- def __validateInput(self, X,y):
+ def __validatePrediction(self, X):
+ # check if bestSplit has been set
+ try:
+ self.bestSplit
+ except:
+ raise Exception("Tree must be fit prior to prediction")
- if X.shape[0] != y.shape[0]:
- raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.")
+ X = np.asarray(X)
+
+ if len(X.shape) != 2:
+ raise Exception(f"X shape {X.shape} not supported. Ensure input array is 2d.")
+
+ if np.issubdtype(X.dtype, np.str_):
+ raise Exception(f"X contains strings which is not allowed.")
+ if X.shape[1] != self.sampleSize:
+ raise Exception(f"Prediction sample of size {X.shape[1]}, not compatible with fitted size of {self.sampleSize}")
+
+
+ return 0
+
+ def __validateInput(self, X,y):
+
+ y = np.asarray(y)
+ X = np.asarray(X)
+
if len(X.shape) != 2:
raise Exception(f"X shape {X.shape} not supported. Ensure input array is 2d.")
- if len(y.shape) != 1:
- raise Exception(f"y shape {y.shape} not supported. Ensure input array is 1d.")
+ if np.issubdtype(y.dtype, np.str_):
+ raise Exception(f"y contains strings which is not allowed.")
+
+ if np.issubdtype(X.dtype, np.str_):
+ raise Exception(f"X contains strings which is not allowed.")
+
+ if np.issubdtype(y.dtype, np.floating):
+ if not np.all(np.equal(np.floor(y), y)): # Check if all values are whole numbers
+ raise Exception("y array contains continuous values, but classification requires discrete values")
+
+ if X.shape[0] != y.shape[0]:
+ raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.")
+
if X.shape[0] <= 1:
raise Exception(f"X must contain more than one sample.")
@@ -161,26 +233,29 @@ def createGraph(node, graph):
traverseForGraph(node, graph)
return graph
-def traverseForGraph(node : SplittingNode, graph):
+def traverseForGraph(node, graph):
cid = str(node.__hash__())
- if node.leftChild != None:
- graph.node(str(node.leftChild.__hash__()), str(node.leftChild))
- graph.edge(cid, str(node.leftChild.__hash__()))
+ if node.leftChild == None:
+ raise Exception("left child should never be none")
+
+ if node.rightChild == None:
+ raise Exception("right child should never be none")
+
+ graph.node(str(node.leftChild.__hash__()), str(node.leftChild))
+ graph.node(str(node.rightChild.__hash__()), str(node.rightChild))
+
+ graph.edge(cid, str(node.leftChild.__hash__()))
+ graph.edge(cid, str(node.rightChild.__hash__()))
+
+
+ # will be false in case where child is leaf node
+ if type(node.leftChild) == type(node):
traverseForGraph(node.leftChild, graph)
- else:
- graph.node(str(node.__hash__() + 1), "Leaf")
- graph.edge(str(cid), str(1 + node.__hash__()))
- if node.rightChild != None:
- graph.node(str(node.rightChild.__hash__()), str(node.rightChild))
- graph.edge(cid, str(node.rightChild.__hash__()))
+ if type(node.rightChild) == type(node):
traverseForGraph(node.rightChild, graph)
- else:
- graph.node(str(node.__hash__() - 1), "Leaf")
- graph.edge(str(cid), str( node.__hash__() - 1))
-
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -1,14 +1,9 @@
import numpy as np
import math
+from numba import njit
class SplittingNode:
- index = 0
- val = 0
- leftChild = None
- rightChild = None
-
-
def __init__(self, feature_index, value, rightChild = None, leftChild = None):
self.index = feature_index
self.val = value
@@ -71,8 +66,8 @@ class SplittingNode:
return False
# split the data by current node
- def split(self, arr):
+ def split(self, arr):
ltCount = 0
gtCount = 0
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -1,13 +1,55 @@
from Podtc import PseudoOptimalDecisionTreeClassifier
import numpy as np
import plotly.express as px
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.datasets import load_digits
+import pandas as pd
+from keras.datasets import mnist
+from sklearn.metrics import accuracy_score
-X = np.random.random((20000, 2)) * 1000
-y = (X[:,0] + X[:,1]) > 500
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.2, proportionOfDimsToTrainOn=1, maxDepth=100);
+(train_X, train_y), (test_X, test_y) = mnist.load_data()
+
+train_X = train_X.reshape(-1, 784)
+test_X = test_X.reshape(-1, 784)
+
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.09, proportionToValidateSplits=.02, proportionOfDimsToTrainOn=.75, maxDepth=15);
+classifier.fit(train_X, train_y)
+y_pred = classifier.predict(test_X)
+
+print(accuracy_score(y_true=test_y, y_pred=y_pred))
+
+
+classifier = DecisionTreeClassifier(max_depth=15)
+classifier.fit(train_X, train_y)
+y_pred = classifier.predict(test_X)
+
+print("SECOND ACCURACY:")
+print(accuracy_score(y_true=test_y, y_pred=y_pred))
+
+
+assert False
+
+X = np.random.random((200, 2))
+y = np.random.random((200)) * 10
+y = y.round()
+# y = (X[:,0] + X[:,1]) > .5
+
+
+#clf = DecisionTreeClassifier()
+#clf.fit(X,y)
+
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=2);
+
classifier.fit(X,y)
+
+
+X_pred = np.random.random((20, 2)).round()
+
+print(X_pred)
+print(classifier.predict(X_pred))
+
# classifier.predict()
# print(classifier)