Added prediction - decision-tree-classifier - Unnamed repository; edit this file 'description' to name the repository.

commit 505b2e05ba4f43008cda0974f61c30d454821115
parent 8c9198bcbb729ddcf7ac3909048f3f65f7ae7fba
Author: Andrew <andrewlaack1@gmail.com>
Date:   Thu, 19 Dec 2024 02:02:35 -0600

Added prediction

Diffstat:
A classifier/LeafNode.py  | 9 +++++++++
M classifier/Podtc.py  | 143 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
M classifier/SplittingNode.py  | 9 ++-------
M classifier/Testing.py  | 48 +++++++++++++++++++++++++++++++++++++++++++++---

4 files changed, 165 insertions(+), 44 deletions(-)
diff --git a/classifier/LeafNode.py b/classifier/LeafNode.py
@@ -0,0 +1,9 @@
+class LeafNode:
+    def __init__(self, classification, count, elements):
+        self.classification = classification
+        self.count = count
+        self.elements = elements
+
+    def __str__(self):
+
+        return(f"Classification: {self.classification}\nCount: {self.count}\nLabel Counts: {self.elements.tolist()}")
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -1,6 +1,8 @@
+from warnings import warn
 import numpy as np
 from tqdm import tqdm
 from SplittingNode import SplittingNode
+from LeafNode import LeafNode
 import math
 import graphviz
 
@@ -9,13 +11,6 @@ class PseudoOptimalDecisionTreeClassifier():
 
     # First is first split, last is ... well yeah.
 
-    proportionToTrainOn = 0
-    threshold = 0
-    maxDepth = 0
-    propValSplits = 0
-    propDimsTrain = .5
-    bestSplit = None
-
     def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5):
         self.threshold = pruneThreshold 
         self.maxDepth = maxDepth
@@ -42,6 +37,9 @@ class PseudoOptimalDecisionTreeClassifier():
 
         self.__validateInput(X,y)
         y_re = y.reshape(-1,1)
+
+        self.sampleSize = X.shape[1]
+        self.categories = np.unique(y)
         
         # together [:,-1] == y
         together = np.append(X,y_re, axis=1)
@@ -62,10 +60,21 @@ class PseudoOptimalDecisionTreeClassifier():
 
         return dims
 
+
+
+    def __classification(self, together):
+        lastCol = together[:, -1].astype('int')
+        counts = np.bincount(lastCol, minlength=len(self.categories))
+        majority_label = np.argmax(counts)
+        return majority_label, counts
+
+
+
     def recurse(self, together, depth, dims):
         
         if(depth == 0):
-            return None
+            classification, elements = self.__classification(together)
+            return LeafNode(classification, len(together), elements)
 
         bestSplit, ltGini, gtGini = self._best_split(together, self.proportionToTrainOn, dims)
 
@@ -74,17 +83,19 @@ class PseudoOptimalDecisionTreeClassifier():
 
         ltArr, gtArr = bestSplit.split(arr=together)
 
-
-        blt = None
-        bgt = None
-
         if len(ltArr) > 1 and ltGini > 0:
             blt = self.recurse(ltArr, depth - 1, dims)
             bestSplit.leftChild = blt
+        else:
+            classification, elements = self.__classification(ltArr)
+            bestSplit.leftChild = LeafNode(classification, len(ltArr), elements)
 
         if len(gtArr) > 1 and gtGini > 0:
             bgt = self.recurse(gtArr, depth - 1, dims)
             bestSplit.rightChild= bgt
+        else:
+            classification, elements = self.__classification(gtArr)
+            bestSplit.rightChild = LeafNode(classification, len(gtArr), elements)
 
         return bestSplit
 
@@ -125,24 +136,85 @@ class PseudoOptimalDecisionTreeClassifier():
         # These impurities allow for us to stop if we have a pure node.
         return (bestNode, blg, bgg)
 
-    def predict(self):
-        print("TODO")
-        return
+
+    # OPTIMIZE THIS... THIS SHOULD BE DONE WITH BATCHES NOT SINGLE INSTANCES
+    def predict(self, X):
+
+        self.__validatePrediction(X)
+
+        y = np.zeros(shape=(len(X),))
+
+        leaf = LeafNode(0,0,[])
+
+        for i in range(0, len(y)):
+
+            done = False
+            current = self.bestSplit
+
+            while not done:
+                if isinstance(current,type(leaf)):
+                    y[i] = current.classification
+                    done = True
+                    continue
+
+                if self._lessThan(current, X[i]): #type: ignore
+                    current = current.leftChild #type: ignore
+                else:
+                    current = current.rightChild #type: ignore
+        return y
+
+    def _lessThan(self, split : SplittingNode, sample):
+        if(sample[split.index] < split.val):
+            return True
+        return False
+
 
     def __str__(self):
         return "TODO"
 
-    def __validateInput(self, X,y):
+    def __validatePrediction(self, X):
 
+        # check if bestSplit has been set
+        try:
+            self.bestSplit
+        except:
+            raise Exception("Tree must be fit prior to prediction")
 
-        if X.shape[0] != y.shape[0]:
-            raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.")
+        X = np.asarray(X)
+
+        if len(X.shape) != 2:
+            raise Exception(f"X shape {X.shape} not supported. Ensure input array is 2d.")
+
+        if np.issubdtype(X.dtype, np.str_):
+            raise Exception(f"X contains strings which is not allowed.")
         
+        if X.shape[1] != self.sampleSize:
+            raise Exception(f"Prediction sample of size {X.shape[1]}, not compatible with fitted size of {self.sampleSize}")
+
+
+        return 0
+
+    def __validateInput(self, X,y):
+
+        y = np.asarray(y)
+        X = np.asarray(X)
+
         if len(X.shape) != 2:
             raise Exception(f"X shape {X.shape} not supported. Ensure input array is 2d.")
 
-        if len(y.shape) != 1:
-            raise Exception(f"y shape {y.shape} not supported. Ensure input array is 1d.")
+        if np.issubdtype(y.dtype, np.str_):
+            raise Exception(f"y contains strings which is not allowed.")
+
+        if np.issubdtype(X.dtype, np.str_):
+            raise Exception(f"X contains strings which is not allowed.")
+
+        if np.issubdtype(y.dtype, np.floating):  
+            if not np.all(np.equal(np.floor(y), y)):  # Check if all values are whole numbers
+                raise Exception("y array contains continuous values, but classification requires discrete values")
+
+        if X.shape[0] != y.shape[0]:
+            raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.")
+        
 
         if X.shape[0] <= 1:
             raise Exception(f"X must contain more than one sample.")
@@ -161,26 +233,29 @@ def createGraph(node, graph):
     traverseForGraph(node, graph)
     return graph
 
-def traverseForGraph(node : SplittingNode, graph):
+def traverseForGraph(node, graph):
 
     cid = str(node.__hash__())
 
-    if node.leftChild != None:
-        graph.node(str(node.leftChild.__hash__()), str(node.leftChild))
-        graph.edge(cid, str(node.leftChild.__hash__()))
+    if node.leftChild == None:
+        raise Exception("left child should never be none")
+
+    if node.rightChild == None:
+        raise Exception("right child should never be none")
+
+    graph.node(str(node.leftChild.__hash__()), str(node.leftChild))
+    graph.node(str(node.rightChild.__hash__()), str(node.rightChild))
+
+    graph.edge(cid, str(node.leftChild.__hash__()))
+    graph.edge(cid, str(node.rightChild.__hash__()))
+
+
+    # will be false in case where child is leaf node
+    if type(node.leftChild) == type(node):
         traverseForGraph(node.leftChild, graph)
-    else:
-        graph.node(str(node.__hash__() + 1), "Leaf")
-        graph.edge(str(cid), str(1 + node.__hash__()))
 
-    if node.rightChild != None:
-        graph.node(str(node.rightChild.__hash__()), str(node.rightChild))
-        graph.edge(cid, str(node.rightChild.__hash__()))
+    if type(node.rightChild) == type(node):
         traverseForGraph(node.rightChild, graph)
-    else:
-        graph.node(str(node.__hash__() - 1), "Leaf")
-        graph.edge(str(cid), str( node.__hash__() - 1))
-
 
 
 
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -1,14 +1,9 @@
 import numpy as np
 import math
+from numba import njit
 
 class SplittingNode:
 
-    index = 0
-    val = 0
-    leftChild = None
-    rightChild = None
-
-
     def __init__(self, feature_index, value, rightChild = None, leftChild = None):
         self.index = feature_index
         self.val = value
@@ -71,8 +66,8 @@ class SplittingNode:
         return False
 
     # split the data by current node
-    def split(self, arr):
 
+    def split(self, arr):
 
         ltCount = 0
         gtCount = 0
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -1,13 +1,55 @@
 from Podtc import PseudoOptimalDecisionTreeClassifier
 import numpy as np
 import plotly.express as px
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.datasets import load_digits
+import pandas as pd
+from keras.datasets import mnist
+from sklearn.metrics import accuracy_score
 
-X = np.random.random((20000, 2)) * 1000
-y = (X[:,0] + X[:,1]) > 500
 
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.2, proportionOfDimsToTrainOn=1, maxDepth=100);
+(train_X, train_y), (test_X, test_y) = mnist.load_data()
+
+train_X = train_X.reshape(-1, 784)
+test_X = test_X.reshape(-1, 784)
+
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.09, proportionToValidateSplits=.02, proportionOfDimsToTrainOn=.75, maxDepth=15);
+classifier.fit(train_X, train_y)
+y_pred = classifier.predict(test_X)
+
+print(accuracy_score(y_true=test_y, y_pred=y_pred))
+
+
+classifier = DecisionTreeClassifier(max_depth=15)
+classifier.fit(train_X, train_y)
+y_pred = classifier.predict(test_X)
+
+print("SECOND ACCURACY:")
+print(accuracy_score(y_true=test_y, y_pred=y_pred))
+
+
+assert False
+
+X = np.random.random((200, 2))
+y = np.random.random((200)) * 10
+y = y.round()
+# y = (X[:,0] + X[:,1]) > .5
+
+
+#clf = DecisionTreeClassifier()
+#clf.fit(X,y)
+
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=1, proportionToValidateSplits=1, proportionOfDimsToTrainOn=1, maxDepth=2);
+
 
 classifier.fit(X,y)
+
+
+X_pred = np.random.random((20, 2)).round()
+
+print(X_pred)
+print(classifier.predict(X_pred))
+
 # classifier.predict()
 # print(classifier)

	decision-tree-classifier Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

A	classifier/LeafNode.py	\|	9	+++++++++
M	classifier/Podtc.py	\|	143	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
M	classifier/SplittingNode.py	\|	9	++-------
M	classifier/Testing.py	\|	48	+++++++++++++++++++++++++++++++++++++++++++++---