Did a bunch of optimization work - decision-tree-classifier - Unnamed repository; edit this file 'description' to name the repository.

commit 0668e37fc1e73ed4bda57088244ff0f8dd0b3494
parent d55ce288b320b06cea9d8df9e15fa3718d8a67e9
Author: Andrew <andrewlaack1@gmail.com>
Date:   Tue, 17 Dec 2024 22:51:23 -0600

Did a bunch of optimization work

Diffstat:
M classifier/Podtc.py  | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M classifier/SplittingNode.py  | 7 +++++--
M classifier/Testing.py  | 4 ++--

3 files changed, 76 insertions(+), 14 deletions(-)
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -1,17 +1,40 @@
 import numpy as np
+from tqdm import tqdm
 from SplittingNode import SplittingNode
+import math
 
 class PseudoOptimalDecisionTreeClassifier():
 
 
     # First is first split, last is ... well yeah.
+
+    proportionToTrainOn = 0
     splitList = []
     threshold = 0
     maxDepth = 0
+    propValSplits = 0
+    propDimsTrain = .5
 
-    def __init__(self, pruneThreshold = .2, maxDepth = 5):
+    def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5):
         self.threshold = pruneThreshold 
         self.maxDepth = maxDepth
+
+        # I guess allow > 1, just d
+
+        if(proportionToTrainOn > 1 or proportionToTrainOn <= 0):
+            raise Exception(f"Proportion to train on {proportionToTrainOn}, is not valid. Select a proportion in the range of (0,1]")
+
+        self.proportionToTrainOn = proportionToTrainOn
+
+        if(proportionToValidateSplits > 1 or proportionToValidateSplits <= 0):
+            raise Exception(f"Proportion to validate splits with {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]")
+
+        self.propValSplits = proportionToValidateSplits
+
+        if(proportionOfDimsToTrainOn > 1 or proportionOfDimsToTrainOn <= 0):
+            raise Exception(f"Proportion of dimensions to train on {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]")
+
+        self.propDimsTrain = proportionOfDimsToTrainOn
         return
 
     def fit(self, X,  y):
@@ -22,36 +45,51 @@ class PseudoOptimalDecisionTreeClassifier():
         # together [:,-1] == y
         together = np.append(X,y_re, axis=1)
 
-        self.splitList.append(self._best_split(together))
+        self.splitList.append(self._best_split(together, self.proportionToTrainOn, self.propDimsTrain))
 
         ltArr = np.array([])
         gtArr = np.array([])
         ltArr, gtArr = self.splitList[-1].split(X)
 
-        print(f"LESS THAN \n{ltArr}")
-        print(f"GREATER THAN \n{gtArr}")
+        print(self.splitList[0])
         return
 
+
+
     # pass in current root
     # find best options from then on
 
     # Find best split
-    def _best_split(self, together):
+    def _best_split(self, together, proportionUsed, propOfDims):
+
+        dimCount = len(together[0]) - 1
+        dims = np.arange(dimCount)
+
+        dimsToSample = math.ceil(dimCount * propOfDims)
+
+        if(dimsToSample != dimCount):
+            dims = dimsWithMostVar(dimsToSample, together)
+
+
+
         bestGini = float("inf")
         bestNode  = None
 
-        # each column (excluding y)
-        for x in range(0, len(together[0]) - 1):
+        # columns (excluding y)
+        for x in tqdm(dims):
 
             together = together[together[:,x].argsort()]
 
             # each row (sample)
             # also, we are interpolating between samples
 
-            for y in range(0, len(together) - 1):
-                splitOn = ((together[y+1][x] - together[y][x]) / 2) + together[y][x]
+            indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
+
+
+            for currentSample in indices:
+                splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x]
                 split = SplittingNode(x, splitOn)
-                current = split.gini(together)
+                current = split.gini(together, self.propValSplits)
                 if current < bestGini:
                     bestNode = split
                     bestGini = current
@@ -66,6 +104,8 @@ class PseudoOptimalDecisionTreeClassifier():
         return "TODO"
 
     def __validateInput(self, X,y):
+
+
         if X.shape[0] != y.shape[0]:
             raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.")
         
@@ -74,4 +114,23 @@ class PseudoOptimalDecisionTreeClassifier():
 
         if len(y.shape) != 1:
             raise Exception(f"y shape {y.shape} not supported. Ensure input array is 1d.")
+
+        if X.shape[0] <= 1:
+            raise Exception(f"X must contain more than one sample.")
         return
+
+
+# use np.var
+def dimsWithMostVar(dimCount, arr):
+    print("Selecting Split Dims")
+    
+    assert dimCount < len(arr[0]) - 1
+    
+    vars = np.var(arr[:, :-1], axis=0)
+    retArr = np.argsort(vars)[::-1]
+    retArr = retArr[:dimCount]
+    print("Split Dims Selected")
+
+    assert dimCount == retArr.shape[0]
+    return retArr
+
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -1,4 +1,5 @@
 import numpy as np
+import math
 
 class SplittingNode:
 
@@ -17,7 +18,7 @@ class SplittingNode:
     # maybe add input validation???
     # do in place weighted gini calculation
 
-    def gini(self, combined):
+    def gini(self, combined, propToValWith):
 
         ltc = {}
         geqc = {}
@@ -25,7 +26,9 @@ class SplittingNode:
         ltCount = 0
         geqCount = 0
 
-        for i in range(0, len(combined)):
+        values = np.round(np.linspace(0, len(combined) - 1, math.ceil(propToValWith * len(combined)))).astype(int)
+
+        for i in values:
 
             lessThan = self._lessThan(combined[i])
             classification = int(combined[i][-1])
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -2,10 +2,10 @@ from Podtc import PseudoOptimalDecisionTreeClassifier
 import numpy as np
 import plotly.express as px
 
-X = np.random.random((10, 2))
+X = np.random.random((60000, 2))
 y = (X[:,0] + X[:,1]) > 1
 
-classifier = PseudoOptimalDecisionTreeClassifier();
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=1);
 
 classifier.fit(X,y)
 classifier.predict()

	decision-tree-classifier Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

M	classifier/Podtc.py	\|	79	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
M	classifier/SplittingNode.py	\|	7	+++++--
M	classifier/Testing.py	\|	4	++--