commit 0668e37fc1e73ed4bda57088244ff0f8dd0b3494
parent d55ce288b320b06cea9d8df9e15fa3718d8a67e9
Author: Andrew <andrewlaack1@gmail.com>
Date: Tue, 17 Dec 2024 22:51:23 -0600
Did a bunch of optimization work
Diffstat:
3 files changed, 76 insertions(+), 14 deletions(-)
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -1,17 +1,40 @@
import numpy as np
+from tqdm import tqdm
from SplittingNode import SplittingNode
+import math
class PseudoOptimalDecisionTreeClassifier():
# First is first split, last is ... well yeah.
+
+ proportionToTrainOn = 0
splitList = []
threshold = 0
maxDepth = 0
+ propValSplits = 0
+ propDimsTrain = .5
- def __init__(self, pruneThreshold = .2, maxDepth = 5):
+ def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5):
self.threshold = pruneThreshold
self.maxDepth = maxDepth
+
+ # I guess allow > 1, just d
+
+ if(proportionToTrainOn > 1 or proportionToTrainOn <= 0):
+ raise Exception(f"Proportion to train on {proportionToTrainOn}, is not valid. Select a proportion in the range of (0,1]")
+
+ self.proportionToTrainOn = proportionToTrainOn
+
+ if(proportionToValidateSplits > 1 or proportionToValidateSplits <= 0):
+ raise Exception(f"Proportion to validate splits with {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]")
+
+ self.propValSplits = proportionToValidateSplits
+
+ if(proportionOfDimsToTrainOn > 1 or proportionOfDimsToTrainOn <= 0):
+ raise Exception(f"Proportion of dimensions to train on {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]")
+
+ self.propDimsTrain = proportionOfDimsToTrainOn
return
def fit(self, X, y):
@@ -22,36 +45,51 @@ class PseudoOptimalDecisionTreeClassifier():
# together [:,-1] == y
together = np.append(X,y_re, axis=1)
- self.splitList.append(self._best_split(together))
+ self.splitList.append(self._best_split(together, self.proportionToTrainOn, self.propDimsTrain))
ltArr = np.array([])
gtArr = np.array([])
ltArr, gtArr = self.splitList[-1].split(X)
- print(f"LESS THAN \n{ltArr}")
- print(f"GREATER THAN \n{gtArr}")
+ print(self.splitList[0])
return
+
+
# pass in current root
# find best options from then on
# Find best split
- def _best_split(self, together):
+ def _best_split(self, together, proportionUsed, propOfDims):
+
+ dimCount = len(together[0]) - 1
+ dims = np.arange(dimCount)
+
+ dimsToSample = math.ceil(dimCount * propOfDims)
+
+ if(dimsToSample != dimCount):
+ dims = dimsWithMostVar(dimsToSample, together)
+
+
+
bestGini = float("inf")
bestNode = None
- # each column (excluding y)
- for x in range(0, len(together[0]) - 1):
+ # columns (excluding y)
+ for x in tqdm(dims):
together = together[together[:,x].argsort()]
# each row (sample)
# also, we are interpolating between samples
- for y in range(0, len(together) - 1):
- splitOn = ((together[y+1][x] - together[y][x]) / 2) + together[y][x]
+ indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
+
+
+ for currentSample in indices:
+ splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x]
split = SplittingNode(x, splitOn)
- current = split.gini(together)
+ current = split.gini(together, self.propValSplits)
if current < bestGini:
bestNode = split
bestGini = current
@@ -66,6 +104,8 @@ class PseudoOptimalDecisionTreeClassifier():
return "TODO"
def __validateInput(self, X,y):
+
+
if X.shape[0] != y.shape[0]:
raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.")
@@ -74,4 +114,23 @@ class PseudoOptimalDecisionTreeClassifier():
if len(y.shape) != 1:
raise Exception(f"y shape {y.shape} not supported. Ensure input array is 1d.")
+
+ if X.shape[0] <= 1:
+ raise Exception(f"X must contain more than one sample.")
return
+
+
+# use np.var
+def dimsWithMostVar(dimCount, arr):
+ print("Selecting Split Dims")
+
+ assert dimCount < len(arr[0]) - 1
+
+ vars = np.var(arr[:, :-1], axis=0)
+ retArr = np.argsort(vars)[::-1]
+ retArr = retArr[:dimCount]
+ print("Split Dims Selected")
+
+ assert dimCount == retArr.shape[0]
+ return retArr
+
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -1,4 +1,5 @@
import numpy as np
+import math
class SplittingNode:
@@ -17,7 +18,7 @@ class SplittingNode:
# maybe add input validation???
# do in place weighted gini calculation
- def gini(self, combined):
+ def gini(self, combined, propToValWith):
ltc = {}
geqc = {}
@@ -25,7 +26,9 @@ class SplittingNode:
ltCount = 0
geqCount = 0
- for i in range(0, len(combined)):
+ values = np.round(np.linspace(0, len(combined) - 1, math.ceil(propToValWith * len(combined)))).astype(int)
+
+ for i in values:
lessThan = self._lessThan(combined[i])
classification = int(combined[i][-1])
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -2,10 +2,10 @@ from Podtc import PseudoOptimalDecisionTreeClassifier
import numpy as np
import plotly.express as px
-X = np.random.random((10, 2))
+X = np.random.random((60000, 2))
y = (X[:,0] + X[:,1]) > 1
-classifier = PseudoOptimalDecisionTreeClassifier();
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=1);
classifier.fit(X,y)
classifier.predict()