decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 0668e37fc1e73ed4bda57088244ff0f8dd0b3494
parent d55ce288b320b06cea9d8df9e15fa3718d8a67e9
Author: Andrew <andrewlaack1@gmail.com>
Date:   Tue, 17 Dec 2024 22:51:23 -0600

Did a bunch of optimization work

Diffstat:
Mclassifier/Podtc.py | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------
Mclassifier/SplittingNode.py | 7+++++--
Mclassifier/Testing.py | 4++--
3 files changed, 76 insertions(+), 14 deletions(-)

diff --git a/classifier/Podtc.py b/classifier/Podtc.py @@ -1,17 +1,40 @@ import numpy as np +from tqdm import tqdm from SplittingNode import SplittingNode +import math class PseudoOptimalDecisionTreeClassifier(): # First is first split, last is ... well yeah. + + proportionToTrainOn = 0 splitList = [] threshold = 0 maxDepth = 0 + propValSplits = 0 + propDimsTrain = .5 - def __init__(self, pruneThreshold = .2, maxDepth = 5): + def __init__(self, pruneThreshold = .2, maxDepth = 5, proportionToTrainOn=.5, proportionToValidateSplits=.5, proportionOfDimsToTrainOn=.5): self.threshold = pruneThreshold self.maxDepth = maxDepth + + # I guess allow > 1, just d + + if(proportionToTrainOn > 1 or proportionToTrainOn <= 0): + raise Exception(f"Proportion to train on {proportionToTrainOn}, is not valid. Select a proportion in the range of (0,1]") + + self.proportionToTrainOn = proportionToTrainOn + + if(proportionToValidateSplits > 1 or proportionToValidateSplits <= 0): + raise Exception(f"Proportion to validate splits with {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]") + + self.propValSplits = proportionToValidateSplits + + if(proportionOfDimsToTrainOn > 1 or proportionOfDimsToTrainOn <= 0): + raise Exception(f"Proportion of dimensions to train on {proportionToValidateSplits}, is not valid. Select a proportion in the range of (0,1]") + + self.propDimsTrain = proportionOfDimsToTrainOn return def fit(self, X, y): @@ -22,36 +45,51 @@ class PseudoOptimalDecisionTreeClassifier(): # together [:,-1] == y together = np.append(X,y_re, axis=1) - self.splitList.append(self._best_split(together)) + self.splitList.append(self._best_split(together, self.proportionToTrainOn, self.propDimsTrain)) ltArr = np.array([]) gtArr = np.array([]) ltArr, gtArr = self.splitList[-1].split(X) - print(f"LESS THAN \n{ltArr}") - print(f"GREATER THAN \n{gtArr}") + print(self.splitList[0]) return + + # pass in current root # find best options from then on # Find best split - def _best_split(self, together): + def _best_split(self, together, proportionUsed, propOfDims): + + dimCount = len(together[0]) - 1 + dims = np.arange(dimCount) + + dimsToSample = math.ceil(dimCount * propOfDims) + + if(dimsToSample != dimCount): + dims = dimsWithMostVar(dimsToSample, together) + + + bestGini = float("inf") bestNode = None - # each column (excluding y) - for x in range(0, len(together[0]) - 1): + # columns (excluding y) + for x in tqdm(dims): together = together[together[:,x].argsort()] # each row (sample) # also, we are interpolating between samples - for y in range(0, len(together) - 1): - splitOn = ((together[y+1][x] - together[y][x]) / 2) + together[y][x] + indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int) + + + for currentSample in indices: + splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x] split = SplittingNode(x, splitOn) - current = split.gini(together) + current = split.gini(together, self.propValSplits) if current < bestGini: bestNode = split bestGini = current @@ -66,6 +104,8 @@ class PseudoOptimalDecisionTreeClassifier(): return "TODO" def __validateInput(self, X,y): + + if X.shape[0] != y.shape[0]: raise Exception(f"Incongruent array sizes. X has shape {X.shape} and y has shape {y.shape}.") @@ -74,4 +114,23 @@ class PseudoOptimalDecisionTreeClassifier(): if len(y.shape) != 1: raise Exception(f"y shape {y.shape} not supported. Ensure input array is 1d.") + + if X.shape[0] <= 1: + raise Exception(f"X must contain more than one sample.") return + + +# use np.var +def dimsWithMostVar(dimCount, arr): + print("Selecting Split Dims") + + assert dimCount < len(arr[0]) - 1 + + vars = np.var(arr[:, :-1], axis=0) + retArr = np.argsort(vars)[::-1] + retArr = retArr[:dimCount] + print("Split Dims Selected") + + assert dimCount == retArr.shape[0] + return retArr + diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py @@ -1,4 +1,5 @@ import numpy as np +import math class SplittingNode: @@ -17,7 +18,7 @@ class SplittingNode: # maybe add input validation??? # do in place weighted gini calculation - def gini(self, combined): + def gini(self, combined, propToValWith): ltc = {} geqc = {} @@ -25,7 +26,9 @@ class SplittingNode: ltCount = 0 geqCount = 0 - for i in range(0, len(combined)): + values = np.round(np.linspace(0, len(combined) - 1, math.ceil(propToValWith * len(combined)))).astype(int) + + for i in values: lessThan = self._lessThan(combined[i]) classification = int(combined[i][-1]) diff --git a/classifier/Testing.py b/classifier/Testing.py @@ -2,10 +2,10 @@ from Podtc import PseudoOptimalDecisionTreeClassifier import numpy as np import plotly.express as px -X = np.random.random((10, 2)) +X = np.random.random((60000, 2)) y = (X[:,0] + X[:,1]) > 1 -classifier = PseudoOptimalDecisionTreeClassifier(); +classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=1); classifier.fit(X,y) classifier.predict()