decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit e58a69cc634251f3800d09352630558b9fcbee59
parent bc574a4c5fab2abf6bc6eb671ab54f694505ea8c
Author: Andrew <andrewlaack1@gmail.com>
Date:   Thu, 19 Dec 2024 16:12:10 -0600

Did stuff

Diffstat:
Mclassifier/Podtc.py | 48+++++++++++++++++++++++++++++++++++-------------
Mclassifier/SplittingNode.py | 17+++++++++++++++--
Mclassifier/Testing.py | 6+++---
3 files changed, 53 insertions(+), 18 deletions(-)

diff --git a/classifier/Podtc.py b/classifier/Podtc.py @@ -1,12 +1,11 @@ -from warnings import warn import numpy as np from tqdm import tqdm from SplittingNode import SplittingNode from SplittingNode import gini - from LeafNode import LeafNode import math import graphviz +from concurrent.futures import ProcessPoolExecutor class PseudoOptimalDecisionTreeClassifier(): @@ -104,15 +103,21 @@ class PseudoOptimalDecisionTreeClassifier(): # pass in current root # find best options from then on + + + + # Find best split def _best_split(self, together, proportionUsed, dims): bestGini = float("inf") - bestNode = None + bestNode = SplittingNode(0,0) blg = float("inf") bgg = float("inf") + + # columns (excluding y) for x in tqdm(dims): @@ -122,25 +127,26 @@ class PseudoOptimalDecisionTreeClassifier(): # also, we are interpolating between samples # indices for splits (this decides how many splits to test) - indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int) + + splitList = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int) # indices for evals. This decides which indices to check upon splitting values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32) + splits = [] - for currentSample in indices: + for currentSample in splitList: splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x] split = SplittingNode(x, splitOn) + splits.append(split) - current = gini(together, values, split.index, split.val) - - - if current[0] < bestGini: - bestNode = split - bestGini = current[0] - blg = current[1] - bgg = current[2] + bestOf = evalSplits(splits, together, values) + if bestOf[0] < bestGini: + bestNode = bestOf[3] + bestGini = bestOf[0] + blg = bestOf[1] + bgg = bestOf[2] # Return the best node, the left gini impurity, and right gini impurity. # These impurities allow for us to stop if we have a pure node. @@ -283,3 +289,19 @@ def dimsWithMostVar(dimCount, arr): assert dimCount == retArr.shape[0] return retArr + +def evalSplits(splits, together, values): + bestGini = float("inf") + bestNode = SplittingNode(0,0) + blg = float("inf") + bgg = float("inf") + + for split in splits: + current = gini(together, values, split.index, split.val) + if current[0] < bestGini: + bestNode = split + bestGini = current[0] + blg = current[1] + bgg = current[2] + + return bestGini, blg, bgg , bestNode diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py @@ -16,8 +16,19 @@ class SplittingNode: # split the data by current node + def split(self, arr): + pySplit = True + + if pySplit: + return self.__split_py(arr) + else: + assert False + + + + def __split_py(self, arr): ltCount = 0 gtCount = 0 @@ -49,6 +60,7 @@ class SplittingNode: return ltArr, gtArr + def __str__(self): return f"Splitting index: {self.index}\nSplitting value: {round(self.val,2)}" @@ -63,8 +75,8 @@ def gini(combined, values, index, val): # implement prop to val with for c++ - useCPP = False - usePy = True + useCPP = True + usePy = False # add indices and index count @@ -95,6 +107,7 @@ def gini(combined, values, index, val): ltGini = result.ltGini gtGini = result.gtGini return (weightedGini, ltGini, gtGini) + if usePy: outPy = giniPy(combined, values, index, val) return outPy diff --git a/classifier/Testing.py b/classifier/Testing.py @@ -17,13 +17,13 @@ test_X = test_X.reshape(-1, 784) # train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]] # train_y = [1, 1 , 2, 1, 5, 2,1 ,3] -classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.01, proportionToValidateSplits=.1, proportionOfDimsToTrainOn=.1, maxDepth=1); +classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.005, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=.01, maxDepth=2); + classifier.fit(train_X, train_y) y_pred = classifier.predict(test_X) - +print("MY ACCURACY:") print(accuracy_score(y_true=test_y, y_pred=y_pred)) - # classifier = DecisionTreeClassifier(max_depth=15) # classifier.fit(train_X, train_y) # y_pred = classifier.predict(test_X)