commit e58a69cc634251f3800d09352630558b9fcbee59
parent bc574a4c5fab2abf6bc6eb671ab54f694505ea8c
Author: Andrew <andrewlaack1@gmail.com>
Date: Thu, 19 Dec 2024 16:12:10 -0600
Did stuff
Diffstat:
3 files changed, 53 insertions(+), 18 deletions(-)
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -1,12 +1,11 @@
-from warnings import warn
import numpy as np
from tqdm import tqdm
from SplittingNode import SplittingNode
from SplittingNode import gini
-
from LeafNode import LeafNode
import math
import graphviz
+from concurrent.futures import ProcessPoolExecutor
class PseudoOptimalDecisionTreeClassifier():
@@ -104,15 +103,21 @@ class PseudoOptimalDecisionTreeClassifier():
# pass in current root
# find best options from then on
+
+
+
+
# Find best split
def _best_split(self, together, proportionUsed, dims):
bestGini = float("inf")
- bestNode = None
+ bestNode = SplittingNode(0,0)
blg = float("inf")
bgg = float("inf")
+
+
# columns (excluding y)
for x in tqdm(dims):
@@ -122,25 +127,26 @@ class PseudoOptimalDecisionTreeClassifier():
# also, we are interpolating between samples
# indices for splits (this decides how many splits to test)
- indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
+
+ splitList = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
# indices for evals. This decides which indices to check upon splitting
values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32)
+ splits = []
- for currentSample in indices:
+ for currentSample in splitList:
splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x]
split = SplittingNode(x, splitOn)
+ splits.append(split)
- current = gini(together, values, split.index, split.val)
-
-
- if current[0] < bestGini:
- bestNode = split
- bestGini = current[0]
- blg = current[1]
- bgg = current[2]
+ bestOf = evalSplits(splits, together, values)
+ if bestOf[0] < bestGini:
+ bestNode = bestOf[3]
+ bestGini = bestOf[0]
+ blg = bestOf[1]
+ bgg = bestOf[2]
# Return the best node, the left gini impurity, and right gini impurity.
# These impurities allow for us to stop if we have a pure node.
@@ -283,3 +289,19 @@ def dimsWithMostVar(dimCount, arr):
assert dimCount == retArr.shape[0]
return retArr
+
+def evalSplits(splits, together, values):
+ bestGini = float("inf")
+ bestNode = SplittingNode(0,0)
+ blg = float("inf")
+ bgg = float("inf")
+
+ for split in splits:
+ current = gini(together, values, split.index, split.val)
+ if current[0] < bestGini:
+ bestNode = split
+ bestGini = current[0]
+ blg = current[1]
+ bgg = current[2]
+
+ return bestGini, blg, bgg , bestNode
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -16,8 +16,19 @@ class SplittingNode:
# split the data by current node
+
def split(self, arr):
+ pySplit = True
+
+ if pySplit:
+ return self.__split_py(arr)
+ else:
+ assert False
+
+
+
+ def __split_py(self, arr):
ltCount = 0
gtCount = 0
@@ -49,6 +60,7 @@ class SplittingNode:
return ltArr, gtArr
+
def __str__(self):
return f"Splitting index: {self.index}\nSplitting value: {round(self.val,2)}"
@@ -63,8 +75,8 @@ def gini(combined, values, index, val):
# implement prop to val with for c++
- useCPP = False
- usePy = True
+ useCPP = True
+ usePy = False
# add indices and index count
@@ -95,6 +107,7 @@ def gini(combined, values, index, val):
ltGini = result.ltGini
gtGini = result.gtGini
return (weightedGini, ltGini, gtGini)
+
if usePy:
outPy = giniPy(combined, values, index, val)
return outPy
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -17,13 +17,13 @@ test_X = test_X.reshape(-1, 784)
# train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]]
# train_y = [1, 1 , 2, 1, 5, 2,1 ,3]
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.01, proportionToValidateSplits=.1, proportionOfDimsToTrainOn=.1, maxDepth=1);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.005, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=.01, maxDepth=2);
+
classifier.fit(train_X, train_y)
y_pred = classifier.predict(test_X)
-
+print("MY ACCURACY:")
print(accuracy_score(y_true=test_y, y_pred=y_pred))
-
# classifier = DecisionTreeClassifier(max_depth=15)
# classifier.fit(train_X, train_y)
# y_pred = classifier.predict(test_X)