Did stuff - decision-tree-classifier - Unnamed repository; edit this file 'description' to name the repository.

commit e58a69cc634251f3800d09352630558b9fcbee59
parent bc574a4c5fab2abf6bc6eb671ab54f694505ea8c
Author: Andrew <andrewlaack1@gmail.com>
Date:   Thu, 19 Dec 2024 16:12:10 -0600

Did stuff

Diffstat:
M classifier/Podtc.py  | 48 +++++++++++++++++++++++++++++++++++-------------
M classifier/SplittingNode.py  | 17 +++++++++++++++--
M classifier/Testing.py  | 6 +++---

3 files changed, 53 insertions(+), 18 deletions(-)
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -1,12 +1,11 @@
-from warnings import warn
 import numpy as np
 from tqdm import tqdm
 from SplittingNode import SplittingNode
 from SplittingNode import gini
-
 from LeafNode import LeafNode
 import math
 import graphviz
+from concurrent.futures import ProcessPoolExecutor
 
 class PseudoOptimalDecisionTreeClassifier():
 
@@ -104,15 +103,21 @@ class PseudoOptimalDecisionTreeClassifier():
     # pass in current root
     # find best options from then on
 
+
+
+
+
     # Find best split
     def _best_split(self, together, proportionUsed, dims):
 
         bestGini = float("inf")
-        bestNode  = None
+        bestNode  = SplittingNode(0,0) 
         blg = float("inf")
         bgg = float("inf")
 
 
+
+
         # columns (excluding y)
         for x in tqdm(dims):
 
@@ -122,25 +127,26 @@ class PseudoOptimalDecisionTreeClassifier():
             # also, we are interpolating between samples
 
             # indices for splits (this decides how many splits to test)
-            indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
+
+            splitList = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
 
             # indices for evals. This decides which indices to check upon splitting
             values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32)
 
+            splits = []
 
-            for currentSample in indices:
+            for currentSample in splitList:
                 splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x]
                 split = SplittingNode(x, splitOn)
+                splits.append(split)
 
-                current = gini(together, values, split.index, split.val)
-
-
-                if current[0] < bestGini:
-                    bestNode = split
-                    bestGini = current[0]
-                    blg = current[1]
-                    bgg = current[2]
+            bestOf = evalSplits(splits, together, values)
 
+            if bestOf[0] < bestGini:
+                bestNode = bestOf[3]
+                bestGini = bestOf[0]
+                blg = bestOf[1]
+                bgg = bestOf[2]
 
         # Return the best node, the left gini impurity, and right gini impurity.
         # These impurities allow for us to stop if we have a pure node.
@@ -283,3 +289,19 @@ def dimsWithMostVar(dimCount, arr):
     assert dimCount == retArr.shape[0]
     return retArr
 
+
+def evalSplits(splits, together, values):
+    bestGini = float("inf")
+    bestNode  = SplittingNode(0,0) 
+    blg = float("inf")
+    bgg = float("inf")
+
+    for split in splits:
+        current = gini(together, values, split.index, split.val)
+        if current[0] < bestGini:
+            bestNode = split
+            bestGini = current[0]
+            blg = current[1]
+            bgg = current[2]
+
+    return bestGini, blg, bgg , bestNode
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -16,8 +16,19 @@ class SplittingNode:
 
     # split the data by current node
 
+
     def split(self, arr):
 
+        pySplit = True
+
+        if pySplit:
+            return self.__split_py(arr) 
+        else:
+            assert False
+
+
+
+    def __split_py(self, arr):
         ltCount = 0
         gtCount = 0
 
@@ -49,6 +60,7 @@ class SplittingNode:
 
         return ltArr, gtArr
 
+
     def __str__(self):
         return f"Splitting index: {self.index}\nSplitting value: {round(self.val,2)}"
 
@@ -63,8 +75,8 @@ def gini(combined, values, index, val):
     # implement prop to val with for c++
 
 
-    useCPP = False 
-    usePy = True 
+    useCPP = True 
+    usePy = False
 
 
     # add indices and index count
@@ -95,6 +107,7 @@ def gini(combined, values, index, val):
         ltGini = result.ltGini
         gtGini = result.gtGini
         return (weightedGini, ltGini, gtGini)
+
     if usePy:
         outPy = giniPy(combined, values, index, val)  
         return outPy
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -17,13 +17,13 @@ test_X = test_X.reshape(-1, 784)
 # train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]]
 # train_y = [1, 1 , 2, 1, 5, 2,1 ,3]
 
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.01, proportionToValidateSplits=.1, proportionOfDimsToTrainOn=.1, maxDepth=1);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.005, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=.01, maxDepth=2);
+
 classifier.fit(train_X, train_y)
 y_pred = classifier.predict(test_X)
-
+print("MY ACCURACY:")
 print(accuracy_score(y_true=test_y, y_pred=y_pred))
 
-
 # classifier = DecisionTreeClassifier(max_depth=15)
 # classifier.fit(train_X, train_y)
 # y_pred = classifier.predict(test_X)

	decision-tree-classifier Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

M	classifier/Podtc.py	\|	48	+++++++++++++++++++++++++++++++++++-------------
M	classifier/SplittingNode.py	\|	17	+++++++++++++++--
M	classifier/Testing.py	\|	6	+++---