Considering stopping with what is being done. C++ seems better; python is shit - decision-tree-classifier - Unnamed repository; edit this file 'description' to name the repository.

commit fd65d662d1afca9132e08c51d2958600a37200ec
parent e58a69cc634251f3800d09352630558b9fcbee59
Author: Andrew <andrewlaack1@gmail.com>
Date:   Fri, 20 Dec 2024 01:25:07 -0600

Considering stopping with what is being done. C++ seems better; python is shit

Diffstat:
A classifier/Makefile  | 8 ++++++++
M classifier/Podtc.py  | 66 +++++++++++++++++++++++++++++++-----------------------------------
M classifier/SplittingNode.py  | 69 +++++++++++++++++++--------------------------------------------------
M classifier/Testing.py  | 19 +++++++++++--------
M classifier/cpp/gini.cpp  | 2 --
A classifier/cpp/split.cpp  | 44 ++++++++++++++++++++++++++++++++++++++++++++

6 files changed, 113 insertions(+), 95 deletions(-)
diff --git a/classifier/Makefile b/classifier/Makefile
@@ -0,0 +1,8 @@
+split:
+	g++ -shared -o cpp/libsplit.so -fPIC -O3 cpp/split.cpp
+
+gini:
+	g++ -shared -o cpp/libgini.so -fPIC -O3 cpp/gini.cpp
+
+clean:
+	rm cpp/libgini.so
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -1,3 +1,5 @@
+from warnings import warn
+import ctypes
 import numpy as np
 from tqdm import tqdm
 from SplittingNode import SplittingNode
@@ -5,7 +7,6 @@ from SplittingNode import gini
 from LeafNode import LeafNode
 import math
 import graphviz
-from concurrent.futures import ProcessPoolExecutor
 
 class PseudoOptimalDecisionTreeClassifier():
 
@@ -67,6 +68,8 @@ class PseudoOptimalDecisionTreeClassifier():
         lastCol = together[:, -1].astype('int')
         counts = np.bincount(lastCol, minlength=len(self.categories))
         majority_label = np.argmax(counts)
+        if(len(counts) == 0):
+            assert False
         return majority_label, counts
 
 
@@ -84,6 +87,11 @@ class PseudoOptimalDecisionTreeClassifier():
 
         ltArr, gtArr = bestSplit.split(arr=together)
 
+        # might make sense to simply stop
+        # if the length of either array is 0
+        # because that means splits aren't doing anything..
+        # just a thought
+
         if len(ltArr) > 1 and ltGini > 0:
             blt = self.recurse(ltArr, depth - 1, dims)
             bestSplit.leftChild = blt
@@ -103,50 +111,54 @@ class PseudoOptimalDecisionTreeClassifier():
     # pass in current root
     # find best options from then on
 
-
-
-
-
     # Find best split
     def _best_split(self, together, proportionUsed, dims):
 
         bestGini = float("inf")
-        bestNode  = SplittingNode(0,0) 
+        bestNode  = None
         blg = float("inf")
         bgg = float("inf")
 
+        # indices for evals. This decides which indices to check upon splitting
+        values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32)
+        vals = values.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
 
-
+        indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
+        sample_count = len(together[:, 0])
 
         # columns (excluding y)
         for x in tqdm(dims):
 
+            # random sampling would be a lot faster
+
             together = together[together[:,x].argsort()]
 
+
             # each row (sample)
             # also, we are interpolating between samples
 
             # indices for splits (this decides how many splits to test)
 
-            splitList = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
 
-            # indices for evals. This decides which indices to check upon splitting
-            values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32)
+            eles = together[:, x].astype(np.float32)
+            eles = eles.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
 
-            splits = []
+            classes = together[:, -1].astype(np.int32).ctypes.data_as(ctypes.POINTER(ctypes.c_int))
 
-            for currentSample in splitList:
+            
+            for currentSample in indices:
                 splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x]
                 split = SplittingNode(x, splitOn)
-                splits.append(split)
 
-            bestOf = evalSplits(splits, together, values)
+                current = gini(eles , values, split.val, classes, sample_count, vals)
+
+
+                if current[0] < bestGini: # type: ignore
+                    bestNode = split
+                    bestGini = current[0]  # type: ignore
+                    blg = current[1]  # type: ignore
+                    bgg = current[2]  # type: ignore
 
-            if bestOf[0] < bestGini:
-                bestNode = bestOf[3]
-                bestGini = bestOf[0]
-                blg = bestOf[1]
-                bgg = bestOf[2]
 
         # Return the best node, the left gini impurity, and right gini impurity.
         # These impurities allow for us to stop if we have a pure node.
@@ -289,19 +301,3 @@ def dimsWithMostVar(dimCount, arr):
     assert dimCount == retArr.shape[0]
     return retArr
 
-
-def evalSplits(splits, together, values):
-    bestGini = float("inf")
-    bestNode  = SplittingNode(0,0) 
-    blg = float("inf")
-    bgg = float("inf")
-
-    for split in splits:
-        current = gini(together, values, split.index, split.val)
-        if current[0] < bestGini:
-            bestNode = split
-            bestGini = current[0]
-            blg = current[1]
-            bgg = current[2]
-
-    return bestGini, blg, bgg , bestNode
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -69,63 +69,32 @@ class GiniResult(ctypes.Structure):
                 ("ltGini", ctypes.c_float),
                 ("gtGini", ctypes.c_float)]
 
-def gini(combined, values, index, val):
-
-
-    # implement prop to val with for c++
-
-
-    useCPP = True 
-    usePy = False
-
-
-    # add indices and index count
-    if useCPP:
-        gini_lib = ctypes.CDLL('./cpp/libgini.so')
-        gini_lib.gini.restype = GiniResult
-        gini_lib.gini.argtypes = [
-                ctypes.POINTER(ctypes.c_float), 
-                ctypes.POINTER(ctypes.c_int), 
-                ctypes.c_int, 
-                ctypes.c_float, 
-                ctypes.POINTER(ctypes.c_int), 
-                ctypes.c_int
-        ]
-
-        eles = combined[:, index].astype(np.float32)
-        eles = eles.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
-
-        classes = combined[:, -1].astype(np.int32).ctypes.data_as(ctypes.POINTER(ctypes.c_int))
-        sample_count = len(combined[:, index])
-        split_val = ctypes.c_float(val)
-
-        vals = values.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
-
-        result = gini_lib.gini(eles, classes, sample_count, split_val, vals, len(values))
-
-        weightedGini = result.weighted
-        ltGini = result.ltGini
-        gtGini = result.gtGini
-        return (weightedGini, ltGini, gtGini)
-
-    if usePy:
-        outPy = giniPy(combined, values, index, val)  
-        return outPy
-        # return outPy
-    return
-
+def gini(eles, values, val, classes, sample_count, vals):
+
+    gini_lib = ctypes.CDLL('./cpp/libgini.so')
+    gini_lib.gini.restype = GiniResult
+    gini_lib.gini.argtypes = [
+            ctypes.POINTER(ctypes.c_float), 
+            ctypes.POINTER(ctypes.c_int), 
+            ctypes.c_int, 
+            ctypes.c_float, 
+            ctypes.POINTER(ctypes.c_int), 
+            ctypes.c_int
+    ]
+
+    split_val = ctypes.c_float(val)
+    result = gini_lib.gini(eles, classes, sample_count, split_val, vals, len(values))
+    weightedGini = result.weighted
+    ltGini = result.ltGini
+    gtGini = result.gtGini
+    return (weightedGini, ltGini, gtGini)
 
 
 def giniPy(combined , values, index, val):
-
     ltc = {}
     geqc = {}
-
     ltCount = 0
     geqCount = 0
-
-
-
     for i in values:
 
         lt = _lessThan(combined[i], index, val)
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -17,22 +17,26 @@ test_X = test_X.reshape(-1, 784)
 # train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]]
 # train_y = [1, 1 , 2, 1, 5, 2,1 ,3]
 
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.005, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=.01, maxDepth=2);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.1, proportionOfDimsToTrainOn=.01, maxDepth=2);
 
 classifier.fit(train_X, train_y)
+
+# classifier.graph()
+
+
 y_pred = classifier.predict(test_X)
 print("MY ACCURACY:")
 print(accuracy_score(y_true=test_y, y_pred=y_pred))
 
-# classifier = DecisionTreeClassifier(max_depth=15)
-# classifier.fit(train_X, train_y)
-# y_pred = classifier.predict(test_X)
+classifier = DecisionTreeClassifier(max_depth=4)
+classifier.fit(train_X, train_y)
+y_pred = classifier.predict(test_X)
 
-# print("SECOND ACCURACY:")
-# print(accuracy_score(y_true=test_y, y_pred=y_pred))
+print("SECOND ACCURACY:")
+print(accuracy_score(y_true=test_y, y_pred=y_pred))
 
 
-assert False
+exit()
 
 X = np.random.random((200, 2))
 y = np.random.random((200)) * 10
@@ -56,7 +60,6 @@ print(classifier.predict(X_pred))
 # classifier.predict()
 # print(classifier)
 
-classifier.graph()
 
 
 #scatter = px.scatter(x=X[:,0], y=X[:,1], color=y)
diff --git a/classifier/cpp/gini.cpp b/classifier/cpp/gini.cpp
@@ -4,8 +4,6 @@
 
 using namespace std;
 
-
-
 extern "C" {
 
     struct GiniResult {
diff --git a/classifier/cpp/split.cpp b/classifier/cpp/split.cpp
@@ -0,0 +1,44 @@
+#include <iostream>
+
+//split_lib = ctypes.CDLL('./cpp/libsplit.so')
+//split_lib.split.restype = BestSplit
+//split_lib.split.argtypes = [
+//        ctypes.POINTER(ctypes.c_float),  # together
+//        ctypes.c_float, # prop to train on
+//        ctypes.c_float, # prop to val on
+//        ctypes.POINTER(ctypes.c_int), # dims
+//]
+//bestSplit, ltGini, gtGini = split_lib.split(ctypes.POINTER(together), self.propDimsTrain, self.propValSplits, ctypes.POINTER(dims))
+//
+//
+//
+//class BestSplit(ctypes.Structure):
+//    _fields_ = [("index", ctypes.c_int),
+//                ("splitVal", ctypes.c_int),
+//                ("ltGini", ctypes.c_float),
+//                ("gtGini", ctypes.c_float)
+//                ]
+//
+//
+
+
+extern "C" {
+	struct BestSplit{
+		int index;
+		float splitVal;
+		float ltGini;
+		float gtGini;
+	};
+
+
+	BestSplit split(float* together, float propToTrainOn, float propToValWith, int* dims){
+
+		printf("%f\n", together[0]);
+		printf("%f\n", propToTrainOn);
+		printf("%f\n", propToValWith);
+		printf("%i\n", dims[0]);
+
+		return BestSplit();
+
+	}
+}

	decision-tree-classifier Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

A	classifier/Makefile	\|	8	++++++++
M	classifier/Podtc.py	\|	66	+++++++++++++++++++++++++++++++-----------------------------------
M	classifier/SplittingNode.py	\|	69	+++++++++++++++++++--------------------------------------------------
M	classifier/Testing.py	\|	19	+++++++++++--------
M	classifier/cpp/gini.cpp	\|	2	--
A	classifier/cpp/split.cpp	\|	44	++++++++++++++++++++++++++++++++++++++++++++