commit fd65d662d1afca9132e08c51d2958600a37200ec
parent e58a69cc634251f3800d09352630558b9fcbee59
Author: Andrew <andrewlaack1@gmail.com>
Date: Fri, 20 Dec 2024 01:25:07 -0600
Considering stopping with what is being done. C++ seems better; python is shit
Diffstat:
6 files changed, 113 insertions(+), 95 deletions(-)
diff --git a/classifier/Makefile b/classifier/Makefile
@@ -0,0 +1,8 @@
+split:
+ g++ -shared -o cpp/libsplit.so -fPIC -O3 cpp/split.cpp
+
+gini:
+ g++ -shared -o cpp/libgini.so -fPIC -O3 cpp/gini.cpp
+
+clean:
+ rm cpp/libgini.so
diff --git a/classifier/Podtc.py b/classifier/Podtc.py
@@ -1,3 +1,5 @@
+from warnings import warn
+import ctypes
import numpy as np
from tqdm import tqdm
from SplittingNode import SplittingNode
@@ -5,7 +7,6 @@ from SplittingNode import gini
from LeafNode import LeafNode
import math
import graphviz
-from concurrent.futures import ProcessPoolExecutor
class PseudoOptimalDecisionTreeClassifier():
@@ -67,6 +68,8 @@ class PseudoOptimalDecisionTreeClassifier():
lastCol = together[:, -1].astype('int')
counts = np.bincount(lastCol, minlength=len(self.categories))
majority_label = np.argmax(counts)
+ if(len(counts) == 0):
+ assert False
return majority_label, counts
@@ -84,6 +87,11 @@ class PseudoOptimalDecisionTreeClassifier():
ltArr, gtArr = bestSplit.split(arr=together)
+ # might make sense to simply stop
+ # if the length of either array is 0
+ # because that means splits aren't doing anything..
+ # just a thought
+
if len(ltArr) > 1 and ltGini > 0:
blt = self.recurse(ltArr, depth - 1, dims)
bestSplit.leftChild = blt
@@ -103,50 +111,54 @@ class PseudoOptimalDecisionTreeClassifier():
# pass in current root
# find best options from then on
-
-
-
-
# Find best split
def _best_split(self, together, proportionUsed, dims):
bestGini = float("inf")
- bestNode = SplittingNode(0,0)
+ bestNode = None
blg = float("inf")
bgg = float("inf")
+ # indices for evals. This decides which indices to check upon splitting
+ values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32)
+ vals = values.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
-
+ indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
+ sample_count = len(together[:, 0])
# columns (excluding y)
for x in tqdm(dims):
+ # random sampling would be a lot faster
+
together = together[together[:,x].argsort()]
+
# each row (sample)
# also, we are interpolating between samples
# indices for splits (this decides how many splits to test)
- splitList = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int)
- # indices for evals. This decides which indices to check upon splitting
- values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32)
+ eles = together[:, x].astype(np.float32)
+ eles = eles.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
- splits = []
+ classes = together[:, -1].astype(np.int32).ctypes.data_as(ctypes.POINTER(ctypes.c_int))
- for currentSample in splitList:
+
+ for currentSample in indices:
splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x]
split = SplittingNode(x, splitOn)
- splits.append(split)
- bestOf = evalSplits(splits, together, values)
+ current = gini(eles , values, split.val, classes, sample_count, vals)
+
+
+ if current[0] < bestGini: # type: ignore
+ bestNode = split
+ bestGini = current[0] # type: ignore
+ blg = current[1] # type: ignore
+ bgg = current[2] # type: ignore
- if bestOf[0] < bestGini:
- bestNode = bestOf[3]
- bestGini = bestOf[0]
- blg = bestOf[1]
- bgg = bestOf[2]
# Return the best node, the left gini impurity, and right gini impurity.
# These impurities allow for us to stop if we have a pure node.
@@ -289,19 +301,3 @@ def dimsWithMostVar(dimCount, arr):
assert dimCount == retArr.shape[0]
return retArr
-
-def evalSplits(splits, together, values):
- bestGini = float("inf")
- bestNode = SplittingNode(0,0)
- blg = float("inf")
- bgg = float("inf")
-
- for split in splits:
- current = gini(together, values, split.index, split.val)
- if current[0] < bestGini:
- bestNode = split
- bestGini = current[0]
- blg = current[1]
- bgg = current[2]
-
- return bestGini, blg, bgg , bestNode
diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py
@@ -69,63 +69,32 @@ class GiniResult(ctypes.Structure):
("ltGini", ctypes.c_float),
("gtGini", ctypes.c_float)]
-def gini(combined, values, index, val):
-
-
- # implement prop to val with for c++
-
-
- useCPP = True
- usePy = False
-
-
- # add indices and index count
- if useCPP:
- gini_lib = ctypes.CDLL('./cpp/libgini.so')
- gini_lib.gini.restype = GiniResult
- gini_lib.gini.argtypes = [
- ctypes.POINTER(ctypes.c_float),
- ctypes.POINTER(ctypes.c_int),
- ctypes.c_int,
- ctypes.c_float,
- ctypes.POINTER(ctypes.c_int),
- ctypes.c_int
- ]
-
- eles = combined[:, index].astype(np.float32)
- eles = eles.ctypes.data_as(ctypes.POINTER(ctypes.c_float))
-
- classes = combined[:, -1].astype(np.int32).ctypes.data_as(ctypes.POINTER(ctypes.c_int))
- sample_count = len(combined[:, index])
- split_val = ctypes.c_float(val)
-
- vals = values.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
-
- result = gini_lib.gini(eles, classes, sample_count, split_val, vals, len(values))
-
- weightedGini = result.weighted
- ltGini = result.ltGini
- gtGini = result.gtGini
- return (weightedGini, ltGini, gtGini)
-
- if usePy:
- outPy = giniPy(combined, values, index, val)
- return outPy
- # return outPy
- return
-
+def gini(eles, values, val, classes, sample_count, vals):
+
+ gini_lib = ctypes.CDLL('./cpp/libgini.so')
+ gini_lib.gini.restype = GiniResult
+ gini_lib.gini.argtypes = [
+ ctypes.POINTER(ctypes.c_float),
+ ctypes.POINTER(ctypes.c_int),
+ ctypes.c_int,
+ ctypes.c_float,
+ ctypes.POINTER(ctypes.c_int),
+ ctypes.c_int
+ ]
+
+ split_val = ctypes.c_float(val)
+ result = gini_lib.gini(eles, classes, sample_count, split_val, vals, len(values))
+ weightedGini = result.weighted
+ ltGini = result.ltGini
+ gtGini = result.gtGini
+ return (weightedGini, ltGini, gtGini)
def giniPy(combined , values, index, val):
-
ltc = {}
geqc = {}
-
ltCount = 0
geqCount = 0
-
-
-
for i in values:
lt = _lessThan(combined[i], index, val)
diff --git a/classifier/Testing.py b/classifier/Testing.py
@@ -17,22 +17,26 @@ test_X = test_X.reshape(-1, 784)
# train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]]
# train_y = [1, 1 , 2, 1, 5, 2,1 ,3]
-classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.005, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=.01, maxDepth=2);
+classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.1, proportionOfDimsToTrainOn=.01, maxDepth=2);
classifier.fit(train_X, train_y)
+
+# classifier.graph()
+
+
y_pred = classifier.predict(test_X)
print("MY ACCURACY:")
print(accuracy_score(y_true=test_y, y_pred=y_pred))
-# classifier = DecisionTreeClassifier(max_depth=15)
-# classifier.fit(train_X, train_y)
-# y_pred = classifier.predict(test_X)
+classifier = DecisionTreeClassifier(max_depth=4)
+classifier.fit(train_X, train_y)
+y_pred = classifier.predict(test_X)
-# print("SECOND ACCURACY:")
-# print(accuracy_score(y_true=test_y, y_pred=y_pred))
+print("SECOND ACCURACY:")
+print(accuracy_score(y_true=test_y, y_pred=y_pred))
-assert False
+exit()
X = np.random.random((200, 2))
y = np.random.random((200)) * 10
@@ -56,7 +60,6 @@ print(classifier.predict(X_pred))
# classifier.predict()
# print(classifier)
-classifier.graph()
#scatter = px.scatter(x=X[:,0], y=X[:,1], color=y)
diff --git a/classifier/cpp/gini.cpp b/classifier/cpp/gini.cpp
@@ -4,8 +4,6 @@
using namespace std;
-
-
extern "C" {
struct GiniResult {
diff --git a/classifier/cpp/split.cpp b/classifier/cpp/split.cpp
@@ -0,0 +1,44 @@
+#include <iostream>
+
+//split_lib = ctypes.CDLL('./cpp/libsplit.so')
+//split_lib.split.restype = BestSplit
+//split_lib.split.argtypes = [
+// ctypes.POINTER(ctypes.c_float), # together
+// ctypes.c_float, # prop to train on
+// ctypes.c_float, # prop to val on
+// ctypes.POINTER(ctypes.c_int), # dims
+//]
+//bestSplit, ltGini, gtGini = split_lib.split(ctypes.POINTER(together), self.propDimsTrain, self.propValSplits, ctypes.POINTER(dims))
+//
+//
+//
+//class BestSplit(ctypes.Structure):
+// _fields_ = [("index", ctypes.c_int),
+// ("splitVal", ctypes.c_int),
+// ("ltGini", ctypes.c_float),
+// ("gtGini", ctypes.c_float)
+// ]
+//
+//
+
+
+extern "C" {
+ struct BestSplit{
+ int index;
+ float splitVal;
+ float ltGini;
+ float gtGini;
+ };
+
+
+ BestSplit split(float* together, float propToTrainOn, float propToValWith, int* dims){
+
+ printf("%f\n", together[0]);
+ printf("%f\n", propToTrainOn);
+ printf("%f\n", propToValWith);
+ printf("%i\n", dims[0]);
+
+ return BestSplit();
+
+ }
+}