decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit fd65d662d1afca9132e08c51d2958600a37200ec
parent e58a69cc634251f3800d09352630558b9fcbee59
Author: Andrew <andrewlaack1@gmail.com>
Date:   Fri, 20 Dec 2024 01:25:07 -0600

Considering stopping with what is being done. C++ seems better; python is shit

Diffstat:
Aclassifier/Makefile | 8++++++++
Mclassifier/Podtc.py | 66+++++++++++++++++++++++++++++++-----------------------------------
Mclassifier/SplittingNode.py | 69+++++++++++++++++++--------------------------------------------------
Mclassifier/Testing.py | 19+++++++++++--------
Mclassifier/cpp/gini.cpp | 2--
Aclassifier/cpp/split.cpp | 44++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 113 insertions(+), 95 deletions(-)

diff --git a/classifier/Makefile b/classifier/Makefile @@ -0,0 +1,8 @@ +split: + g++ -shared -o cpp/libsplit.so -fPIC -O3 cpp/split.cpp + +gini: + g++ -shared -o cpp/libgini.so -fPIC -O3 cpp/gini.cpp + +clean: + rm cpp/libgini.so diff --git a/classifier/Podtc.py b/classifier/Podtc.py @@ -1,3 +1,5 @@ +from warnings import warn +import ctypes import numpy as np from tqdm import tqdm from SplittingNode import SplittingNode @@ -5,7 +7,6 @@ from SplittingNode import gini from LeafNode import LeafNode import math import graphviz -from concurrent.futures import ProcessPoolExecutor class PseudoOptimalDecisionTreeClassifier(): @@ -67,6 +68,8 @@ class PseudoOptimalDecisionTreeClassifier(): lastCol = together[:, -1].astype('int') counts = np.bincount(lastCol, minlength=len(self.categories)) majority_label = np.argmax(counts) + if(len(counts) == 0): + assert False return majority_label, counts @@ -84,6 +87,11 @@ class PseudoOptimalDecisionTreeClassifier(): ltArr, gtArr = bestSplit.split(arr=together) + # might make sense to simply stop + # if the length of either array is 0 + # because that means splits aren't doing anything.. + # just a thought + if len(ltArr) > 1 and ltGini > 0: blt = self.recurse(ltArr, depth - 1, dims) bestSplit.leftChild = blt @@ -103,50 +111,54 @@ class PseudoOptimalDecisionTreeClassifier(): # pass in current root # find best options from then on - - - - # Find best split def _best_split(self, together, proportionUsed, dims): bestGini = float("inf") - bestNode = SplittingNode(0,0) + bestNode = None blg = float("inf") bgg = float("inf") + # indices for evals. This decides which indices to check upon splitting + values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32) + vals = values.ctypes.data_as(ctypes.POINTER(ctypes.c_int)) - + indices = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int) + sample_count = len(together[:, 0]) # columns (excluding y) for x in tqdm(dims): + # random sampling would be a lot faster + together = together[together[:,x].argsort()] + # each row (sample) # also, we are interpolating between samples # indices for splits (this decides how many splits to test) - splitList = np.round(np.linspace(0, len(together) - 2, math.ceil(proportionUsed * len(together)))).astype(int) - # indices for evals. This decides which indices to check upon splitting - values = np.round(np.linspace(0, len(together) - 1, math.ceil(self.propValSplits * len(together)))).astype(np.int32) + eles = together[:, x].astype(np.float32) + eles = eles.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) - splits = [] + classes = together[:, -1].astype(np.int32).ctypes.data_as(ctypes.POINTER(ctypes.c_int)) - for currentSample in splitList: + + for currentSample in indices: splitOn = ((together[currentSample+1][x] - together[currentSample][x]) / 2) + together[currentSample][x] split = SplittingNode(x, splitOn) - splits.append(split) - bestOf = evalSplits(splits, together, values) + current = gini(eles , values, split.val, classes, sample_count, vals) + + + if current[0] < bestGini: # type: ignore + bestNode = split + bestGini = current[0] # type: ignore + blg = current[1] # type: ignore + bgg = current[2] # type: ignore - if bestOf[0] < bestGini: - bestNode = bestOf[3] - bestGini = bestOf[0] - blg = bestOf[1] - bgg = bestOf[2] # Return the best node, the left gini impurity, and right gini impurity. # These impurities allow for us to stop if we have a pure node. @@ -289,19 +301,3 @@ def dimsWithMostVar(dimCount, arr): assert dimCount == retArr.shape[0] return retArr - -def evalSplits(splits, together, values): - bestGini = float("inf") - bestNode = SplittingNode(0,0) - blg = float("inf") - bgg = float("inf") - - for split in splits: - current = gini(together, values, split.index, split.val) - if current[0] < bestGini: - bestNode = split - bestGini = current[0] - blg = current[1] - bgg = current[2] - - return bestGini, blg, bgg , bestNode diff --git a/classifier/SplittingNode.py b/classifier/SplittingNode.py @@ -69,63 +69,32 @@ class GiniResult(ctypes.Structure): ("ltGini", ctypes.c_float), ("gtGini", ctypes.c_float)] -def gini(combined, values, index, val): - - - # implement prop to val with for c++ - - - useCPP = True - usePy = False - - - # add indices and index count - if useCPP: - gini_lib = ctypes.CDLL('./cpp/libgini.so') - gini_lib.gini.restype = GiniResult - gini_lib.gini.argtypes = [ - ctypes.POINTER(ctypes.c_float), - ctypes.POINTER(ctypes.c_int), - ctypes.c_int, - ctypes.c_float, - ctypes.POINTER(ctypes.c_int), - ctypes.c_int - ] - - eles = combined[:, index].astype(np.float32) - eles = eles.ctypes.data_as(ctypes.POINTER(ctypes.c_float)) - - classes = combined[:, -1].astype(np.int32).ctypes.data_as(ctypes.POINTER(ctypes.c_int)) - sample_count = len(combined[:, index]) - split_val = ctypes.c_float(val) - - vals = values.ctypes.data_as(ctypes.POINTER(ctypes.c_int)) - - result = gini_lib.gini(eles, classes, sample_count, split_val, vals, len(values)) - - weightedGini = result.weighted - ltGini = result.ltGini - gtGini = result.gtGini - return (weightedGini, ltGini, gtGini) - - if usePy: - outPy = giniPy(combined, values, index, val) - return outPy - # return outPy - return - +def gini(eles, values, val, classes, sample_count, vals): + + gini_lib = ctypes.CDLL('./cpp/libgini.so') + gini_lib.gini.restype = GiniResult + gini_lib.gini.argtypes = [ + ctypes.POINTER(ctypes.c_float), + ctypes.POINTER(ctypes.c_int), + ctypes.c_int, + ctypes.c_float, + ctypes.POINTER(ctypes.c_int), + ctypes.c_int + ] + + split_val = ctypes.c_float(val) + result = gini_lib.gini(eles, classes, sample_count, split_val, vals, len(values)) + weightedGini = result.weighted + ltGini = result.ltGini + gtGini = result.gtGini + return (weightedGini, ltGini, gtGini) def giniPy(combined , values, index, val): - ltc = {} geqc = {} - ltCount = 0 geqCount = 0 - - - for i in values: lt = _lessThan(combined[i], index, val) diff --git a/classifier/Testing.py b/classifier/Testing.py @@ -17,22 +17,26 @@ test_X = test_X.reshape(-1, 784) # train_X = [[2,5], [5,2], [3,4], [4,4], [5,5], [10, 10], [2,2], [12,12]] # train_y = [1, 1 , 2, 1, 5, 2,1 ,3] -classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.005, proportionToValidateSplits=.01, proportionOfDimsToTrainOn=.01, maxDepth=2); +classifier = PseudoOptimalDecisionTreeClassifier(proportionToTrainOn=.05, proportionToValidateSplits=.1, proportionOfDimsToTrainOn=.01, maxDepth=2); classifier.fit(train_X, train_y) + +# classifier.graph() + + y_pred = classifier.predict(test_X) print("MY ACCURACY:") print(accuracy_score(y_true=test_y, y_pred=y_pred)) -# classifier = DecisionTreeClassifier(max_depth=15) -# classifier.fit(train_X, train_y) -# y_pred = classifier.predict(test_X) +classifier = DecisionTreeClassifier(max_depth=4) +classifier.fit(train_X, train_y) +y_pred = classifier.predict(test_X) -# print("SECOND ACCURACY:") -# print(accuracy_score(y_true=test_y, y_pred=y_pred)) +print("SECOND ACCURACY:") +print(accuracy_score(y_true=test_y, y_pred=y_pred)) -assert False +exit() X = np.random.random((200, 2)) y = np.random.random((200)) * 10 @@ -56,7 +60,6 @@ print(classifier.predict(X_pred)) # classifier.predict() # print(classifier) -classifier.graph() #scatter = px.scatter(x=X[:,0], y=X[:,1], color=y) diff --git a/classifier/cpp/gini.cpp b/classifier/cpp/gini.cpp @@ -4,8 +4,6 @@ using namespace std; - - extern "C" { struct GiniResult { diff --git a/classifier/cpp/split.cpp b/classifier/cpp/split.cpp @@ -0,0 +1,44 @@ +#include <iostream> + +//split_lib = ctypes.CDLL('./cpp/libsplit.so') +//split_lib.split.restype = BestSplit +//split_lib.split.argtypes = [ +// ctypes.POINTER(ctypes.c_float), # together +// ctypes.c_float, # prop to train on +// ctypes.c_float, # prop to val on +// ctypes.POINTER(ctypes.c_int), # dims +//] +//bestSplit, ltGini, gtGini = split_lib.split(ctypes.POINTER(together), self.propDimsTrain, self.propValSplits, ctypes.POINTER(dims)) +// +// +// +//class BestSplit(ctypes.Structure): +// _fields_ = [("index", ctypes.c_int), +// ("splitVal", ctypes.c_int), +// ("ltGini", ctypes.c_float), +// ("gtGini", ctypes.c_float) +// ] +// +// + + +extern "C" { + struct BestSplit{ + int index; + float splitVal; + float ltGini; + float gtGini; + }; + + + BestSplit split(float* together, float propToTrainOn, float propToValWith, int* dims){ + + printf("%f\n", together[0]); + printf("%f\n", propToTrainOn); + printf("%f\n", propToValWith); + printf("%i\n", dims[0]); + + return BestSplit(); + + } +}