decision-tree-classifier

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 93a716343d4dfad27a8ca44df30ef0fc5de879e2
parent 7d9083a0c8161d3ee0671936348eefcc0c02d8d6
Author: Andrew <andrewlaack1@gmail.com>
Date:   Tue, 24 Dec 2024 12:02:28 -0600

Did stuff

Diffstat:
MNotes.md | 17+++++++++++++++++
Mrewrite/Test.py | 49++++++++++++++++++++++++++++---------------------
Mrewrite/cpp/DecisionTreeClassifier.cpp | 78++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
Mrewrite/cpp/TreeNode.cpp | 3++-
4 files changed, 99 insertions(+), 48 deletions(-)

diff --git a/Notes.md b/Notes.md @@ -48,3 +48,20 @@ random list 1,000 elements, 784 dimensions, depth = 1 same as above but with 3,000 elements: python and c++: 1m 55s (user) just c++ : 1m 37s (user) + + +--- + +1,000 samples with 784 dimensions, depth = 1: + c++ single thread (python time): + 10.33 seconds + 10.33 seconds + 10.3526 seconds + c++ with multithreading (python time): + 1.852 seconds + 1.901 seconds + 1.886 seconds + +HOLY SHIT!! This pins all 6 cpu cores. + + diff --git a/rewrite/Test.py b/rewrite/Test.py @@ -5,6 +5,7 @@ from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA import time +import matplotlib.pyplot as plt mnist = fetch_openml("mnist_784", version=1) @@ -12,34 +13,40 @@ X, y = mnist["data"], mnist["target"] X = X.astype(np.float32) # Ensure data is in float64 format for the classifier y = y.astype(np.int32) # Ensure target labels are integers +accLs = [] +pca = PCA(n_components=784) # Reduce to 50 dimensions -X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) +for i in range(1,1000): + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) -pca = PCA(n_components=75) # Reduce to 50 dimensions -X_train_pca = pca.fit_transform(X_train) -X_test_pca = pca.transform(X_test) + X_train_pca = pca.fit_transform(X_train) + X_test_pca = pca.transform(X_test) -X_train_pca = X_train_pca[0:20000] -y_train = y_train[0:10000] -# Initialize and train the Decision Tree Classifier -clf = DecisionTreeClassifier(25) -print("TRAINING") -start_time = time.time() -clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0])) -end_time = time.time() -elapsed_time = end_time - start_time -print(f"Time taken: {elapsed_time:.4f} seconds") + num_samples = i + np.random.seed(42) + random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False) + y_train = np.array(y_train) -# Predict on the test set -print("PREDICTING") -y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0])) -# Evaluate the classifier's accuracy -accuracy = accuracy_score(y_test, y_pred) -print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%") + X_train_pca = X_train_pca[random_indices] + y_train = y_train[random_indices] + clf = DecisionTreeClassifier(100) + start_time = time.time() + clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0])) + end_time = time.time() + elapsed_time = end_time - start_time + print(f"{i}: Time taken: {elapsed_time:.4f} seconds") -exit() + + y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0])) + accuracy = accuracy_score(y_test, y_pred) + print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%") + + accLs.append(accuracy) + +plt.plot(range(1,1000), accLs) +plt.show() diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/rewrite/cpp/DecisionTreeClassifier.cpp @@ -2,6 +2,8 @@ #include <limits> #include <iostream> #include <unordered_map> +#include <thread> +#include <mutex> using namespace std; @@ -112,37 +114,61 @@ TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int column // 2 1 1 // 4 1 3 -// consider adding interpolation to this and sorting the list first. -// Also, no reason to consider the 0th split if that is the case. - -TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int columns){ - - TreeNode* bestNode = nullptr; - float bestGini = std::numeric_limits<float>::max(); - for(int col = 0 ; col < columns; ++col){ - for(int row = 0; row < rows; ++row){ - float val = X[row*columns + col]; - TreeNode* current = new TreeNode(val, col); - float gini = current->evalSplit(X, y, rows, columns, "gini"); - if (gini < bestGini){ - TreeNode* prevBest = bestNode; - delete prevBest; - bestNode = current; - bestGini = gini; - } - else{ - delete current; - } - } - - } - - return bestNode; +// consider adding interpolation to this and sorting the list first. +// Also, no reason to consider the 0th split if that is the case. +TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int columns) { + TreeNode* bestNode = nullptr; + float bestGini = std::numeric_limits<float>::max(); + std::mutex mtx; + + for(int i = 0 ; i < rows; ++i){} + + auto evalColumn = [&](int col){ + TreeNode* localBestNode = nullptr; + float localBestGini = std::numeric_limits<float>::max(); + + for (int row = 0; row < rows; ++row) { + float val = X[row * columns + col]; + TreeNode* current = new TreeNode(val, col); + float gini = current->evalSplit(X, y, rows, columns, "gini"); + + if (gini < localBestGini) { + delete localBestNode; + localBestNode = current; + localBestGini = gini; + } else { + delete current; + } + } + + // Update global best values with mutex protection + std::lock_guard<std::mutex> lock(mtx); + if (localBestGini < bestGini) { + delete bestNode; + bestNode = localBestNode; + bestGini = localBestGini; + } else { + delete localBestNode; + } + }; + + // Create a thread pool to evaluate each column in parallel + std::vector<std::thread> threads; + for (int col = 0; col < columns; ++col) { + threads.emplace_back(evalColumn, col); + } + + // Wait for all threads to finish + for (auto& thread : threads) { + thread.join(); + } + + return bestNode; } int* DecisionTreeClassifier::predict(float* X, int samples, int features) { diff --git a/rewrite/cpp/TreeNode.cpp b/rewrite/cpp/TreeNode.cpp @@ -3,7 +3,8 @@ #include "Criterion.h" #include "math.h" #include "iostream" -#include <string> //for std::string +#include <string> +#include <sstream> TreeNode::TreeNode(int classification){ leaf = true;