commit 93a716343d4dfad27a8ca44df30ef0fc5de879e2
parent 7d9083a0c8161d3ee0671936348eefcc0c02d8d6
Author: Andrew <andrewlaack1@gmail.com>
Date: Tue, 24 Dec 2024 12:02:28 -0600
Did stuff
Diffstat:
4 files changed, 99 insertions(+), 48 deletions(-)
diff --git a/Notes.md b/Notes.md
@@ -48,3 +48,20 @@ random list 1,000 elements, 784 dimensions, depth = 1
same as above but with 3,000 elements:
python and c++: 1m 55s (user)
just c++ : 1m 37s (user)
+
+
+---
+
+1,000 samples with 784 dimensions, depth = 1:
+ c++ single thread (python time):
+ 10.33 seconds
+ 10.33 seconds
+ 10.3526 seconds
+ c++ with multithreading (python time):
+ 1.852 seconds
+ 1.901 seconds
+ 1.886 seconds
+
+HOLY SHIT!! This pins all 6 cpu cores.
+
+
diff --git a/rewrite/Test.py b/rewrite/Test.py
@@ -5,6 +5,7 @@ from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import time
+import matplotlib.pyplot as plt
mnist = fetch_openml("mnist_784", version=1)
@@ -12,34 +13,40 @@ X, y = mnist["data"], mnist["target"]
X = X.astype(np.float32) # Ensure data is in float64 format for the classifier
y = y.astype(np.int32) # Ensure target labels are integers
+accLs = []
+pca = PCA(n_components=784) # Reduce to 50 dimensions
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+for i in range(1,1000):
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
-pca = PCA(n_components=75) # Reduce to 50 dimensions
-X_train_pca = pca.fit_transform(X_train)
-X_test_pca = pca.transform(X_test)
+ X_train_pca = pca.fit_transform(X_train)
+ X_test_pca = pca.transform(X_test)
-X_train_pca = X_train_pca[0:20000]
-y_train = y_train[0:10000]
-# Initialize and train the Decision Tree Classifier
-clf = DecisionTreeClassifier(25)
-print("TRAINING")
-start_time = time.time()
-clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0]))
-end_time = time.time()
-elapsed_time = end_time - start_time
-print(f"Time taken: {elapsed_time:.4f} seconds")
+ num_samples = i
+ np.random.seed(42)
+ random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False)
+ y_train = np.array(y_train)
-# Predict on the test set
-print("PREDICTING")
-y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
-# Evaluate the classifier's accuracy
-accuracy = accuracy_score(y_test, y_pred)
-print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
+ X_train_pca = X_train_pca[random_indices]
+ y_train = y_train[random_indices]
+ clf = DecisionTreeClassifier(100)
+ start_time = time.time()
+ clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0]))
+ end_time = time.time()
+ elapsed_time = end_time - start_time
+ print(f"{i}: Time taken: {elapsed_time:.4f} seconds")
-exit()
+
+ y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
+ accuracy = accuracy_score(y_test, y_pred)
+ print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
+
+ accLs.append(accuracy)
+
+plt.plot(range(1,1000), accLs)
+plt.show()
diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/rewrite/cpp/DecisionTreeClassifier.cpp
@@ -2,6 +2,8 @@
#include <limits>
#include <iostream>
#include <unordered_map>
+#include <thread>
+#include <mutex>
using namespace std;
@@ -112,37 +114,61 @@ TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int column
// 2 1 1
// 4 1 3
-// consider adding interpolation to this and sorting the list first.
-// Also, no reason to consider the 0th split if that is the case.
-
-TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int columns){
-
- TreeNode* bestNode = nullptr;
- float bestGini = std::numeric_limits<float>::max();
- for(int col = 0 ; col < columns; ++col){
- for(int row = 0; row < rows; ++row){
- float val = X[row*columns + col];
- TreeNode* current = new TreeNode(val, col);
- float gini = current->evalSplit(X, y, rows, columns, "gini");
- if (gini < bestGini){
- TreeNode* prevBest = bestNode;
- delete prevBest;
- bestNode = current;
- bestGini = gini;
- }
- else{
- delete current;
- }
- }
-
- }
-
- return bestNode;
+// consider adding interpolation to this and sorting the list first.
+// Also, no reason to consider the 0th split if that is the case.
+TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int columns) {
+ TreeNode* bestNode = nullptr;
+ float bestGini = std::numeric_limits<float>::max();
+ std::mutex mtx;
+
+ for(int i = 0 ; i < rows; ++i){}
+
+ auto evalColumn = [&](int col){
+ TreeNode* localBestNode = nullptr;
+ float localBestGini = std::numeric_limits<float>::max();
+
+ for (int row = 0; row < rows; ++row) {
+ float val = X[row * columns + col];
+ TreeNode* current = new TreeNode(val, col);
+ float gini = current->evalSplit(X, y, rows, columns, "gini");
+
+ if (gini < localBestGini) {
+ delete localBestNode;
+ localBestNode = current;
+ localBestGini = gini;
+ } else {
+ delete current;
+ }
+ }
+
+ // Update global best values with mutex protection
+ std::lock_guard<std::mutex> lock(mtx);
+ if (localBestGini < bestGini) {
+ delete bestNode;
+ bestNode = localBestNode;
+ bestGini = localBestGini;
+ } else {
+ delete localBestNode;
+ }
+ };
+
+ // Create a thread pool to evaluate each column in parallel
+ std::vector<std::thread> threads;
+ for (int col = 0; col < columns; ++col) {
+ threads.emplace_back(evalColumn, col);
+ }
+
+ // Wait for all threads to finish
+ for (auto& thread : threads) {
+ thread.join();
+ }
+
+ return bestNode;
}
int* DecisionTreeClassifier::predict(float* X, int samples, int features) {
diff --git a/rewrite/cpp/TreeNode.cpp b/rewrite/cpp/TreeNode.cpp
@@ -3,7 +3,8 @@
#include "Criterion.h"
#include "math.h"
#include "iostream"
-#include <string> //for std::string
+#include <string>
+#include <sstream>
TreeNode::TreeNode(int classification){
leaf = true;