Did stuff - decision-tree-classifier - Unnamed repository; edit this file 'description' to name the repository.

commit 93a716343d4dfad27a8ca44df30ef0fc5de879e2
parent 7d9083a0c8161d3ee0671936348eefcc0c02d8d6
Author: Andrew <andrewlaack1@gmail.com>
Date:   Tue, 24 Dec 2024 12:02:28 -0600

Did stuff

Diffstat:
M Notes.md  | 17 +++++++++++++++++
M rewrite/Test.py  | 49 ++++++++++++++++++++++++++++---------------------
M rewrite/cpp/DecisionTreeClassifier.cpp  | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M rewrite/cpp/TreeNode.cpp  | 3 ++-

4 files changed, 99 insertions(+), 48 deletions(-)
diff --git a/Notes.md b/Notes.md
@@ -48,3 +48,20 @@ random list 1,000 elements, 784 dimensions, depth = 1
 same as above but with 3,000 elements:
     python and c++: 1m 55s (user)
     just c++ :  1m 37s (user)
+
+
+---
+
+1,000 samples with 784 dimensions, depth = 1:
+    c++ single thread (python time):
+        10.33 seconds
+        10.33 seconds
+        10.3526 seconds
+    c++ with multithreading (python time):
+        1.852 seconds
+        1.901 seconds
+        1.886 seconds
+
+HOLY SHIT!! This pins all 6 cpu cores.
+
+
diff --git a/rewrite/Test.py b/rewrite/Test.py
@@ -5,6 +5,7 @@ from sklearn.datasets import fetch_openml
 from sklearn.model_selection import train_test_split
 from sklearn.decomposition import PCA
 import time
+import matplotlib.pyplot as plt
 
 
 mnist = fetch_openml("mnist_784", version=1)
@@ -12,34 +13,40 @@ X, y = mnist["data"], mnist["target"]
 X = X.astype(np.float32)  # Ensure data is in float64 format for the classifier
 y = y.astype(np.int32)    # Ensure target labels are integers
 
+accLs = []
+pca = PCA(n_components=784)  # Reduce to 50 dimensions
 
-X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+for i in range(1,1000):
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
-pca = PCA(n_components=75)  # Reduce to 50 dimensions
-X_train_pca = pca.fit_transform(X_train)
-X_test_pca = pca.transform(X_test)
+    X_train_pca = pca.fit_transform(X_train)
+    X_test_pca = pca.transform(X_test)
 
-X_train_pca = X_train_pca[0:20000]
-y_train = y_train[0:10000]
 
-# Initialize and train the Decision Tree Classifier
-clf = DecisionTreeClassifier(25)
-print("TRAINING")
 
-start_time = time.time()
-clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0]))
-end_time = time.time()
-elapsed_time = end_time - start_time
-print(f"Time taken: {elapsed_time:.4f} seconds")
+    num_samples = i
+    np.random.seed(42)
+    random_indices = np.random.choice(len(X_train_pca) - 1, num_samples, replace=False)
 
+    y_train = np.array(y_train)
 
-# Predict on the test set
-print("PREDICTING")
-y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
-# Evaluate the classifier's accuracy
-accuracy = accuracy_score(y_test, y_pred)
-print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
+    X_train_pca = X_train_pca[random_indices]
+    y_train = y_train[random_indices]
 
+    clf = DecisionTreeClassifier(100)
 
+    start_time = time.time()
+    clf.fit(X_train_pca, len(X_train_pca), y_train, len(X_train_pca[0]))
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    print(f"{i}: Time taken: {elapsed_time:.4f} seconds")
 
-exit()
+
+    y_pred = clf.predict(X_test_pca, len(X_test_pca), len(X_test_pca[0]))
+    accuracy = accuracy_score(y_test, y_pred)
+    print(f"Accuracy of Decision Tree Classifier on MNIST: {accuracy * 100:.2f}%")
+
+    accLs.append(accuracy)
+
+plt.plot(range(1,1000), accLs)
+plt.show()
diff --git a/rewrite/cpp/DecisionTreeClassifier.cpp b/rewrite/cpp/DecisionTreeClassifier.cpp
@@ -2,6 +2,8 @@
 #include <limits>
 #include <iostream>
 #include <unordered_map>
+#include <thread>
+#include <mutex>
 
 using namespace std;
 
@@ -112,37 +114,61 @@ TreeNode* DecisionTreeClassifier::recurse(float* X, int rows, int* y, int column
 //	2	1	1
 //	4	1	3
 
-// consider adding interpolation to this and sorting the list first.
-// Also, no reason to consider the 0th split if that is the case.
-
-TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int columns){
-
-	TreeNode* bestNode = nullptr;
-	float bestGini = std::numeric_limits<float>::max();
 
-	for(int col = 0 ; col < columns; ++col){
-		for(int row = 0; row < rows; ++row){
 
-			float val = X[row*columns + col];
-			TreeNode* current = new TreeNode(val, col);
-			float gini = current->evalSplit(X, y, rows, columns, "gini");
-			if (gini < bestGini){
 
-				TreeNode* prevBest = bestNode;
-				delete prevBest;
 
-				bestNode = current;
-				bestGini = gini;
-			}
-			else{
-				delete current;
-			}
-		}
-
-	}
-
-	return bestNode;
+// consider adding interpolation to this and sorting the list first.
+// Also, no reason to consider the 0th split if that is the case.
 
+TreeNode* DecisionTreeClassifier::bestSplit(float* X, int rows, int* y, int columns) {
+    TreeNode* bestNode = nullptr;
+    float bestGini = std::numeric_limits<float>::max();
+    std::mutex mtx; 
+
+	for(int i = 0 ; i < rows; ++i){}
+
+    auto evalColumn = [&](int col){
+        TreeNode* localBestNode = nullptr;
+        float localBestGini = std::numeric_limits<float>::max();
+
+        for (int row = 0; row < rows; ++row) {
+            float val = X[row * columns + col];
+            TreeNode* current = new TreeNode(val, col);
+            float gini = current->evalSplit(X, y, rows, columns, "gini");
+
+            if (gini < localBestGini) {
+                delete localBestNode;
+                localBestNode = current;
+                localBestGini = gini;
+            } else {
+                delete current;
+            }
+        }
+
+        // Update global best values with mutex protection
+        std::lock_guard<std::mutex> lock(mtx);
+        if (localBestGini < bestGini) {
+            delete bestNode;
+            bestNode = localBestNode;
+            bestGini = localBestGini;
+        } else {
+            delete localBestNode;
+        }
+    };
+
+    // Create a thread pool to evaluate each column in parallel
+    std::vector<std::thread> threads;
+    for (int col = 0; col < columns; ++col) {
+        threads.emplace_back(evalColumn, col);
+    }
+
+    // Wait for all threads to finish
+    for (auto& thread : threads) {
+        thread.join();
+    }
+
+    return bestNode;
 }
 
 int* DecisionTreeClassifier::predict(float* X, int samples, int features) {
diff --git a/rewrite/cpp/TreeNode.cpp b/rewrite/cpp/TreeNode.cpp
@@ -3,7 +3,8 @@
 #include "Criterion.h"
 #include "math.h"
 #include "iostream"
-#include <string>  //for std::string
+#include <string>
+#include <sstream>
 
 TreeNode::TreeNode(int classification){
 	leaf = true;

	decision-tree-classifier Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| README \| LICENSE

M	Notes.md	\|	17	+++++++++++++++++
M	rewrite/Test.py	\|	49	++++++++++++++++++++++++++++---------------------
M	rewrite/cpp/DecisionTreeClassifier.cpp	\|	78	++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------
M	rewrite/cpp/TreeNode.cpp	\|	3	++-