machinelearning

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 9e7ca9326bf012a30976c94000b52aba15e86f79
parent e73bbe68068fc543da8bdb0953cd5051b4d9fb63
Author: Andrew Laack <andrew@laack.co>
Date:   Sun, 15 Feb 2026 20:26:39 -0600

Added from scratch directory

Diffstat:
AfromScratch/KNN.py | 149+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 149 insertions(+), 0 deletions(-)

diff --git a/fromScratch/KNN.py b/fromScratch/KNN.py @@ -0,0 +1,149 @@ +# K-Nearest-Neighbors +import random +import heapq +from matplotlib import pyplot as plt +import numpy as np + + +class Neighbor: + def __init__(self, distance, index): + self.distance = distance + self.index = index + + # this is done to make min heap simulate a max heap + def __lt__(self, other): + return -self.distance < -other.distance + + def __str__(self): + return "Distance: " + str(self.distance) + " Index: " + str(self.index) + + +class KNN: + def __init__(self, n): + self.n = n + + def fit(self, X, y): + self.X = X.copy() + self.y = y.copy() + + def predict(self, X): + + preds = [] + + for i in range(len(X)): + + nearest_neighbors = [] + + for index, instance in enumerate(self.X): + euc_dist = euc_dist_squared(instance, X[i]) + neighbor = Neighbor(euc_dist, index) + + if len(nearest_neighbors) > 0: + if len(nearest_neighbors) < self.n: + heapq.heappush(nearest_neighbors, neighbor) + else: + if neighbor.distance < nearest_neighbors[0].distance: + heapq.heappop(nearest_neighbors) + heapq.heappush(nearest_neighbors, neighbor) + else: + heapq.heappush(nearest_neighbors, neighbor) + + if self.n > len(self.X): + assert len(nearest_neighbors) == self.n + + preds.append(self._most_common_vote(nearest_neighbors)) + + return preds + + def _most_common_vote(self, neighbors): + + votes = {} + + for neighbor in neighbors: + classification = self.y[neighbor.index] + votes[classification] = votes.get(classification, 0) + 1 + + max = 0 + most_popular = "" + + for key in votes: + if votes[key] > max: + max = votes[key] + most_popular = key + + return most_popular + + +def euc_dist_squared(input_1, input_2): + assert len(input_1) == len(input_2) + sum = 0 + for i in range(len(input_1)): + sum += (input_1[i] - input_2[i]) ** 2 + return sum + + +if __name__ == "__main__": + knn = KNN(5) + + samples_1 = 500 + samples_2 = 500 + + cluster_1 = [[random.gauss(0, 5), random.gauss(0, 5)] for _ in range(samples_1)] + cluster_2 = [[random.gauss(10, 5), random.gauss(10, 5)] for _ in range(samples_2)] + cluster_1.extend(cluster_2) + X = cluster_1 + + y = [0] * samples_1 + y.extend([1] * samples_2) + + knn.fit(X, y) + + test_0 = [[random.gauss(0, 5), random.gauss(0, 5)] for _ in range(samples_1)] + test_1 = [[random.gauss(10, 5), random.gauss(10, 5)] for _ in range(samples_2)] + + test_0.extend(test_1) + test_X = test_0 + + preds = knn.predict(test_X) + + X = np.array(X) + test_X = np.array(test_X) + test_y = [0] * samples_1 + test_y.extend([1] * samples_2) + + correctness = [] + + for i in range(len(test_y)): + if test_y[i] != preds[i]: + correctness.append(0) + else: + correctness.append(1) + + # ORIGINAL + plt.scatter(x=X[:, 0], y=X[:, 1], c=y, cmap="jet", alpha=0.1) + + alphas = [] + for correct in correctness: + if correct == 1: + alphas.append(0.1) + else: + alphas.append(1) + + # PREDICTIONS + plt.scatter( + x=test_X[:, 0], + y=test_X[:, 1], + c=test_y, + cmap="jet", + edgecolors="black", + alpha=alphas, + ) + + plt.show() + + correct_count = 0 + for correct in correctness: + if correct == 1: + correct_count += 1 + + print("Accuracy: ", correct_count / len(correctness))