commit 9e7ca9326bf012a30976c94000b52aba15e86f79
parent e73bbe68068fc543da8bdb0953cd5051b4d9fb63
Author: Andrew Laack <andrew@laack.co>
Date: Sun, 15 Feb 2026 20:26:39 -0600
Added from scratch directory
Diffstat:
| A | fromScratch/KNN.py | | | 149 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
1 file changed, 149 insertions(+), 0 deletions(-)
diff --git a/fromScratch/KNN.py b/fromScratch/KNN.py
@@ -0,0 +1,149 @@
+# K-Nearest-Neighbors
+import random
+import heapq
+from matplotlib import pyplot as plt
+import numpy as np
+
+
+class Neighbor:
+ def __init__(self, distance, index):
+ self.distance = distance
+ self.index = index
+
+ # this is done to make min heap simulate a max heap
+ def __lt__(self, other):
+ return -self.distance < -other.distance
+
+ def __str__(self):
+ return "Distance: " + str(self.distance) + " Index: " + str(self.index)
+
+
+class KNN:
+ def __init__(self, n):
+ self.n = n
+
+ def fit(self, X, y):
+ self.X = X.copy()
+ self.y = y.copy()
+
+ def predict(self, X):
+
+ preds = []
+
+ for i in range(len(X)):
+
+ nearest_neighbors = []
+
+ for index, instance in enumerate(self.X):
+ euc_dist = euc_dist_squared(instance, X[i])
+ neighbor = Neighbor(euc_dist, index)
+
+ if len(nearest_neighbors) > 0:
+ if len(nearest_neighbors) < self.n:
+ heapq.heappush(nearest_neighbors, neighbor)
+ else:
+ if neighbor.distance < nearest_neighbors[0].distance:
+ heapq.heappop(nearest_neighbors)
+ heapq.heappush(nearest_neighbors, neighbor)
+ else:
+ heapq.heappush(nearest_neighbors, neighbor)
+
+ if self.n > len(self.X):
+ assert len(nearest_neighbors) == self.n
+
+ preds.append(self._most_common_vote(nearest_neighbors))
+
+ return preds
+
+ def _most_common_vote(self, neighbors):
+
+ votes = {}
+
+ for neighbor in neighbors:
+ classification = self.y[neighbor.index]
+ votes[classification] = votes.get(classification, 0) + 1
+
+ max = 0
+ most_popular = ""
+
+ for key in votes:
+ if votes[key] > max:
+ max = votes[key]
+ most_popular = key
+
+ return most_popular
+
+
+def euc_dist_squared(input_1, input_2):
+ assert len(input_1) == len(input_2)
+ sum = 0
+ for i in range(len(input_1)):
+ sum += (input_1[i] - input_2[i]) ** 2
+ return sum
+
+
+if __name__ == "__main__":
+ knn = KNN(5)
+
+ samples_1 = 500
+ samples_2 = 500
+
+ cluster_1 = [[random.gauss(0, 5), random.gauss(0, 5)] for _ in range(samples_1)]
+ cluster_2 = [[random.gauss(10, 5), random.gauss(10, 5)] for _ in range(samples_2)]
+ cluster_1.extend(cluster_2)
+ X = cluster_1
+
+ y = [0] * samples_1
+ y.extend([1] * samples_2)
+
+ knn.fit(X, y)
+
+ test_0 = [[random.gauss(0, 5), random.gauss(0, 5)] for _ in range(samples_1)]
+ test_1 = [[random.gauss(10, 5), random.gauss(10, 5)] for _ in range(samples_2)]
+
+ test_0.extend(test_1)
+ test_X = test_0
+
+ preds = knn.predict(test_X)
+
+ X = np.array(X)
+ test_X = np.array(test_X)
+ test_y = [0] * samples_1
+ test_y.extend([1] * samples_2)
+
+ correctness = []
+
+ for i in range(len(test_y)):
+ if test_y[i] != preds[i]:
+ correctness.append(0)
+ else:
+ correctness.append(1)
+
+ # ORIGINAL
+ plt.scatter(x=X[:, 0], y=X[:, 1], c=y, cmap="jet", alpha=0.1)
+
+ alphas = []
+ for correct in correctness:
+ if correct == 1:
+ alphas.append(0.1)
+ else:
+ alphas.append(1)
+
+ # PREDICTIONS
+ plt.scatter(
+ x=test_X[:, 0],
+ y=test_X[:, 1],
+ c=test_y,
+ cmap="jet",
+ edgecolors="black",
+ alpha=alphas,
+ )
+
+ plt.show()
+
+ correct_count = 0
+ for correct in correctness:
+ if correct == 1:
+ correct_count += 1
+
+ print("Accuracy: ", correct_count / len(correctness))