hhcart-bright.py (8892B)
1 import numpy as np 2 import pandas as pd 3 from sklearn.model_selection import KFold 4 5 def compute_householder_matrix(d): 6 p = d.shape[0] 7 e1 = np.zeros_like(d) 8 e1[0] = 1.0 9 u = e1 - d 10 u = u / np.linalg.norm(u) 11 H = np.eye(p) - 2 * np.outer(u, u) 12 return H 13 14 def reflect_dataset(X, d): 15 H = compute_householder_matrix(d) 16 X_reflected = X @ H 17 return X_reflected, H 18 19 def compute_twoing_score(y_left, y_right): 20 # Avoid degenerate splits. 21 if len(y_left) == 0 or len(y_right) == 0: 22 return 0 23 24 total = len(y_left) + len(y_right) 25 P_L = len(y_left) / total 26 P_R = len(y_right) / total 27 28 classes = np.unique(np.concatenate([y_left, y_right])) 29 sum_diff = 0 30 for cls in classes: 31 P_Li = np.sum(y_left == cls) / len(y_left) 32 P_Ri = np.sum(y_right == cls) / len(y_right) 33 sum_diff += np.abs(P_Li - P_Ri) 34 35 return P_L * P_R * (sum_diff ** 2) 36 37 def find_best_reflection(X, y): 38 """ 39 For each class in y, compute all eigenvectors (from the class covariance), 40 reflect the dataset for each eigenvector candidate, and then for every axis in 41 the reflected feature space try all axis-aligned splits using the unique sample 42 values (without interpolation) to evaluate the twoing score. 43 Returns the parameters of the best split found. 44 """ 45 best_score = -np.inf 46 best_params = None 47 48 for cls in np.unique(y): 49 class_indices = np.where(y == cls)[0] 50 if len(class_indices) < 2: 51 continue 52 class_data = X[class_indices] 53 cov = np.cov(class_data, rowvar=False) 54 eigvals, eigvecs = np.linalg.eigh(cov) 55 56 # Evaluate all eigenvector candidates (each column in eigvecs) 57 for i in range(eigvecs.shape[1]): 58 d_candidate = eigvecs[:, i] 59 X_reflected, H = reflect_dataset(X, d_candidate) 60 p = X_reflected.shape[1] 61 # For each axis in the reflected space, check axis-aligned splits. 62 for j in range(p): 63 values = X_reflected[:, j] 64 unique_vals = np.unique(values) 65 if len(unique_vals) < 2: 66 continue 67 # Use each unique value directly as a candidate threshold. 68 for threshold in unique_vals: 69 left_idx = values <= threshold 70 right_idx = ~left_idx 71 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 72 continue 73 score = compute_twoing_score(y[left_idx], y[right_idx]) 74 if score > best_score: 75 best_score = score 76 best_params = { 77 'H': H, 78 'd_candidate': d_candidate, # Store the eigenvector used. 79 'axis': j, 80 'threshold': threshold, 81 'left_idx': left_idx, 82 'right_idx': right_idx, 83 'score': score, 84 'class_used': cls, 85 'X_reflected': X_reflected 86 } 87 return best_params 88 89 class Node: 90 def __init__(self, is_leaf=False, prediction=None, H=None, d_candidate=None, axis=None, 91 threshold=None, score=None, left=None, right=None): 92 self.is_leaf = is_leaf 93 self.prediction = prediction # Majority class for leaf nodes. 94 self.H = H # Reflection matrix used at the node. 95 self.d_candidate = d_candidate # Eigenvector used for reflection. 96 self.axis = axis # Axis in the reflected space used for splitting. 97 self.threshold = threshold # Threshold for the axis-aligned split. 98 self.score = score # Twoing score for the chosen split. 99 self.left = left # Left child node. 100 self.right = right # Right child node. 101 102 class HHCartClassifier: 103 def __init__(self, max_depth=4, min_samples_split=2): 104 self.max_depth = max_depth 105 self.min_samples_split = min_samples_split 106 self.root = None 107 108 def fit(self, X, y): 109 self.root = self._build_tree(X, y, depth=0) 110 111 def _build_tree(self, X, y, depth): 112 # Stopping criteria: max depth reached, too few samples, or pure node. 113 if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1: 114 majority_class = np.bincount(y.astype(int)).argmax() 115 return Node(is_leaf=True, prediction=majority_class) 116 117 best_params = find_best_reflection(X, y) 118 if best_params is None: 119 majority_class = np.bincount(y.astype(int)).argmax() 120 return Node(is_leaf=True, prediction=majority_class) 121 122 H = best_params['H'] 123 d_candidate = best_params['d_candidate'] 124 axis = best_params['axis'] 125 threshold = best_params['threshold'] 126 left_idx = best_params['left_idx'] 127 right_idx = best_params['right_idx'] 128 score = best_params['score'] 129 130 # If the split does not actually separate the data, create a leaf. 131 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 132 majority_class = np.bincount(y.astype(int)).argmax() 133 return Node(is_leaf=True, prediction=majority_class) 134 135 # Build left and right subtrees. 136 left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1) 137 right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1) 138 139 return Node(is_leaf=False, H=H, d_candidate=d_candidate, axis=axis, threshold=threshold, score=score, 140 left=left_node, right=right_node) 141 142 def _predict_sample(self, x, node): 143 if node.is_leaf: 144 return node.prediction 145 # Reflect the sample using the node's reflection matrix. 146 x_reflected = x @ node.H 147 # Use the stored axis in the reflected space for the split. 148 value = x_reflected[node.axis] 149 if value <= node.threshold: 150 return self._predict_sample(x, node.left) 151 else: 152 return self._predict_sample(x, node.right) 153 154 def predict(self, X): 155 return np.array([self._predict_sample(x, self.root) for x in X]) 156 157 def count_leaves(node): 158 """Recursively count the number of leaf nodes in the tree.""" 159 if node.is_leaf: 160 return 1 161 return count_leaves(node.left) + count_leaves(node.right) 162 163 164 import numpy as np 165 import pandas as pd 166 from sklearn.model_selection import KFold 167 from sklearn.metrics import accuracy_score 168 169 # Load dataset. 170 df = pd.read_csv('../../datasets/bright.csv') 171 y = df[df.columns[-1]] 172 df = df.drop(df.columns[-1], axis=1) 173 X = df.to_numpy(dtype=np.float64) 174 175 # Define hyperparameters 176 depths = range(1, 6) 177 n_splits = 5 178 n_trials = 10 179 seed = 15 180 181 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration 182 results = {depth: [] for depth in depths} 183 tree_sizes = {depth: [] for depth in depths} 184 185 # For each iteration, generate the folds once and use them for each depth. 186 for trial in range(n_trials): 187 print("TRIAL: " + str(trial + 1)) 188 # Create KFold splits for this trial 189 kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed)) 190 folds = list(kf.split(X)) 191 192 for depth in depths: 193 print("DEPTH: " + str(depth)) 194 fold_accuracies = [] 195 fold_sizes = [] 196 197 for train_index, test_index in folds: 198 X_train, X_test = X[train_index], X[test_index] 199 y_train, y_test = y[train_index], y[test_index] 200 201 # Create and train the classifier with the current depth 202 clf = HHCartClassifier(max_depth=depth, min_samples_split=2) 203 clf.fit(X_train, y_train) 204 preds = clf.predict(X_test) 205 206 # Compute and store the accuracy and tree size (number of leaf nodes) 207 fold_accuracies.append(accuracy_score(y_test, preds)) 208 fold_sizes.append(count_leaves(clf.root)) 209 210 # Store the mean accuracy and mean tree size from the folds of this trial 211 results[depth].append(np.mean(fold_accuracies)) 212 tree_sizes[depth].append(np.mean(fold_sizes)) 213 214 # Write the final averaged results over all iterations to file 215 with open("resultsBright.txt", "w") as f: 216 f.write("Grid Search Results:\n") 217 for depth in depths: 218 avg_accuracy = np.mean(results[depth]) * 100 # percent 219 std_accuracy = np.std(results[depth]) * 100 # percent 220 avg_size = np.mean(tree_sizes[depth]) 221 std_size = np.std(tree_sizes[depth]) 222 f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ") 223 f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")