hhcart-iris.py (8597B)
1 import numpy as np 2 import pandas as pd 3 from sklearn.model_selection import KFold 4 5 def compute_householder_matrix(d1): 6 p = d1.shape[0] 7 e1 = np.zeros_like(d1) 8 e1[0] = 1.0 9 u = e1 - d1 10 u = u / np.linalg.norm(u) 11 H = np.eye(p) - 2 * np.outer(u, u) 12 return H 13 14 def reflect_dataset(X, d1): 15 H = compute_householder_matrix(d1) 16 X_reflected = X @ H 17 return X_reflected, H 18 19 def compute_twoing_score(y_left, y_right): 20 # Avoid degenerate splits. 21 if len(y_left) == 0 or len(y_right) == 0: 22 return 0 23 24 total = len(y_left) + len(y_right) 25 P_L = len(y_left) / total 26 P_R = len(y_right) / total 27 28 classes = np.unique(np.concatenate([y_left, y_right])) 29 sum_diff = 0 30 for cls in classes: 31 P_Li = np.sum(y_left == cls) / len(y_left) 32 P_Ri = np.sum(y_right == cls) / len(y_right) 33 sum_diff += np.abs(P_Li - P_Ri) 34 35 return P_L * P_R * (sum_diff ** 2) 36 37 def find_best_reflection(X, y): 38 """ 39 For each class in y, compute the dominant eigenvector (largest eigenvalue eigenvector of the class covariance), 40 reflect the dataset, and then for every axis in the reflected feature space try all axis-aligned splits using 41 the unique sample values (without interpolation) to evaluate the twoing score. 42 Returns the parameters of the best split found. 43 """ 44 best_score = -np.inf 45 best_params = None 46 47 for cls in np.unique(y): 48 class_indices = np.where(y == cls)[0] 49 if len(class_indices) < 2: 50 continue 51 class_data = X[class_indices] 52 cov = np.cov(class_data, rowvar=False) 53 eigvals, eigvecs = np.linalg.eigh(cov) 54 # Choose the dominant eigenvector (largest eigenvalue) 55 d1 = eigvecs[:, -1] 56 57 X_reflected, H = reflect_dataset(X, d1) 58 p = X_reflected.shape[1] 59 # For each axis in the reflected space, check axis-aligned splits. 60 for j in range(p): 61 values = X_reflected[:, j] 62 unique_vals = np.unique(values) 63 if len(unique_vals) < 2: 64 continue 65 # Use each unique value directly as a candidate threshold. 66 for threshold in unique_vals: 67 left_idx = values <= threshold 68 right_idx = ~left_idx 69 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 70 continue 71 score = compute_twoing_score(y[left_idx], y[right_idx]) 72 if score > best_score: 73 best_score = score 74 best_params = { 75 'H': H, 76 'd1': d1, 77 'axis': j, 78 'threshold': threshold, 79 'left_idx': left_idx, 80 'right_idx': right_idx, 81 'score': score, 82 'class_used': cls, 83 'X_reflected': X_reflected 84 } 85 return best_params 86 87 class Node: 88 def __init__(self, is_leaf=False, prediction=None, H=None, d1=None, axis=None, 89 threshold=None, score=None, left=None, right=None): 90 self.is_leaf = is_leaf 91 self.prediction = prediction # Majority class for leaf nodes. 92 self.H = H # Reflection matrix used at the node. 93 self.d1 = d1 # Dominant eigenvector used. 94 self.axis = axis # Axis in the reflected space used for splitting. 95 self.threshold = threshold # Threshold for the axis-aligned split. 96 self.score = score # Twoing score for the chosen split. 97 self.left = left # Left child node. 98 self.right = right # Right child node. 99 100 class HHCartClassifier: 101 def __init__(self, max_depth=4, min_samples_split=2): 102 self.max_depth = max_depth 103 self.min_samples_split = min_samples_split 104 self.root = None 105 106 def fit(self, X, y): 107 self.root = self._build_tree(X, y, depth=0) 108 109 def _build_tree(self, X, y, depth): 110 # Stopping criteria: max depth reached, too few samples, or pure node. 111 if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1: 112 majority_class = np.bincount(y.astype(int)).argmax() 113 return Node(is_leaf=True, prediction=majority_class) 114 115 best_params = find_best_reflection(X, y) 116 if best_params is None: 117 majority_class = np.bincount(y.astype(int)).argmax() 118 return Node(is_leaf=True, prediction=majority_class) 119 120 H = best_params['H'] 121 d1 = best_params['d1'] 122 axis = best_params['axis'] 123 threshold = best_params['threshold'] 124 left_idx = best_params['left_idx'] 125 right_idx = best_params['right_idx'] 126 score = best_params['score'] 127 128 # If the split does not actually separate the data, create a leaf. 129 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 130 majority_class = np.bincount(y.astype(int)).argmax() 131 return Node(is_leaf=True, prediction=majority_class) 132 133 # Build left and right subtrees. 134 left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1) 135 right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1) 136 137 return Node(is_leaf=False, H=H, d1=d1, axis=axis, threshold=threshold, score=score, 138 left=left_node, right=right_node) 139 140 def _predict_sample(self, x, node): 141 if node.is_leaf: 142 return node.prediction 143 # Reflect the sample using the node's reflection matrix. 144 x_reflected = x @ node.H 145 # Use the stored axis in the reflected space for the split. 146 value = x_reflected[node.axis] 147 if value <= node.threshold: 148 return self._predict_sample(x, node.left) 149 else: 150 return self._predict_sample(x, node.right) 151 152 def predict(self, X): 153 return np.array([self._predict_sample(x, self.root) for x in X]) 154 155 def count_leaves(node): 156 """Recursively count the number of leaf nodes in the tree.""" 157 if node.is_leaf: 158 return 1 159 return count_leaves(node.left) + count_leaves(node.right) 160 161 import numpy as np 162 import pandas as pd 163 from sklearn.model_selection import KFold 164 from sklearn.metrics import accuracy_score 165 from sklearn.datasets import load_iris 166 167 iris = load_iris() 168 df = pd.DataFrame(iris.data, columns=iris.feature_names) 169 y = iris.target 170 X = df.to_numpy() 171 172 # Define hyperparameters 173 depths = range(1, 6) 174 n_splits = 5 175 n_trials = 10 176 seed = 15 177 178 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration 179 results = {depth: [] for depth in depths} 180 tree_sizes = {depth: [] for depth in depths} 181 182 # For each iteration, generate the folds once and use them for each depth. 183 for trial in range(n_trials): 184 print("TRIAL: " + str(trial + 1)) 185 # Create KFold splits for this trial 186 kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed)) 187 folds = list(kf.split(X)) 188 189 for depth in depths: 190 print("DEPTH: " + str(depth)) 191 fold_accuracies = [] 192 fold_sizes = [] 193 194 for train_index, test_index in folds: 195 X_train, X_test = X[train_index], X[test_index] 196 y_train, y_test = y[train_index], y[test_index] 197 198 # Create and train the classifier with the current depth 199 clf = HHCartClassifier(max_depth=depth, min_samples_split=2) 200 clf.fit(X_train, y_train) 201 preds = clf.predict(X_test) 202 203 # Compute and store the accuracy and tree size (number of leaf nodes) 204 fold_accuracies.append(accuracy_score(y_test, preds)) 205 fold_sizes.append(count_leaves(clf.root)) 206 207 # Store the mean accuracy and mean tree size from the folds of this trial 208 results[depth].append(np.mean(fold_accuracies)) 209 tree_sizes[depth].append(np.mean(fold_sizes)) 210 211 # Write the final averaged results over all iterations to file 212 with open("resultsIris.txt", "w") as f: 213 f.write("Grid Search Results:\n") 214 for depth in depths: 215 avg_accuracy = np.mean(results[depth]) * 100 # percent 216 std_accuracy = np.std(results[depth]) * 100 # percent 217 avg_size = np.mean(tree_sizes[depth]) 218 std_size = np.std(tree_sizes[depth]) 219 f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ") 220 f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")