hhcart-dim.py (8584B)
1 import numpy as np 2 import pandas as pd 3 from sklearn.model_selection import KFold 4 5 def compute_householder_matrix(d1): 6 p = d1.shape[0] 7 e1 = np.zeros_like(d1) 8 e1[0] = 1.0 9 u = e1 - d1 10 u = u / np.linalg.norm(u) 11 H = np.eye(p) - 2 * np.outer(u, u) 12 return H 13 14 def reflect_dataset(X, d1): 15 H = compute_householder_matrix(d1) 16 X_reflected = X @ H 17 return X_reflected, H 18 19 def compute_twoing_score(y_left, y_right): 20 # Avoid degenerate splits. 21 if len(y_left) == 0 or len(y_right) == 0: 22 return 0 23 24 total = len(y_left) + len(y_right) 25 P_L = len(y_left) / total 26 P_R = len(y_right) / total 27 28 classes = np.unique(np.concatenate([y_left, y_right])) 29 sum_diff = 0 30 for cls in classes: 31 P_Li = np.sum(y_left == cls) / len(y_left) 32 P_Ri = np.sum(y_right == cls) / len(y_right) 33 sum_diff += np.abs(P_Li - P_Ri) 34 35 return P_L * P_R * (sum_diff ** 2) 36 37 def find_best_reflection(X, y): 38 """ 39 For each class in y, compute the dominant eigenvector (largest eigenvalue eigenvector of the class covariance), 40 reflect the dataset, and then for every axis in the reflected feature space try all axis-aligned splits using 41 the unique sample values (without interpolation) to evaluate the twoing score. 42 Returns the parameters of the best split found. 43 """ 44 best_score = -np.inf 45 best_params = None 46 47 for cls in np.unique(y): 48 class_indices = np.where(y == cls)[0] 49 if len(class_indices) < 2: 50 continue 51 class_data = X[class_indices] 52 cov = np.cov(class_data, rowvar=False) 53 eigvals, eigvecs = np.linalg.eigh(cov) 54 # Choose the dominant eigenvector (largest eigenvalue) 55 d1 = eigvecs[:, -1] 56 57 X_reflected, H = reflect_dataset(X, d1) 58 p = X_reflected.shape[1] 59 # For each axis in the reflected space, check axis-aligned splits. 60 for j in range(p): 61 values = X_reflected[:, j] 62 unique_vals = np.unique(values) 63 if len(unique_vals) < 2: 64 continue 65 # Use each unique value directly as a candidate threshold. 66 for threshold in unique_vals: 67 left_idx = values <= threshold 68 right_idx = ~left_idx 69 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 70 continue 71 score = compute_twoing_score(y[left_idx], y[right_idx]) 72 if score > best_score: 73 best_score = score 74 best_params = { 75 'H': H, 76 'd1': d1, 77 'axis': j, 78 'threshold': threshold, 79 'left_idx': left_idx, 80 'right_idx': right_idx, 81 'score': score, 82 'class_used': cls, 83 'X_reflected': X_reflected 84 } 85 return best_params 86 87 class Node: 88 def __init__(self, is_leaf=False, prediction=None, H=None, d1=None, axis=None, 89 threshold=None, score=None, left=None, right=None): 90 self.is_leaf = is_leaf 91 self.prediction = prediction # Majority class for leaf nodes. 92 self.H = H # Reflection matrix used at the node. 93 self.d1 = d1 # Dominant eigenvector used. 94 self.axis = axis # Axis in the reflected space used for splitting. 95 self.threshold = threshold # Threshold for the axis-aligned split. 96 self.score = score # Twoing score for the chosen split. 97 self.left = left # Left child node. 98 self.right = right # Right child node. 99 100 class HHCartClassifier: 101 def __init__(self, max_depth=4, min_samples_split=2): 102 self.max_depth = max_depth 103 self.min_samples_split = min_samples_split 104 self.root = None 105 106 def fit(self, X, y): 107 self.root = self._build_tree(X, y, depth=0) 108 109 def _build_tree(self, X, y, depth): 110 # Stopping criteria: max depth reached, too few samples, or pure node. 111 if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1: 112 majority_class = np.bincount(y.astype(int)).argmax() 113 return Node(is_leaf=True, prediction=majority_class) 114 115 best_params = find_best_reflection(X, y) 116 if best_params is None: 117 majority_class = np.bincount(y.astype(int)).argmax() 118 return Node(is_leaf=True, prediction=majority_class) 119 120 H = best_params['H'] 121 d1 = best_params['d1'] 122 axis = best_params['axis'] 123 threshold = best_params['threshold'] 124 left_idx = best_params['left_idx'] 125 right_idx = best_params['right_idx'] 126 score = best_params['score'] 127 128 # If the split does not actually separate the data, create a leaf. 129 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 130 majority_class = np.bincount(y.astype(int)).argmax() 131 return Node(is_leaf=True, prediction=majority_class) 132 133 # Build left and right subtrees. 134 left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1) 135 right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1) 136 137 return Node(is_leaf=False, H=H, d1=d1, axis=axis, threshold=threshold, score=score, 138 left=left_node, right=right_node) 139 140 def _predict_sample(self, x, node): 141 if node.is_leaf: 142 return node.prediction 143 # Reflect the sample using the node's reflection matrix. 144 x_reflected = x @ node.H 145 # Use the stored axis in the reflected space for the split. 146 value = x_reflected[node.axis] 147 if value <= node.threshold: 148 return self._predict_sample(x, node.left) 149 else: 150 return self._predict_sample(x, node.right) 151 152 def predict(self, X): 153 return np.array([self._predict_sample(x, self.root) for x in X]) 154 155 def count_leaves(node): 156 """Recursively count the number of leaf nodes in the tree.""" 157 if node.is_leaf: 158 return 1 159 return count_leaves(node.left) + count_leaves(node.right) 160 161 import numpy as np 162 import pandas as pd 163 from sklearn.model_selection import KFold 164 from sklearn.metrics import accuracy_score 165 166 df = pd.read_csv('../../datasets/dim.csv') 167 y = df[df.columns[-1]] 168 df = df.drop(df.columns[-1], axis=1) 169 X = df.to_numpy(dtype=np.float64) 170 171 # Define hyperparameters 172 depths = range(1, 6) 173 n_splits = 5 174 n_trials = 10 175 seed = 15 176 177 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration 178 results = {depth: [] for depth in depths} 179 tree_sizes = {depth: [] for depth in depths} 180 181 # For each iteration, generate the folds once and use them for each depth. 182 for trial in range(n_trials): 183 print("TRIAL: " + str(trial + 1)) 184 # Create KFold splits for this trial 185 kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed)) 186 folds = list(kf.split(X)) 187 188 for depth in depths: 189 print("DEPTH: " + str(depth)) 190 fold_accuracies = [] 191 fold_sizes = [] 192 193 for train_index, test_index in folds: 194 X_train, X_test = X[train_index], X[test_index] 195 y_train, y_test = y[train_index], y[test_index] 196 197 # Create and train the classifier with the current depth 198 clf = HHCartClassifier(max_depth=depth, min_samples_split=2) 199 clf.fit(X_train, y_train) 200 preds = clf.predict(X_test) 201 202 # Compute and store the accuracy and tree size (number of leaf nodes) 203 fold_accuracies.append(accuracy_score(y_test, preds)) 204 fold_sizes.append(count_leaves(clf.root)) 205 206 # Store the mean accuracy and mean tree size from the folds of this trial 207 results[depth].append(np.mean(fold_accuracies)) 208 tree_sizes[depth].append(np.mean(fold_sizes)) 209 210 # Write the final averaged results over all iterations to file 211 with open("resultsDim.txt", "w") as f: 212 f.write("Grid Search Results:\n") 213 for depth in depths: 214 avg_accuracy = np.mean(results[depth]) * 100 # percent 215 std_accuracy = np.std(results[depth]) * 100 # percent 216 avg_size = np.mean(tree_sizes[depth]) 217 std_size = np.std(tree_sizes[depth]) 218 f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ") 219 f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")