hhcart-cancer.py (8693B)
1 import numpy as np 2 import pandas as pd 3 from sklearn.model_selection import KFold 4 5 def compute_householder_matrix(d1): 6 p = d1.shape[0] 7 e1 = np.zeros_like(d1) 8 e1[0] = 1.0 9 u = e1 - d1 10 u = u / np.linalg.norm(u) 11 H = np.eye(p) - 2 * np.outer(u, u) 12 return H 13 14 def reflect_dataset(X, d1): 15 H = compute_householder_matrix(d1) 16 X_reflected = X @ H 17 return X_reflected, H 18 19 def compute_twoing_score(y_left, y_right): 20 # Avoid degenerate splits. 21 if len(y_left) == 0 or len(y_right) == 0: 22 return 0 23 24 total = len(y_left) + len(y_right) 25 P_L = len(y_left) / total 26 P_R = len(y_right) / total 27 28 classes = np.unique(np.concatenate([y_left, y_right])) 29 sum_diff = 0 30 for cls in classes: 31 P_Li = np.sum(y_left == cls) / len(y_left) 32 P_Ri = np.sum(y_right == cls) / len(y_right) 33 sum_diff += np.abs(P_Li - P_Ri) 34 35 return P_L * P_R * (sum_diff ** 2) 36 37 def find_best_reflection(X, y): 38 """ 39 For each class in y, compute the dominant eigenvector (largest eigenvalue eigenvector of the class covariance), 40 reflect the dataset, and then for every axis in the reflected feature space try all axis-aligned splits using 41 the unique sample values (without interpolation) to evaluate the twoing score. 42 Returns the parameters of the best split found. 43 """ 44 best_score = -np.inf 45 best_params = None 46 47 for cls in np.unique(y): 48 class_indices = np.where(y == cls)[0] 49 if len(class_indices) < 2: 50 continue 51 class_data = X[class_indices] 52 cov = np.cov(class_data, rowvar=False) 53 eigvals, eigvecs = np.linalg.eigh(cov) 54 # Choose the dominant eigenvector (largest eigenvalue) 55 d1 = eigvecs[:, -1] 56 57 X_reflected, H = reflect_dataset(X, d1) 58 p = X_reflected.shape[1] 59 # For each axis in the reflected space, check axis-aligned splits. 60 for j in range(p): 61 values = X_reflected[:, j] 62 unique_vals = np.unique(values) 63 if len(unique_vals) < 2: 64 continue 65 # Use each unique value directly as a candidate threshold. 66 for threshold in unique_vals: 67 left_idx = values <= threshold 68 right_idx = ~left_idx 69 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 70 continue 71 score = compute_twoing_score(y[left_idx], y[right_idx]) 72 if score > best_score: 73 best_score = score 74 best_params = { 75 'H': H, 76 'd1': d1, 77 'axis': j, 78 'threshold': threshold, 79 'left_idx': left_idx, 80 'right_idx': right_idx, 81 'score': score, 82 'class_used': cls, 83 'X_reflected': X_reflected 84 } 85 return best_params 86 87 class Node: 88 def __init__(self, is_leaf=False, prediction=None, H=None, d1=None, axis=None, 89 threshold=None, score=None, left=None, right=None): 90 self.is_leaf = is_leaf 91 self.prediction = prediction # Majority class for leaf nodes. 92 self.H = H # Reflection matrix used at the node. 93 self.d1 = d1 # Dominant eigenvector used. 94 self.axis = axis # Axis in the reflected space used for splitting. 95 self.threshold = threshold # Threshold for the axis-aligned split. 96 self.score = score # Twoing score for the chosen split. 97 self.left = left # Left child node. 98 self.right = right # Right child node. 99 100 class HHCartClassifier: 101 def __init__(self, max_depth=4, min_samples_split=2): 102 self.max_depth = max_depth 103 self.min_samples_split = min_samples_split 104 self.root = None 105 106 def fit(self, X, y): 107 self.root = self._build_tree(X, y, depth=0) 108 109 def _build_tree(self, X, y, depth): 110 # Stopping criteria: max depth reached, too few samples, or pure node. 111 if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1: 112 majority_class = np.bincount(y.astype(int)).argmax() 113 return Node(is_leaf=True, prediction=majority_class) 114 115 best_params = find_best_reflection(X, y) 116 if best_params is None: 117 majority_class = np.bincount(y.astype(int)).argmax() 118 return Node(is_leaf=True, prediction=majority_class) 119 120 H = best_params['H'] 121 d1 = best_params['d1'] 122 axis = best_params['axis'] 123 threshold = best_params['threshold'] 124 left_idx = best_params['left_idx'] 125 right_idx = best_params['right_idx'] 126 score = best_params['score'] 127 128 # If the split does not actually separate the data, create a leaf. 129 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 130 majority_class = np.bincount(y.astype(int)).argmax() 131 return Node(is_leaf=True, prediction=majority_class) 132 133 # Build left and right subtrees. 134 left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1) 135 right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1) 136 137 return Node(is_leaf=False, H=H, d1=d1, axis=axis, threshold=threshold, score=score, 138 left=left_node, right=right_node) 139 140 def _predict_sample(self, x, node): 141 if node.is_leaf: 142 return node.prediction 143 # Reflect the sample using the node's reflection matrix. 144 x_reflected = x @ node.H 145 # Use the stored axis in the reflected space for the split. 146 value = x_reflected[node.axis] 147 if value <= node.threshold: 148 return self._predict_sample(x, node.left) 149 else: 150 return self._predict_sample(x, node.right) 151 152 def predict(self, X): 153 return np.array([self._predict_sample(x, self.root) for x in X]) 154 155 def count_leaves(node): 156 """Recursively count the number of leaf nodes in the tree.""" 157 if node.is_leaf: 158 return 1 159 return count_leaves(node.left) + count_leaves(node.right) 160 161 import numpy as np 162 import pandas as pd 163 from sklearn.model_selection import KFold 164 from sklearn.metrics import accuracy_score 165 166 df = pd.read_csv('../../datasets/cancer.csv') 167 df = df.drop(df.columns[0], axis=1) 168 df = df.dropna() 169 df = df[df[' nuc'] != '?'] 170 df = df.reset_index(drop=True) 171 y = df[' cla'] 172 df = df.drop(df.columns[-1], axis=1) 173 X = df.to_numpy(dtype=np.float64) 174 175 # Define hyperparameters 176 depths = range(1, 6) 177 n_splits = 5 178 n_trials = 10 179 seed = 15 180 181 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration 182 results = {depth: [] for depth in depths} 183 tree_sizes = {depth: [] for depth in depths} 184 185 # For each iteration, generate the folds once and use them for each depth. 186 for trial in range(n_trials): 187 print("TRIAL: " + str(trial + 1)) 188 # Create KFold splits for this trial 189 kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed)) 190 folds = list(kf.split(X)) 191 192 for depth in depths: 193 print("DEPTH: " + str(depth)) 194 fold_accuracies = [] 195 fold_sizes = [] 196 197 for train_index, test_index in folds: 198 X_train, X_test = X[train_index], X[test_index] 199 y_train, y_test = y[train_index], y[test_index] 200 201 # Create and train the classifier with the current depth 202 clf = HHCartClassifier(max_depth=depth, min_samples_split=2) 203 clf.fit(X_train, y_train) 204 preds = clf.predict(X_test) 205 206 # Compute and store the accuracy and tree size (number of leaf nodes) 207 fold_accuracies.append(accuracy_score(y_test, preds)) 208 fold_sizes.append(count_leaves(clf.root)) 209 210 # Store the mean accuracy and mean tree size from the folds of this trial 211 results[depth].append(np.mean(fold_accuracies)) 212 tree_sizes[depth].append(np.mean(fold_sizes)) 213 214 # Write the final averaged results over all iterations to file 215 with open("resultsCancer.txt", "w") as f: 216 f.write("Grid Search Results:\n") 217 for depth in depths: 218 avg_accuracy = np.mean(results[depth]) * 100 # percent 219 std_accuracy = np.std(results[depth]) * 100 # percent 220 avg_size = np.mean(tree_sizes[depth]) 221 std_size = np.std(tree_sizes[depth]) 222 f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ") 223 f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")