hhcart-housing.py (9059B)
1 import numpy as np 2 import pandas as pd 3 from sklearn.model_selection import KFold 4 5 def compute_householder_matrix(d): 6 p = d.shape[0] 7 e1 = np.zeros_like(d) 8 e1[0] = 1.0 9 u = e1 - d 10 u = u / np.linalg.norm(u) 11 H = np.eye(p) - 2 * np.outer(u, u) 12 return H 13 14 def reflect_dataset(X, d): 15 H = compute_householder_matrix(d) 16 X_reflected = X @ H 17 return X_reflected, H 18 19 def compute_twoing_score(y_left, y_right): 20 # Avoid degenerate splits. 21 if len(y_left) == 0 or len(y_right) == 0: 22 return 0 23 24 total = len(y_left) + len(y_right) 25 P_L = len(y_left) / total 26 P_R = len(y_right) / total 27 28 classes = np.unique(np.concatenate([y_left, y_right])) 29 sum_diff = 0 30 for cls in classes: 31 P_Li = np.sum(y_left == cls) / len(y_left) 32 P_Ri = np.sum(y_right == cls) / len(y_right) 33 sum_diff += np.abs(P_Li - P_Ri) 34 35 return P_L * P_R * (sum_diff ** 2) 36 37 def find_best_reflection(X, y): 38 """ 39 For each class in y, compute all eigenvectors (from the class covariance), 40 reflect the dataset for each eigenvector candidate, and then for every axis in 41 the reflected feature space try all axis-aligned splits using the unique sample 42 values (without interpolation) to evaluate the twoing score. 43 Returns the parameters of the best split found. 44 """ 45 best_score = -np.inf 46 best_params = None 47 48 for cls in np.unique(y): 49 class_indices = np.where(y == cls)[0] 50 if len(class_indices) < 2: 51 continue 52 class_data = X[class_indices] 53 cov = np.cov(class_data, rowvar=False) 54 eigvals, eigvecs = np.linalg.eigh(cov) 55 56 # Evaluate all eigenvector candidates (each column in eigvecs) 57 for i in range(eigvecs.shape[1]): 58 d_candidate = eigvecs[:, i] 59 X_reflected, H = reflect_dataset(X, d_candidate) 60 p = X_reflected.shape[1] 61 # For each axis in the reflected space, check axis-aligned splits. 62 for j in range(p): 63 values = X_reflected[:, j] 64 unique_vals = np.unique(values) 65 if len(unique_vals) < 2: 66 continue 67 # Use each unique value directly as a candidate threshold. 68 for threshold in unique_vals: 69 left_idx = values <= threshold 70 right_idx = ~left_idx 71 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 72 continue 73 score = compute_twoing_score(y[left_idx], y[right_idx]) 74 if score > best_score: 75 best_score = score 76 best_params = { 77 'H': H, 78 'd_candidate': d_candidate, # Store the eigenvector used. 79 'axis': j, 80 'threshold': threshold, 81 'left_idx': left_idx, 82 'right_idx': right_idx, 83 'score': score, 84 'class_used': cls, 85 'X_reflected': X_reflected 86 } 87 return best_params 88 89 class Node: 90 def __init__(self, is_leaf=False, prediction=None, H=None, d_candidate=None, axis=None, 91 threshold=None, score=None, left=None, right=None): 92 self.is_leaf = is_leaf 93 self.prediction = prediction # Majority class for leaf nodes. 94 self.H = H # Reflection matrix used at the node. 95 self.d_candidate = d_candidate # Eigenvector used for reflection. 96 self.axis = axis # Axis in the reflected space used for splitting. 97 self.threshold = threshold # Threshold for the axis-aligned split. 98 self.score = score # Twoing score for the chosen split. 99 self.left = left # Left child node. 100 self.right = right # Right child node. 101 102 class HHCartClassifier: 103 def __init__(self, max_depth=4, min_samples_split=2): 104 self.max_depth = max_depth 105 self.min_samples_split = min_samples_split 106 self.root = None 107 108 def fit(self, X, y): 109 self.root = self._build_tree(X, y, depth=0) 110 111 def _build_tree(self, X, y, depth): 112 # Stopping criteria: max depth reached, too few samples, or pure node. 113 if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1: 114 majority_class = np.bincount(y.astype(int)).argmax() 115 return Node(is_leaf=True, prediction=majority_class) 116 117 best_params = find_best_reflection(X, y) 118 if best_params is None: 119 majority_class = np.bincount(y.astype(int)).argmax() 120 return Node(is_leaf=True, prediction=majority_class) 121 122 H = best_params['H'] 123 d_candidate = best_params['d_candidate'] 124 axis = best_params['axis'] 125 threshold = best_params['threshold'] 126 left_idx = best_params['left_idx'] 127 right_idx = best_params['right_idx'] 128 score = best_params['score'] 129 130 # If the split does not actually separate the data, create a leaf. 131 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 132 majority_class = np.bincount(y.astype(int)).argmax() 133 return Node(is_leaf=True, prediction=majority_class) 134 135 # Build left and right subtrees. 136 left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1) 137 right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1) 138 139 return Node(is_leaf=False, H=H, d_candidate=d_candidate, axis=axis, threshold=threshold, score=score, 140 left=left_node, right=right_node) 141 142 def _predict_sample(self, x, node): 143 if node.is_leaf: 144 return node.prediction 145 # Reflect the sample using the node's reflection matrix. 146 x_reflected = x @ node.H 147 # Use the stored axis in the reflected space for the split. 148 value = x_reflected[node.axis] 149 if value <= node.threshold: 150 return self._predict_sample(x, node.left) 151 else: 152 return self._predict_sample(x, node.right) 153 154 def predict(self, X): 155 return np.array([self._predict_sample(x, self.root) for x in X]) 156 157 def count_leaves(node): 158 """Recursively count the number of leaf nodes in the tree.""" 159 if node.is_leaf: 160 return 1 161 return count_leaves(node.left) + count_leaves(node.right) 162 163 164 import numpy as np 165 import pandas as pd 166 from sklearn.model_selection import KFold 167 from sklearn.metrics import accuracy_score 168 169 # Load data 170 df = pd.read_csv('../../datasets/housing.csv') 171 y = df['MEDV'] 172 df = df.drop(df.columns[-1], axis=1) 173 X = df.to_numpy(dtype=np.float64) 174 175 176 y = y < 21 177 y = 2 - y 178 179 # Define hyperparameters 180 depths = range(1, 6) 181 n_splits = 5 182 n_trials = 10 183 seed = 15 184 185 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration 186 results = {depth: [] for depth in depths} 187 tree_sizes = {depth: [] for depth in depths} 188 189 # For each iteration, generate the folds once and use them for each depth. 190 for trial in range(n_trials): 191 print("TRIAL: " + str(trial + 1)) 192 # Create KFold splits for this trial 193 kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed)) 194 folds = list(kf.split(X)) 195 196 for depth in depths: 197 print("DEPTH: " + str(depth)) 198 fold_accuracies = [] 199 fold_sizes = [] 200 201 for train_index, test_index in folds: 202 X_train, X_test = X[train_index], X[test_index] 203 y_train, y_test = y[train_index], y[test_index] 204 205 # mean imputation 206 207 means = np.nanmean(X_train, axis=0) 208 209 X_train = np.where(np.isnan(X_train), means, X_train) 210 X_test = np.where(np.isnan(X_test), means, X_test) 211 212 clf = HHCartClassifier(max_depth=depth, min_samples_split=2) 213 clf.fit(X_train, y_train) 214 preds = clf.predict(X_test) 215 216 # Compute and store the accuracy and tree size (number of leaf nodes) 217 fold_accuracies.append(accuracy_score(y_test, preds)) 218 fold_sizes.append(count_leaves(clf.root)) 219 220 # Store the mean accuracy and mean tree size from the folds of this trial 221 results[depth].append(np.mean(fold_accuracies)) 222 tree_sizes[depth].append(np.mean(fold_sizes)) 223 224 # Write the final averaged results over all iterations to file 225 with open("resultsHousing.txt", "w") as f: 226 f.write("Grid Search Results:\n") 227 for depth in depths: 228 avg_accuracy = np.mean(results[depth]) * 100 # percent 229 std_accuracy = np.std(results[depth]) * 100 # percent 230 avg_size = np.mean(tree_sizes[depth]) 231 std_size = np.std(tree_sizes[depth]) 232 f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ") 233 f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")