hhcart-housing.py (8773B)
1 import numpy as np 2 import pandas as pd 3 from sklearn.model_selection import KFold 4 5 def compute_householder_matrix(d1): 6 p = d1.shape[0] 7 e1 = np.zeros_like(d1) 8 e1[0] = 1.0 9 u = e1 - d1 10 u = u / np.linalg.norm(u) 11 H = np.eye(p) - 2 * np.outer(u, u) 12 return H 13 14 def reflect_dataset(X, d1): 15 H = compute_householder_matrix(d1) 16 X_reflected = X @ H 17 return X_reflected, H 18 19 def compute_twoing_score(y_left, y_right): 20 # Avoid degenerate splits. 21 if len(y_left) == 0 or len(y_right) == 0: 22 return 0 23 24 total = len(y_left) + len(y_right) 25 P_L = len(y_left) / total 26 P_R = len(y_right) / total 27 28 classes = np.unique(np.concatenate([y_left, y_right])) 29 sum_diff = 0 30 for cls in classes: 31 P_Li = np.sum(y_left == cls) / len(y_left) 32 P_Ri = np.sum(y_right == cls) / len(y_right) 33 sum_diff += np.abs(P_Li - P_Ri) 34 35 return P_L * P_R * (sum_diff ** 2) 36 37 def find_best_reflection(X, y): 38 """ 39 For each class in y, compute the dominant eigenvector (largest eigenvalue eigenvector of the class covariance), 40 reflect the dataset, and then for every axis in the reflected feature space try all axis-aligned splits using 41 the unique sample values (without interpolation) to evaluate the twoing score. 42 Returns the parameters of the best split found. 43 """ 44 best_score = -np.inf 45 best_params = None 46 47 for cls in np.unique(y): 48 class_indices = np.where(y == cls)[0] 49 if len(class_indices) < 2: 50 continue 51 class_data = X[class_indices] 52 cov = np.cov(class_data, rowvar=False) 53 eigvals, eigvecs = np.linalg.eigh(cov) 54 # Choose the dominant eigenvector (largest eigenvalue) 55 d1 = eigvecs[:, -1] 56 57 X_reflected, H = reflect_dataset(X, d1) 58 p = X_reflected.shape[1] 59 # For each axis in the reflected space, check axis-aligned splits. 60 for j in range(p): 61 values = X_reflected[:, j] 62 unique_vals = np.unique(values) 63 if len(unique_vals) < 2: 64 continue 65 # Use each unique value directly as a candidate threshold. 66 for threshold in unique_vals: 67 left_idx = values <= threshold 68 right_idx = ~left_idx 69 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 70 continue 71 score = compute_twoing_score(y[left_idx], y[right_idx]) 72 if score > best_score: 73 best_score = score 74 best_params = { 75 'H': H, 76 'd1': d1, 77 'axis': j, 78 'threshold': threshold, 79 'left_idx': left_idx, 80 'right_idx': right_idx, 81 'score': score, 82 'class_used': cls, 83 'X_reflected': X_reflected 84 } 85 return best_params 86 87 class Node: 88 def __init__(self, is_leaf=False, prediction=None, H=None, d1=None, axis=None, 89 threshold=None, score=None, left=None, right=None): 90 self.is_leaf = is_leaf 91 self.prediction = prediction # Majority class for leaf nodes. 92 self.H = H # Reflection matrix used at the node. 93 self.d1 = d1 # Dominant eigenvector used. 94 self.axis = axis # Axis in the reflected space used for splitting. 95 self.threshold = threshold # Threshold for the axis-aligned split. 96 self.score = score # Twoing score for the chosen split. 97 self.left = left # Left child node. 98 self.right = right # Right child node. 99 100 class HHCartClassifier: 101 def __init__(self, max_depth=4, min_samples_split=2): 102 self.max_depth = max_depth 103 self.min_samples_split = min_samples_split 104 self.root = None 105 106 def fit(self, X, y): 107 self.root = self._build_tree(X, y, depth=0) 108 109 def _build_tree(self, X, y, depth): 110 # Stopping criteria: max depth reached, too few samples, or pure node. 111 if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1: 112 majority_class = np.bincount(y.astype(int)).argmax() 113 return Node(is_leaf=True, prediction=majority_class) 114 115 best_params = find_best_reflection(X, y) 116 if best_params is None: 117 majority_class = np.bincount(y.astype(int)).argmax() 118 return Node(is_leaf=True, prediction=majority_class) 119 120 H = best_params['H'] 121 d1 = best_params['d1'] 122 axis = best_params['axis'] 123 threshold = best_params['threshold'] 124 left_idx = best_params['left_idx'] 125 right_idx = best_params['right_idx'] 126 score = best_params['score'] 127 128 # If the split does not actually separate the data, create a leaf. 129 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0: 130 majority_class = np.bincount(y.astype(int)).argmax() 131 return Node(is_leaf=True, prediction=majority_class) 132 133 # Build left and right subtrees. 134 left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1) 135 right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1) 136 137 return Node(is_leaf=False, H=H, d1=d1, axis=axis, threshold=threshold, score=score, 138 left=left_node, right=right_node) 139 140 def _predict_sample(self, x, node): 141 if node.is_leaf: 142 return node.prediction 143 # Reflect the sample using the node's reflection matrix. 144 x_reflected = x @ node.H 145 # Use the stored axis in the reflected space for the split. 146 value = x_reflected[node.axis] 147 if value <= node.threshold: 148 return self._predict_sample(x, node.left) 149 else: 150 return self._predict_sample(x, node.right) 151 152 def predict(self, X): 153 return np.array([self._predict_sample(x, self.root) for x in X]) 154 155 def count_leaves(node): 156 """Recursively count the number of leaf nodes in the tree.""" 157 if node.is_leaf: 158 return 1 159 return count_leaves(node.left) + count_leaves(node.right) 160 161 import numpy as np 162 import pandas as pd 163 from sklearn.model_selection import KFold 164 from sklearn.metrics import accuracy_score 165 166 # Load data 167 df = pd.read_csv('../../datasets/housing.csv') 168 y = df['MEDV'] 169 df = df.drop(df.columns[-1], axis=1) 170 X = df.to_numpy(dtype=np.float64) 171 172 173 y = y < 21 174 y = 2 - y 175 176 # Define hyperparameters 177 depths = range(1, 6) 178 n_splits = 5 179 n_trials = 10 180 seed = 15 181 182 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration 183 results = {depth: [] for depth in depths} 184 tree_sizes = {depth: [] for depth in depths} 185 186 # For each iteration, generate the folds once and use them for each depth. 187 for trial in range(n_trials): 188 print("TRIAL: " + str(trial + 1)) 189 # Create KFold splits for this trial 190 kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed)) 191 folds = list(kf.split(X)) 192 193 for depth in depths: 194 print("DEPTH: " + str(depth)) 195 fold_accuracies = [] 196 fold_sizes = [] 197 198 for train_index, test_index in folds: 199 X_train, X_test = X[train_index], X[test_index] 200 y_train, y_test = y[train_index], y[test_index] 201 202 # mean imputation 203 204 means = np.nanmean(X_train, axis=0) 205 206 X_train = np.where(np.isnan(X_train), means, X_train) 207 X_test = np.where(np.isnan(X_test), means, X_test) 208 209 clf = HHCartClassifier(max_depth=depth, min_samples_split=2) 210 clf.fit(X_train, y_train) 211 preds = clf.predict(X_test) 212 213 # Compute and store the accuracy and tree size (number of leaf nodes) 214 fold_accuracies.append(accuracy_score(y_test, preds)) 215 fold_sizes.append(count_leaves(clf.root)) 216 217 # Store the mean accuracy and mean tree size from the folds of this trial 218 results[depth].append(np.mean(fold_accuracies)) 219 tree_sizes[depth].append(np.mean(fold_sizes)) 220 221 # Write the final averaged results over all iterations to file 222 with open("resultsHousing.txt", "w") as f: 223 f.write("Grid Search Results:\n") 224 for depth in depths: 225 avg_accuracy = np.mean(results[depth]) * 100 # percent 226 std_accuracy = np.std(results[depth]) * 100 # percent 227 avg_size = np.mean(tree_sizes[depth]) 228 std_size = np.std(tree_sizes[depth]) 229 f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ") 230 f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")