hhcart-housing.py - cart-elc - Source code for CART-ELC

hhcart-housing.py (8773B)
      1 import numpy as np
      2 import pandas as pd
      3 from sklearn.model_selection import KFold
      4 
      5 def compute_householder_matrix(d1):
      6     p = d1.shape[0]
      7     e1 = np.zeros_like(d1)
      8     e1[0] = 1.0
      9     u = e1 - d1
     10     u = u / np.linalg.norm(u)
     11     H = np.eye(p) - 2 * np.outer(u, u)
     12     return H
     13 
     14 def reflect_dataset(X, d1):
     15     H = compute_householder_matrix(d1)
     16     X_reflected = X @ H
     17     return X_reflected, H
     18 
     19 def compute_twoing_score(y_left, y_right):
     20     # Avoid degenerate splits.
     21     if len(y_left) == 0 or len(y_right) == 0:
     22         return 0
     23 
     24     total = len(y_left) + len(y_right)
     25     P_L = len(y_left) / total
     26     P_R = len(y_right) / total
     27 
     28     classes = np.unique(np.concatenate([y_left, y_right]))
     29     sum_diff = 0
     30     for cls in classes:
     31         P_Li = np.sum(y_left == cls) / len(y_left)
     32         P_Ri = np.sum(y_right == cls) / len(y_right)
     33         sum_diff += np.abs(P_Li - P_Ri)
     34 
     35     return P_L * P_R * (sum_diff ** 2)
     36 
     37 def find_best_reflection(X, y):
     38     """
     39     For each class in y, compute the dominant eigenvector (largest eigenvalue eigenvector of the class covariance),
     40     reflect the dataset, and then for every axis in the reflected feature space try all axis-aligned splits using
     41     the unique sample values (without interpolation) to evaluate the twoing score.
     42     Returns the parameters of the best split found.
     43     """
     44     best_score = -np.inf
     45     best_params = None
     46 
     47     for cls in np.unique(y):
     48         class_indices = np.where(y == cls)[0]
     49         if len(class_indices) < 2:
     50             continue
     51         class_data = X[class_indices]
     52         cov = np.cov(class_data, rowvar=False)
     53         eigvals, eigvecs = np.linalg.eigh(cov)
     54         # Choose the dominant eigenvector (largest eigenvalue)
     55         d1 = eigvecs[:, -1]
     56 
     57         X_reflected, H = reflect_dataset(X, d1)
     58         p = X_reflected.shape[1]
     59         # For each axis in the reflected space, check axis-aligned splits.
     60         for j in range(p):
     61             values = X_reflected[:, j]
     62             unique_vals = np.unique(values)
     63             if len(unique_vals) < 2:
     64                 continue
     65             # Use each unique value directly as a candidate threshold.
     66             for threshold in unique_vals:
     67                 left_idx = values <= threshold
     68                 right_idx = ~left_idx
     69                 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
     70                     continue
     71                 score = compute_twoing_score(y[left_idx], y[right_idx])
     72                 if score > best_score:
     73                     best_score = score
     74                     best_params = {
     75                         'H': H,
     76                         'd1': d1,
     77                         'axis': j,
     78                         'threshold': threshold,
     79                         'left_idx': left_idx,
     80                         'right_idx': right_idx,
     81                         'score': score,
     82                         'class_used': cls,
     83                         'X_reflected': X_reflected
     84                     }
     85     return best_params
     86 
     87 class Node:
     88     def __init__(self, is_leaf=False, prediction=None, H=None, d1=None, axis=None,
     89                  threshold=None, score=None, left=None, right=None):
     90         self.is_leaf = is_leaf
     91         self.prediction = prediction      # Majority class for leaf nodes.
     92         self.H = H                        # Reflection matrix used at the node.
     93         self.d1 = d1                      # Dominant eigenvector used.
     94         self.axis = axis                  # Axis in the reflected space used for splitting.
     95         self.threshold = threshold        # Threshold for the axis-aligned split.
     96         self.score = score                # Twoing score for the chosen split.
     97         self.left = left                  # Left child node.
     98         self.right = right                # Right child node.
     99 
    100 class HHCartClassifier:
    101     def __init__(self, max_depth=4, min_samples_split=2):
    102         self.max_depth = max_depth
    103         self.min_samples_split = min_samples_split
    104         self.root = None
    105 
    106     def fit(self, X, y):
    107         self.root = self._build_tree(X, y, depth=0)
    108         
    109     def _build_tree(self, X, y, depth):
    110         # Stopping criteria: max depth reached, too few samples, or pure node.
    111         if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1:
    112             majority_class = np.bincount(y.astype(int)).argmax()
    113             return Node(is_leaf=True, prediction=majority_class)
    114         
    115         best_params = find_best_reflection(X, y)
    116         if best_params is None:
    117             majority_class = np.bincount(y.astype(int)).argmax()
    118             return Node(is_leaf=True, prediction=majority_class)
    119 
    120         H = best_params['H']
    121         d1 = best_params['d1']
    122         axis = best_params['axis']
    123         threshold = best_params['threshold']
    124         left_idx = best_params['left_idx']
    125         right_idx = best_params['right_idx']
    126         score = best_params['score']
    127 
    128         # If the split does not actually separate the data, create a leaf.
    129         if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
    130             majority_class = np.bincount(y.astype(int)).argmax()
    131             return Node(is_leaf=True, prediction=majority_class)
    132         
    133         # Build left and right subtrees.
    134         left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1)
    135         right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1)
    136         
    137         return Node(is_leaf=False, H=H, d1=d1, axis=axis, threshold=threshold, score=score,
    138                     left=left_node, right=right_node)
    139 
    140     def _predict_sample(self, x, node):
    141         if node.is_leaf:
    142             return node.prediction
    143         # Reflect the sample using the node's reflection matrix.
    144         x_reflected = x @ node.H
    145         # Use the stored axis in the reflected space for the split.
    146         value = x_reflected[node.axis]
    147         if value <= node.threshold:
    148             return self._predict_sample(x, node.left)
    149         else:
    150             return self._predict_sample(x, node.right)
    151 
    152     def predict(self, X):
    153         return np.array([self._predict_sample(x, self.root) for x in X])
    154 
    155 def count_leaves(node):
    156     """Recursively count the number of leaf nodes in the tree."""
    157     if node.is_leaf:
    158         return 1
    159     return count_leaves(node.left) + count_leaves(node.right)
    160 
    161 import numpy as np
    162 import pandas as pd
    163 from sklearn.model_selection import KFold
    164 from sklearn.metrics import accuracy_score
    165 
    166 # Load data
    167 df = pd.read_csv('../../datasets/housing.csv')
    168 y = df['MEDV']
    169 df = df.drop(df.columns[-1], axis=1)
    170 X = df.to_numpy(dtype=np.float64)
    171 
    172 
    173 y = y < 21
    174 y = 2 - y 
    175 
    176 # Define hyperparameters
    177 depths = range(1, 6)
    178 n_splits = 5
    179 n_trials = 10
    180 seed = 15
    181 
    182 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration
    183 results = {depth: [] for depth in depths}
    184 tree_sizes = {depth: [] for depth in depths}
    185 
    186 # For each iteration, generate the folds once and use them for each depth.
    187 for trial in range(n_trials):
    188     print("TRIAL: " + str(trial + 1))
    189     # Create KFold splits for this trial
    190     kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed))
    191     folds = list(kf.split(X))
    192     
    193     for depth in depths:
    194         print("DEPTH: " + str(depth))
    195         fold_accuracies = []
    196         fold_sizes = []
    197         
    198         for train_index, test_index in folds:
    199             X_train, X_test = X[train_index], X[test_index]
    200             y_train, y_test = y[train_index], y[test_index]
    201 
    202             # mean imputation
    203             
    204             means = np.nanmean(X_train, axis=0)
    205 
    206             X_train = np.where(np.isnan(X_train), means, X_train)
    207             X_test = np.where(np.isnan(X_test), means, X_test)
    208             
    209             clf = HHCartClassifier(max_depth=depth, min_samples_split=2)
    210             clf.fit(X_train, y_train)
    211             preds = clf.predict(X_test)
    212             
    213             # Compute and store the accuracy and tree size (number of leaf nodes)
    214             fold_accuracies.append(accuracy_score(y_test, preds))
    215             fold_sizes.append(count_leaves(clf.root))
    216         
    217         # Store the mean accuracy and mean tree size from the folds of this trial
    218         results[depth].append(np.mean(fold_accuracies))
    219         tree_sizes[depth].append(np.mean(fold_sizes))
    220 
    221 # Write the final averaged results over all iterations to file
    222 with open("resultsHousing.txt", "w") as f:
    223     f.write("Grid Search Results:\n")
    224     for depth in depths:
    225         avg_accuracy = np.mean(results[depth]) * 100 # percent
    226         std_accuracy = np.std(results[depth]) * 100 # percent
    227         avg_size = np.mean(tree_sizes[depth])
    228         std_size = np.std(tree_sizes[depth])
    229         f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ")
    230         f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")
	cart-elc Source code for CART-ELC
	git clone git://git.laack.co/cart-elc.git
	Log \| Files \| Refs \| README \| LICENSE