cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

hhcart-housing.py (9059B)


      1 import numpy as np
      2 import pandas as pd
      3 from sklearn.model_selection import KFold
      4 
      5 def compute_householder_matrix(d):
      6     p = d.shape[0]
      7     e1 = np.zeros_like(d)
      8     e1[0] = 1.0
      9     u = e1 - d
     10     u = u / np.linalg.norm(u)
     11     H = np.eye(p) - 2 * np.outer(u, u)
     12     return H
     13 
     14 def reflect_dataset(X, d):
     15     H = compute_householder_matrix(d)
     16     X_reflected = X @ H
     17     return X_reflected, H
     18 
     19 def compute_twoing_score(y_left, y_right):
     20     # Avoid degenerate splits.
     21     if len(y_left) == 0 or len(y_right) == 0:
     22         return 0
     23 
     24     total = len(y_left) + len(y_right)
     25     P_L = len(y_left) / total
     26     P_R = len(y_right) / total
     27 
     28     classes = np.unique(np.concatenate([y_left, y_right]))
     29     sum_diff = 0
     30     for cls in classes:
     31         P_Li = np.sum(y_left == cls) / len(y_left)
     32         P_Ri = np.sum(y_right == cls) / len(y_right)
     33         sum_diff += np.abs(P_Li - P_Ri)
     34 
     35     return P_L * P_R * (sum_diff ** 2)
     36 
     37 def find_best_reflection(X, y):
     38     """
     39     For each class in y, compute all eigenvectors (from the class covariance),
     40     reflect the dataset for each eigenvector candidate, and then for every axis in 
     41     the reflected feature space try all axis-aligned splits using the unique sample 
     42     values (without interpolation) to evaluate the twoing score.
     43     Returns the parameters of the best split found.
     44     """
     45     best_score = -np.inf
     46     best_params = None
     47 
     48     for cls in np.unique(y):
     49         class_indices = np.where(y == cls)[0]
     50         if len(class_indices) < 2:
     51             continue
     52         class_data = X[class_indices]
     53         cov = np.cov(class_data, rowvar=False)
     54         eigvals, eigvecs = np.linalg.eigh(cov)
     55         
     56         # Evaluate all eigenvector candidates (each column in eigvecs)
     57         for i in range(eigvecs.shape[1]):
     58             d_candidate = eigvecs[:, i]
     59             X_reflected, H = reflect_dataset(X, d_candidate)
     60             p = X_reflected.shape[1]
     61             # For each axis in the reflected space, check axis-aligned splits.
     62             for j in range(p):
     63                 values = X_reflected[:, j]
     64                 unique_vals = np.unique(values)
     65                 if len(unique_vals) < 2:
     66                     continue
     67                 # Use each unique value directly as a candidate threshold.
     68                 for threshold in unique_vals:
     69                     left_idx = values <= threshold
     70                     right_idx = ~left_idx
     71                     if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
     72                         continue
     73                     score = compute_twoing_score(y[left_idx], y[right_idx])
     74                     if score > best_score:
     75                         best_score = score
     76                         best_params = {
     77                             'H': H,
     78                             'd_candidate': d_candidate,  # Store the eigenvector used.
     79                             'axis': j,
     80                             'threshold': threshold,
     81                             'left_idx': left_idx,
     82                             'right_idx': right_idx,
     83                             'score': score,
     84                             'class_used': cls,
     85                             'X_reflected': X_reflected
     86                         }
     87     return best_params
     88 
     89 class Node:
     90     def __init__(self, is_leaf=False, prediction=None, H=None, d_candidate=None, axis=None,
     91                  threshold=None, score=None, left=None, right=None):
     92         self.is_leaf = is_leaf
     93         self.prediction = prediction      # Majority class for leaf nodes.
     94         self.H = H                        # Reflection matrix used at the node.
     95         self.d_candidate = d_candidate    # Eigenvector used for reflection.
     96         self.axis = axis                  # Axis in the reflected space used for splitting.
     97         self.threshold = threshold        # Threshold for the axis-aligned split.
     98         self.score = score                # Twoing score for the chosen split.
     99         self.left = left                  # Left child node.
    100         self.right = right                # Right child node.
    101 
    102 class HHCartClassifier:
    103     def __init__(self, max_depth=4, min_samples_split=2):
    104         self.max_depth = max_depth
    105         self.min_samples_split = min_samples_split
    106         self.root = None
    107 
    108     def fit(self, X, y):
    109         self.root = self._build_tree(X, y, depth=0)
    110         
    111     def _build_tree(self, X, y, depth):
    112         # Stopping criteria: max depth reached, too few samples, or pure node.
    113         if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1:
    114             majority_class = np.bincount(y.astype(int)).argmax()
    115             return Node(is_leaf=True, prediction=majority_class)
    116         
    117         best_params = find_best_reflection(X, y)
    118         if best_params is None:
    119             majority_class = np.bincount(y.astype(int)).argmax()
    120             return Node(is_leaf=True, prediction=majority_class)
    121 
    122         H = best_params['H']
    123         d_candidate = best_params['d_candidate']
    124         axis = best_params['axis']
    125         threshold = best_params['threshold']
    126         left_idx = best_params['left_idx']
    127         right_idx = best_params['right_idx']
    128         score = best_params['score']
    129 
    130         # If the split does not actually separate the data, create a leaf.
    131         if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
    132             majority_class = np.bincount(y.astype(int)).argmax()
    133             return Node(is_leaf=True, prediction=majority_class)
    134         
    135         # Build left and right subtrees.
    136         left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1)
    137         right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1)
    138         
    139         return Node(is_leaf=False, H=H, d_candidate=d_candidate, axis=axis, threshold=threshold, score=score,
    140                     left=left_node, right=right_node)
    141 
    142     def _predict_sample(self, x, node):
    143         if node.is_leaf:
    144             return node.prediction
    145         # Reflect the sample using the node's reflection matrix.
    146         x_reflected = x @ node.H
    147         # Use the stored axis in the reflected space for the split.
    148         value = x_reflected[node.axis]
    149         if value <= node.threshold:
    150             return self._predict_sample(x, node.left)
    151         else:
    152             return self._predict_sample(x, node.right)
    153 
    154     def predict(self, X):
    155         return np.array([self._predict_sample(x, self.root) for x in X])
    156 
    157 def count_leaves(node):
    158     """Recursively count the number of leaf nodes in the tree."""
    159     if node.is_leaf:
    160         return 1
    161     return count_leaves(node.left) + count_leaves(node.right)
    162 
    163 
    164 import numpy as np
    165 import pandas as pd
    166 from sklearn.model_selection import KFold
    167 from sklearn.metrics import accuracy_score
    168 
    169 # Load data
    170 df = pd.read_csv('../../datasets/housing.csv')
    171 y = df['MEDV']
    172 df = df.drop(df.columns[-1], axis=1)
    173 X = df.to_numpy(dtype=np.float64)
    174 
    175 
    176 y = y < 21
    177 y = 2 - y 
    178 
    179 # Define hyperparameters
    180 depths = range(1, 6)
    181 n_splits = 5
    182 n_trials = 10
    183 seed = 15
    184 
    185 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration
    186 results = {depth: [] for depth in depths}
    187 tree_sizes = {depth: [] for depth in depths}
    188 
    189 # For each iteration, generate the folds once and use them for each depth.
    190 for trial in range(n_trials):
    191     print("TRIAL: " + str(trial + 1))
    192     # Create KFold splits for this trial
    193     kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed))
    194     folds = list(kf.split(X))
    195     
    196     for depth in depths:
    197         print("DEPTH: " + str(depth))
    198         fold_accuracies = []
    199         fold_sizes = []
    200         
    201         for train_index, test_index in folds:
    202             X_train, X_test = X[train_index], X[test_index]
    203             y_train, y_test = y[train_index], y[test_index]
    204 
    205             # mean imputation
    206             
    207             means = np.nanmean(X_train, axis=0)
    208 
    209             X_train = np.where(np.isnan(X_train), means, X_train)
    210             X_test = np.where(np.isnan(X_test), means, X_test)
    211             
    212             clf = HHCartClassifier(max_depth=depth, min_samples_split=2)
    213             clf.fit(X_train, y_train)
    214             preds = clf.predict(X_test)
    215             
    216             # Compute and store the accuracy and tree size (number of leaf nodes)
    217             fold_accuracies.append(accuracy_score(y_test, preds))
    218             fold_sizes.append(count_leaves(clf.root))
    219         
    220         # Store the mean accuracy and mean tree size from the folds of this trial
    221         results[depth].append(np.mean(fold_accuracies))
    222         tree_sizes[depth].append(np.mean(fold_sizes))
    223 
    224 # Write the final averaged results over all iterations to file
    225 with open("resultsHousing.txt", "w") as f:
    226     f.write("Grid Search Results:\n")
    227     for depth in depths:
    228         avg_accuracy = np.mean(results[depth]) * 100 # percent
    229         std_accuracy = np.std(results[depth]) * 100 # percent
    230         avg_size = np.mean(tree_sizes[depth])
    231         std_size = np.std(tree_sizes[depth])
    232         f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ")
    233         f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")