hhcart-iris.py - cart-elc - Source code for CART-ELC

hhcart-iris.py (8597B)
      1 import numpy as np
      2 import pandas as pd
      3 from sklearn.model_selection import KFold
      4 
      5 def compute_householder_matrix(d1):
      6     p = d1.shape[0]
      7     e1 = np.zeros_like(d1)
      8     e1[0] = 1.0
      9     u = e1 - d1
     10     u = u / np.linalg.norm(u)
     11     H = np.eye(p) - 2 * np.outer(u, u)
     12     return H
     13 
     14 def reflect_dataset(X, d1):
     15     H = compute_householder_matrix(d1)
     16     X_reflected = X @ H
     17     return X_reflected, H
     18 
     19 def compute_twoing_score(y_left, y_right):
     20     # Avoid degenerate splits.
     21     if len(y_left) == 0 or len(y_right) == 0:
     22         return 0
     23 
     24     total = len(y_left) + len(y_right)
     25     P_L = len(y_left) / total
     26     P_R = len(y_right) / total
     27 
     28     classes = np.unique(np.concatenate([y_left, y_right]))
     29     sum_diff = 0
     30     for cls in classes:
     31         P_Li = np.sum(y_left == cls) / len(y_left)
     32         P_Ri = np.sum(y_right == cls) / len(y_right)
     33         sum_diff += np.abs(P_Li - P_Ri)
     34 
     35     return P_L * P_R * (sum_diff ** 2)
     36 
     37 def find_best_reflection(X, y):
     38     """
     39     For each class in y, compute the dominant eigenvector (largest eigenvalue eigenvector of the class covariance),
     40     reflect the dataset, and then for every axis in the reflected feature space try all axis-aligned splits using
     41     the unique sample values (without interpolation) to evaluate the twoing score.
     42     Returns the parameters of the best split found.
     43     """
     44     best_score = -np.inf
     45     best_params = None
     46 
     47     for cls in np.unique(y):
     48         class_indices = np.where(y == cls)[0]
     49         if len(class_indices) < 2:
     50             continue
     51         class_data = X[class_indices]
     52         cov = np.cov(class_data, rowvar=False)
     53         eigvals, eigvecs = np.linalg.eigh(cov)
     54         # Choose the dominant eigenvector (largest eigenvalue)
     55         d1 = eigvecs[:, -1]
     56 
     57         X_reflected, H = reflect_dataset(X, d1)
     58         p = X_reflected.shape[1]
     59         # For each axis in the reflected space, check axis-aligned splits.
     60         for j in range(p):
     61             values = X_reflected[:, j]
     62             unique_vals = np.unique(values)
     63             if len(unique_vals) < 2:
     64                 continue
     65             # Use each unique value directly as a candidate threshold.
     66             for threshold in unique_vals:
     67                 left_idx = values <= threshold
     68                 right_idx = ~left_idx
     69                 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
     70                     continue
     71                 score = compute_twoing_score(y[left_idx], y[right_idx])
     72                 if score > best_score:
     73                     best_score = score
     74                     best_params = {
     75                         'H': H,
     76                         'd1': d1,
     77                         'axis': j,
     78                         'threshold': threshold,
     79                         'left_idx': left_idx,
     80                         'right_idx': right_idx,
     81                         'score': score,
     82                         'class_used': cls,
     83                         'X_reflected': X_reflected
     84                     }
     85     return best_params
     86 
     87 class Node:
     88     def __init__(self, is_leaf=False, prediction=None, H=None, d1=None, axis=None,
     89                  threshold=None, score=None, left=None, right=None):
     90         self.is_leaf = is_leaf
     91         self.prediction = prediction      # Majority class for leaf nodes.
     92         self.H = H                        # Reflection matrix used at the node.
     93         self.d1 = d1                      # Dominant eigenvector used.
     94         self.axis = axis                  # Axis in the reflected space used for splitting.
     95         self.threshold = threshold        # Threshold for the axis-aligned split.
     96         self.score = score                # Twoing score for the chosen split.
     97         self.left = left                  # Left child node.
     98         self.right = right                # Right child node.
     99 
    100 class HHCartClassifier:
    101     def __init__(self, max_depth=4, min_samples_split=2):
    102         self.max_depth = max_depth
    103         self.min_samples_split = min_samples_split
    104         self.root = None
    105 
    106     def fit(self, X, y):
    107         self.root = self._build_tree(X, y, depth=0)
    108         
    109     def _build_tree(self, X, y, depth):
    110         # Stopping criteria: max depth reached, too few samples, or pure node.
    111         if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1:
    112             majority_class = np.bincount(y.astype(int)).argmax()
    113             return Node(is_leaf=True, prediction=majority_class)
    114         
    115         best_params = find_best_reflection(X, y)
    116         if best_params is None:
    117             majority_class = np.bincount(y.astype(int)).argmax()
    118             return Node(is_leaf=True, prediction=majority_class)
    119 
    120         H = best_params['H']
    121         d1 = best_params['d1']
    122         axis = best_params['axis']
    123         threshold = best_params['threshold']
    124         left_idx = best_params['left_idx']
    125         right_idx = best_params['right_idx']
    126         score = best_params['score']
    127 
    128         # If the split does not actually separate the data, create a leaf.
    129         if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
    130             majority_class = np.bincount(y.astype(int)).argmax()
    131             return Node(is_leaf=True, prediction=majority_class)
    132         
    133         # Build left and right subtrees.
    134         left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1)
    135         right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1)
    136         
    137         return Node(is_leaf=False, H=H, d1=d1, axis=axis, threshold=threshold, score=score,
    138                     left=left_node, right=right_node)
    139 
    140     def _predict_sample(self, x, node):
    141         if node.is_leaf:
    142             return node.prediction
    143         # Reflect the sample using the node's reflection matrix.
    144         x_reflected = x @ node.H
    145         # Use the stored axis in the reflected space for the split.
    146         value = x_reflected[node.axis]
    147         if value <= node.threshold:
    148             return self._predict_sample(x, node.left)
    149         else:
    150             return self._predict_sample(x, node.right)
    151 
    152     def predict(self, X):
    153         return np.array([self._predict_sample(x, self.root) for x in X])
    154 
    155 def count_leaves(node):
    156     """Recursively count the number of leaf nodes in the tree."""
    157     if node.is_leaf:
    158         return 1
    159     return count_leaves(node.left) + count_leaves(node.right)
    160 
    161 import numpy as np
    162 import pandas as pd
    163 from sklearn.model_selection import KFold
    164 from sklearn.metrics import accuracy_score
    165 from sklearn.datasets import load_iris
    166 
    167 iris = load_iris()
    168 df = pd.DataFrame(iris.data, columns=iris.feature_names)
    169 y = iris.target
    170 X = df.to_numpy()
    171 
    172 # Define hyperparameters
    173 depths = range(1, 6)
    174 n_splits = 5
    175 n_trials = 10
    176 seed = 15
    177 
    178 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration
    179 results = {depth: [] for depth in depths}
    180 tree_sizes = {depth: [] for depth in depths}
    181 
    182 # For each iteration, generate the folds once and use them for each depth.
    183 for trial in range(n_trials):
    184     print("TRIAL: " + str(trial + 1))
    185     # Create KFold splits for this trial
    186     kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed))
    187     folds = list(kf.split(X))
    188     
    189     for depth in depths:
    190         print("DEPTH: " + str(depth))
    191         fold_accuracies = []
    192         fold_sizes = []
    193         
    194         for train_index, test_index in folds:
    195             X_train, X_test = X[train_index], X[test_index]
    196             y_train, y_test = y[train_index], y[test_index]
    197             
    198             # Create and train the classifier with the current depth
    199             clf = HHCartClassifier(max_depth=depth, min_samples_split=2)
    200             clf.fit(X_train, y_train)
    201             preds = clf.predict(X_test)
    202             
    203             # Compute and store the accuracy and tree size (number of leaf nodes)
    204             fold_accuracies.append(accuracy_score(y_test, preds))
    205             fold_sizes.append(count_leaves(clf.root))
    206         
    207         # Store the mean accuracy and mean tree size from the folds of this trial
    208         results[depth].append(np.mean(fold_accuracies))
    209         tree_sizes[depth].append(np.mean(fold_sizes))
    210 
    211 # Write the final averaged results over all iterations to file
    212 with open("resultsIris.txt", "w") as f:
    213     f.write("Grid Search Results:\n")
    214     for depth in depths:
    215         avg_accuracy = np.mean(results[depth]) * 100 # percent
    216         std_accuracy = np.std(results[depth]) * 100 # percent
    217         avg_size = np.mean(tree_sizes[depth])
    218         std_size = np.std(tree_sizes[depth])
    219         f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ")
    220         f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")
	cart-elc Source code for CART-ELC
	git clone git://git.laack.co/cart-elc.git
	Log \| Files \| Refs \| README \| LICENSE