cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

hhcart-iris.py (8883B)


      1 import numpy as np
      2 import pandas as pd
      3 from sklearn.model_selection import KFold
      4 
      5 def compute_householder_matrix(d):
      6     p = d.shape[0]
      7     e1 = np.zeros_like(d)
      8     e1[0] = 1.0
      9     u = e1 - d
     10     u = u / np.linalg.norm(u)
     11     H = np.eye(p) - 2 * np.outer(u, u)
     12     return H
     13 
     14 def reflect_dataset(X, d):
     15     H = compute_householder_matrix(d)
     16     X_reflected = X @ H
     17     return X_reflected, H
     18 
     19 def compute_twoing_score(y_left, y_right):
     20     # Avoid degenerate splits.
     21     if len(y_left) == 0 or len(y_right) == 0:
     22         return 0
     23 
     24     total = len(y_left) + len(y_right)
     25     P_L = len(y_left) / total
     26     P_R = len(y_right) / total
     27 
     28     classes = np.unique(np.concatenate([y_left, y_right]))
     29     sum_diff = 0
     30     for cls in classes:
     31         P_Li = np.sum(y_left == cls) / len(y_left)
     32         P_Ri = np.sum(y_right == cls) / len(y_right)
     33         sum_diff += np.abs(P_Li - P_Ri)
     34 
     35     return P_L * P_R * (sum_diff ** 2)
     36 
     37 def find_best_reflection(X, y):
     38     """
     39     For each class in y, compute all eigenvectors (from the class covariance),
     40     reflect the dataset for each eigenvector candidate, and then for every axis in 
     41     the reflected feature space try all axis-aligned splits using the unique sample 
     42     values (without interpolation) to evaluate the twoing score.
     43     Returns the parameters of the best split found.
     44     """
     45     best_score = -np.inf
     46     best_params = None
     47 
     48     for cls in np.unique(y):
     49         class_indices = np.where(y == cls)[0]
     50         if len(class_indices) < 2:
     51             continue
     52         class_data = X[class_indices]
     53         cov = np.cov(class_data, rowvar=False)
     54         eigvals, eigvecs = np.linalg.eigh(cov)
     55         
     56         # Evaluate all eigenvector candidates (each column in eigvecs)
     57         for i in range(eigvecs.shape[1]):
     58             d_candidate = eigvecs[:, i]
     59             X_reflected, H = reflect_dataset(X, d_candidate)
     60             p = X_reflected.shape[1]
     61             # For each axis in the reflected space, check axis-aligned splits.
     62             for j in range(p):
     63                 values = X_reflected[:, j]
     64                 unique_vals = np.unique(values)
     65                 if len(unique_vals) < 2:
     66                     continue
     67                 # Use each unique value directly as a candidate threshold.
     68                 for threshold in unique_vals:
     69                     left_idx = values <= threshold
     70                     right_idx = ~left_idx
     71                     if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
     72                         continue
     73                     score = compute_twoing_score(y[left_idx], y[right_idx])
     74                     if score > best_score:
     75                         best_score = score
     76                         best_params = {
     77                             'H': H,
     78                             'd_candidate': d_candidate,  # Store the eigenvector used.
     79                             'axis': j,
     80                             'threshold': threshold,
     81                             'left_idx': left_idx,
     82                             'right_idx': right_idx,
     83                             'score': score,
     84                             'class_used': cls,
     85                             'X_reflected': X_reflected
     86                         }
     87     return best_params
     88 
     89 class Node:
     90     def __init__(self, is_leaf=False, prediction=None, H=None, d_candidate=None, axis=None,
     91                  threshold=None, score=None, left=None, right=None):
     92         self.is_leaf = is_leaf
     93         self.prediction = prediction      # Majority class for leaf nodes.
     94         self.H = H                        # Reflection matrix used at the node.
     95         self.d_candidate = d_candidate    # Eigenvector used for reflection.
     96         self.axis = axis                  # Axis in the reflected space used for splitting.
     97         self.threshold = threshold        # Threshold for the axis-aligned split.
     98         self.score = score                # Twoing score for the chosen split.
     99         self.left = left                  # Left child node.
    100         self.right = right                # Right child node.
    101 
    102 class HHCartClassifier:
    103     def __init__(self, max_depth=4, min_samples_split=2):
    104         self.max_depth = max_depth
    105         self.min_samples_split = min_samples_split
    106         self.root = None
    107 
    108     def fit(self, X, y):
    109         self.root = self._build_tree(X, y, depth=0)
    110         
    111     def _build_tree(self, X, y, depth):
    112         # Stopping criteria: max depth reached, too few samples, or pure node.
    113         if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1:
    114             majority_class = np.bincount(y.astype(int)).argmax()
    115             return Node(is_leaf=True, prediction=majority_class)
    116         
    117         best_params = find_best_reflection(X, y)
    118         if best_params is None:
    119             majority_class = np.bincount(y.astype(int)).argmax()
    120             return Node(is_leaf=True, prediction=majority_class)
    121 
    122         H = best_params['H']
    123         d_candidate = best_params['d_candidate']
    124         axis = best_params['axis']
    125         threshold = best_params['threshold']
    126         left_idx = best_params['left_idx']
    127         right_idx = best_params['right_idx']
    128         score = best_params['score']
    129 
    130         # If the split does not actually separate the data, create a leaf.
    131         if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
    132             majority_class = np.bincount(y.astype(int)).argmax()
    133             return Node(is_leaf=True, prediction=majority_class)
    134         
    135         # Build left and right subtrees.
    136         left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1)
    137         right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1)
    138         
    139         return Node(is_leaf=False, H=H, d_candidate=d_candidate, axis=axis, threshold=threshold, score=score,
    140                     left=left_node, right=right_node)
    141 
    142     def _predict_sample(self, x, node):
    143         if node.is_leaf:
    144             return node.prediction
    145         # Reflect the sample using the node's reflection matrix.
    146         x_reflected = x @ node.H
    147         # Use the stored axis in the reflected space for the split.
    148         value = x_reflected[node.axis]
    149         if value <= node.threshold:
    150             return self._predict_sample(x, node.left)
    151         else:
    152             return self._predict_sample(x, node.right)
    153 
    154     def predict(self, X):
    155         return np.array([self._predict_sample(x, self.root) for x in X])
    156 
    157 def count_leaves(node):
    158     """Recursively count the number of leaf nodes in the tree."""
    159     if node.is_leaf:
    160         return 1
    161     return count_leaves(node.left) + count_leaves(node.right)
    162 
    163 
    164 import numpy as np
    165 import pandas as pd
    166 from sklearn.model_selection import KFold
    167 from sklearn.metrics import accuracy_score
    168 from sklearn.datasets import load_iris
    169 
    170 iris = load_iris()
    171 df = pd.DataFrame(iris.data, columns=iris.feature_names)
    172 y = iris.target
    173 X = df.to_numpy()
    174 
    175 # Define hyperparameters
    176 depths = range(1, 6)
    177 n_splits = 5
    178 n_trials = 10
    179 seed = 15
    180 
    181 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration
    182 results = {depth: [] for depth in depths}
    183 tree_sizes = {depth: [] for depth in depths}
    184 
    185 # For each iteration, generate the folds once and use them for each depth.
    186 for trial in range(n_trials):
    187     print("TRIAL: " + str(trial + 1))
    188     # Create KFold splits for this trial
    189     kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed))
    190     folds = list(kf.split(X))
    191     
    192     for depth in depths:
    193         print("DEPTH: " + str(depth))
    194         fold_accuracies = []
    195         fold_sizes = []
    196         
    197         for train_index, test_index in folds:
    198             X_train, X_test = X[train_index], X[test_index]
    199             y_train, y_test = y[train_index], y[test_index]
    200             
    201             # Create and train the classifier with the current depth
    202             clf = HHCartClassifier(max_depth=depth, min_samples_split=2)
    203             clf.fit(X_train, y_train)
    204             preds = clf.predict(X_test)
    205             
    206             # Compute and store the accuracy and tree size (number of leaf nodes)
    207             fold_accuracies.append(accuracy_score(y_test, preds))
    208             fold_sizes.append(count_leaves(clf.root))
    209         
    210         # Store the mean accuracy and mean tree size from the folds of this trial
    211         results[depth].append(np.mean(fold_accuracies))
    212         tree_sizes[depth].append(np.mean(fold_sizes))
    213 
    214 # Write the final averaged results over all iterations to file
    215 with open("resultsIris.txt", "w") as f:
    216     f.write("Grid Search Results:\n")
    217     for depth in depths:
    218         avg_accuracy = np.mean(results[depth]) * 100 # percent
    219         std_accuracy = np.std(results[depth]) * 100 # percent
    220         avg_size = np.mean(tree_sizes[depth])
    221         std_size = np.std(tree_sizes[depth])
    222         f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ")
    223         f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")