cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

hhcart-cancer.py (8693B)


      1 import numpy as np
      2 import pandas as pd
      3 from sklearn.model_selection import KFold
      4 
      5 def compute_householder_matrix(d1):
      6     p = d1.shape[0]
      7     e1 = np.zeros_like(d1)
      8     e1[0] = 1.0
      9     u = e1 - d1
     10     u = u / np.linalg.norm(u)
     11     H = np.eye(p) - 2 * np.outer(u, u)
     12     return H
     13 
     14 def reflect_dataset(X, d1):
     15     H = compute_householder_matrix(d1)
     16     X_reflected = X @ H
     17     return X_reflected, H
     18 
     19 def compute_twoing_score(y_left, y_right):
     20     # Avoid degenerate splits.
     21     if len(y_left) == 0 or len(y_right) == 0:
     22         return 0
     23 
     24     total = len(y_left) + len(y_right)
     25     P_L = len(y_left) / total
     26     P_R = len(y_right) / total
     27 
     28     classes = np.unique(np.concatenate([y_left, y_right]))
     29     sum_diff = 0
     30     for cls in classes:
     31         P_Li = np.sum(y_left == cls) / len(y_left)
     32         P_Ri = np.sum(y_right == cls) / len(y_right)
     33         sum_diff += np.abs(P_Li - P_Ri)
     34 
     35     return P_L * P_R * (sum_diff ** 2)
     36 
     37 def find_best_reflection(X, y):
     38     """
     39     For each class in y, compute the dominant eigenvector (largest eigenvalue eigenvector of the class covariance),
     40     reflect the dataset, and then for every axis in the reflected feature space try all axis-aligned splits using
     41     the unique sample values (without interpolation) to evaluate the twoing score.
     42     Returns the parameters of the best split found.
     43     """
     44     best_score = -np.inf
     45     best_params = None
     46 
     47     for cls in np.unique(y):
     48         class_indices = np.where(y == cls)[0]
     49         if len(class_indices) < 2:
     50             continue
     51         class_data = X[class_indices]
     52         cov = np.cov(class_data, rowvar=False)
     53         eigvals, eigvecs = np.linalg.eigh(cov)
     54         # Choose the dominant eigenvector (largest eigenvalue)
     55         d1 = eigvecs[:, -1]
     56 
     57         X_reflected, H = reflect_dataset(X, d1)
     58         p = X_reflected.shape[1]
     59         # For each axis in the reflected space, check axis-aligned splits.
     60         for j in range(p):
     61             values = X_reflected[:, j]
     62             unique_vals = np.unique(values)
     63             if len(unique_vals) < 2:
     64                 continue
     65             # Use each unique value directly as a candidate threshold.
     66             for threshold in unique_vals:
     67                 left_idx = values <= threshold
     68                 right_idx = ~left_idx
     69                 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
     70                     continue
     71                 score = compute_twoing_score(y[left_idx], y[right_idx])
     72                 if score > best_score:
     73                     best_score = score
     74                     best_params = {
     75                         'H': H,
     76                         'd1': d1,
     77                         'axis': j,
     78                         'threshold': threshold,
     79                         'left_idx': left_idx,
     80                         'right_idx': right_idx,
     81                         'score': score,
     82                         'class_used': cls,
     83                         'X_reflected': X_reflected
     84                     }
     85     return best_params
     86 
     87 class Node:
     88     def __init__(self, is_leaf=False, prediction=None, H=None, d1=None, axis=None,
     89                  threshold=None, score=None, left=None, right=None):
     90         self.is_leaf = is_leaf
     91         self.prediction = prediction      # Majority class for leaf nodes.
     92         self.H = H                        # Reflection matrix used at the node.
     93         self.d1 = d1                      # Dominant eigenvector used.
     94         self.axis = axis                  # Axis in the reflected space used for splitting.
     95         self.threshold = threshold        # Threshold for the axis-aligned split.
     96         self.score = score                # Twoing score for the chosen split.
     97         self.left = left                  # Left child node.
     98         self.right = right                # Right child node.
     99 
    100 class HHCartClassifier:
    101     def __init__(self, max_depth=4, min_samples_split=2):
    102         self.max_depth = max_depth
    103         self.min_samples_split = min_samples_split
    104         self.root = None
    105 
    106     def fit(self, X, y):
    107         self.root = self._build_tree(X, y, depth=0)
    108         
    109     def _build_tree(self, X, y, depth):
    110         # Stopping criteria: max depth reached, too few samples, or pure node.
    111         if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1:
    112             majority_class = np.bincount(y.astype(int)).argmax()
    113             return Node(is_leaf=True, prediction=majority_class)
    114         
    115         best_params = find_best_reflection(X, y)
    116         if best_params is None:
    117             majority_class = np.bincount(y.astype(int)).argmax()
    118             return Node(is_leaf=True, prediction=majority_class)
    119 
    120         H = best_params['H']
    121         d1 = best_params['d1']
    122         axis = best_params['axis']
    123         threshold = best_params['threshold']
    124         left_idx = best_params['left_idx']
    125         right_idx = best_params['right_idx']
    126         score = best_params['score']
    127 
    128         # If the split does not actually separate the data, create a leaf.
    129         if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
    130             majority_class = np.bincount(y.astype(int)).argmax()
    131             return Node(is_leaf=True, prediction=majority_class)
    132         
    133         # Build left and right subtrees.
    134         left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1)
    135         right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1)
    136         
    137         return Node(is_leaf=False, H=H, d1=d1, axis=axis, threshold=threshold, score=score,
    138                     left=left_node, right=right_node)
    139 
    140     def _predict_sample(self, x, node):
    141         if node.is_leaf:
    142             return node.prediction
    143         # Reflect the sample using the node's reflection matrix.
    144         x_reflected = x @ node.H
    145         # Use the stored axis in the reflected space for the split.
    146         value = x_reflected[node.axis]
    147         if value <= node.threshold:
    148             return self._predict_sample(x, node.left)
    149         else:
    150             return self._predict_sample(x, node.right)
    151 
    152     def predict(self, X):
    153         return np.array([self._predict_sample(x, self.root) for x in X])
    154 
    155 def count_leaves(node):
    156     """Recursively count the number of leaf nodes in the tree."""
    157     if node.is_leaf:
    158         return 1
    159     return count_leaves(node.left) + count_leaves(node.right)
    160 
    161 import numpy as np
    162 import pandas as pd
    163 from sklearn.model_selection import KFold
    164 from sklearn.metrics import accuracy_score
    165 
    166 df = pd.read_csv('../../datasets/cancer.csv')
    167 df = df.drop(df.columns[0], axis=1)
    168 df = df.dropna()
    169 df = df[df[' nuc'] != '?']
    170 df = df.reset_index(drop=True)
    171 y = df[' cla']
    172 df = df.drop(df.columns[-1], axis=1)
    173 X = df.to_numpy(dtype=np.float64)
    174 
    175 # Define hyperparameters
    176 depths = range(1, 6)
    177 n_splits = 5
    178 n_trials = 10
    179 seed = 15
    180 
    181 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration
    182 results = {depth: [] for depth in depths}
    183 tree_sizes = {depth: [] for depth in depths}
    184 
    185 # For each iteration, generate the folds once and use them for each depth.
    186 for trial in range(n_trials):
    187     print("TRIAL: " + str(trial + 1))
    188     # Create KFold splits for this trial
    189     kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed))
    190     folds = list(kf.split(X))
    191     
    192     for depth in depths:
    193         print("DEPTH: " + str(depth))
    194         fold_accuracies = []
    195         fold_sizes = []
    196         
    197         for train_index, test_index in folds:
    198             X_train, X_test = X[train_index], X[test_index]
    199             y_train, y_test = y[train_index], y[test_index]
    200             
    201             # Create and train the classifier with the current depth
    202             clf = HHCartClassifier(max_depth=depth, min_samples_split=2)
    203             clf.fit(X_train, y_train)
    204             preds = clf.predict(X_test)
    205             
    206             # Compute and store the accuracy and tree size (number of leaf nodes)
    207             fold_accuracies.append(accuracy_score(y_test, preds))
    208             fold_sizes.append(count_leaves(clf.root))
    209         
    210         # Store the mean accuracy and mean tree size from the folds of this trial
    211         results[depth].append(np.mean(fold_accuracies))
    212         tree_sizes[depth].append(np.mean(fold_sizes))
    213 
    214 # Write the final averaged results over all iterations to file
    215 with open("resultsCancer.txt", "w") as f:
    216     f.write("Grid Search Results:\n")
    217     for depth in depths:
    218         avg_accuracy = np.mean(results[depth]) * 100 # percent
    219         std_accuracy = np.std(results[depth]) * 100 # percent
    220         avg_size = np.mean(tree_sizes[depth])
    221         std_size = np.std(tree_sizes[depth])
    222         f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ")
    223         f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")