hhcart-dim.py - cart-elc - Source code for CART-ELC

hhcart-dim.py (8584B)
      1 import numpy as np
      2 import pandas as pd
      3 from sklearn.model_selection import KFold
      4 
      5 def compute_householder_matrix(d1):
      6     p = d1.shape[0]
      7     e1 = np.zeros_like(d1)
      8     e1[0] = 1.0
      9     u = e1 - d1
     10     u = u / np.linalg.norm(u)
     11     H = np.eye(p) - 2 * np.outer(u, u)
     12     return H
     13 
     14 def reflect_dataset(X, d1):
     15     H = compute_householder_matrix(d1)
     16     X_reflected = X @ H
     17     return X_reflected, H
     18 
     19 def compute_twoing_score(y_left, y_right):
     20     # Avoid degenerate splits.
     21     if len(y_left) == 0 or len(y_right) == 0:
     22         return 0
     23 
     24     total = len(y_left) + len(y_right)
     25     P_L = len(y_left) / total
     26     P_R = len(y_right) / total
     27 
     28     classes = np.unique(np.concatenate([y_left, y_right]))
     29     sum_diff = 0
     30     for cls in classes:
     31         P_Li = np.sum(y_left == cls) / len(y_left)
     32         P_Ri = np.sum(y_right == cls) / len(y_right)
     33         sum_diff += np.abs(P_Li - P_Ri)
     34 
     35     return P_L * P_R * (sum_diff ** 2)
     36 
     37 def find_best_reflection(X, y):
     38     """
     39     For each class in y, compute the dominant eigenvector (largest eigenvalue eigenvector of the class covariance),
     40     reflect the dataset, and then for every axis in the reflected feature space try all axis-aligned splits using
     41     the unique sample values (without interpolation) to evaluate the twoing score.
     42     Returns the parameters of the best split found.
     43     """
     44     best_score = -np.inf
     45     best_params = None
     46 
     47     for cls in np.unique(y):
     48         class_indices = np.where(y == cls)[0]
     49         if len(class_indices) < 2:
     50             continue
     51         class_data = X[class_indices]
     52         cov = np.cov(class_data, rowvar=False)
     53         eigvals, eigvecs = np.linalg.eigh(cov)
     54         # Choose the dominant eigenvector (largest eigenvalue)
     55         d1 = eigvecs[:, -1]
     56 
     57         X_reflected, H = reflect_dataset(X, d1)
     58         p = X_reflected.shape[1]
     59         # For each axis in the reflected space, check axis-aligned splits.
     60         for j in range(p):
     61             values = X_reflected[:, j]
     62             unique_vals = np.unique(values)
     63             if len(unique_vals) < 2:
     64                 continue
     65             # Use each unique value directly as a candidate threshold.
     66             for threshold in unique_vals:
     67                 left_idx = values <= threshold
     68                 right_idx = ~left_idx
     69                 if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
     70                     continue
     71                 score = compute_twoing_score(y[left_idx], y[right_idx])
     72                 if score > best_score:
     73                     best_score = score
     74                     best_params = {
     75                         'H': H,
     76                         'd1': d1,
     77                         'axis': j,
     78                         'threshold': threshold,
     79                         'left_idx': left_idx,
     80                         'right_idx': right_idx,
     81                         'score': score,
     82                         'class_used': cls,
     83                         'X_reflected': X_reflected
     84                     }
     85     return best_params
     86 
     87 class Node:
     88     def __init__(self, is_leaf=False, prediction=None, H=None, d1=None, axis=None,
     89                  threshold=None, score=None, left=None, right=None):
     90         self.is_leaf = is_leaf
     91         self.prediction = prediction      # Majority class for leaf nodes.
     92         self.H = H                        # Reflection matrix used at the node.
     93         self.d1 = d1                      # Dominant eigenvector used.
     94         self.axis = axis                  # Axis in the reflected space used for splitting.
     95         self.threshold = threshold        # Threshold for the axis-aligned split.
     96         self.score = score                # Twoing score for the chosen split.
     97         self.left = left                  # Left child node.
     98         self.right = right                # Right child node.
     99 
    100 class HHCartClassifier:
    101     def __init__(self, max_depth=4, min_samples_split=2):
    102         self.max_depth = max_depth
    103         self.min_samples_split = min_samples_split
    104         self.root = None
    105 
    106     def fit(self, X, y):
    107         self.root = self._build_tree(X, y, depth=0)
    108         
    109     def _build_tree(self, X, y, depth):
    110         # Stopping criteria: max depth reached, too few samples, or pure node.
    111         if depth >= self.max_depth or len(y) < self.min_samples_split or np.unique(y).size == 1:
    112             majority_class = np.bincount(y.astype(int)).argmax()
    113             return Node(is_leaf=True, prediction=majority_class)
    114         
    115         best_params = find_best_reflection(X, y)
    116         if best_params is None:
    117             majority_class = np.bincount(y.astype(int)).argmax()
    118             return Node(is_leaf=True, prediction=majority_class)
    119 
    120         H = best_params['H']
    121         d1 = best_params['d1']
    122         axis = best_params['axis']
    123         threshold = best_params['threshold']
    124         left_idx = best_params['left_idx']
    125         right_idx = best_params['right_idx']
    126         score = best_params['score']
    127 
    128         # If the split does not actually separate the data, create a leaf.
    129         if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
    130             majority_class = np.bincount(y.astype(int)).argmax()
    131             return Node(is_leaf=True, prediction=majority_class)
    132         
    133         # Build left and right subtrees.
    134         left_node = self._build_tree(X[left_idx], y[left_idx], depth + 1)
    135         right_node = self._build_tree(X[right_idx], y[right_idx], depth + 1)
    136         
    137         return Node(is_leaf=False, H=H, d1=d1, axis=axis, threshold=threshold, score=score,
    138                     left=left_node, right=right_node)
    139 
    140     def _predict_sample(self, x, node):
    141         if node.is_leaf:
    142             return node.prediction
    143         # Reflect the sample using the node's reflection matrix.
    144         x_reflected = x @ node.H
    145         # Use the stored axis in the reflected space for the split.
    146         value = x_reflected[node.axis]
    147         if value <= node.threshold:
    148             return self._predict_sample(x, node.left)
    149         else:
    150             return self._predict_sample(x, node.right)
    151 
    152     def predict(self, X):
    153         return np.array([self._predict_sample(x, self.root) for x in X])
    154 
    155 def count_leaves(node):
    156     """Recursively count the number of leaf nodes in the tree."""
    157     if node.is_leaf:
    158         return 1
    159     return count_leaves(node.left) + count_leaves(node.right)
    160 
    161 import numpy as np
    162 import pandas as pd
    163 from sklearn.model_selection import KFold
    164 from sklearn.metrics import accuracy_score
    165 
    166 df = pd.read_csv('../../datasets/dim.csv')
    167 y = df[df.columns[-1]]
    168 df = df.drop(df.columns[-1], axis=1)
    169 X = df.to_numpy(dtype=np.float64)
    170 
    171 # Define hyperparameters
    172 depths = range(1, 6)
    173 n_splits = 5
    174 n_trials = 10
    175 seed = 15
    176 
    177 # Store results: one list per depth collecting the mean accuracy and tree size from each iteration
    178 results = {depth: [] for depth in depths}
    179 tree_sizes = {depth: [] for depth in depths}
    180 
    181 # For each iteration, generate the folds once and use them for each depth.
    182 for trial in range(n_trials):
    183     print("TRIAL: " + str(trial + 1))
    184     # Create KFold splits for this trial
    185     kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed))
    186     folds = list(kf.split(X))
    187     
    188     for depth in depths:
    189         print("DEPTH: " + str(depth))
    190         fold_accuracies = []
    191         fold_sizes = []
    192         
    193         for train_index, test_index in folds:
    194             X_train, X_test = X[train_index], X[test_index]
    195             y_train, y_test = y[train_index], y[test_index]
    196             
    197             # Create and train the classifier with the current depth
    198             clf = HHCartClassifier(max_depth=depth, min_samples_split=2)
    199             clf.fit(X_train, y_train)
    200             preds = clf.predict(X_test)
    201             
    202             # Compute and store the accuracy and tree size (number of leaf nodes)
    203             fold_accuracies.append(accuracy_score(y_test, preds))
    204             fold_sizes.append(count_leaves(clf.root))
    205         
    206         # Store the mean accuracy and mean tree size from the folds of this trial
    207         results[depth].append(np.mean(fold_accuracies))
    208         tree_sizes[depth].append(np.mean(fold_sizes))
    209 
    210 # Write the final averaged results over all iterations to file
    211 with open("resultsDim.txt", "w") as f:
    212     f.write("Grid Search Results:\n")
    213     for depth in depths:
    214         avg_accuracy = np.mean(results[depth]) * 100 # percent
    215         std_accuracy = np.std(results[depth]) * 100 # percent
    216         avg_size = np.mean(tree_sizes[depth])
    217         std_size = np.std(tree_sizes[depth])
    218         f.write(f"Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f}), ")
    219         f.write(f"Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")
	cart-elc Source code for CART-ELC
	git clone git://git.laack.co/cart-elc.git
	Log \| Files \| Refs \| README \| LICENSE