cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

cancerAll.py (2894B)


      1 import numpy as np
      2 import pandas as pd
      3 import decision_tree
      4 from sklearn.model_selection import KFold
      5 from sklearn.metrics import accuracy_score
      6 
      7 df = pd.read_csv('../../datasets/cancer.csv')
      8 df = df.drop(df.columns[0], axis=1)
      9 df = df.dropna()
     10 df = df[df[' nuc'] != '?']
     11 df = df.reset_index(drop=True)
     12 y = df[' cla']
     13 df = df.drop(df.columns[-1], axis=1)
     14 X = df.to_numpy(dtype=np.float64)
     15 
     16 # Define hyperparameters
     17 depths = [1,2,3,4,5]
     18 criteria = ["gini", "twoing", "information gain"]
     19 lcs = [1,2]
     20 n_splits = 5
     21 n_trials = 10
     22 seed = 71
     23 
     24 # Store results
     25 accuracies = {lc: {depth: [] for depth in depths} for lc in lcs}
     26 treeSizes = {lc: {depth: [] for depth in depths} for lc in lcs}
     27 foldSizes = []
     28 foldAccuracies = []
     29 
     30 # Store results
     31 results = {criterion: {lc: {depth: [] for depth in depths} for lc in lcs} for criterion in criteria}
     32 tree_sizes = {criterion: {lc: {depth: [] for depth in depths} for lc in lcs} for criterion in criteria}
     33 
     34 for trial in range(n_trials):
     35     kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed))
     36     
     37     for criterion in criteria:
     38         for lc in lcs:
     39             for depth in depths:
     40                 fold_accuracies = []
     41                 fold_sizes = []
     42                 
     43                 for train_index, test_index in kf.split(X):
     44                     X_train, X_test = X[train_index], X[test_index]
     45                     y_train, y_test = y[train_index], y[test_index]
     46                     
     47                     clf = decision_tree.ELCClassifier(depth, lc, 100, criterion)
     48                     clf.fit(X_train.ravel(), y_train.shape[0], y_train, int(X_train.size / y_train.shape[0]))
     49                     preds = clf.predict(X_test.ravel(), y_test.shape[0], int(X_test.size / y_test.shape[0]))
     50                     
     51                     fold_accuracies.append(100 * accuracy_score(y_pred=preds, y_true=y_test))
     52                     fold_sizes.append(clf.getSplits() + 1) # leaves not splits
     53                 
     54                 results[criterion][lc][depth].append(np.mean(fold_accuracies))
     55                 tree_sizes[criterion][lc][depth].append(np.mean(fold_sizes))
     56                                                                                  
     57 with open("resultsCancer.txt", "w") as f:
     58     f.write("Results:\n")
     59     for criterion in criteria:
     60         f.write("\n")
     61         f.write(f"{criterion}:\n")
     62         for lc in lcs:
     63             for depth in depths:
     64                 avg_accuracy = np.mean(results[criterion][lc][depth])
     65                 std_accuracy = np.std(results[criterion][lc][depth])
     66                 avg_size = np.mean(tree_sizes[criterion][lc][depth])
     67                 std_size = np.std(tree_sizes[criterion][lc][depth])
     68                 
     69                 f.write(f"LCs: {lc}, Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f})")
     70                 f.write(f", Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")