cancerAll.py (2894B)
1 import numpy as np 2 import pandas as pd 3 import decision_tree 4 from sklearn.model_selection import KFold 5 from sklearn.metrics import accuracy_score 6 7 df = pd.read_csv('../../datasets/cancer.csv') 8 df = df.drop(df.columns[0], axis=1) 9 df = df.dropna() 10 df = df[df[' nuc'] != '?'] 11 df = df.reset_index(drop=True) 12 y = df[' cla'] 13 df = df.drop(df.columns[-1], axis=1) 14 X = df.to_numpy(dtype=np.float64) 15 16 # Define hyperparameters 17 depths = [1,2,3,4,5] 18 criteria = ["gini", "twoing", "information gain"] 19 lcs = [1,2] 20 n_splits = 5 21 n_trials = 10 22 seed = 71 23 24 # Store results 25 accuracies = {lc: {depth: [] for depth in depths} for lc in lcs} 26 treeSizes = {lc: {depth: [] for depth in depths} for lc in lcs} 27 foldSizes = [] 28 foldAccuracies = [] 29 30 # Store results 31 results = {criterion: {lc: {depth: [] for depth in depths} for lc in lcs} for criterion in criteria} 32 tree_sizes = {criterion: {lc: {depth: [] for depth in depths} for lc in lcs} for criterion in criteria} 33 34 for trial in range(n_trials): 35 kf = KFold(n_splits=n_splits, shuffle=True, random_state=(trial + seed)) 36 37 for criterion in criteria: 38 for lc in lcs: 39 for depth in depths: 40 fold_accuracies = [] 41 fold_sizes = [] 42 43 for train_index, test_index in kf.split(X): 44 X_train, X_test = X[train_index], X[test_index] 45 y_train, y_test = y[train_index], y[test_index] 46 47 clf = decision_tree.ELCClassifier(depth, lc, 100, criterion) 48 clf.fit(X_train.ravel(), y_train.shape[0], y_train, int(X_train.size / y_train.shape[0])) 49 preds = clf.predict(X_test.ravel(), y_test.shape[0], int(X_test.size / y_test.shape[0])) 50 51 fold_accuracies.append(100 * accuracy_score(y_pred=preds, y_true=y_test)) 52 fold_sizes.append(clf.getSplits() + 1) # leaves not splits 53 54 results[criterion][lc][depth].append(np.mean(fold_accuracies)) 55 tree_sizes[criterion][lc][depth].append(np.mean(fold_sizes)) 56 57 with open("resultsCancer.txt", "w") as f: 58 f.write("Results:\n") 59 for criterion in criteria: 60 f.write("\n") 61 f.write(f"{criterion}:\n") 62 for lc in lcs: 63 for depth in depths: 64 avg_accuracy = np.mean(results[criterion][lc][depth]) 65 std_accuracy = np.std(results[criterion][lc][depth]) 66 avg_size = np.mean(tree_sizes[criterion][lc][depth]) 67 std_size = np.std(tree_sizes[criterion][lc][depth]) 68 69 f.write(f"LCs: {lc}, Depth: {depth} Avg Accuracy: {avg_accuracy:.1f} (Std: {std_accuracy:.1f})") 70 f.write(f", Avg # of Leaves: {avg_size:.1f} (Std: {std_size:.1f})\n")