TreeNode.cpp (10707B)
1 #include "../include/TreeNode.h" 2 #include "stdexcept" 3 #include "math.h" 4 #include <string> 5 #include <sstream> 6 #include "../lib/eigen-3.4.0/Eigen/Dense" 7 #include "iostream" 8 #include "unordered_map" 9 #include "unordered_set" 10 #include <vector> 11 12 TreeNode::TreeNode(int classification){ 13 leaf = true; 14 this->classification = classification; 15 } 16 17 // pass in only the samples you want to split on. 18 TreeNode::TreeNode(float* samples, int features, int points, int* indicesOrderIn, int indicesCount){ 19 20 //std::cout << indicesCount << std::endl; 21 //std::cout << features << std::endl; 22 23 24 this->indicesOrder = new int[indicesCount]; 25 this->indicesCount = indicesCount; 26 27 for(int i = 0 ; i < indicesCount; ++i){ 28 this->indicesOrder[i] = indicesOrderIn[i]; 29 } 30 31 //std::cout << std::endl; 32 //std::cout << "INDICES: "<< std::endl; 33 //for(int i = 0 ; i < indicesCount; ++i){ 34 // std::cout << indicesOrder[i] << " "; 35 //} 36 //std::cout << std::endl; 37 38 Eigen::MatrixXd data(points, indicesCount); 39 40 41 for (int i = 0; i < points; i++) { 42 for (int j = 0; j < indicesCount; j++) { 43 data(i, j) = samples[(i * features) + indicesOrder[j]]; 44 } 45 } 46 47 // uncomment to see input matricies 48 //std::cout << data << std::endl << std::endl;; 49 50 // compute means 51 Eigen::VectorXd mean = data.colwise().mean(); 52 53 // compute difference from mean for each sample 54 Eigen::MatrixXd centered = data.rowwise() - mean.transpose(); 55 56 // create covariance matrix 57 Eigen::MatrixXd covariance = (centered.transpose() * centered) / (points- 1); 58 59 // used to compute eigen vector 60 Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eig(covariance); 61 62 // eigen vector = normal vector 63 Eigen::VectorXd normal = eig.eigenvectors().col(0); 64 65 float offset = mean.dot(normal); 66 67 float* equ = new float[this->indicesCount + 1]; 68 for (int i = 0; i < indicesCount ; i++) { 69 equ[i] = normal[i]; 70 } 71 72 equ[indicesCount] = offset; 73 74 this->equation = equ; 75 this->leaf = false; 76 77 } 78 79 TreeNode::~TreeNode() { 80 if(this->equation != nullptr){ 81 delete[] this->equation; 82 this->equation = nullptr; 83 } 84 if(this->indicesOrder != nullptr){ 85 delete[] this->indicesOrder; 86 this->indicesOrder = nullptr; 87 } 88 } 89 90 float* TreeNode::getEquation(){ 91 return this->equation; 92 } 93 94 bool TreeNode::isLeaf(){ 95 return leaf; 96 } 97 98 // don't include classifications in feature count. 99 float TreeNode::evalSplit(float* X, int* y, int samples, int features, std::string criterion){ 100 101 if(isLeaf()){ 102 throw std::logic_error("Cannot evaluate split on leaf node."); 103 } 104 105 if(criterion == "gini"){ 106 107 return giniImpurity(X, y, samples, features); 108 } 109 110 if(criterion == "twoing"){ 111 return twoingRule(X, y, samples, features); 112 } 113 if(criterion == "information gain"){ 114 return informationGain(X,y,samples,features); 115 } 116 117 throw std::invalid_argument("supported objective functions are 'twoing', 'gini', and 'information gain'"); 118 return 0.0f; 119 } 120 121 122 float TreeNode::informationGain(float* X, int* y, int samples, int features) { 123 124 float before = entropy(y, samples); 125 126 std::vector<int> left; 127 std::vector<int> right; 128 129 float* currentSample = X; 130 131 for(int i = 0; i < samples; ++i){ 132 133 if(aboveOrOnPlane(currentSample, features)){ 134 left.push_back(y[i]); 135 } 136 else{ 137 right.push_back(y[i]); 138 } 139 140 currentSample += features; 141 } 142 143 float leftEn = entropy(left.data(), left.size()); 144 float rightEn = entropy(right.data(), right.size()); 145 146 float weightedLeft = leftEn * (static_cast<float>(left.size()) / samples); 147 float weightedRight = rightEn * (static_cast<float>(right.size()) / samples); 148 149 float after = weightedLeft + weightedRight; 150 151 float final = before - after; 152 153 // we are minimizing objective function thus 154 // invert as high entropy is good. 155 156 final *= -1; 157 158 return final; 159 } 160 161 float TreeNode::entropy(int* y, int samples){ 162 163 if(samples == 0){ 164 return 0.0f; 165 } 166 167 std::unordered_map<int, int> counts; 168 169 for (int i = 0; i < samples; ++i) { 170 counts[y[i]]++; 171 } 172 173 float entropy = 0.0f; 174 175 for (const auto& entry : counts) { 176 177 float prob = static_cast<float>(entry.second) / static_cast<float>(samples); 178 float logProb = std::log2(prob); 179 180 entropy += logProb * prob; 181 } 182 183 entropy *= -1; 184 185 return entropy; 186 } 187 188 189 // outer = (p_L * p_R) * (1/4) 190 // inner = sum(abs(p(class | t_L) - p(class | t_R))) ^ 2 191 // final = outer * inner 192 193 // note: since the elcclassifier is trying to minimize the objective function, 194 // but the twoing rule is to be maximized, we multiply this value by -1 before returning. 195 196 float TreeNode::twoingRule(float* X, int* y, int samples, int features) { 197 198 std::unordered_map<int, int> ltMap; 199 std::unordered_map<int, int> gtMap; 200 201 int ltCount = 0; 202 int gteqCount = 0; 203 204 float* currentSample = X; 205 206 for (int i = 0; i < samples; ++i) { 207 if (aboveOrOnPlane(currentSample, features)) { 208 ltMap[y[i]]++; 209 ltCount++; 210 } else { 211 gtMap[y[i]]++; 212 gteqCount++; 213 } 214 currentSample += features; 215 } 216 217 // one side is empty, avoid div by 0. 218 if (ltCount == 0 || gteqCount == 0) { 219 return 0.0f; 220 } 221 222 float pL = static_cast<float>(ltCount) / samples; 223 float pR = static_cast<float>(gteqCount) / samples; 224 float outer = (pL * pR) * (1 / 4.0f); 225 226 // union labels from both groups in case 227 // samples are only in one or the other. 228 std::unordered_set<int> classLabels; 229 for (const auto& entry : ltMap) { 230 classLabels.insert(entry.first); 231 } 232 for (const auto& entry : gtMap) { 233 classLabels.insert(entry.first); 234 } 235 236 float inner = 0.0f; 237 for (int classLabel : classLabels) { 238 int ltClassCount = (ltMap.find(classLabel) != ltMap.end()) ? ltMap[classLabel] : 0; 239 float pClassL = static_cast<float>(ltClassCount) / ltCount; 240 241 int gtClassCount = (gtMap.find(classLabel) != gtMap.end()) ? gtMap[classLabel] : 0; 242 float pClassR = static_cast<float>(gtClassCount) / gteqCount; 243 244 inner += std::abs(pClassL - pClassR); 245 } 246 247 inner = inner * inner; 248 249 // Multiply by -1 because the classifier minimizes the objective function 250 float final = -1 * outer * inner; 251 return final; 252 } 253 254 float TreeNode::giniImpurity(float* X, int* y, int samples, int features){ 255 256 // verified this is perfect. 257 // if there are issues it is with constructor of tree node class. I built it wrong. 258 259 std::unordered_map<int, int> ltMap; 260 std::unordered_map<int, int> gtMap; 261 262 int ltCount = 0; 263 int gteqCount = 0; 264 265 float* currentSample = X; 266 267 for(int i = 0; i < samples; ++i){ 268 269 if(aboveOrOnPlane(currentSample, features)){ 270 ltMap[y[i]]++; 271 ltCount++; 272 } 273 else{ 274 gtMap[y[i]]++; 275 gteqCount++; 276 } 277 278 currentSample += features; 279 } 280 281 float ltGini= 1.0f; 282 283 for (const auto& pair : ltMap) { 284 ltGini -= pow(float(pair.second) / ltCount, 2); 285 } 286 287 float gteqGini = 1.0f; 288 289 for (const auto& pair : gtMap) { 290 gteqGini -= pow(float(pair.second) / gteqCount, 2); 291 } 292 293 if(gteqCount == 0){ 294 gteqGini = 0.0f; 295 } 296 if(ltCount == 0){ 297 ltGini = 0.0f; 298 } 299 300 301 float gini = gteqGini * float(gteqCount) / samples; 302 gini += ltGini * float(ltCount) / samples; 303 304 return gini; 305 } 306 307 308 // equation form is as follows: 309 // x, y, z, d 310 // where ax + by + cz = d 311 312 313 bool TreeNode::aboveOrOnPlane(float* sample, int features){ 314 315 float dp = 0; 316 for(int i = 0 ; i < indicesCount; ++i){ 317 dp += sample[indicesOrder[i]] * equation[i]; 318 } 319 float bias = equation[indicesCount]; 320 float appliedBias = dp - bias; 321 322 // apply slight bias to ensure values on the plane evaluate to |~0|. 323 float TOLERANCE = .001; 324 appliedBias += TOLERANCE; 325 return appliedBias >= 0; 326 327 } 328 329 330 331 void TreeNode::setLeftChild(TreeNode* child){ 332 leftChild = child; 333 } 334 335 void TreeNode::setRightChild(TreeNode* child){ 336 rightChild = child; 337 } 338 339 TreeNode* TreeNode::getLeftChild(){ 340 return leftChild; 341 } 342 343 TreeNode* TreeNode::getRightChild(){ 344 return rightChild; 345 } 346 347 SplitResults TreeNode::splitOnNode(float* X, int* y, int samples, int features){ 348 349 int leftSize = 0; 350 int rightSize = 0; 351 float* current = X; 352 353 // get counts for array init 354 for(int i = 0 ; i < samples; ++i){ 355 if(aboveOrOnPlane(current, features)){ 356 rightSize += 1; 357 } 358 else{ 359 leftSize += 1; 360 } 361 362 current += features; 363 } 364 // done with counting 365 366 // init 367 SplitResults res; 368 res.leftSize = leftSize; 369 res.rightSize = rightSize; 370 371 372 float* right = new float[rightSize * features]; 373 float* left = new float[leftSize * features]; 374 int* rightLabels = new int[rightSize]; 375 int* leftLabels = new int[leftSize]; 376 377 378 379 int leftFill = 0; 380 int rightFill = 0; 381 current = X; 382 383 for (int i = 0; i < samples; ++i) { 384 if (aboveOrOnPlane(current, features)) { 385 for (int x = 0; x < features; ++x) { 386 right[rightFill * features + x] = current[x]; 387 } 388 rightLabels[rightFill] = y[i]; 389 rightFill += 1; 390 } else { 391 for (int x = 0; x < features; ++x) { 392 left[leftFill * features + x] = current[x]; 393 } 394 leftLabels[leftFill] = y[i]; 395 leftFill += 1; 396 } 397 current += features; 398 } 399 400 res.XLeft = left; 401 res.XRight = right; 402 res.yRight = rightLabels; 403 res.yLeft = leftLabels; 404 405 return res; 406 } 407 408 409 std::string TreeNode::getDotEdges(){ 410 411 if(isLeaf()){ 412 return ""; 413 } 414 415 std::string current = getDotLabel() + "->" + leftChild->getDotLabel() + ";\n"; 416 current += getDotLabel() + "->" + rightChild->getDotLabel() + ";\n"; 417 418 current += rightChild->getDotEdges(); 419 current += leftChild->getDotEdges(); 420 421 return current; 422 } 423 424 std::string toStringWithPrecision(float value, int precision = 1) { 425 float scale = std::pow(10.0f, precision); 426 value = std::round(value * scale) / scale; // Round to the desired precision 427 return std::to_string(value).substr(0, std::to_string(value).find(".") + precision + 1); 428 } 429 430 std::string TreeNode::getDotLabel(){ 431 const void * address = static_cast<const void*>(this); 432 std::stringstream ss; 433 ss << address; 434 std::string name = ss.str(); 435 if (isLeaf()){ 436 return "\"" + name + "\nCLASSIFICATION: " + std::to_string(classification) + "\""; 437 } 438 439 // equ will always have one more value than the indices. 440 // This is because we need an offset (intercept). 441 442 std::string equ = ""; 443 std::string indices = ""; 444 for(int i = 0 ; i < this->indicesCount; ++i){ 445 equ += toStringWithPrecision(this->equation[indicesOrder[i]], 2) + " "; 446 indices += std::to_string(indicesOrder[i]) + " "; 447 } 448 equ += toStringWithPrecision(this->equation[this->indicesCount], 2) + " "; 449 450 return "\"" + name + "\nINDICES:\n" + indices + "\nCOEFFICIENTS:\n" + equ + "\""; 451 } 452 453 int TreeNode::getClassification(){ 454 if(isLeaf()){ 455 return classification; 456 } 457 throw std::logic_error("Unable to call getClassification() on internal vertices."); 458 }