cart-elc

Source code for CART-ELC
git clone git://git.laack.co/cart-elc.git
Log | Files | Refs | README | LICENSE

TreeNode.cpp (10707B)


      1 #include "../include/TreeNode.h"
      2 #include "stdexcept"
      3 #include "math.h"
      4 #include <string>
      5 #include <sstream>
      6 #include "../lib/eigen-3.4.0/Eigen/Dense"
      7 #include "iostream"
      8 #include "unordered_map"
      9 #include "unordered_set"
     10 #include <vector>
     11 
     12 TreeNode::TreeNode(int classification){
     13 	leaf = true;
     14 	this->classification = classification;
     15 }
     16 
     17 // pass in only the samples you want to split on.
     18 TreeNode::TreeNode(float* samples, int features, int points, int* indicesOrderIn, int indicesCount){
     19 
     20 	//std::cout << indicesCount << std::endl;
     21 	//std::cout << features << std::endl;
     22 
     23 
     24 	this->indicesOrder = new int[indicesCount];
     25 	this->indicesCount = indicesCount;
     26 	
     27 	for(int i = 0 ; i < indicesCount; ++i){
     28 		this->indicesOrder[i] = indicesOrderIn[i];
     29 	}
     30 
     31 	//std::cout << std::endl;
     32 	//std::cout << "INDICES: "<< std::endl;
     33 	//for(int i = 0 ; i < indicesCount; ++i){
     34 	//	std::cout << indicesOrder[i] << " "; 
     35 	//}
     36 	//std::cout << std::endl;
     37 
     38     Eigen::MatrixXd data(points, indicesCount);
     39 
     40 
     41     for (int i = 0; i < points; i++) {
     42         for (int j = 0; j < indicesCount; j++) {
     43             data(i, j) = samples[(i * features) + indicesOrder[j]];
     44         }
     45     }
     46 
     47 	// uncomment to see input matricies
     48 	//std::cout << data << std::endl << std::endl;;
     49 
     50 	// compute means
     51     Eigen::VectorXd mean = data.colwise().mean();
     52 
     53 	// compute difference from mean for each sample
     54     Eigen::MatrixXd centered = data.rowwise() - mean.transpose();
     55 
     56 	// create covariance matrix
     57     Eigen::MatrixXd covariance = (centered.transpose() * centered) / (points- 1);
     58 
     59 	// used to compute eigen vector
     60     Eigen::SelfAdjointEigenSolver<Eigen::MatrixXd> eig(covariance);
     61 
     62 	// eigen vector = normal vector
     63     Eigen::VectorXd normal = eig.eigenvectors().col(0);
     64 
     65     float offset = mean.dot(normal);
     66 
     67     float* equ = new float[this->indicesCount + 1];
     68     for (int i = 0; i < indicesCount ; i++) {
     69         equ[i] = normal[i];
     70     }
     71 
     72     equ[indicesCount] = offset;
     73 
     74     this->equation = equ;
     75     this->leaf = false;
     76 
     77 }
     78 
     79 TreeNode::~TreeNode() {
     80 	if(this->equation != nullptr){
     81 		delete[] this->equation; 
     82 		this->equation = nullptr;
     83 	}
     84 	if(this->indicesOrder != nullptr){
     85 		delete[] this->indicesOrder;
     86 		this->indicesOrder = nullptr;
     87 	}
     88 }
     89 
     90 float* TreeNode::getEquation(){
     91 	return this->equation;
     92 }
     93 
     94 bool TreeNode::isLeaf(){
     95 	return leaf;
     96 }
     97 
     98 // don't include classifications in feature count.
     99 float TreeNode::evalSplit(float* X, int* y, int samples, int features, std::string criterion){
    100 
    101 	if(isLeaf()){
    102 		throw std::logic_error("Cannot evaluate split on leaf node.");
    103 	}
    104 
    105 	if(criterion == "gini"){
    106 
    107 		return giniImpurity(X, y, samples, features);
    108 	}
    109 
    110 	if(criterion == "twoing"){
    111 		return twoingRule(X, y, samples, features);
    112 	}
    113 	if(criterion == "information gain"){
    114 		return informationGain(X,y,samples,features);
    115 	}
    116 
    117 	throw std::invalid_argument("supported objective functions are 'twoing', 'gini', and 'information gain'");
    118 	return 0.0f;
    119 }
    120 
    121 
    122 float TreeNode::informationGain(float* X, int* y, int samples, int features) {
    123 
    124 	float before = entropy(y, samples);
    125 
    126 	std::vector<int> left;
    127 	std::vector<int> right;
    128 
    129 	float* currentSample = X;
    130 
    131 	for(int i = 0; i < samples; ++i){
    132 
    133 		if(aboveOrOnPlane(currentSample, features)){
    134 			left.push_back(y[i]);
    135 		}
    136 		else{
    137 			right.push_back(y[i]);
    138 		}
    139 
    140 		currentSample += features;
    141 	}
    142 
    143 	float leftEn = entropy(left.data(), left.size());
    144 	float rightEn = entropy(right.data(), right.size());
    145 
    146 	float weightedLeft = leftEn * (static_cast<float>(left.size()) / samples);
    147 	float weightedRight = rightEn * (static_cast<float>(right.size()) / samples);
    148 
    149 	float after = weightedLeft + weightedRight;
    150 
    151 	float final = before - after;
    152 
    153 	// we are minimizing objective function thus
    154 	// invert as high entropy is good.
    155 	
    156 	final *= -1;
    157 
    158 	return final;
    159 }
    160 
    161 float TreeNode::entropy(int* y, int samples){
    162 
    163 	if(samples == 0){
    164 		return 0.0f;
    165 	}
    166 
    167     std::unordered_map<int, int> counts;
    168 
    169     for (int i = 0; i < samples; ++i) {
    170         counts[y[i]]++;
    171     }
    172 
    173 	float entropy = 0.0f;
    174 
    175     for (const auto& entry : counts) {
    176 
    177 		float prob = static_cast<float>(entry.second) / static_cast<float>(samples);
    178 		float logProb = std::log2(prob);
    179 
    180 		entropy += logProb * prob;
    181     }
    182 
    183 	entropy *= -1;
    184 
    185 	return entropy;
    186 }
    187 
    188 
    189 // outer = (p_L * p_R) * (1/4)
    190 // inner = sum(abs(p(class | t_L) - p(class | t_R))) ^ 2
    191 // final = outer * inner
    192 
    193 // note: since the elcclassifier is trying to minimize the objective function,
    194 // but the twoing rule is to be maximized, we multiply this value by -1 before returning.
    195 
    196 float TreeNode::twoingRule(float* X, int* y, int samples, int features) {
    197 
    198     std::unordered_map<int, int> ltMap;
    199     std::unordered_map<int, int> gtMap;
    200 
    201     int ltCount = 0;
    202     int gteqCount = 0;
    203 
    204     float* currentSample = X;
    205 
    206     for (int i = 0; i < samples; ++i) {
    207         if (aboveOrOnPlane(currentSample, features)) {
    208             ltMap[y[i]]++;
    209             ltCount++;
    210         } else {
    211             gtMap[y[i]]++;
    212             gteqCount++;
    213         }
    214         currentSample += features;
    215     }
    216 
    217     // one side is empty, avoid div by 0.
    218     if (ltCount == 0 || gteqCount == 0) {
    219         return 0.0f;
    220     }
    221 
    222     float pL = static_cast<float>(ltCount) / samples;
    223     float pR = static_cast<float>(gteqCount) / samples;
    224     float outer = (pL * pR) * (1 / 4.0f);
    225 
    226     // union labels from both groups in case
    227 	// samples are only in one or the other.
    228     std::unordered_set<int> classLabels;
    229     for (const auto& entry : ltMap) {
    230         classLabels.insert(entry.first);
    231     }
    232     for (const auto& entry : gtMap) {
    233         classLabels.insert(entry.first);
    234     }
    235 
    236     float inner = 0.0f;
    237     for (int classLabel : classLabels) {
    238         int ltClassCount = (ltMap.find(classLabel) != ltMap.end()) ? ltMap[classLabel] : 0;
    239         float pClassL = static_cast<float>(ltClassCount) / ltCount;
    240 
    241         int gtClassCount = (gtMap.find(classLabel) != gtMap.end()) ? gtMap[classLabel] : 0;
    242         float pClassR = static_cast<float>(gtClassCount) / gteqCount;
    243 
    244         inner += std::abs(pClassL - pClassR);
    245     }
    246 
    247     inner = inner * inner;
    248 
    249     // Multiply by -1 because the classifier minimizes the objective function
    250     float final = -1 * outer * inner;
    251     return final;
    252 }
    253 
    254 float TreeNode::giniImpurity(float* X, int* y, int samples, int features){
    255 
    256 	// verified this is perfect. 
    257 	// if there are issues it is with constructor of tree node class. I built it wrong.
    258 
    259 	std::unordered_map<int, int> ltMap;
    260 	std::unordered_map<int, int> gtMap;
    261 
    262 	int ltCount = 0;
    263 	int gteqCount = 0;
    264 
    265 	float* currentSample = X;
    266 
    267 	for(int i = 0; i < samples; ++i){
    268 
    269 		if(aboveOrOnPlane(currentSample, features)){
    270 			ltMap[y[i]]++;
    271 			ltCount++;
    272 		}
    273 		else{
    274 			gtMap[y[i]]++;
    275 			gteqCount++;
    276 		}
    277 
    278 		currentSample += features;
    279 	}
    280 
    281 	float ltGini= 1.0f;
    282 
    283 	for (const auto& pair : ltMap) {
    284 		ltGini -= pow(float(pair.second) / ltCount, 2);
    285 	}
    286 
    287 	float gteqGini = 1.0f;
    288 
    289 	for (const auto& pair : gtMap) {
    290 		gteqGini -= pow(float(pair.second) / gteqCount, 2);
    291 	}
    292 
    293 	if(gteqCount == 0){
    294 		gteqGini = 0.0f;
    295 	}
    296 	if(ltCount == 0){
    297 		ltGini = 0.0f;
    298 	}
    299 
    300 
    301 	float gini = gteqGini * float(gteqCount) / samples;
    302 	gini += ltGini * float(ltCount) / samples;
    303 
    304 	return gini;
    305 }
    306 
    307 
    308 // equation form is as follows:
    309 // x, y, z, d
    310 // where ax + by + cz = d
    311 
    312 
    313 bool TreeNode::aboveOrOnPlane(float* sample, int features){
    314 
    315 	float dp = 0;
    316 	for(int i = 0 ; i < indicesCount; ++i){
    317 		dp += sample[indicesOrder[i]] * equation[i];
    318 	}
    319 	float bias = equation[indicesCount];
    320 	float appliedBias = dp - bias;
    321 
    322 	// apply slight bias to ensure values on the plane evaluate to |~0|.
    323 	float TOLERANCE = .001;
    324 	appliedBias += TOLERANCE;
    325 	return appliedBias >= 0;
    326 
    327 }
    328 
    329 
    330 
    331 void TreeNode::setLeftChild(TreeNode* child){
    332 	leftChild = child;
    333 }
    334 
    335 void TreeNode::setRightChild(TreeNode* child){
    336 	rightChild = child;
    337 }
    338 
    339 TreeNode* TreeNode::getLeftChild(){
    340 	return leftChild;
    341 }
    342 
    343 TreeNode* TreeNode::getRightChild(){
    344 	return rightChild;
    345 }
    346 
    347 SplitResults TreeNode::splitOnNode(float* X, int* y, int samples, int features){
    348 	
    349 	int leftSize = 0;
    350 	int rightSize = 0;
    351 	float* current = X;
    352 
    353 	// get counts for array init
    354 	for(int i = 0 ; i < samples; ++i){
    355 		if(aboveOrOnPlane(current, features)){
    356 			rightSize += 1;
    357 		}
    358 		else{
    359 			leftSize += 1;
    360 		}
    361 
    362 		current += features;
    363 	}
    364 	// done with counting
    365 	
    366 	// init
    367 	SplitResults res;
    368 	res.leftSize = leftSize;
    369 	res.rightSize = rightSize;
    370 
    371 
    372 	float* right = new float[rightSize * features];
    373 	float* left = new float[leftSize * features];
    374 	int* rightLabels = new int[rightSize];
    375 	int* leftLabels = new int[leftSize];
    376 
    377 
    378 
    379 	int leftFill = 0;
    380 	int rightFill = 0;
    381 	current = X;
    382 
    383     for (int i = 0; i < samples; ++i) {
    384         if (aboveOrOnPlane(current, features)) {
    385             for (int x = 0; x < features; ++x) {
    386                 right[rightFill * features + x] = current[x];
    387             }
    388             rightLabels[rightFill] = y[i];
    389             rightFill += 1;
    390         } else {
    391             for (int x = 0; x < features; ++x) {
    392                 left[leftFill * features + x] = current[x];
    393             }
    394             leftLabels[leftFill] = y[i];
    395             leftFill += 1;
    396         }
    397         current += features;
    398     }
    399 
    400 	res.XLeft = left;
    401 	res.XRight = right;
    402 	res.yRight = rightLabels;
    403 	res.yLeft = leftLabels;
    404 
    405     return res;
    406 }
    407 
    408 
    409 std::string TreeNode::getDotEdges(){
    410 
    411 	if(isLeaf()){
    412 		return "";
    413 	}
    414 
    415 	std::string current = getDotLabel() + "->" + leftChild->getDotLabel() + ";\n";
    416 	current += getDotLabel() + "->" + rightChild->getDotLabel() + ";\n";
    417 
    418 	current += rightChild->getDotEdges();
    419 	current += leftChild->getDotEdges();
    420 
    421 	return current;
    422 }
    423 
    424 std::string toStringWithPrecision(float value, int precision = 1) {
    425     float scale = std::pow(10.0f, precision);
    426     value = std::round(value * scale) / scale; // Round to the desired precision
    427     return std::to_string(value).substr(0, std::to_string(value).find(".") + precision + 1);
    428 }
    429 
    430 std::string TreeNode::getDotLabel(){
    431 	const void * address = static_cast<const void*>(this);
    432 	std::stringstream ss;
    433 	ss << address;  
    434 	std::string name = ss.str(); 
    435 	if (isLeaf()){
    436 		return "\"" + name + "\nCLASSIFICATION: " + std::to_string(classification) + "\"";
    437 	}
    438 
    439 	// equ will always have one more value than the indices.
    440 	// This is because we need an offset (intercept).
    441 	
    442 	std::string equ = "";
    443 	std::string indices = "";
    444 	for(int i = 0 ; i < this->indicesCount; ++i){
    445 		equ += toStringWithPrecision(this->equation[indicesOrder[i]], 2) + " ";
    446 		indices += std::to_string(indicesOrder[i]) + " ";
    447 	}
    448 	equ += toStringWithPrecision(this->equation[this->indicesCount], 2) + " ";
    449 
    450 	return "\"" + name + "\nINDICES:\n" + indices + "\nCOEFFICIENTS:\n" + equ + "\"";
    451 }
    452 
    453 int TreeNode::getClassification(){
    454 	if(isLeaf()){
    455 		return classification;
    456 	}
    457 	throw std::logic_error("Unable to call getClassification() on internal vertices.");
    458 }