c-programming

Simple C programs
git clone git://git.laack.co/c-programming.git
Log | Files | Refs

knn.c (2883B)


      1 #include <stdio.h>
      2 #include <stdlib.h>
      3 
      4 #define INVALID -1
      5 #define VALID 0
      6 
      7 #define TRUE 1
      8 #define FALSE 0
      9 
     10 typedef struct Inputs{
     11     char* trainingDatasetPath;
     12     char* predictionSamplesPath;
     13     int k;
     14 } Inputs;
     15 
     16 
     17 
     18 typedef struct Sample{
     19     float* features;
     20     int target;
     21 } Sample;
     22 
     23 typedef struct Dataset{
     24     Sample* samples;
     25     int sampleCount;
     26     int featureCount;
     27     int valid;
     28 } Dataset;
     29 
     30 Inputs inputParse(int argc, char ** argv){
     31     Inputs in;
     32 
     33 
     34     if (argc != 4 || (in.k = atoi(argv[3])) <= 0){
     35         in.k = INVALID;
     36         return in;
     37     }
     38 
     39     in.trainingDatasetPath = argv[1];
     40     in.predictionSamplesPath = argv[2];
     41 
     42     return in;
     43 }
     44 
     45 void printInputs(Inputs inputs){
     46     printf("training path: %40s\nprediction path: %40s\nk: %40d\n", inputs.predictionSamplesPath, inputs.trainingDatasetPath, inputs.k);
     47 }
     48 
     49 // This does not support line end commas
     50 // This also doesn't support escaped strings as the input should be float and ints
     51 int csvColumns(char* fileName){
     52     FILE* fp = fopen(fileName, "r");
     53     int length = 0;
     54     char c;
     55 
     56     while ((c = fgetc(fp)) != EOF && c != '\n'){
     57         if(c == ','){
     58             length += 1;
     59         }
     60     }
     61 
     62     fclose(fp);
     63     return length;
     64 }
     65 
     66 
     67 Dataset* readCSV(char* fileName, int containsTarget){
     68 
     69     FILE* fp = fopen(fileName, "r");
     70     Dataset* dataset = malloc(sizeof(Dataset));
     71     dataset->valid = VALID;
     72 
     73     char c;
     74     char buffered[51];
     75     int length = 0;
     76     int csvColumnCount = csvColumns(fileName);
     77     int featureCount = csvColumnCount;
     78 
     79     if(containsTarget){
     80         featureCount = csvColumnCount -1;
     81     }
     82 
     83 
     84     Sample* current = malloc(sizeof(Sample));
     85     current->features = malloc(sizeof(float) * featureCount);
     86     current->target = 0;
     87     int featureNum = 0;
     88 
     89     while((c = fgetc(fp)) != EOF){
     90 
     91         // this doesn't account for target. it would likely make more sense to just load everything in at the same time...
     92         if(c == ',' || c == "\n"){
     93             buffered[length] = 0;
     94             float currentF = atof(buffered);
     95             current->features[featureNum] = currentF;
     96             featureNum += 1;
     97             length = 0;
     98         }
     99         else if (length < 50){
    100             buffered[length] = c;
    101             length += 1;
    102         }
    103         else{
    104             dataset->valid = INVALID;
    105             fclose(fp);
    106             return dataset;
    107         }
    108 
    109     }
    110 
    111     fclose(fp);
    112     return dataset;
    113 }
    114 
    115 
    116 int main(int argc, char ** argv){
    117     Inputs input = inputParse(argc,argv);
    118     if (input.k == INVALID){
    119         printf("Usage: knn {training_dataset.csv} {prediction_samples.csv} {k}\n");
    120         return -1;
    121     }
    122 
    123     printInputs(input);
    124     Dataset* trainingData = readCSV(input.trainingDatasetPath, TRUE);
    125 
    126     if(trainingData->valid == INVALID){
    127         free(trainingData);
    128         printf("Invalid training dataset");
    129         return -1;
    130     }
    131 
    132     free(trainingData);
    133 }