machinelearning

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 9bb96b95ae2ca9a32dd5afbd536aeefae357cc08
parent e026d0e3698bb995921fa61158c09c7316c97bd1
Author: Andrew <andrewlaack1@gmail.com>
Date:   Fri, 17 May 2024 20:43:42 -0500

Completed implementation of person's correlation coefficient calculation.

Diffstat:
McorrelationCoefficient/CorrelationCoefficient.ipynb | 99++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----
1 file changed, 93 insertions(+), 6 deletions(-)

diff --git a/correlationCoefficient/CorrelationCoefficient.ipynb b/correlationCoefficient/CorrelationCoefficient.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 5, + "execution_count": 64, "metadata": {}, "outputs": [], "source": [ @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 65, "metadata": {}, "outputs": [ { @@ -181,7 +181,7 @@ "max 500001.000000 " ] }, - "execution_count": 13, + "execution_count": 65, "metadata": {}, "output_type": "execute_result" } @@ -194,10 +194,97 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.6880752079585545\n" + ] + } + ], + "source": [ + "import math\n", + "\n", + "# Find means of dataset\n", + "\n", + "meanIncome = 0\n", + "meanPrice = 0\n", + "\n", + "for x in df['median_income']:\n", + " meanIncome += x\n", + "for i in df['median_house_value']:\n", + " meanPrice += i\n", + "\n", + "meanPrice = meanPrice / len(df['median_house_value'])\n", + "meanIncome = meanIncome / len(df['median_income'])\n", + "\n", + "# Subtract means from each value\n", + "\n", + "incomes = df['median_income'].values\n", + "prices = df['median_house_value'].values\n", + "\n", + "count = 0\n", + "for i in incomes:\n", + " incomes[count] = i - meanIncome\n", + " count += 1\n", + "\n", + "count = 0\n", + "for i in prices:\n", + " prices[count] = i - meanPrice\n", + " count += 1\n", + "\n", + "# Multiply deviations for each pair to find sum\n", + "\n", + "count = 0\n", + "sumOfDeviations = 0\n", + "while count < len(incomes):\n", + " sumOfDeviations += (prices[count] * incomes[count])\n", + " count += 1\n", + "\n", + "# Sum of squared deviations\n", + "\n", + "squaredIncomes = 0\n", + "squaredPrices = 0\n", + "for i in incomes:\n", + " squaredIncomes += i * i\n", + "\n", + "for p in prices:\n", + " squaredPrices += p * p\n", + "\n", + "\n", + "# Sqrt of squared deviations\n", + "\n", + "squaredCombined = squaredIncomes * squaredPrices\n", + "denominator = math.sqrt(squaredCombined)\n", + "\n", + "# Final calculation\n", + "\n", + "r = sumOfDeviations / denominator\n", + "print(r)\n", + "\n", + "\n", + "# To summarize:\n", + "\n", + "# For the numerator find the sum of the standard deviation for each ordered pair \n", + "# multiplied together. To do this find the mean of x and y values then subtract the actual\n", + "# values from the mean and multiply the corresponding x and y values together. You then\n", + "# add these all up. This is the numerator.\n", + "\n", + "# To find the denominator we use the mean we found earlier and square the standard deviation\n", + "# for each x value and sum these together. We then do the same thing for all y values and\n", + "# multiply the x and y squared deviations together. The last step is taking the squareroot.\n", + "# We now have the denominator.\n", + "\n", + "# The final step is the divide the numerator by the denominator to find the r value which is\n", + "# Pearsons's correlation coefficient generally defined using r, R, or the character rho (looks\n", + "# like a lowercase p).\n", + "\n", + "# I verified this code works correctly by checking the true correlation between median house\n", + "# value and median income. " + ] } ], "metadata": {