commit 9bb96b95ae2ca9a32dd5afbd536aeefae357cc08
parent e026d0e3698bb995921fa61158c09c7316c97bd1
Author: Andrew <andrewlaack1@gmail.com>
Date: Fri, 17 May 2024 20:43:42 -0500
Completed implementation of person's correlation coefficient calculation.
Diffstat:
1 file changed, 93 insertions(+), 6 deletions(-)
diff --git a/correlationCoefficient/CorrelationCoefficient.ipynb b/correlationCoefficient/CorrelationCoefficient.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
@@ -13,7 +13,7 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 65,
"metadata": {},
"outputs": [
{
@@ -181,7 +181,7 @@
"max 500001.000000 "
]
},
- "execution_count": 13,
+ "execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
@@ -194,10 +194,97 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 66,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.6880752079585545\n"
+ ]
+ }
+ ],
+ "source": [
+ "import math\n",
+ "\n",
+ "# Find means of dataset\n",
+ "\n",
+ "meanIncome = 0\n",
+ "meanPrice = 0\n",
+ "\n",
+ "for x in df['median_income']:\n",
+ " meanIncome += x\n",
+ "for i in df['median_house_value']:\n",
+ " meanPrice += i\n",
+ "\n",
+ "meanPrice = meanPrice / len(df['median_house_value'])\n",
+ "meanIncome = meanIncome / len(df['median_income'])\n",
+ "\n",
+ "# Subtract means from each value\n",
+ "\n",
+ "incomes = df['median_income'].values\n",
+ "prices = df['median_house_value'].values\n",
+ "\n",
+ "count = 0\n",
+ "for i in incomes:\n",
+ " incomes[count] = i - meanIncome\n",
+ " count += 1\n",
+ "\n",
+ "count = 0\n",
+ "for i in prices:\n",
+ " prices[count] = i - meanPrice\n",
+ " count += 1\n",
+ "\n",
+ "# Multiply deviations for each pair to find sum\n",
+ "\n",
+ "count = 0\n",
+ "sumOfDeviations = 0\n",
+ "while count < len(incomes):\n",
+ " sumOfDeviations += (prices[count] * incomes[count])\n",
+ " count += 1\n",
+ "\n",
+ "# Sum of squared deviations\n",
+ "\n",
+ "squaredIncomes = 0\n",
+ "squaredPrices = 0\n",
+ "for i in incomes:\n",
+ " squaredIncomes += i * i\n",
+ "\n",
+ "for p in prices:\n",
+ " squaredPrices += p * p\n",
+ "\n",
+ "\n",
+ "# Sqrt of squared deviations\n",
+ "\n",
+ "squaredCombined = squaredIncomes * squaredPrices\n",
+ "denominator = math.sqrt(squaredCombined)\n",
+ "\n",
+ "# Final calculation\n",
+ "\n",
+ "r = sumOfDeviations / denominator\n",
+ "print(r)\n",
+ "\n",
+ "\n",
+ "# To summarize:\n",
+ "\n",
+ "# For the numerator find the sum of the standard deviation for each ordered pair \n",
+ "# multiplied together. To do this find the mean of x and y values then subtract the actual\n",
+ "# values from the mean and multiply the corresponding x and y values together. You then\n",
+ "# add these all up. This is the numerator.\n",
+ "\n",
+ "# To find the denominator we use the mean we found earlier and square the standard deviation\n",
+ "# for each x value and sum these together. We then do the same thing for all y values and\n",
+ "# multiply the x and y squared deviations together. The last step is taking the squareroot.\n",
+ "# We now have the denominator.\n",
+ "\n",
+ "# The final step is the divide the numerator by the denominator to find the r value which is\n",
+ "# Pearsons's correlation coefficient generally defined using r, R, or the character rho (looks\n",
+ "# like a lowercase p).\n",
+ "\n",
+ "# I verified this code works correctly by checking the true correlation between median house\n",
+ "# value and median income. "
+ ]
}
],
"metadata": {