CorrelationCoefficient.ipynb (10294B)
1 { 2 "cells": [ 3 { 4 "cell_type": "code", 5 "execution_count": 64, 6 "metadata": {}, 7 "outputs": [], 8 "source": [ 9 "import pandas as pd\n", 10 "import matplotlib.pyplot as plt\n", 11 "from pathlib import Path" 12 ] 13 }, 14 { 15 "cell_type": "code", 16 "execution_count": 65, 17 "metadata": {}, 18 "outputs": [ 19 { 20 "data": { 21 "text/html": [ 22 "<div>\n", 23 "<style scoped>\n", 24 " .dataframe tbody tr th:only-of-type {\n", 25 " vertical-align: middle;\n", 26 " }\n", 27 "\n", 28 " .dataframe tbody tr th {\n", 29 " vertical-align: top;\n", 30 " }\n", 31 "\n", 32 " .dataframe thead th {\n", 33 " text-align: right;\n", 34 " }\n", 35 "</style>\n", 36 "<table border=\"1\" class=\"dataframe\">\n", 37 " <thead>\n", 38 " <tr style=\"text-align: right;\">\n", 39 " <th></th>\n", 40 " <th>longitude</th>\n", 41 " <th>latitude</th>\n", 42 " <th>housing_median_age</th>\n", 43 " <th>total_rooms</th>\n", 44 " <th>total_bedrooms</th>\n", 45 " <th>population</th>\n", 46 " <th>households</th>\n", 47 " <th>median_income</th>\n", 48 " <th>median_house_value</th>\n", 49 " </tr>\n", 50 " </thead>\n", 51 " <tbody>\n", 52 " <tr>\n", 53 " <th>count</th>\n", 54 " <td>20640.000000</td>\n", 55 " <td>20640.000000</td>\n", 56 " <td>20640.000000</td>\n", 57 " <td>20640.000000</td>\n", 58 " <td>20433.000000</td>\n", 59 " <td>20640.000000</td>\n", 60 " <td>20640.000000</td>\n", 61 " <td>20640.000000</td>\n", 62 " <td>20640.000000</td>\n", 63 " </tr>\n", 64 " <tr>\n", 65 " <th>mean</th>\n", 66 " <td>-119.569704</td>\n", 67 " <td>35.631861</td>\n", 68 " <td>28.639486</td>\n", 69 " <td>2635.763081</td>\n", 70 " <td>537.870553</td>\n", 71 " <td>1425.476744</td>\n", 72 " <td>499.539680</td>\n", 73 " <td>3.870671</td>\n", 74 " <td>206855.816909</td>\n", 75 " </tr>\n", 76 " <tr>\n", 77 " <th>std</th>\n", 78 " <td>2.003532</td>\n", 79 " <td>2.135952</td>\n", 80 " <td>12.585558</td>\n", 81 " <td>2181.615252</td>\n", 82 " <td>421.385070</td>\n", 83 " <td>1132.462122</td>\n", 84 " <td>382.329753</td>\n", 85 " <td>1.899822</td>\n", 86 " <td>115395.615874</td>\n", 87 " </tr>\n", 88 " <tr>\n", 89 " <th>min</th>\n", 90 " <td>-124.350000</td>\n", 91 " <td>32.540000</td>\n", 92 " <td>1.000000</td>\n", 93 " <td>2.000000</td>\n", 94 " <td>1.000000</td>\n", 95 " <td>3.000000</td>\n", 96 " <td>1.000000</td>\n", 97 " <td>0.499900</td>\n", 98 " <td>14999.000000</td>\n", 99 " </tr>\n", 100 " <tr>\n", 101 " <th>25%</th>\n", 102 " <td>-121.800000</td>\n", 103 " <td>33.930000</td>\n", 104 " <td>18.000000</td>\n", 105 " <td>1447.750000</td>\n", 106 " <td>296.000000</td>\n", 107 " <td>787.000000</td>\n", 108 " <td>280.000000</td>\n", 109 " <td>2.563400</td>\n", 110 " <td>119600.000000</td>\n", 111 " </tr>\n", 112 " <tr>\n", 113 " <th>50%</th>\n", 114 " <td>-118.490000</td>\n", 115 " <td>34.260000</td>\n", 116 " <td>29.000000</td>\n", 117 " <td>2127.000000</td>\n", 118 " <td>435.000000</td>\n", 119 " <td>1166.000000</td>\n", 120 " <td>409.000000</td>\n", 121 " <td>3.534800</td>\n", 122 " <td>179700.000000</td>\n", 123 " </tr>\n", 124 " <tr>\n", 125 " <th>75%</th>\n", 126 " <td>-118.010000</td>\n", 127 " <td>37.710000</td>\n", 128 " <td>37.000000</td>\n", 129 " <td>3148.000000</td>\n", 130 " <td>647.000000</td>\n", 131 " <td>1725.000000</td>\n", 132 " <td>605.000000</td>\n", 133 " <td>4.743250</td>\n", 134 " <td>264725.000000</td>\n", 135 " </tr>\n", 136 " <tr>\n", 137 " <th>max</th>\n", 138 " <td>-114.310000</td>\n", 139 " <td>41.950000</td>\n", 140 " <td>52.000000</td>\n", 141 " <td>39320.000000</td>\n", 142 " <td>6445.000000</td>\n", 143 " <td>35682.000000</td>\n", 144 " <td>6082.000000</td>\n", 145 " <td>15.000100</td>\n", 146 " <td>500001.000000</td>\n", 147 " </tr>\n", 148 " </tbody>\n", 149 "</table>\n", 150 "</div>" 151 ], 152 "text/plain": [ 153 " longitude latitude housing_median_age total_rooms \\\n", 154 "count 20640.000000 20640.000000 20640.000000 20640.000000 \n", 155 "mean -119.569704 35.631861 28.639486 2635.763081 \n", 156 "std 2.003532 2.135952 12.585558 2181.615252 \n", 157 "min -124.350000 32.540000 1.000000 2.000000 \n", 158 "25% -121.800000 33.930000 18.000000 1447.750000 \n", 159 "50% -118.490000 34.260000 29.000000 2127.000000 \n", 160 "75% -118.010000 37.710000 37.000000 3148.000000 \n", 161 "max -114.310000 41.950000 52.000000 39320.000000 \n", 162 "\n", 163 " total_bedrooms population households median_income \\\n", 164 "count 20433.000000 20640.000000 20640.000000 20640.000000 \n", 165 "mean 537.870553 1425.476744 499.539680 3.870671 \n", 166 "std 421.385070 1132.462122 382.329753 1.899822 \n", 167 "min 1.000000 3.000000 1.000000 0.499900 \n", 168 "25% 296.000000 787.000000 280.000000 2.563400 \n", 169 "50% 435.000000 1166.000000 409.000000 3.534800 \n", 170 "75% 647.000000 1725.000000 605.000000 4.743250 \n", 171 "max 6445.000000 35682.000000 6082.000000 15.000100 \n", 172 "\n", 173 " median_house_value \n", 174 "count 20640.000000 \n", 175 "mean 206855.816909 \n", 176 "std 115395.615874 \n", 177 "min 14999.000000 \n", 178 "25% 119600.000000 \n", 179 "50% 179700.000000 \n", 180 "75% 264725.000000 \n", 181 "max 500001.000000 " 182 ] 183 }, 184 "execution_count": 65, 185 "metadata": {}, 186 "output_type": "execute_result" 187 } 188 ], 189 "source": [ 190 "# Read in data\n", 191 "df = pd.read_csv(Path(\"../datasets/housing/housing.csv\"))\n", 192 "df.describe()" 193 ] 194 }, 195 { 196 "cell_type": "code", 197 "execution_count": 66, 198 "metadata": {}, 199 "outputs": [ 200 { 201 "name": "stdout", 202 "output_type": "stream", 203 "text": [ 204 "0.6880752079585545\n" 205 ] 206 } 207 ], 208 "source": [ 209 "import math\n", 210 "\n", 211 "# Find means of dataset\n", 212 "\n", 213 "meanIncome = 0\n", 214 "meanPrice = 0\n", 215 "\n", 216 "for x in df['median_income']:\n", 217 " meanIncome += x\n", 218 "for i in df['median_house_value']:\n", 219 " meanPrice += i\n", 220 "\n", 221 "meanPrice = meanPrice / len(df['median_house_value'])\n", 222 "meanIncome = meanIncome / len(df['median_income'])\n", 223 "\n", 224 "# Subtract means from each value\n", 225 "\n", 226 "incomes = df['median_income'].values\n", 227 "prices = df['median_house_value'].values\n", 228 "\n", 229 "count = 0\n", 230 "for i in incomes:\n", 231 " incomes[count] = i - meanIncome\n", 232 " count += 1\n", 233 "\n", 234 "count = 0\n", 235 "for i in prices:\n", 236 " prices[count] = i - meanPrice\n", 237 " count += 1\n", 238 "\n", 239 "# Multiply deviations for each pair to find sum\n", 240 "\n", 241 "count = 0\n", 242 "sumOfDeviations = 0\n", 243 "while count < len(incomes):\n", 244 " sumOfDeviations += (prices[count] * incomes[count])\n", 245 " count += 1\n", 246 "\n", 247 "# Sum of squared deviations\n", 248 "\n", 249 "squaredIncomes = 0\n", 250 "squaredPrices = 0\n", 251 "for i in incomes:\n", 252 " squaredIncomes += i * i\n", 253 "\n", 254 "for p in prices:\n", 255 " squaredPrices += p * p\n", 256 "\n", 257 "\n", 258 "# Sqrt of squared deviations\n", 259 "\n", 260 "squaredCombined = squaredIncomes * squaredPrices\n", 261 "denominator = math.sqrt(squaredCombined)\n", 262 "\n", 263 "# Final calculation\n", 264 "\n", 265 "r = sumOfDeviations / denominator\n", 266 "print(r)\n", 267 "\n", 268 "\n", 269 "# To summarize:\n", 270 "\n", 271 "# For the numerator find the sum of the standard deviation for each ordered pair \n", 272 "# multiplied together. To do this find the mean of x and y values then subtract the actual\n", 273 "# values from the mean and multiply the corresponding x and y values together. You then\n", 274 "# add these all up. This is the numerator.\n", 275 "\n", 276 "# To find the denominator we use the mean we found earlier and square the standard deviation\n", 277 "# for each x value and sum these together. We then do the same thing for all y values and\n", 278 "# multiply the x and y squared deviations together. The last step is taking the squareroot.\n", 279 "# We now have the denominator.\n", 280 "\n", 281 "# The final step is the divide the numerator by the denominator to find the r value which is\n", 282 "# Pearsons's correlation coefficient generally defined using r, R, or the character rho (looks\n", 283 "# like a lowercase p).\n", 284 "\n", 285 "# I verified this code works correctly by checking the true correlation between median house\n", 286 "# value and median income. " 287 ] 288 } 289 ], 290 "metadata": { 291 "kernelspec": { 292 "display_name": "notebook", 293 "language": "python", 294 "name": "notebook" 295 }, 296 "language_info": { 297 "codemirror_mode": { 298 "name": "ipython", 299 "version": 3 300 }, 301 "file_extension": ".py", 302 "mimetype": "text/x-python", 303 "name": "python", 304 "nbconvert_exporter": "python", 305 "pygments_lexer": "ipython3", 306 "version": "3.11.2" 307 } 308 }, 309 "nbformat": 4, 310 "nbformat_minor": 2 311 }