machinelearning

Machine learning code
git clone git://git.laack.co/machinelearning.git
Log | Files | Refs

CorrelationCoefficient.ipynb (10294B)


      1 {
      2  "cells": [
      3   {
      4    "cell_type": "code",
      5    "execution_count": 64,
      6    "metadata": {},
      7    "outputs": [],
      8    "source": [
      9     "import pandas as pd\n",
     10     "import matplotlib.pyplot as plt\n",
     11     "from pathlib import Path"
     12    ]
     13   },
     14   {
     15    "cell_type": "code",
     16    "execution_count": 65,
     17    "metadata": {},
     18    "outputs": [
     19     {
     20      "data": {
     21       "text/html": [
     22        "<div>\n",
     23        "<style scoped>\n",
     24        "    .dataframe tbody tr th:only-of-type {\n",
     25        "        vertical-align: middle;\n",
     26        "    }\n",
     27        "\n",
     28        "    .dataframe tbody tr th {\n",
     29        "        vertical-align: top;\n",
     30        "    }\n",
     31        "\n",
     32        "    .dataframe thead th {\n",
     33        "        text-align: right;\n",
     34        "    }\n",
     35        "</style>\n",
     36        "<table border=\"1\" class=\"dataframe\">\n",
     37        "  <thead>\n",
     38        "    <tr style=\"text-align: right;\">\n",
     39        "      <th></th>\n",
     40        "      <th>longitude</th>\n",
     41        "      <th>latitude</th>\n",
     42        "      <th>housing_median_age</th>\n",
     43        "      <th>total_rooms</th>\n",
     44        "      <th>total_bedrooms</th>\n",
     45        "      <th>population</th>\n",
     46        "      <th>households</th>\n",
     47        "      <th>median_income</th>\n",
     48        "      <th>median_house_value</th>\n",
     49        "    </tr>\n",
     50        "  </thead>\n",
     51        "  <tbody>\n",
     52        "    <tr>\n",
     53        "      <th>count</th>\n",
     54        "      <td>20640.000000</td>\n",
     55        "      <td>20640.000000</td>\n",
     56        "      <td>20640.000000</td>\n",
     57        "      <td>20640.000000</td>\n",
     58        "      <td>20433.000000</td>\n",
     59        "      <td>20640.000000</td>\n",
     60        "      <td>20640.000000</td>\n",
     61        "      <td>20640.000000</td>\n",
     62        "      <td>20640.000000</td>\n",
     63        "    </tr>\n",
     64        "    <tr>\n",
     65        "      <th>mean</th>\n",
     66        "      <td>-119.569704</td>\n",
     67        "      <td>35.631861</td>\n",
     68        "      <td>28.639486</td>\n",
     69        "      <td>2635.763081</td>\n",
     70        "      <td>537.870553</td>\n",
     71        "      <td>1425.476744</td>\n",
     72        "      <td>499.539680</td>\n",
     73        "      <td>3.870671</td>\n",
     74        "      <td>206855.816909</td>\n",
     75        "    </tr>\n",
     76        "    <tr>\n",
     77        "      <th>std</th>\n",
     78        "      <td>2.003532</td>\n",
     79        "      <td>2.135952</td>\n",
     80        "      <td>12.585558</td>\n",
     81        "      <td>2181.615252</td>\n",
     82        "      <td>421.385070</td>\n",
     83        "      <td>1132.462122</td>\n",
     84        "      <td>382.329753</td>\n",
     85        "      <td>1.899822</td>\n",
     86        "      <td>115395.615874</td>\n",
     87        "    </tr>\n",
     88        "    <tr>\n",
     89        "      <th>min</th>\n",
     90        "      <td>-124.350000</td>\n",
     91        "      <td>32.540000</td>\n",
     92        "      <td>1.000000</td>\n",
     93        "      <td>2.000000</td>\n",
     94        "      <td>1.000000</td>\n",
     95        "      <td>3.000000</td>\n",
     96        "      <td>1.000000</td>\n",
     97        "      <td>0.499900</td>\n",
     98        "      <td>14999.000000</td>\n",
     99        "    </tr>\n",
    100        "    <tr>\n",
    101        "      <th>25%</th>\n",
    102        "      <td>-121.800000</td>\n",
    103        "      <td>33.930000</td>\n",
    104        "      <td>18.000000</td>\n",
    105        "      <td>1447.750000</td>\n",
    106        "      <td>296.000000</td>\n",
    107        "      <td>787.000000</td>\n",
    108        "      <td>280.000000</td>\n",
    109        "      <td>2.563400</td>\n",
    110        "      <td>119600.000000</td>\n",
    111        "    </tr>\n",
    112        "    <tr>\n",
    113        "      <th>50%</th>\n",
    114        "      <td>-118.490000</td>\n",
    115        "      <td>34.260000</td>\n",
    116        "      <td>29.000000</td>\n",
    117        "      <td>2127.000000</td>\n",
    118        "      <td>435.000000</td>\n",
    119        "      <td>1166.000000</td>\n",
    120        "      <td>409.000000</td>\n",
    121        "      <td>3.534800</td>\n",
    122        "      <td>179700.000000</td>\n",
    123        "    </tr>\n",
    124        "    <tr>\n",
    125        "      <th>75%</th>\n",
    126        "      <td>-118.010000</td>\n",
    127        "      <td>37.710000</td>\n",
    128        "      <td>37.000000</td>\n",
    129        "      <td>3148.000000</td>\n",
    130        "      <td>647.000000</td>\n",
    131        "      <td>1725.000000</td>\n",
    132        "      <td>605.000000</td>\n",
    133        "      <td>4.743250</td>\n",
    134        "      <td>264725.000000</td>\n",
    135        "    </tr>\n",
    136        "    <tr>\n",
    137        "      <th>max</th>\n",
    138        "      <td>-114.310000</td>\n",
    139        "      <td>41.950000</td>\n",
    140        "      <td>52.000000</td>\n",
    141        "      <td>39320.000000</td>\n",
    142        "      <td>6445.000000</td>\n",
    143        "      <td>35682.000000</td>\n",
    144        "      <td>6082.000000</td>\n",
    145        "      <td>15.000100</td>\n",
    146        "      <td>500001.000000</td>\n",
    147        "    </tr>\n",
    148        "  </tbody>\n",
    149        "</table>\n",
    150        "</div>"
    151       ],
    152       "text/plain": [
    153        "          longitude      latitude  housing_median_age   total_rooms  \\\n",
    154        "count  20640.000000  20640.000000        20640.000000  20640.000000   \n",
    155        "mean    -119.569704     35.631861           28.639486   2635.763081   \n",
    156        "std        2.003532      2.135952           12.585558   2181.615252   \n",
    157        "min     -124.350000     32.540000            1.000000      2.000000   \n",
    158        "25%     -121.800000     33.930000           18.000000   1447.750000   \n",
    159        "50%     -118.490000     34.260000           29.000000   2127.000000   \n",
    160        "75%     -118.010000     37.710000           37.000000   3148.000000   \n",
    161        "max     -114.310000     41.950000           52.000000  39320.000000   \n",
    162        "\n",
    163        "       total_bedrooms    population    households  median_income  \\\n",
    164        "count    20433.000000  20640.000000  20640.000000   20640.000000   \n",
    165        "mean       537.870553   1425.476744    499.539680       3.870671   \n",
    166        "std        421.385070   1132.462122    382.329753       1.899822   \n",
    167        "min          1.000000      3.000000      1.000000       0.499900   \n",
    168        "25%        296.000000    787.000000    280.000000       2.563400   \n",
    169        "50%        435.000000   1166.000000    409.000000       3.534800   \n",
    170        "75%        647.000000   1725.000000    605.000000       4.743250   \n",
    171        "max       6445.000000  35682.000000   6082.000000      15.000100   \n",
    172        "\n",
    173        "       median_house_value  \n",
    174        "count        20640.000000  \n",
    175        "mean        206855.816909  \n",
    176        "std         115395.615874  \n",
    177        "min          14999.000000  \n",
    178        "25%         119600.000000  \n",
    179        "50%         179700.000000  \n",
    180        "75%         264725.000000  \n",
    181        "max         500001.000000  "
    182       ]
    183      },
    184      "execution_count": 65,
    185      "metadata": {},
    186      "output_type": "execute_result"
    187     }
    188    ],
    189    "source": [
    190     "# Read in data\n",
    191     "df = pd.read_csv(Path(\"../datasets/housing/housing.csv\"))\n",
    192     "df.describe()"
    193    ]
    194   },
    195   {
    196    "cell_type": "code",
    197    "execution_count": 66,
    198    "metadata": {},
    199    "outputs": [
    200     {
    201      "name": "stdout",
    202      "output_type": "stream",
    203      "text": [
    204       "0.6880752079585545\n"
    205      ]
    206     }
    207    ],
    208    "source": [
    209     "import math\n",
    210     "\n",
    211     "# Find means of dataset\n",
    212     "\n",
    213     "meanIncome = 0\n",
    214     "meanPrice = 0\n",
    215     "\n",
    216     "for x in df['median_income']:\n",
    217     "    meanIncome += x\n",
    218     "for i in df['median_house_value']:\n",
    219     "    meanPrice += i\n",
    220     "\n",
    221     "meanPrice = meanPrice / len(df['median_house_value'])\n",
    222     "meanIncome = meanIncome / len(df['median_income'])\n",
    223     "\n",
    224     "# Subtract means from each value\n",
    225     "\n",
    226     "incomes = df['median_income'].values\n",
    227     "prices = df['median_house_value'].values\n",
    228     "\n",
    229     "count = 0\n",
    230     "for i in incomes:\n",
    231     "    incomes[count] = i - meanIncome\n",
    232     "    count += 1\n",
    233     "\n",
    234     "count = 0\n",
    235     "for i in prices:\n",
    236     "    prices[count] = i - meanPrice\n",
    237     "    count += 1\n",
    238     "\n",
    239     "# Multiply deviations for each pair to find sum\n",
    240     "\n",
    241     "count = 0\n",
    242     "sumOfDeviations = 0\n",
    243     "while count < len(incomes):\n",
    244     "    sumOfDeviations += (prices[count] * incomes[count])\n",
    245     "    count += 1\n",
    246     "\n",
    247     "# Sum of squared deviations\n",
    248     "\n",
    249     "squaredIncomes = 0\n",
    250     "squaredPrices = 0\n",
    251     "for i in incomes:\n",
    252     "    squaredIncomes += i * i\n",
    253     "\n",
    254     "for p in prices:\n",
    255     "    squaredPrices += p * p\n",
    256     "\n",
    257     "\n",
    258     "# Sqrt of squared deviations\n",
    259     "\n",
    260     "squaredCombined = squaredIncomes * squaredPrices\n",
    261     "denominator = math.sqrt(squaredCombined)\n",
    262     "\n",
    263     "# Final calculation\n",
    264     "\n",
    265     "r = sumOfDeviations / denominator\n",
    266     "print(r)\n",
    267     "\n",
    268     "\n",
    269     "# To summarize:\n",
    270     "\n",
    271     "# For the numerator find the sum of the standard deviation for each ordered pair \n",
    272     "# multiplied together. To do this find the mean of x and y values then subtract the actual\n",
    273     "# values from the mean and multiply the corresponding x and y values together. You then\n",
    274     "# add these all up. This is the numerator.\n",
    275     "\n",
    276     "# To find the denominator we use the mean we found earlier and square the standard deviation\n",
    277     "# for each x value and sum these together. We then do the same thing for all y values and\n",
    278     "# multiply the x and y squared deviations together. The last step is taking the squareroot.\n",
    279     "# We now have the denominator.\n",
    280     "\n",
    281     "# The final step is the divide the numerator by the denominator to find the r value which is\n",
    282     "# Pearsons's correlation coefficient generally defined using r, R, or the character rho (looks\n",
    283     "# like a lowercase p).\n",
    284     "\n",
    285     "# I verified this code works correctly by checking the true correlation between median house\n",
    286     "# value and median income. "
    287    ]
    288   }
    289  ],
    290  "metadata": {
    291   "kernelspec": {
    292    "display_name": "notebook",
    293    "language": "python",
    294    "name": "notebook"
    295   },
    296   "language_info": {
    297    "codemirror_mode": {
    298     "name": "ipython",
    299     "version": 3
    300    },
    301    "file_extension": ".py",
    302    "mimetype": "text/x-python",
    303    "name": "python",
    304    "nbconvert_exporter": "python",
    305    "pygments_lexer": "ipython3",
    306    "version": "3.11.2"
    307   }
    308  },
    309  "nbformat": 4,
    310  "nbformat_minor": 2
    311 }