machinelearning

Machine learning code
git clone git://git.laack.co/machinelearning.git
Log | Files | Refs

LinearRegressionHousingV3.ipynb (13709B)


      1 {
      2  "cells": [
      3   {
      4    "cell_type": "markdown",
      5    "metadata": {},
      6    "source": [
      7     "This will be a summation of everything I have learned about preprocessing until this point.\n",
      8     "\n",
      9     "It will do the following transformations:\n",
     10     "\n",
     11     "    1. One hot encoding ocean proximity\n",
     12     "    2. Impute Medians (This messed stuff up... bad so I dropped na... oh well)\n",
     13     "    3. Get bedroom ratio\n",
     14     "    4. Get rooms per house\n",
     15     "    5. Get people per house\n",
     16     "    6. Replace Long Tails with Logs\n",
     17     "    7. Standardize all values\n",
     18     "    8. Split\n",
     19     "    9. Train\n",
     20     "    10. Profit"
     21    ]
     22   },
     23   {
     24    "cell_type": "markdown",
     25    "metadata": {},
     26    "source": [
     27     "I decided to drop na instead of imputing because it gave horribly bad values after standardization. "
     28    ]
     29   },
     30   {
     31    "cell_type": "code",
     32    "execution_count": 614,
     33    "metadata": {},
     34    "outputs": [],
     35    "source": [
     36     "import pandas as pd\n",
     37     "fullDataset = pd.read_csv('../datasets/housing/housing.csv')\n",
     38     "fullDataset = fullDataset.dropna()"
     39    ]
     40   },
     41   {
     42    "cell_type": "code",
     43    "execution_count": 615,
     44    "metadata": {},
     45    "outputs": [],
     46    "source": [
     47     "from sklearn.preprocessing import OneHotEncoder\n",
     48     "\n",
     49     "encoder = OneHotEncoder(sparse_output=False)\n",
     50     "prox = fullDataset[['ocean_proximity']]\n",
     51     "encodedCol = encoder.fit_transform(prox)\n",
     52     "encodedCol_df = pd.DataFrame(encodedCol, columns=encoder.get_feature_names_out(['ocean_proximity']))\n",
     53     "\n",
     54     "fullDataset = pd.concat([fullDataset, encodedCol_df] )\n",
     55     "fullDataset.drop(columns='ocean_proximity', axis=1, inplace=True)"
     56    ]
     57   },
     58   {
     59    "cell_type": "code",
     60    "execution_count": 616,
     61    "metadata": {},
     62    "outputs": [],
     63    "source": [
     64     "from sklearn.impute import SimpleImputer\n",
     65     "\n",
     66     "imputer = SimpleImputer(strategy='median')\n",
     67     "fullDataset = imputer.fit_transform(fullDataset)\n",
     68     "\n",
     69     "fullDataset = pd.DataFrame(fullDataset, columns=imputer.get_feature_names_out())\n"
     70    ]
     71   },
     72   {
     73    "cell_type": "code",
     74    "execution_count": 617,
     75    "metadata": {},
     76    "outputs": [],
     77    "source": [
     78     "fullDataset['bedroom_ratio'] = fullDataset['total_bedrooms'] / fullDataset['total_rooms']\n",
     79     "fullDataset['rooms_per_house'] = fullDataset['total_rooms'] / fullDataset['households']\n",
     80     "fullDataset['people_per_house'] = fullDataset['population'] / fullDataset['households']"
     81    ]
     82   },
     83   {
     84    "cell_type": "code",
     85    "execution_count": 618,
     86    "metadata": {},
     87    "outputs": [],
     88    "source": [
     89     "from sklearn.preprocessing import FunctionTransformer \n",
     90     "import numpy as np\n",
     91     "ft = FunctionTransformer(np.log)\n",
     92     "\n",
     93     "fullDataset['median_income'] = ft.transform(fullDataset['median_income'])\n",
     94     "fullDataset['total_rooms'] = ft.transform(fullDataset['total_rooms'])\n",
     95     "fullDataset['total_bedrooms'] = ft.transform(fullDataset['total_bedrooms'])\n",
     96     "fullDataset['population'] = ft.transform(fullDataset['population'])\n",
     97     "fullDataset['households'] = ft.transform(fullDataset['households'])"
     98    ]
     99   },
    100   {
    101    "cell_type": "code",
    102    "execution_count": 619,
    103    "metadata": {},
    104    "outputs": [],
    105    "source": [
    106     "from sklearn.preprocessing import StandardScaler\n",
    107     "\n",
    108     "std = StandardScaler()\n",
    109     "\n",
    110     "goal = fullDataset['median_house_value']\n",
    111     "\n",
    112     "fullDataset = fullDataset.drop(axis=1, columns='median_house_value')\n",
    113     "fullDataset = std.fit_transform(fullDataset)\n",
    114     "fullDataset = pd.DataFrame(fullDataset , columns=std.get_feature_names_out())\n",
    115     "\n",
    116     "fullDataset = pd.concat([fullDataset, goal], axis=1)"
    117    ]
    118   },
    119   {
    120    "cell_type": "code",
    121    "execution_count": 620,
    122    "metadata": {},
    123    "outputs": [
    124     {
    125      "data": {
    126       "text/html": [
    127        "<div>\n",
    128        "<style scoped>\n",
    129        "    .dataframe tbody tr th:only-of-type {\n",
    130        "        vertical-align: middle;\n",
    131        "    }\n",
    132        "\n",
    133        "    .dataframe tbody tr th {\n",
    134        "        vertical-align: top;\n",
    135        "    }\n",
    136        "\n",
    137        "    .dataframe thead th {\n",
    138        "        text-align: right;\n",
    139        "    }\n",
    140        "</style>\n",
    141        "<table border=\"1\" class=\"dataframe\">\n",
    142        "  <thead>\n",
    143        "    <tr style=\"text-align: right;\">\n",
    144        "      <th></th>\n",
    145        "      <th>longitude</th>\n",
    146        "      <th>latitude</th>\n",
    147        "      <th>housing_median_age</th>\n",
    148        "      <th>total_rooms</th>\n",
    149        "      <th>total_bedrooms</th>\n",
    150        "      <th>population</th>\n",
    151        "      <th>households</th>\n",
    152        "      <th>median_income</th>\n",
    153        "      <th>ocean_proximity_&lt;1H OCEAN</th>\n",
    154        "      <th>ocean_proximity_INLAND</th>\n",
    155        "      <th>ocean_proximity_ISLAND</th>\n",
    156        "      <th>ocean_proximity_NEAR BAY</th>\n",
    157        "      <th>ocean_proximity_NEAR OCEAN</th>\n",
    158        "      <th>bedroom_ratio</th>\n",
    159        "      <th>rooms_per_house</th>\n",
    160        "      <th>people_per_house</th>\n",
    161        "      <th>median_house_value</th>\n",
    162        "    </tr>\n",
    163        "  </thead>\n",
    164        "  <tbody>\n",
    165        "    <tr>\n",
    166        "      <th>40800</th>\n",
    167        "      <td>0.356367</td>\n",
    168        "      <td>-0.413793</td>\n",
    169        "      <td>0.020600</td>\n",
    170        "      <td>0.031705</td>\n",
    171        "      <td>0.023426</td>\n",
    172        "      <td>0.035975</td>\n",
    173        "      <td>0.031885</td>\n",
    174        "      <td>0.027803</td>\n",
    175        "      <td>-0.532731</td>\n",
    176        "      <td>2.300206</td>\n",
    177        "      <td>-0.011062</td>\n",
    178        "      <td>-0.242517</td>\n",
    179        "      <td>-0.262159</td>\n",
    180        "      <td>-0.103414</td>\n",
    181        "      <td>-0.065604</td>\n",
    182        "      <td>-0.014948</td>\n",
    183        "      <td>179700.0</td>\n",
    184        "    </tr>\n",
    185        "    <tr>\n",
    186        "      <th>18285</th>\n",
    187        "      <td>-1.437524</td>\n",
    188        "      <td>2.340363</td>\n",
    189        "      <td>-0.877729</td>\n",
    190        "      <td>1.068025</td>\n",
    191        "      <td>0.768423</td>\n",
    192        "      <td>0.821632</td>\n",
    193        "      <td>0.780280</td>\n",
    194        "      <td>0.048288</td>\n",
    195        "      <td>-0.532731</td>\n",
    196        "      <td>-0.434744</td>\n",
    197        "      <td>-0.011062</td>\n",
    198        "      <td>-0.242517</td>\n",
    199        "      <td>-0.262159</td>\n",
    200        "      <td>-0.858356</td>\n",
    201        "      <td>0.457610</td>\n",
    202        "      <td>-0.006201</td>\n",
    203        "      <td>151900.0</td>\n",
    204        "    </tr>\n",
    205        "    <tr>\n",
    206        "      <th>27106</th>\n",
    207        "      <td>0.356367</td>\n",
    208        "      <td>-0.413793</td>\n",
    209        "      <td>0.020600</td>\n",
    210        "      <td>0.031705</td>\n",
    211        "      <td>0.023426</td>\n",
    212        "      <td>0.035975</td>\n",
    213        "      <td>0.031885</td>\n",
    214        "      <td>0.027803</td>\n",
    215        "      <td>1.877119</td>\n",
    216        "      <td>-0.434744</td>\n",
    217        "      <td>-0.011062</td>\n",
    218        "      <td>-0.242517</td>\n",
    219        "      <td>-0.262159</td>\n",
    220        "      <td>-0.103414</td>\n",
    221        "      <td>-0.065604</td>\n",
    222        "      <td>-0.014948</td>\n",
    223        "      <td>179700.0</td>\n",
    224        "    </tr>\n",
    225        "    <tr>\n",
    226        "      <th>12339</th>\n",
    227        "      <td>-0.843957</td>\n",
    228        "      <td>1.303788</td>\n",
    229        "      <td>1.031220</td>\n",
    230        "      <td>-0.394040</td>\n",
    231        "      <td>-0.687060</td>\n",
    232        "      <td>-0.286809</td>\n",
    233        "      <td>-0.712446</td>\n",
    234        "      <td>-1.129927</td>\n",
    235        "      <td>-0.532731</td>\n",
    236        "      <td>-0.434744</td>\n",
    237        "      <td>-0.011062</td>\n",
    238        "      <td>-0.242517</td>\n",
    239        "      <td>-0.262159</td>\n",
    240        "      <td>-0.759334</td>\n",
    241        "      <td>0.446049</td>\n",
    242        "      <td>0.078841</td>\n",
    243        "      <td>112500.0</td>\n",
    244        "    </tr>\n",
    245        "    <tr>\n",
    246        "      <th>1501</th>\n",
    247        "      <td>1.372026</td>\n",
    248        "      <td>-0.727176</td>\n",
    249        "      <td>-0.990020</td>\n",
    250        "      <td>0.123144</td>\n",
    251        "      <td>0.005602</td>\n",
    252        "      <td>-0.212510</td>\n",
    253        "      <td>-0.015836</td>\n",
    254        "      <td>-1.068128</td>\n",
    255        "      <td>-0.532731</td>\n",
    256        "      <td>-0.434744</td>\n",
    257        "      <td>-0.011062</td>\n",
    258        "      <td>-0.242517</td>\n",
    259        "      <td>-0.262159</td>\n",
    260        "      <td>-0.382393</td>\n",
    261        "      <td>0.159467</td>\n",
    262        "      <td>-0.053481</td>\n",
    263        "      <td>89400.0</td>\n",
    264        "    </tr>\n",
    265        "  </tbody>\n",
    266        "</table>\n",
    267        "</div>"
    268       ],
    269       "text/plain": [
    270        "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
    271        "40800   0.356367 -0.413793            0.020600     0.031705        0.023426   \n",
    272        "18285  -1.437524  2.340363           -0.877729     1.068025        0.768423   \n",
    273        "27106   0.356367 -0.413793            0.020600     0.031705        0.023426   \n",
    274        "12339  -0.843957  1.303788            1.031220    -0.394040       -0.687060   \n",
    275        "1501    1.372026 -0.727176           -0.990020     0.123144        0.005602   \n",
    276        "\n",
    277        "       population  households  median_income  ocean_proximity_<1H OCEAN  \\\n",
    278        "40800    0.035975    0.031885       0.027803                  -0.532731   \n",
    279        "18285    0.821632    0.780280       0.048288                  -0.532731   \n",
    280        "27106    0.035975    0.031885       0.027803                   1.877119   \n",
    281        "12339   -0.286809   -0.712446      -1.129927                  -0.532731   \n",
    282        "1501    -0.212510   -0.015836      -1.068128                  -0.532731   \n",
    283        "\n",
    284        "       ocean_proximity_INLAND  ocean_proximity_ISLAND  \\\n",
    285        "40800                2.300206               -0.011062   \n",
    286        "18285               -0.434744               -0.011062   \n",
    287        "27106               -0.434744               -0.011062   \n",
    288        "12339               -0.434744               -0.011062   \n",
    289        "1501                -0.434744               -0.011062   \n",
    290        "\n",
    291        "       ocean_proximity_NEAR BAY  ocean_proximity_NEAR OCEAN  bedroom_ratio  \\\n",
    292        "40800                 -0.242517                   -0.262159      -0.103414   \n",
    293        "18285                 -0.242517                   -0.262159      -0.858356   \n",
    294        "27106                 -0.242517                   -0.262159      -0.103414   \n",
    295        "12339                 -0.242517                   -0.262159      -0.759334   \n",
    296        "1501                  -0.242517                   -0.262159      -0.382393   \n",
    297        "\n",
    298        "       rooms_per_house  people_per_house  median_house_value  \n",
    299        "40800        -0.065604         -0.014948            179700.0  \n",
    300        "18285         0.457610         -0.006201            151900.0  \n",
    301        "27106        -0.065604         -0.014948            179700.0  \n",
    302        "12339         0.446049          0.078841            112500.0  \n",
    303        "1501          0.159467         -0.053481             89400.0  "
    304       ]
    305      },
    306      "execution_count": 620,
    307      "metadata": {},
    308      "output_type": "execute_result"
    309     }
    310    ],
    311    "source": [
    312     "from sklearn.model_selection import train_test_split\n",
    313     "\n",
    314     "train, test = train_test_split(fullDataset, test_size=.1, random_state=4)\n",
    315     "\n",
    316     "train.head()"
    317    ]
    318   },
    319   {
    320    "cell_type": "code",
    321    "execution_count": 621,
    322    "metadata": {},
    323    "outputs": [],
    324    "source": [
    325     "from sklearn.linear_model import LinearRegression\n",
    326     "\n",
    327     "reg = LinearRegression()\n",
    328     "\n",
    329     "features = train.columns.to_list()\n",
    330     "prediction = ['median_house_value']\n",
    331     "features.remove('median_house_value')\n",
    332     "model = reg.fit(X=train[features] , y=train[prediction])"
    333    ]
    334   },
    335   {
    336    "cell_type": "code",
    337    "execution_count": 622,
    338    "metadata": {},
    339    "outputs": [],
    340    "source": [
    341     "actual : pd.Series = test['median_house_value']\n",
    342     "test = test.drop('median_house_value', axis=1)\n",
    343     "guesses = model.predict(X=test)"
    344    ]
    345   },
    346   {
    347    "cell_type": "code",
    348    "execution_count": 623,
    349    "metadata": {},
    350    "outputs": [
    351     {
    352      "name": "stdout",
    353      "output_type": "stream",
    354      "text": [
    355       "RMSE: 49662.046191243775\n",
    356       "MAE: 26713.07387587102\n",
    357       "Average Error: 13.65%\n"
    358      ]
    359     }
    360    ],
    361    "source": [
    362     "from sklearn.metrics import mean_squared_error\n",
    363     "import numpy as np\n",
    364     "\n",
    365     "rmse = np.sqrt(mean_squared_error(actual, guesses))\n",
    366     "\n",
    367     "total_count : float = 0\n",
    368     "total_error : float = 0\n",
    369     "\n",
    370     "for val in actual:\n",
    371     "    total_error += abs(val - guesses[total_count])\n",
    372     "    total_count += 1 \n",
    373     "\n",
    374     "mae = (total_error / total_count)[0] # Not sure why this is a list...\n",
    375     "\n",
    376     "print(\"RMSE: \" + str(rmse))\n",
    377     "print(\"MAE: \" + str(mae))\n",
    378     "print(\"Average Error: \" + str(round(100 * (mae / actual.mean()), 2)) + \"%\")\n"
    379    ]
    380   }
    381  ],
    382  "metadata": {
    383   "kernelspec": {
    384    "display_name": "notebook",
    385    "language": "python",
    386    "name": "notebook"
    387   },
    388   "language_info": {
    389    "codemirror_mode": {
    390     "name": "ipython",
    391     "version": 3
    392    },
    393    "file_extension": ".py",
    394    "mimetype": "text/x-python",
    395    "name": "python",
    396    "nbconvert_exporter": "python",
    397    "pygments_lexer": "ipython3",
    398    "version": "3.11.2"
    399   }
    400  },
    401  "nbformat": 4,
    402  "nbformat_minor": 2
    403 }