machinelearning

Machine learning code
git clone git://git.laack.co/machinelearning.git
Log | Files | Refs

RandomForestHousing.ipynb (13681B)


      1 {
      2  "cells": [
      3   {
      4    "cell_type": "code",
      5    "execution_count": 266,
      6    "metadata": {},
      7    "outputs": [],
      8    "source": [
      9     "import pandas as pd\n",
     10     "fullDataset = pd.read_csv('../datasets/housing/housing.csv')\n",
     11     "fullDataset = fullDataset.dropna()"
     12    ]
     13   },
     14   {
     15    "cell_type": "code",
     16    "execution_count": 267,
     17    "metadata": {},
     18    "outputs": [],
     19    "source": [
     20     "from sklearn.preprocessing import OneHotEncoder\n",
     21     "\n",
     22     "encoder = OneHotEncoder(sparse_output=False)\n",
     23     "prox = fullDataset[['ocean_proximity']]\n",
     24     "encodedCol = encoder.fit_transform(prox)\n",
     25     "encodedCol_df = pd.DataFrame(encodedCol, columns=encoder.get_feature_names_out(['ocean_proximity']))\n",
     26     "\n",
     27     "fullDataset = pd.concat([fullDataset, encodedCol_df] )\n",
     28     "fullDataset.drop(columns='ocean_proximity', axis=1, inplace=True)"
     29    ]
     30   },
     31   {
     32    "cell_type": "code",
     33    "execution_count": 268,
     34    "metadata": {},
     35    "outputs": [],
     36    "source": [
     37     "from sklearn.impute import SimpleImputer\n",
     38     "\n",
     39     "imputer = SimpleImputer(strategy='median')\n",
     40     "fullDataset = imputer.fit_transform(fullDataset)\n",
     41     "\n",
     42     "fullDataset = pd.DataFrame(fullDataset, columns=imputer.get_feature_names_out())\n"
     43    ]
     44   },
     45   {
     46    "cell_type": "code",
     47    "execution_count": 269,
     48    "metadata": {},
     49    "outputs": [],
     50    "source": [
     51     "fullDataset['bedroom_ratio'] = fullDataset['total_bedrooms'] / fullDataset['total_rooms']\n",
     52     "fullDataset['rooms_per_house'] = fullDataset['total_rooms'] / fullDataset['households']\n",
     53     "fullDataset['people_per_house'] = fullDataset['population'] / fullDataset['households']"
     54    ]
     55   },
     56   {
     57    "cell_type": "code",
     58    "execution_count": 270,
     59    "metadata": {},
     60    "outputs": [],
     61    "source": [
     62     "from sklearn.preprocessing import FunctionTransformer \n",
     63     "import numpy as np\n",
     64     "ft = FunctionTransformer(np.log)\n",
     65     "\n",
     66     "fullDataset['median_income'] = ft.transform(fullDataset['median_income'])\n",
     67     "fullDataset['total_rooms'] = ft.transform(fullDataset['total_rooms'])\n",
     68     "fullDataset['total_bedrooms'] = ft.transform(fullDataset['total_bedrooms'])\n",
     69     "fullDataset['population'] = ft.transform(fullDataset['population'])\n",
     70     "fullDataset['households'] = ft.transform(fullDataset['households'])"
     71    ]
     72   },
     73   {
     74    "cell_type": "code",
     75    "execution_count": 271,
     76    "metadata": {},
     77    "outputs": [],
     78    "source": [
     79     "from sklearn.preprocessing import StandardScaler\n",
     80     "\n",
     81     "std = StandardScaler()\n",
     82     "\n",
     83     "goal = fullDataset['median_house_value']\n",
     84     "\n",
     85     "fullDataset = fullDataset.drop(axis=1, columns='median_house_value')\n",
     86     "fullDataset = std.fit_transform(fullDataset)\n",
     87     "fullDataset = pd.DataFrame(fullDataset , columns=std.get_feature_names_out())\n",
     88     "\n",
     89     "fullDataset = pd.concat([fullDataset, goal], axis=1)"
     90    ]
     91   },
     92   {
     93    "cell_type": "code",
     94    "execution_count": 272,
     95    "metadata": {},
     96    "outputs": [
     97     {
     98      "data": {
     99       "text/html": [
    100        "<div>\n",
    101        "<style scoped>\n",
    102        "    .dataframe tbody tr th:only-of-type {\n",
    103        "        vertical-align: middle;\n",
    104        "    }\n",
    105        "\n",
    106        "    .dataframe tbody tr th {\n",
    107        "        vertical-align: top;\n",
    108        "    }\n",
    109        "\n",
    110        "    .dataframe thead th {\n",
    111        "        text-align: right;\n",
    112        "    }\n",
    113        "</style>\n",
    114        "<table border=\"1\" class=\"dataframe\">\n",
    115        "  <thead>\n",
    116        "    <tr style=\"text-align: right;\">\n",
    117        "      <th></th>\n",
    118        "      <th>longitude</th>\n",
    119        "      <th>latitude</th>\n",
    120        "      <th>housing_median_age</th>\n",
    121        "      <th>total_rooms</th>\n",
    122        "      <th>total_bedrooms</th>\n",
    123        "      <th>population</th>\n",
    124        "      <th>households</th>\n",
    125        "      <th>median_income</th>\n",
    126        "      <th>ocean_proximity_&lt;1H OCEAN</th>\n",
    127        "      <th>ocean_proximity_INLAND</th>\n",
    128        "      <th>ocean_proximity_ISLAND</th>\n",
    129        "      <th>ocean_proximity_NEAR BAY</th>\n",
    130        "      <th>ocean_proximity_NEAR OCEAN</th>\n",
    131        "      <th>bedroom_ratio</th>\n",
    132        "      <th>rooms_per_house</th>\n",
    133        "      <th>people_per_house</th>\n",
    134        "      <th>median_house_value</th>\n",
    135        "    </tr>\n",
    136        "  </thead>\n",
    137        "  <tbody>\n",
    138        "    <tr>\n",
    139        "      <th>40800</th>\n",
    140        "      <td>0.356367</td>\n",
    141        "      <td>-0.413793</td>\n",
    142        "      <td>0.020600</td>\n",
    143        "      <td>0.031705</td>\n",
    144        "      <td>0.023426</td>\n",
    145        "      <td>0.035975</td>\n",
    146        "      <td>0.031885</td>\n",
    147        "      <td>0.027803</td>\n",
    148        "      <td>-0.532731</td>\n",
    149        "      <td>2.300206</td>\n",
    150        "      <td>-0.011062</td>\n",
    151        "      <td>-0.242517</td>\n",
    152        "      <td>-0.262159</td>\n",
    153        "      <td>-0.103414</td>\n",
    154        "      <td>-0.065604</td>\n",
    155        "      <td>-0.014948</td>\n",
    156        "      <td>179700.0</td>\n",
    157        "    </tr>\n",
    158        "    <tr>\n",
    159        "      <th>18285</th>\n",
    160        "      <td>-1.437524</td>\n",
    161        "      <td>2.340363</td>\n",
    162        "      <td>-0.877729</td>\n",
    163        "      <td>1.068025</td>\n",
    164        "      <td>0.768423</td>\n",
    165        "      <td>0.821632</td>\n",
    166        "      <td>0.780280</td>\n",
    167        "      <td>0.048288</td>\n",
    168        "      <td>-0.532731</td>\n",
    169        "      <td>-0.434744</td>\n",
    170        "      <td>-0.011062</td>\n",
    171        "      <td>-0.242517</td>\n",
    172        "      <td>-0.262159</td>\n",
    173        "      <td>-0.858356</td>\n",
    174        "      <td>0.457610</td>\n",
    175        "      <td>-0.006201</td>\n",
    176        "      <td>151900.0</td>\n",
    177        "    </tr>\n",
    178        "    <tr>\n",
    179        "      <th>27106</th>\n",
    180        "      <td>0.356367</td>\n",
    181        "      <td>-0.413793</td>\n",
    182        "      <td>0.020600</td>\n",
    183        "      <td>0.031705</td>\n",
    184        "      <td>0.023426</td>\n",
    185        "      <td>0.035975</td>\n",
    186        "      <td>0.031885</td>\n",
    187        "      <td>0.027803</td>\n",
    188        "      <td>1.877119</td>\n",
    189        "      <td>-0.434744</td>\n",
    190        "      <td>-0.011062</td>\n",
    191        "      <td>-0.242517</td>\n",
    192        "      <td>-0.262159</td>\n",
    193        "      <td>-0.103414</td>\n",
    194        "      <td>-0.065604</td>\n",
    195        "      <td>-0.014948</td>\n",
    196        "      <td>179700.0</td>\n",
    197        "    </tr>\n",
    198        "    <tr>\n",
    199        "      <th>12339</th>\n",
    200        "      <td>-0.843957</td>\n",
    201        "      <td>1.303788</td>\n",
    202        "      <td>1.031220</td>\n",
    203        "      <td>-0.394040</td>\n",
    204        "      <td>-0.687060</td>\n",
    205        "      <td>-0.286809</td>\n",
    206        "      <td>-0.712446</td>\n",
    207        "      <td>-1.129927</td>\n",
    208        "      <td>-0.532731</td>\n",
    209        "      <td>-0.434744</td>\n",
    210        "      <td>-0.011062</td>\n",
    211        "      <td>-0.242517</td>\n",
    212        "      <td>-0.262159</td>\n",
    213        "      <td>-0.759334</td>\n",
    214        "      <td>0.446049</td>\n",
    215        "      <td>0.078841</td>\n",
    216        "      <td>112500.0</td>\n",
    217        "    </tr>\n",
    218        "    <tr>\n",
    219        "      <th>1501</th>\n",
    220        "      <td>1.372026</td>\n",
    221        "      <td>-0.727176</td>\n",
    222        "      <td>-0.990020</td>\n",
    223        "      <td>0.123144</td>\n",
    224        "      <td>0.005602</td>\n",
    225        "      <td>-0.212510</td>\n",
    226        "      <td>-0.015836</td>\n",
    227        "      <td>-1.068128</td>\n",
    228        "      <td>-0.532731</td>\n",
    229        "      <td>-0.434744</td>\n",
    230        "      <td>-0.011062</td>\n",
    231        "      <td>-0.242517</td>\n",
    232        "      <td>-0.262159</td>\n",
    233        "      <td>-0.382393</td>\n",
    234        "      <td>0.159467</td>\n",
    235        "      <td>-0.053481</td>\n",
    236        "      <td>89400.0</td>\n",
    237        "    </tr>\n",
    238        "  </tbody>\n",
    239        "</table>\n",
    240        "</div>"
    241       ],
    242       "text/plain": [
    243        "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
    244        "40800   0.356367 -0.413793            0.020600     0.031705        0.023426   \n",
    245        "18285  -1.437524  2.340363           -0.877729     1.068025        0.768423   \n",
    246        "27106   0.356367 -0.413793            0.020600     0.031705        0.023426   \n",
    247        "12339  -0.843957  1.303788            1.031220    -0.394040       -0.687060   \n",
    248        "1501    1.372026 -0.727176           -0.990020     0.123144        0.005602   \n",
    249        "\n",
    250        "       population  households  median_income  ocean_proximity_<1H OCEAN  \\\n",
    251        "40800    0.035975    0.031885       0.027803                  -0.532731   \n",
    252        "18285    0.821632    0.780280       0.048288                  -0.532731   \n",
    253        "27106    0.035975    0.031885       0.027803                   1.877119   \n",
    254        "12339   -0.286809   -0.712446      -1.129927                  -0.532731   \n",
    255        "1501    -0.212510   -0.015836      -1.068128                  -0.532731   \n",
    256        "\n",
    257        "       ocean_proximity_INLAND  ocean_proximity_ISLAND  \\\n",
    258        "40800                2.300206               -0.011062   \n",
    259        "18285               -0.434744               -0.011062   \n",
    260        "27106               -0.434744               -0.011062   \n",
    261        "12339               -0.434744               -0.011062   \n",
    262        "1501                -0.434744               -0.011062   \n",
    263        "\n",
    264        "       ocean_proximity_NEAR BAY  ocean_proximity_NEAR OCEAN  bedroom_ratio  \\\n",
    265        "40800                 -0.242517                   -0.262159      -0.103414   \n",
    266        "18285                 -0.242517                   -0.262159      -0.858356   \n",
    267        "27106                 -0.242517                   -0.262159      -0.103414   \n",
    268        "12339                 -0.242517                   -0.262159      -0.759334   \n",
    269        "1501                  -0.242517                   -0.262159      -0.382393   \n",
    270        "\n",
    271        "       rooms_per_house  people_per_house  median_house_value  \n",
    272        "40800        -0.065604         -0.014948            179700.0  \n",
    273        "18285         0.457610         -0.006201            151900.0  \n",
    274        "27106        -0.065604         -0.014948            179700.0  \n",
    275        "12339         0.446049          0.078841            112500.0  \n",
    276        "1501          0.159467         -0.053481             89400.0  "
    277       ]
    278      },
    279      "execution_count": 272,
    280      "metadata": {},
    281      "output_type": "execute_result"
    282     }
    283    ],
    284    "source": [
    285     "from sklearn.model_selection import train_test_split\n",
    286     "\n",
    287     "train, test = train_test_split(fullDataset, test_size=.1, random_state=4)\n",
    288     "\n",
    289     "train.head()"
    290    ]
    291   },
    292   {
    293    "cell_type": "code",
    294    "execution_count": 273,
    295    "metadata": {},
    296    "outputs": [
    297     {
    298      "name": "stderr",
    299      "output_type": "stream",
    300      "text": [
    301       "/home/andrew/gitRepos/myvenv/lib/python3.11/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n",
    302       "  return fit_method(estimator, *args, **kwargs)\n"
    303      ]
    304     }
    305    ],
    306    "source": [
    307     "from sklearn.ensemble import RandomForestRegressor\n",
    308     "\n",
    309     "reg = RandomForestRegressor()\n",
    310     "\n",
    311     "features = train.columns.to_list()\n",
    312     "prediction = ['median_house_value']\n",
    313     "features.remove('median_house_value')\n",
    314     "model = reg.fit(X=train[features] , y=train[prediction])"
    315    ]
    316   },
    317   {
    318    "cell_type": "code",
    319    "execution_count": 274,
    320    "metadata": {},
    321    "outputs": [],
    322    "source": [
    323     "actual : pd.Series = test['median_house_value']\n",
    324     "test = test.drop('median_house_value', axis=1)\n",
    325     "guesses = model.predict(X=test)"
    326    ]
    327   },
    328   {
    329    "cell_type": "code",
    330    "execution_count": 275,
    331    "metadata": {},
    332    "outputs": [
    333     {
    334      "name": "stdout",
    335      "output_type": "stream",
    336      "text": [
    337       "RMSE: 34351.84905691449\n",
    338       "MAE: 16319.86912160511\n",
    339       "Average Error: 8.34%\n"
    340      ]
    341     }
    342    ],
    343    "source": [
    344     "from sklearn.metrics import mean_squared_error\n",
    345     "import numpy as np\n",
    346     "\n",
    347     "rmse = np.sqrt(mean_squared_error(actual, guesses))\n",
    348     "\n",
    349     "total_count : float = 0\n",
    350     "total_error : float = 0\n",
    351     "\n",
    352     "for val in actual:\n",
    353     "    total_error += abs(val - guesses[total_count])\n",
    354     "    total_count += 1 \n",
    355     "\n",
    356     "mae = (total_error / total_count)\n",
    357     "\n",
    358     "print(\"RMSE: \" + str(rmse))\n",
    359     "print(\"MAE: \" + str(mae))\n",
    360     "print(\"Average Error: \" + str(round(100 * (mae / actual.mean()), 2)) + \"%\")\n"
    361    ]
    362   },
    363   {
    364    "cell_type": "code",
    365    "execution_count": 279,
    366    "metadata": {},
    367    "outputs": [
    368     {
    369      "data": {
    370       "text/plain": [
    371        "['../models/CaliforniaHousingModel.pkl']"
    372       ]
    373      },
    374      "execution_count": 279,
    375      "metadata": {},
    376      "output_type": "execute_result"
    377     }
    378    ],
    379    "source": [
    380     "import joblib\n",
    381     "\n",
    382     "joblib.dump(model, '../models/CaliforniaHousingRandomForestModel.pkl')"
    383    ]
    384   }
    385  ],
    386  "metadata": {
    387   "kernelspec": {
    388    "display_name": "notebook",
    389    "language": "python",
    390    "name": "notebook"
    391   },
    392   "language_info": {
    393    "codemirror_mode": {
    394     "name": "ipython",
    395     "version": 3
    396    },
    397    "file_extension": ".py",
    398    "mimetype": "text/x-python",
    399    "name": "python",
    400    "nbconvert_exporter": "python",
    401    "pygments_lexer": "ipython3",
    402    "version": "3.11.2"
    403   }
    404  },
    405  "nbformat": 4,
    406  "nbformat_minor": 2
    407 }