RandomForestHousing.ipynb (13681B)
1 { 2 "cells": [ 3 { 4 "cell_type": "code", 5 "execution_count": 266, 6 "metadata": {}, 7 "outputs": [], 8 "source": [ 9 "import pandas as pd\n", 10 "fullDataset = pd.read_csv('../datasets/housing/housing.csv')\n", 11 "fullDataset = fullDataset.dropna()" 12 ] 13 }, 14 { 15 "cell_type": "code", 16 "execution_count": 267, 17 "metadata": {}, 18 "outputs": [], 19 "source": [ 20 "from sklearn.preprocessing import OneHotEncoder\n", 21 "\n", 22 "encoder = OneHotEncoder(sparse_output=False)\n", 23 "prox = fullDataset[['ocean_proximity']]\n", 24 "encodedCol = encoder.fit_transform(prox)\n", 25 "encodedCol_df = pd.DataFrame(encodedCol, columns=encoder.get_feature_names_out(['ocean_proximity']))\n", 26 "\n", 27 "fullDataset = pd.concat([fullDataset, encodedCol_df] )\n", 28 "fullDataset.drop(columns='ocean_proximity', axis=1, inplace=True)" 29 ] 30 }, 31 { 32 "cell_type": "code", 33 "execution_count": 268, 34 "metadata": {}, 35 "outputs": [], 36 "source": [ 37 "from sklearn.impute import SimpleImputer\n", 38 "\n", 39 "imputer = SimpleImputer(strategy='median')\n", 40 "fullDataset = imputer.fit_transform(fullDataset)\n", 41 "\n", 42 "fullDataset = pd.DataFrame(fullDataset, columns=imputer.get_feature_names_out())\n" 43 ] 44 }, 45 { 46 "cell_type": "code", 47 "execution_count": 269, 48 "metadata": {}, 49 "outputs": [], 50 "source": [ 51 "fullDataset['bedroom_ratio'] = fullDataset['total_bedrooms'] / fullDataset['total_rooms']\n", 52 "fullDataset['rooms_per_house'] = fullDataset['total_rooms'] / fullDataset['households']\n", 53 "fullDataset['people_per_house'] = fullDataset['population'] / fullDataset['households']" 54 ] 55 }, 56 { 57 "cell_type": "code", 58 "execution_count": 270, 59 "metadata": {}, 60 "outputs": [], 61 "source": [ 62 "from sklearn.preprocessing import FunctionTransformer \n", 63 "import numpy as np\n", 64 "ft = FunctionTransformer(np.log)\n", 65 "\n", 66 "fullDataset['median_income'] = ft.transform(fullDataset['median_income'])\n", 67 "fullDataset['total_rooms'] = ft.transform(fullDataset['total_rooms'])\n", 68 "fullDataset['total_bedrooms'] = ft.transform(fullDataset['total_bedrooms'])\n", 69 "fullDataset['population'] = ft.transform(fullDataset['population'])\n", 70 "fullDataset['households'] = ft.transform(fullDataset['households'])" 71 ] 72 }, 73 { 74 "cell_type": "code", 75 "execution_count": 271, 76 "metadata": {}, 77 "outputs": [], 78 "source": [ 79 "from sklearn.preprocessing import StandardScaler\n", 80 "\n", 81 "std = StandardScaler()\n", 82 "\n", 83 "goal = fullDataset['median_house_value']\n", 84 "\n", 85 "fullDataset = fullDataset.drop(axis=1, columns='median_house_value')\n", 86 "fullDataset = std.fit_transform(fullDataset)\n", 87 "fullDataset = pd.DataFrame(fullDataset , columns=std.get_feature_names_out())\n", 88 "\n", 89 "fullDataset = pd.concat([fullDataset, goal], axis=1)" 90 ] 91 }, 92 { 93 "cell_type": "code", 94 "execution_count": 272, 95 "metadata": {}, 96 "outputs": [ 97 { 98 "data": { 99 "text/html": [ 100 "<div>\n", 101 "<style scoped>\n", 102 " .dataframe tbody tr th:only-of-type {\n", 103 " vertical-align: middle;\n", 104 " }\n", 105 "\n", 106 " .dataframe tbody tr th {\n", 107 " vertical-align: top;\n", 108 " }\n", 109 "\n", 110 " .dataframe thead th {\n", 111 " text-align: right;\n", 112 " }\n", 113 "</style>\n", 114 "<table border=\"1\" class=\"dataframe\">\n", 115 " <thead>\n", 116 " <tr style=\"text-align: right;\">\n", 117 " <th></th>\n", 118 " <th>longitude</th>\n", 119 " <th>latitude</th>\n", 120 " <th>housing_median_age</th>\n", 121 " <th>total_rooms</th>\n", 122 " <th>total_bedrooms</th>\n", 123 " <th>population</th>\n", 124 " <th>households</th>\n", 125 " <th>median_income</th>\n", 126 " <th>ocean_proximity_<1H OCEAN</th>\n", 127 " <th>ocean_proximity_INLAND</th>\n", 128 " <th>ocean_proximity_ISLAND</th>\n", 129 " <th>ocean_proximity_NEAR BAY</th>\n", 130 " <th>ocean_proximity_NEAR OCEAN</th>\n", 131 " <th>bedroom_ratio</th>\n", 132 " <th>rooms_per_house</th>\n", 133 " <th>people_per_house</th>\n", 134 " <th>median_house_value</th>\n", 135 " </tr>\n", 136 " </thead>\n", 137 " <tbody>\n", 138 " <tr>\n", 139 " <th>40800</th>\n", 140 " <td>0.356367</td>\n", 141 " <td>-0.413793</td>\n", 142 " <td>0.020600</td>\n", 143 " <td>0.031705</td>\n", 144 " <td>0.023426</td>\n", 145 " <td>0.035975</td>\n", 146 " <td>0.031885</td>\n", 147 " <td>0.027803</td>\n", 148 " <td>-0.532731</td>\n", 149 " <td>2.300206</td>\n", 150 " <td>-0.011062</td>\n", 151 " <td>-0.242517</td>\n", 152 " <td>-0.262159</td>\n", 153 " <td>-0.103414</td>\n", 154 " <td>-0.065604</td>\n", 155 " <td>-0.014948</td>\n", 156 " <td>179700.0</td>\n", 157 " </tr>\n", 158 " <tr>\n", 159 " <th>18285</th>\n", 160 " <td>-1.437524</td>\n", 161 " <td>2.340363</td>\n", 162 " <td>-0.877729</td>\n", 163 " <td>1.068025</td>\n", 164 " <td>0.768423</td>\n", 165 " <td>0.821632</td>\n", 166 " <td>0.780280</td>\n", 167 " <td>0.048288</td>\n", 168 " <td>-0.532731</td>\n", 169 " <td>-0.434744</td>\n", 170 " <td>-0.011062</td>\n", 171 " <td>-0.242517</td>\n", 172 " <td>-0.262159</td>\n", 173 " <td>-0.858356</td>\n", 174 " <td>0.457610</td>\n", 175 " <td>-0.006201</td>\n", 176 " <td>151900.0</td>\n", 177 " </tr>\n", 178 " <tr>\n", 179 " <th>27106</th>\n", 180 " <td>0.356367</td>\n", 181 " <td>-0.413793</td>\n", 182 " <td>0.020600</td>\n", 183 " <td>0.031705</td>\n", 184 " <td>0.023426</td>\n", 185 " <td>0.035975</td>\n", 186 " <td>0.031885</td>\n", 187 " <td>0.027803</td>\n", 188 " <td>1.877119</td>\n", 189 " <td>-0.434744</td>\n", 190 " <td>-0.011062</td>\n", 191 " <td>-0.242517</td>\n", 192 " <td>-0.262159</td>\n", 193 " <td>-0.103414</td>\n", 194 " <td>-0.065604</td>\n", 195 " <td>-0.014948</td>\n", 196 " <td>179700.0</td>\n", 197 " </tr>\n", 198 " <tr>\n", 199 " <th>12339</th>\n", 200 " <td>-0.843957</td>\n", 201 " <td>1.303788</td>\n", 202 " <td>1.031220</td>\n", 203 " <td>-0.394040</td>\n", 204 " <td>-0.687060</td>\n", 205 " <td>-0.286809</td>\n", 206 " <td>-0.712446</td>\n", 207 " <td>-1.129927</td>\n", 208 " <td>-0.532731</td>\n", 209 " <td>-0.434744</td>\n", 210 " <td>-0.011062</td>\n", 211 " <td>-0.242517</td>\n", 212 " <td>-0.262159</td>\n", 213 " <td>-0.759334</td>\n", 214 " <td>0.446049</td>\n", 215 " <td>0.078841</td>\n", 216 " <td>112500.0</td>\n", 217 " </tr>\n", 218 " <tr>\n", 219 " <th>1501</th>\n", 220 " <td>1.372026</td>\n", 221 " <td>-0.727176</td>\n", 222 " <td>-0.990020</td>\n", 223 " <td>0.123144</td>\n", 224 " <td>0.005602</td>\n", 225 " <td>-0.212510</td>\n", 226 " <td>-0.015836</td>\n", 227 " <td>-1.068128</td>\n", 228 " <td>-0.532731</td>\n", 229 " <td>-0.434744</td>\n", 230 " <td>-0.011062</td>\n", 231 " <td>-0.242517</td>\n", 232 " <td>-0.262159</td>\n", 233 " <td>-0.382393</td>\n", 234 " <td>0.159467</td>\n", 235 " <td>-0.053481</td>\n", 236 " <td>89400.0</td>\n", 237 " </tr>\n", 238 " </tbody>\n", 239 "</table>\n", 240 "</div>" 241 ], 242 "text/plain": [ 243 " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", 244 "40800 0.356367 -0.413793 0.020600 0.031705 0.023426 \n", 245 "18285 -1.437524 2.340363 -0.877729 1.068025 0.768423 \n", 246 "27106 0.356367 -0.413793 0.020600 0.031705 0.023426 \n", 247 "12339 -0.843957 1.303788 1.031220 -0.394040 -0.687060 \n", 248 "1501 1.372026 -0.727176 -0.990020 0.123144 0.005602 \n", 249 "\n", 250 " population households median_income ocean_proximity_<1H OCEAN \\\n", 251 "40800 0.035975 0.031885 0.027803 -0.532731 \n", 252 "18285 0.821632 0.780280 0.048288 -0.532731 \n", 253 "27106 0.035975 0.031885 0.027803 1.877119 \n", 254 "12339 -0.286809 -0.712446 -1.129927 -0.532731 \n", 255 "1501 -0.212510 -0.015836 -1.068128 -0.532731 \n", 256 "\n", 257 " ocean_proximity_INLAND ocean_proximity_ISLAND \\\n", 258 "40800 2.300206 -0.011062 \n", 259 "18285 -0.434744 -0.011062 \n", 260 "27106 -0.434744 -0.011062 \n", 261 "12339 -0.434744 -0.011062 \n", 262 "1501 -0.434744 -0.011062 \n", 263 "\n", 264 " ocean_proximity_NEAR BAY ocean_proximity_NEAR OCEAN bedroom_ratio \\\n", 265 "40800 -0.242517 -0.262159 -0.103414 \n", 266 "18285 -0.242517 -0.262159 -0.858356 \n", 267 "27106 -0.242517 -0.262159 -0.103414 \n", 268 "12339 -0.242517 -0.262159 -0.759334 \n", 269 "1501 -0.242517 -0.262159 -0.382393 \n", 270 "\n", 271 " rooms_per_house people_per_house median_house_value \n", 272 "40800 -0.065604 -0.014948 179700.0 \n", 273 "18285 0.457610 -0.006201 151900.0 \n", 274 "27106 -0.065604 -0.014948 179700.0 \n", 275 "12339 0.446049 0.078841 112500.0 \n", 276 "1501 0.159467 -0.053481 89400.0 " 277 ] 278 }, 279 "execution_count": 272, 280 "metadata": {}, 281 "output_type": "execute_result" 282 } 283 ], 284 "source": [ 285 "from sklearn.model_selection import train_test_split\n", 286 "\n", 287 "train, test = train_test_split(fullDataset, test_size=.1, random_state=4)\n", 288 "\n", 289 "train.head()" 290 ] 291 }, 292 { 293 "cell_type": "code", 294 "execution_count": 273, 295 "metadata": {}, 296 "outputs": [ 297 { 298 "name": "stderr", 299 "output_type": "stream", 300 "text": [ 301 "/home/andrew/gitRepos/myvenv/lib/python3.11/site-packages/sklearn/base.py:1474: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().\n", 302 " return fit_method(estimator, *args, **kwargs)\n" 303 ] 304 } 305 ], 306 "source": [ 307 "from sklearn.ensemble import RandomForestRegressor\n", 308 "\n", 309 "reg = RandomForestRegressor()\n", 310 "\n", 311 "features = train.columns.to_list()\n", 312 "prediction = ['median_house_value']\n", 313 "features.remove('median_house_value')\n", 314 "model = reg.fit(X=train[features] , y=train[prediction])" 315 ] 316 }, 317 { 318 "cell_type": "code", 319 "execution_count": 274, 320 "metadata": {}, 321 "outputs": [], 322 "source": [ 323 "actual : pd.Series = test['median_house_value']\n", 324 "test = test.drop('median_house_value', axis=1)\n", 325 "guesses = model.predict(X=test)" 326 ] 327 }, 328 { 329 "cell_type": "code", 330 "execution_count": 275, 331 "metadata": {}, 332 "outputs": [ 333 { 334 "name": "stdout", 335 "output_type": "stream", 336 "text": [ 337 "RMSE: 34351.84905691449\n", 338 "MAE: 16319.86912160511\n", 339 "Average Error: 8.34%\n" 340 ] 341 } 342 ], 343 "source": [ 344 "from sklearn.metrics import mean_squared_error\n", 345 "import numpy as np\n", 346 "\n", 347 "rmse = np.sqrt(mean_squared_error(actual, guesses))\n", 348 "\n", 349 "total_count : float = 0\n", 350 "total_error : float = 0\n", 351 "\n", 352 "for val in actual:\n", 353 " total_error += abs(val - guesses[total_count])\n", 354 " total_count += 1 \n", 355 "\n", 356 "mae = (total_error / total_count)\n", 357 "\n", 358 "print(\"RMSE: \" + str(rmse))\n", 359 "print(\"MAE: \" + str(mae))\n", 360 "print(\"Average Error: \" + str(round(100 * (mae / actual.mean()), 2)) + \"%\")\n" 361 ] 362 }, 363 { 364 "cell_type": "code", 365 "execution_count": 279, 366 "metadata": {}, 367 "outputs": [ 368 { 369 "data": { 370 "text/plain": [ 371 "['../models/CaliforniaHousingModel.pkl']" 372 ] 373 }, 374 "execution_count": 279, 375 "metadata": {}, 376 "output_type": "execute_result" 377 } 378 ], 379 "source": [ 380 "import joblib\n", 381 "\n", 382 "joblib.dump(model, '../models/CaliforniaHousingRandomForestModel.pkl')" 383 ] 384 } 385 ], 386 "metadata": { 387 "kernelspec": { 388 "display_name": "notebook", 389 "language": "python", 390 "name": "notebook" 391 }, 392 "language_info": { 393 "codemirror_mode": { 394 "name": "ipython", 395 "version": 3 396 }, 397 "file_extension": ".py", 398 "mimetype": "text/x-python", 399 "name": "python", 400 "nbconvert_exporter": "python", 401 "pygments_lexer": "ipython3", 402 "version": "3.11.2" 403 } 404 }, 405 "nbformat": 4, 406 "nbformat_minor": 2 407 }