LinearRegressionHousingV3.ipynb (13709B)
1 { 2 "cells": [ 3 { 4 "cell_type": "markdown", 5 "metadata": {}, 6 "source": [ 7 "This will be a summation of everything I have learned about preprocessing until this point.\n", 8 "\n", 9 "It will do the following transformations:\n", 10 "\n", 11 " 1. One hot encoding ocean proximity\n", 12 " 2. Impute Medians (This messed stuff up... bad so I dropped na... oh well)\n", 13 " 3. Get bedroom ratio\n", 14 " 4. Get rooms per house\n", 15 " 5. Get people per house\n", 16 " 6. Replace Long Tails with Logs\n", 17 " 7. Standardize all values\n", 18 " 8. Split\n", 19 " 9. Train\n", 20 " 10. Profit" 21 ] 22 }, 23 { 24 "cell_type": "markdown", 25 "metadata": {}, 26 "source": [ 27 "I decided to drop na instead of imputing because it gave horribly bad values after standardization. " 28 ] 29 }, 30 { 31 "cell_type": "code", 32 "execution_count": 614, 33 "metadata": {}, 34 "outputs": [], 35 "source": [ 36 "import pandas as pd\n", 37 "fullDataset = pd.read_csv('../datasets/housing/housing.csv')\n", 38 "fullDataset = fullDataset.dropna()" 39 ] 40 }, 41 { 42 "cell_type": "code", 43 "execution_count": 615, 44 "metadata": {}, 45 "outputs": [], 46 "source": [ 47 "from sklearn.preprocessing import OneHotEncoder\n", 48 "\n", 49 "encoder = OneHotEncoder(sparse_output=False)\n", 50 "prox = fullDataset[['ocean_proximity']]\n", 51 "encodedCol = encoder.fit_transform(prox)\n", 52 "encodedCol_df = pd.DataFrame(encodedCol, columns=encoder.get_feature_names_out(['ocean_proximity']))\n", 53 "\n", 54 "fullDataset = pd.concat([fullDataset, encodedCol_df] )\n", 55 "fullDataset.drop(columns='ocean_proximity', axis=1, inplace=True)" 56 ] 57 }, 58 { 59 "cell_type": "code", 60 "execution_count": 616, 61 "metadata": {}, 62 "outputs": [], 63 "source": [ 64 "from sklearn.impute import SimpleImputer\n", 65 "\n", 66 "imputer = SimpleImputer(strategy='median')\n", 67 "fullDataset = imputer.fit_transform(fullDataset)\n", 68 "\n", 69 "fullDataset = pd.DataFrame(fullDataset, columns=imputer.get_feature_names_out())\n" 70 ] 71 }, 72 { 73 "cell_type": "code", 74 "execution_count": 617, 75 "metadata": {}, 76 "outputs": [], 77 "source": [ 78 "fullDataset['bedroom_ratio'] = fullDataset['total_bedrooms'] / fullDataset['total_rooms']\n", 79 "fullDataset['rooms_per_house'] = fullDataset['total_rooms'] / fullDataset['households']\n", 80 "fullDataset['people_per_house'] = fullDataset['population'] / fullDataset['households']" 81 ] 82 }, 83 { 84 "cell_type": "code", 85 "execution_count": 618, 86 "metadata": {}, 87 "outputs": [], 88 "source": [ 89 "from sklearn.preprocessing import FunctionTransformer \n", 90 "import numpy as np\n", 91 "ft = FunctionTransformer(np.log)\n", 92 "\n", 93 "fullDataset['median_income'] = ft.transform(fullDataset['median_income'])\n", 94 "fullDataset['total_rooms'] = ft.transform(fullDataset['total_rooms'])\n", 95 "fullDataset['total_bedrooms'] = ft.transform(fullDataset['total_bedrooms'])\n", 96 "fullDataset['population'] = ft.transform(fullDataset['population'])\n", 97 "fullDataset['households'] = ft.transform(fullDataset['households'])" 98 ] 99 }, 100 { 101 "cell_type": "code", 102 "execution_count": 619, 103 "metadata": {}, 104 "outputs": [], 105 "source": [ 106 "from sklearn.preprocessing import StandardScaler\n", 107 "\n", 108 "std = StandardScaler()\n", 109 "\n", 110 "goal = fullDataset['median_house_value']\n", 111 "\n", 112 "fullDataset = fullDataset.drop(axis=1, columns='median_house_value')\n", 113 "fullDataset = std.fit_transform(fullDataset)\n", 114 "fullDataset = pd.DataFrame(fullDataset , columns=std.get_feature_names_out())\n", 115 "\n", 116 "fullDataset = pd.concat([fullDataset, goal], axis=1)" 117 ] 118 }, 119 { 120 "cell_type": "code", 121 "execution_count": 620, 122 "metadata": {}, 123 "outputs": [ 124 { 125 "data": { 126 "text/html": [ 127 "<div>\n", 128 "<style scoped>\n", 129 " .dataframe tbody tr th:only-of-type {\n", 130 " vertical-align: middle;\n", 131 " }\n", 132 "\n", 133 " .dataframe tbody tr th {\n", 134 " vertical-align: top;\n", 135 " }\n", 136 "\n", 137 " .dataframe thead th {\n", 138 " text-align: right;\n", 139 " }\n", 140 "</style>\n", 141 "<table border=\"1\" class=\"dataframe\">\n", 142 " <thead>\n", 143 " <tr style=\"text-align: right;\">\n", 144 " <th></th>\n", 145 " <th>longitude</th>\n", 146 " <th>latitude</th>\n", 147 " <th>housing_median_age</th>\n", 148 " <th>total_rooms</th>\n", 149 " <th>total_bedrooms</th>\n", 150 " <th>population</th>\n", 151 " <th>households</th>\n", 152 " <th>median_income</th>\n", 153 " <th>ocean_proximity_<1H OCEAN</th>\n", 154 " <th>ocean_proximity_INLAND</th>\n", 155 " <th>ocean_proximity_ISLAND</th>\n", 156 " <th>ocean_proximity_NEAR BAY</th>\n", 157 " <th>ocean_proximity_NEAR OCEAN</th>\n", 158 " <th>bedroom_ratio</th>\n", 159 " <th>rooms_per_house</th>\n", 160 " <th>people_per_house</th>\n", 161 " <th>median_house_value</th>\n", 162 " </tr>\n", 163 " </thead>\n", 164 " <tbody>\n", 165 " <tr>\n", 166 " <th>40800</th>\n", 167 " <td>0.356367</td>\n", 168 " <td>-0.413793</td>\n", 169 " <td>0.020600</td>\n", 170 " <td>0.031705</td>\n", 171 " <td>0.023426</td>\n", 172 " <td>0.035975</td>\n", 173 " <td>0.031885</td>\n", 174 " <td>0.027803</td>\n", 175 " <td>-0.532731</td>\n", 176 " <td>2.300206</td>\n", 177 " <td>-0.011062</td>\n", 178 " <td>-0.242517</td>\n", 179 " <td>-0.262159</td>\n", 180 " <td>-0.103414</td>\n", 181 " <td>-0.065604</td>\n", 182 " <td>-0.014948</td>\n", 183 " <td>179700.0</td>\n", 184 " </tr>\n", 185 " <tr>\n", 186 " <th>18285</th>\n", 187 " <td>-1.437524</td>\n", 188 " <td>2.340363</td>\n", 189 " <td>-0.877729</td>\n", 190 " <td>1.068025</td>\n", 191 " <td>0.768423</td>\n", 192 " <td>0.821632</td>\n", 193 " <td>0.780280</td>\n", 194 " <td>0.048288</td>\n", 195 " <td>-0.532731</td>\n", 196 " <td>-0.434744</td>\n", 197 " <td>-0.011062</td>\n", 198 " <td>-0.242517</td>\n", 199 " <td>-0.262159</td>\n", 200 " <td>-0.858356</td>\n", 201 " <td>0.457610</td>\n", 202 " <td>-0.006201</td>\n", 203 " <td>151900.0</td>\n", 204 " </tr>\n", 205 " <tr>\n", 206 " <th>27106</th>\n", 207 " <td>0.356367</td>\n", 208 " <td>-0.413793</td>\n", 209 " <td>0.020600</td>\n", 210 " <td>0.031705</td>\n", 211 " <td>0.023426</td>\n", 212 " <td>0.035975</td>\n", 213 " <td>0.031885</td>\n", 214 " <td>0.027803</td>\n", 215 " <td>1.877119</td>\n", 216 " <td>-0.434744</td>\n", 217 " <td>-0.011062</td>\n", 218 " <td>-0.242517</td>\n", 219 " <td>-0.262159</td>\n", 220 " <td>-0.103414</td>\n", 221 " <td>-0.065604</td>\n", 222 " <td>-0.014948</td>\n", 223 " <td>179700.0</td>\n", 224 " </tr>\n", 225 " <tr>\n", 226 " <th>12339</th>\n", 227 " <td>-0.843957</td>\n", 228 " <td>1.303788</td>\n", 229 " <td>1.031220</td>\n", 230 " <td>-0.394040</td>\n", 231 " <td>-0.687060</td>\n", 232 " <td>-0.286809</td>\n", 233 " <td>-0.712446</td>\n", 234 " <td>-1.129927</td>\n", 235 " <td>-0.532731</td>\n", 236 " <td>-0.434744</td>\n", 237 " <td>-0.011062</td>\n", 238 " <td>-0.242517</td>\n", 239 " <td>-0.262159</td>\n", 240 " <td>-0.759334</td>\n", 241 " <td>0.446049</td>\n", 242 " <td>0.078841</td>\n", 243 " <td>112500.0</td>\n", 244 " </tr>\n", 245 " <tr>\n", 246 " <th>1501</th>\n", 247 " <td>1.372026</td>\n", 248 " <td>-0.727176</td>\n", 249 " <td>-0.990020</td>\n", 250 " <td>0.123144</td>\n", 251 " <td>0.005602</td>\n", 252 " <td>-0.212510</td>\n", 253 " <td>-0.015836</td>\n", 254 " <td>-1.068128</td>\n", 255 " <td>-0.532731</td>\n", 256 " <td>-0.434744</td>\n", 257 " <td>-0.011062</td>\n", 258 " <td>-0.242517</td>\n", 259 " <td>-0.262159</td>\n", 260 " <td>-0.382393</td>\n", 261 " <td>0.159467</td>\n", 262 " <td>-0.053481</td>\n", 263 " <td>89400.0</td>\n", 264 " </tr>\n", 265 " </tbody>\n", 266 "</table>\n", 267 "</div>" 268 ], 269 "text/plain": [ 270 " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", 271 "40800 0.356367 -0.413793 0.020600 0.031705 0.023426 \n", 272 "18285 -1.437524 2.340363 -0.877729 1.068025 0.768423 \n", 273 "27106 0.356367 -0.413793 0.020600 0.031705 0.023426 \n", 274 "12339 -0.843957 1.303788 1.031220 -0.394040 -0.687060 \n", 275 "1501 1.372026 -0.727176 -0.990020 0.123144 0.005602 \n", 276 "\n", 277 " population households median_income ocean_proximity_<1H OCEAN \\\n", 278 "40800 0.035975 0.031885 0.027803 -0.532731 \n", 279 "18285 0.821632 0.780280 0.048288 -0.532731 \n", 280 "27106 0.035975 0.031885 0.027803 1.877119 \n", 281 "12339 -0.286809 -0.712446 -1.129927 -0.532731 \n", 282 "1501 -0.212510 -0.015836 -1.068128 -0.532731 \n", 283 "\n", 284 " ocean_proximity_INLAND ocean_proximity_ISLAND \\\n", 285 "40800 2.300206 -0.011062 \n", 286 "18285 -0.434744 -0.011062 \n", 287 "27106 -0.434744 -0.011062 \n", 288 "12339 -0.434744 -0.011062 \n", 289 "1501 -0.434744 -0.011062 \n", 290 "\n", 291 " ocean_proximity_NEAR BAY ocean_proximity_NEAR OCEAN bedroom_ratio \\\n", 292 "40800 -0.242517 -0.262159 -0.103414 \n", 293 "18285 -0.242517 -0.262159 -0.858356 \n", 294 "27106 -0.242517 -0.262159 -0.103414 \n", 295 "12339 -0.242517 -0.262159 -0.759334 \n", 296 "1501 -0.242517 -0.262159 -0.382393 \n", 297 "\n", 298 " rooms_per_house people_per_house median_house_value \n", 299 "40800 -0.065604 -0.014948 179700.0 \n", 300 "18285 0.457610 -0.006201 151900.0 \n", 301 "27106 -0.065604 -0.014948 179700.0 \n", 302 "12339 0.446049 0.078841 112500.0 \n", 303 "1501 0.159467 -0.053481 89400.0 " 304 ] 305 }, 306 "execution_count": 620, 307 "metadata": {}, 308 "output_type": "execute_result" 309 } 310 ], 311 "source": [ 312 "from sklearn.model_selection import train_test_split\n", 313 "\n", 314 "train, test = train_test_split(fullDataset, test_size=.1, random_state=4)\n", 315 "\n", 316 "train.head()" 317 ] 318 }, 319 { 320 "cell_type": "code", 321 "execution_count": 621, 322 "metadata": {}, 323 "outputs": [], 324 "source": [ 325 "from sklearn.linear_model import LinearRegression\n", 326 "\n", 327 "reg = LinearRegression()\n", 328 "\n", 329 "features = train.columns.to_list()\n", 330 "prediction = ['median_house_value']\n", 331 "features.remove('median_house_value')\n", 332 "model = reg.fit(X=train[features] , y=train[prediction])" 333 ] 334 }, 335 { 336 "cell_type": "code", 337 "execution_count": 622, 338 "metadata": {}, 339 "outputs": [], 340 "source": [ 341 "actual : pd.Series = test['median_house_value']\n", 342 "test = test.drop('median_house_value', axis=1)\n", 343 "guesses = model.predict(X=test)" 344 ] 345 }, 346 { 347 "cell_type": "code", 348 "execution_count": 623, 349 "metadata": {}, 350 "outputs": [ 351 { 352 "name": "stdout", 353 "output_type": "stream", 354 "text": [ 355 "RMSE: 49662.046191243775\n", 356 "MAE: 26713.07387587102\n", 357 "Average Error: 13.65%\n" 358 ] 359 } 360 ], 361 "source": [ 362 "from sklearn.metrics import mean_squared_error\n", 363 "import numpy as np\n", 364 "\n", 365 "rmse = np.sqrt(mean_squared_error(actual, guesses))\n", 366 "\n", 367 "total_count : float = 0\n", 368 "total_error : float = 0\n", 369 "\n", 370 "for val in actual:\n", 371 " total_error += abs(val - guesses[total_count])\n", 372 " total_count += 1 \n", 373 "\n", 374 "mae = (total_error / total_count)[0] # Not sure why this is a list...\n", 375 "\n", 376 "print(\"RMSE: \" + str(rmse))\n", 377 "print(\"MAE: \" + str(mae))\n", 378 "print(\"Average Error: \" + str(round(100 * (mae / actual.mean()), 2)) + \"%\")\n" 379 ] 380 } 381 ], 382 "metadata": { 383 "kernelspec": { 384 "display_name": "notebook", 385 "language": "python", 386 "name": "notebook" 387 }, 388 "language_info": { 389 "codemirror_mode": { 390 "name": "ipython", 391 "version": 3 392 }, 393 "file_extension": ".py", 394 "mimetype": "text/x-python", 395 "name": "python", 396 "nbconvert_exporter": "python", 397 "pygments_lexer": "ipython3", 398 "version": "3.11.2" 399 } 400 }, 401 "nbformat": 4, 402 "nbformat_minor": 2 403 }