MinMaxScaling.ipynb (7680B)
1 { 2 "cells": [ 3 { 4 "cell_type": "code", 5 "execution_count": 3, 6 "metadata": {}, 7 "outputs": [], 8 "source": [ 9 "import pandas as pd\n", 10 "import pathlib as path\n", 11 "# Load in saved csv data\n", 12 "df = pd.read_csv(path.Path('../datasets/housing/housing.csv'))" 13 ] 14 }, 15 { 16 "cell_type": "code", 17 "execution_count": 4, 18 "metadata": {}, 19 "outputs": [], 20 "source": [ 21 "# Remove string column\n", 22 "df.drop(columns='ocean_proximity', axis=1, inplace=True)" 23 ] 24 }, 25 { 26 "cell_type": "code", 27 "execution_count": 14, 28 "metadata": {}, 29 "outputs": [ 30 { 31 "data": { 32 "text/html": [ 33 "<div>\n", 34 "<style scoped>\n", 35 " .dataframe tbody tr th:only-of-type {\n", 36 " vertical-align: middle;\n", 37 " }\n", 38 "\n", 39 " .dataframe tbody tr th {\n", 40 " vertical-align: top;\n", 41 " }\n", 42 "\n", 43 " .dataframe thead th {\n", 44 " text-align: right;\n", 45 " }\n", 46 "</style>\n", 47 "<table border=\"1\" class=\"dataframe\">\n", 48 " <thead>\n", 49 " <tr style=\"text-align: right;\">\n", 50 " <th></th>\n", 51 " <th>longitude</th>\n", 52 " <th>latitude</th>\n", 53 " <th>housing_median_age</th>\n", 54 " <th>total_rooms</th>\n", 55 " <th>total_bedrooms</th>\n", 56 " <th>population</th>\n", 57 " <th>households</th>\n", 58 " <th>median_income</th>\n", 59 " <th>median_house_value</th>\n", 60 " </tr>\n", 61 " </thead>\n", 62 " <tbody>\n", 63 " <tr>\n", 64 " <th>count</th>\n", 65 " <td>20640.000000</td>\n", 66 " <td>20640.000000</td>\n", 67 " <td>20640.000000</td>\n", 68 " <td>20640.000000</td>\n", 69 " <td>20433.000000</td>\n", 70 " <td>20640.000000</td>\n", 71 " <td>20640.000000</td>\n", 72 " <td>20640.000000</td>\n", 73 " <td>20640.000000</td>\n", 74 " </tr>\n", 75 " <tr>\n", 76 " <th>mean</th>\n", 77 " <td>0.476125</td>\n", 78 " <td>0.328572</td>\n", 79 " <td>0.541951</td>\n", 80 " <td>0.066986</td>\n", 81 " <td>0.083313</td>\n", 82 " <td>0.039869</td>\n", 83 " <td>0.081983</td>\n", 84 " <td>0.232464</td>\n", 85 " <td>0.395579</td>\n", 86 " </tr>\n", 87 " <tr>\n", 88 " <th>std</th>\n", 89 " <td>0.199555</td>\n", 90 " <td>0.226988</td>\n", 91 " <td>0.246776</td>\n", 92 " <td>0.055486</td>\n", 93 " <td>0.065392</td>\n", 94 " <td>0.031740</td>\n", 95 " <td>0.062873</td>\n", 96 " <td>0.131020</td>\n", 97 " <td>0.237928</td>\n", 98 " </tr>\n", 99 " <tr>\n", 100 " <th>min</th>\n", 101 " <td>0.000000</td>\n", 102 " <td>0.000000</td>\n", 103 " <td>0.000000</td>\n", 104 " <td>0.000000</td>\n", 105 " <td>0.000000</td>\n", 106 " <td>0.000000</td>\n", 107 " <td>0.000000</td>\n", 108 " <td>0.000000</td>\n", 109 " <td>0.000000</td>\n", 110 " </tr>\n", 111 " <tr>\n", 112 " <th>25%</th>\n", 113 " <td>0.253984</td>\n", 114 " <td>0.147715</td>\n", 115 " <td>0.333333</td>\n", 116 " <td>0.036771</td>\n", 117 " <td>0.045779</td>\n", 118 " <td>0.021974</td>\n", 119 " <td>0.045881</td>\n", 120 " <td>0.142308</td>\n", 121 " <td>0.215671</td>\n", 122 " </tr>\n", 123 " <tr>\n", 124 " <th>50%</th>\n", 125 " <td>0.583665</td>\n", 126 " <td>0.182784</td>\n", 127 " <td>0.549020</td>\n", 128 " <td>0.054046</td>\n", 129 " <td>0.067349</td>\n", 130 " <td>0.032596</td>\n", 131 " <td>0.067094</td>\n", 132 " <td>0.209301</td>\n", 133 " <td>0.339588</td>\n", 134 " </tr>\n", 135 " <tr>\n", 136 " <th>75%</th>\n", 137 " <td>0.631474</td>\n", 138 " <td>0.549416</td>\n", 139 " <td>0.705882</td>\n", 140 " <td>0.080014</td>\n", 141 " <td>0.100248</td>\n", 142 " <td>0.048264</td>\n", 143 " <td>0.099326</td>\n", 144 " <td>0.292641</td>\n", 145 " <td>0.514897</td>\n", 146 " </tr>\n", 147 " <tr>\n", 148 " <th>max</th>\n", 149 " <td>1.000000</td>\n", 150 " <td>1.000000</td>\n", 151 " <td>1.000000</td>\n", 152 " <td>1.000000</td>\n", 153 " <td>1.000000</td>\n", 154 " <td>1.000000</td>\n", 155 " <td>1.000000</td>\n", 156 " <td>1.000000</td>\n", 157 " <td>1.000000</td>\n", 158 " </tr>\n", 159 " </tbody>\n", 160 "</table>\n", 161 "</div>" 162 ], 163 "text/plain": [ 164 " longitude latitude housing_median_age total_rooms \\\n", 165 "count 20640.000000 20640.000000 20640.000000 20640.000000 \n", 166 "mean 0.476125 0.328572 0.541951 0.066986 \n", 167 "std 0.199555 0.226988 0.246776 0.055486 \n", 168 "min 0.000000 0.000000 0.000000 0.000000 \n", 169 "25% 0.253984 0.147715 0.333333 0.036771 \n", 170 "50% 0.583665 0.182784 0.549020 0.054046 \n", 171 "75% 0.631474 0.549416 0.705882 0.080014 \n", 172 "max 1.000000 1.000000 1.000000 1.000000 \n", 173 "\n", 174 " total_bedrooms population households median_income \\\n", 175 "count 20433.000000 20640.000000 20640.000000 20640.000000 \n", 176 "mean 0.083313 0.039869 0.081983 0.232464 \n", 177 "std 0.065392 0.031740 0.062873 0.131020 \n", 178 "min 0.000000 0.000000 0.000000 0.000000 \n", 179 "25% 0.045779 0.021974 0.045881 0.142308 \n", 180 "50% 0.067349 0.032596 0.067094 0.209301 \n", 181 "75% 0.100248 0.048264 0.099326 0.292641 \n", 182 "max 1.000000 1.000000 1.000000 1.000000 \n", 183 "\n", 184 " median_house_value \n", 185 "count 20640.000000 \n", 186 "mean 0.395579 \n", 187 "std 0.237928 \n", 188 "min 0.000000 \n", 189 "25% 0.215671 \n", 190 "50% 0.339588 \n", 191 "75% 0.514897 \n", 192 "max 1.000000 " 193 ] 194 }, 195 "execution_count": 14, 196 "metadata": {}, 197 "output_type": "execute_result" 198 } 199 ], 200 "source": [ 201 "# For each column (assuming they are numbers) iterate through them and set all\n", 202 "# features to be equal to the (current - min) / diff. \n", 203 "\n", 204 "for i in df:\n", 205 " min = df[i].min()\n", 206 " diff = df[i].max() - min\n", 207 " df[i] = (df[i] - min) / diff \n", 208 "\n", 209 "df.describe()" 210 ] 211 } 212 ], 213 "metadata": { 214 "kernelspec": { 215 "display_name": "notebook", 216 "language": "python", 217 "name": "notebook" 218 }, 219 "language_info": { 220 "codemirror_mode": { 221 "name": "ipython", 222 "version": 3 223 }, 224 "file_extension": ".py", 225 "mimetype": "text/x-python", 226 "name": "python", 227 "nbconvert_exporter": "python", 228 "pygments_lexer": "ipython3", 229 "version": "3.11.2" 230 } 231 }, 232 "nbformat": 4, 233 "nbformat_minor": 2 234 }