commit 54d99d4eb66d157bea7c1e93e1713d1623d5c61d
parent 38163490de5aeae453dacdfca14964676c0968c0
Author: Andrew <andrewlaack1@gmail.com>
Date: Sat, 25 May 2024 16:15:58 -0500
updated
Diffstat:
1 file changed, 44 insertions(+), 40 deletions(-)
diff --git a/standardization/standardization.ipynb b/standardization/standardization.ipynb
@@ -2,7 +2,19 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "from pathlib import Path \n",
+ "\n",
+ "df = pd.read_csv(Path('../datasets/housing/housing.csv'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
"metadata": {},
"outputs": [
{
@@ -10,53 +22,45 @@
"output_type": "stream",
"text": [
" longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
- "0 -122.23 37.88 41.0 880.0 129.0 \n",
- "1 -122.22 37.86 21.0 7099.0 1106.0 \n",
- "2 -122.24 37.85 52.0 1467.0 190.0 \n",
- "3 -122.25 37.85 52.0 1274.0 235.0 \n",
- "4 -122.25 37.85 52.0 1627.0 280.0 \n",
+ "0 -1.327803 1.052523 0.982119 -0.804800 -0.970301 \n",
+ "1 -1.322812 1.043159 -0.607004 2.045841 1.348243 \n",
+ "2 -1.332794 1.038478 1.856137 -0.535733 -0.825541 \n",
+ "3 -1.337785 1.038478 1.856137 -0.624199 -0.718750 \n",
+ "4 -1.337785 1.038478 1.856137 -0.462393 -0.611959 \n",
"... ... ... ... ... ... \n",
- "20635 -121.09 39.48 25.0 1665.0 374.0 \n",
- "20636 -121.21 39.49 18.0 697.0 150.0 \n",
- "20637 -121.22 39.43 17.0 2254.0 485.0 \n",
- "20638 -121.32 39.43 18.0 1860.0 409.0 \n",
- "20639 -121.24 39.37 16.0 2785.0 616.0 \n",
+ "20635 -0.758808 1.801603 -0.289180 -0.444974 -0.388886 \n",
+ "20636 -0.818702 1.806285 -0.845373 -0.888682 -0.920466 \n",
+ "20637 -0.823693 1.778194 -0.924829 -0.174991 -0.125468 \n",
+ "20638 -0.873605 1.778194 -0.845373 -0.355591 -0.305826 \n",
+ "20639 -0.833676 1.750104 -1.004285 0.068407 0.185411 \n",
"\n",
- " population households median_income median_house_value \\\n",
- "0 322.0 126.0 8.3252 452600.0 \n",
- "1 2401.0 1138.0 8.3014 358500.0 \n",
- "2 496.0 177.0 7.2574 352100.0 \n",
- "3 558.0 219.0 5.6431 341300.0 \n",
- "4 565.0 259.0 3.8462 342200.0 \n",
- "... ... ... ... ... \n",
- "20635 845.0 330.0 1.5603 78100.0 \n",
- "20636 356.0 114.0 2.5568 77100.0 \n",
- "20637 1007.0 433.0 1.7000 92300.0 \n",
- "20638 741.0 349.0 1.8672 84700.0 \n",
- "20639 1387.0 530.0 2.3886 89400.0 \n",
+ " population households median_income median_house_value \n",
+ "0 -0.974405 -0.977009 2.344709 2.129580 \n",
+ "1 0.861418 1.669921 2.332181 1.314124 \n",
+ "2 -0.820757 -0.843616 1.782656 1.258663 \n",
+ "3 -0.766010 -0.733764 0.932945 1.165072 \n",
+ "4 -0.759828 -0.629142 -0.012881 1.172871 \n",
+ "... ... ... ... ... \n",
+ "20635 -0.512579 -0.443438 -1.216099 -1.115777 \n",
+ "20636 -0.944382 -1.008396 -0.691576 -1.124443 \n",
+ "20637 -0.369528 -0.174037 -1.142566 -0.992722 \n",
+ "20638 -0.604415 -0.393743 -1.054557 -1.058583 \n",
+ "20639 -0.033976 0.079670 -0.780111 -1.017853 \n",
"\n",
- " ocean_proximity \n",
- "0 NEAR BAY \n",
- "1 NEAR BAY \n",
- "2 NEAR BAY \n",
- "3 NEAR BAY \n",
- "4 NEAR BAY \n",
- "... ... \n",
- "20635 INLAND \n",
- "20636 INLAND \n",
- "20637 INLAND \n",
- "20638 INLAND \n",
- "20639 INLAND \n",
- "\n",
- "[20640 rows x 10 columns]\n"
+ "[20640 rows x 9 columns]\n"
]
}
],
"source": [
- "import pandas as pd\n",
- "from pathlib import Path \n",
+ "# Get number columns\n",
+ "df = df.select_dtypes(include=['number'])\n",
"\n",
- "df = pd.read_csv(Path('../datasets/housing/housing.csv'))"
+ "for i in df:\n",
+ " mean = df[i].mean()\n",
+ " std = df[i].std()\n",
+ " df[i] = (df[i] - mean) / std\n",
+ "\n",
+ "print(df)"
]
}
],