commit 38163490de5aeae453dacdfca14964676c0968c0
parent 5e94d00f154d263ad235bccf885cfb1b61d207f5
Author: Andrew <andrewlaack1@gmail.com>
Date: Sat, 25 May 2024 16:15:41 -0500
implemented standardization algorithm for pandas data frame.
Diffstat:
2 files changed, 144 insertions(+), 44 deletions(-)
diff --git a/linearRegression/LinearRegressionHousingV2.ipynb b/linearRegression/LinearRegressionHousingV2.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 481,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -36,7 +36,7 @@
},
{
"cell_type": "code",
- "execution_count": 482,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -78,7 +78,7 @@
},
{
"cell_type": "code",
- "execution_count": 483,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -101,7 +101,7 @@
},
{
"cell_type": "code",
- "execution_count": 484,
+ "execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -117,7 +117,7 @@
},
{
"cell_type": "code",
- "execution_count": 485,
+ "execution_count": 6,
"metadata": {},
"outputs": [
{
@@ -155,7 +155,7 @@
},
{
"cell_type": "code",
- "execution_count": 486,
+ "execution_count": 7,
"metadata": {},
"outputs": [
{
@@ -192,7 +192,7 @@
" <Axes: title={'center': 'median_house_value'}>]], dtype=object)"
]
},
- "execution_count": 486,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
@@ -235,7 +235,7 @@
},
{
"cell_type": "code",
- "execution_count": 487,
+ "execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
@@ -247,7 +247,7 @@
},
{
"cell_type": "code",
- "execution_count": 488,
+ "execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
@@ -260,7 +260,7 @@
},
{
"cell_type": "code",
- "execution_count": 489,
+ "execution_count": 10,
"metadata": {},
"outputs": [
{
@@ -290,7 +290,7 @@
},
{
"cell_type": "code",
- "execution_count": 490,
+ "execution_count": 11,
"metadata": {},
"outputs": [
{
@@ -308,7 +308,7 @@
"Name: median_house_value, dtype: float64"
]
},
- "execution_count": 490,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@@ -328,7 +328,7 @@
},
{
"cell_type": "code",
- "execution_count": 491,
+ "execution_count": 12,
"metadata": {},
"outputs": [
{
@@ -353,7 +353,7 @@
" dtype=object)"
]
},
- "execution_count": 491,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
},
@@ -394,7 +394,7 @@
},
{
"cell_type": "code",
- "execution_count": 492,
+ "execution_count": 13,
"metadata": {},
"outputs": [
{
@@ -403,7 +403,7 @@
"<Axes: xlabel='median_income', ylabel='median_house_value'>"
]
},
- "execution_count": 492,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
},
@@ -425,7 +425,7 @@
},
{
"cell_type": "code",
- "execution_count": 493,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
@@ -446,7 +446,7 @@
"Name: median_house_value, dtype: float64"
]
},
- "execution_count": 493,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -462,7 +462,7 @@
},
{
"cell_type": "code",
- "execution_count": 494,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
@@ -488,7 +488,7 @@
},
{
"cell_type": "code",
- "execution_count": 495,
+ "execution_count": 16,
"metadata": {},
"outputs": [
{
@@ -510,7 +510,7 @@
"Name: median_house_value, dtype: float64"
]
},
- "execution_count": 495,
+ "execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@@ -528,7 +528,7 @@
},
{
"cell_type": "code",
- "execution_count": 496,
+ "execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
@@ -540,7 +540,7 @@
},
{
"cell_type": "code",
- "execution_count": 497,
+ "execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
@@ -562,39 +562,55 @@
},
{
"cell_type": "code",
- "execution_count": 513,
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import OneHotEncoder\n",
+ "\n",
+ "# Use one hot encoding to map each option for the ocean proximity feature\n",
+ "# to its own boolean column\n",
+ "\n",
+ "housing_cat = housing[[\"ocean_proximity\"]]\n",
+ "encoder = OneHotEncoder()\n",
+ "housing_1hot = encoder.fit_transform(housing_cat)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
- " 0\n",
- "0 (0, 0)\\t1.0\n",
- "1 (0, 0)\\t1.0\n",
- "2 (0, 0)\\t1.0\n",
- "3 (0, 0)\\t1.0\n",
- "4 (0, 0)\\t1.0\n",
- "... ...\n",
- "16507 (0, 1)\\t1.0\n",
- "16508 (0, 1)\\t1.0\n",
- "16509 (0, 4)\\t1.0\n",
- "16510 (0, 3)\\t1.0\n",
- "16511 (0, 1)\\t1.0\n",
- "\n",
- "[16512 rows x 1 columns]\n"
+ "[[ 0.23279352 -0.65568544 -0.29411765 ... -0.86955152 -0.73359645\n",
+ " -0.63126026]\n",
+ " [ 0.21659919 -0.7194474 0.33333333 ... -0.92962924 -0.88653182\n",
+ " -0.31322327]\n",
+ " [ 0.24089069 -0.7109458 0.45098039 ... -0.95266604 -0.94540372\n",
+ " -0.5640336 ]\n",
+ " ...\n",
+ " [ 0.46963563 -0.97662062 -0.25490196 ... -0.96008823 -0.89047854\n",
+ " -0.70771438]\n",
+ " [-0.61336032 0.11583422 1. ... -0.94419354 -0.84015787\n",
+ " -0.52068247]\n",
+ " [-0.38866397 0.21997875 -0.49019608 ... -0.95406645 -0.94014142\n",
+ " -0.45784196]]\n"
]
}
],
"source": [
- "from sklearn.preprocessing import OneHotEncoder\n",
+ "from sklearn.preprocessing import MinMaxScaler\n",
"\n",
- "# Use one hot encoding to map each option for the ocean proximity feature\n",
- "# to its own boolean column\n",
+ "# Built in min max computation ranging from -1 to 1. \n",
+ "# See minMaxScaling folder for custom implementation of this from 0 to 1.\n",
"\n",
- "housing_cat = housing[[\"ocean_proximity\"]]\n",
- "encoder = OneHotEncoder()\n",
- "housing_1hot = encoder.fit_transform(housing_cat)\n"
+ "min_max = MinMaxScaler(feature_range=(-1,1))\n",
+ "housing_num_min_max = min_max.fit_transform(housing_num)\n",
+ "\n",
+ "print(housing_num_min_max)"
]
}
],
diff --git a/standardization/standardization.ipynb b/standardization/standardization.ipynb
@@ -0,0 +1,84 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " longitude latitude housing_median_age total_rooms total_bedrooms \\\n",
+ "0 -122.23 37.88 41.0 880.0 129.0 \n",
+ "1 -122.22 37.86 21.0 7099.0 1106.0 \n",
+ "2 -122.24 37.85 52.0 1467.0 190.0 \n",
+ "3 -122.25 37.85 52.0 1274.0 235.0 \n",
+ "4 -122.25 37.85 52.0 1627.0 280.0 \n",
+ "... ... ... ... ... ... \n",
+ "20635 -121.09 39.48 25.0 1665.0 374.0 \n",
+ "20636 -121.21 39.49 18.0 697.0 150.0 \n",
+ "20637 -121.22 39.43 17.0 2254.0 485.0 \n",
+ "20638 -121.32 39.43 18.0 1860.0 409.0 \n",
+ "20639 -121.24 39.37 16.0 2785.0 616.0 \n",
+ "\n",
+ " population households median_income median_house_value \\\n",
+ "0 322.0 126.0 8.3252 452600.0 \n",
+ "1 2401.0 1138.0 8.3014 358500.0 \n",
+ "2 496.0 177.0 7.2574 352100.0 \n",
+ "3 558.0 219.0 5.6431 341300.0 \n",
+ "4 565.0 259.0 3.8462 342200.0 \n",
+ "... ... ... ... ... \n",
+ "20635 845.0 330.0 1.5603 78100.0 \n",
+ "20636 356.0 114.0 2.5568 77100.0 \n",
+ "20637 1007.0 433.0 1.7000 92300.0 \n",
+ "20638 741.0 349.0 1.8672 84700.0 \n",
+ "20639 1387.0 530.0 2.3886 89400.0 \n",
+ "\n",
+ " ocean_proximity \n",
+ "0 NEAR BAY \n",
+ "1 NEAR BAY \n",
+ "2 NEAR BAY \n",
+ "3 NEAR BAY \n",
+ "4 NEAR BAY \n",
+ "... ... \n",
+ "20635 INLAND \n",
+ "20636 INLAND \n",
+ "20637 INLAND \n",
+ "20638 INLAND \n",
+ "20639 INLAND \n",
+ "\n",
+ "[20640 rows x 10 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "from pathlib import Path \n",
+ "\n",
+ "df = pd.read_csv(Path('../datasets/housing/housing.csv'))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "notebook",
+ "language": "python",
+ "name": "notebook"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}