implemented standardization algorithm for pandas data frame. - machinelearning - Unnamed repository; edit this file 'description' to name the repository.

commit 38163490de5aeae453dacdfca14964676c0968c0
parent 5e94d00f154d263ad235bccf885cfb1b61d207f5
Author: Andrew <andrewlaack1@gmail.com>
Date:   Sat, 25 May 2024 16:15:41 -0500

implemented standardization algorithm for pandas data frame.

Diffstat:
M linearRegression/LinearRegressionHousingV2.ipynb  | 104 ++++++++++++++++++++++++++++++++++++++++++++++----------------------------------
A standardization/standardization.ipynb  | 84 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

2 files changed, 144 insertions(+), 44 deletions(-)
diff --git a/linearRegression/LinearRegressionHousingV2.ipynb b/linearRegression/LinearRegressionHousingV2.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 481,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -36,7 +36,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 482,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -78,7 +78,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 483,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -101,7 +101,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 484,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -117,7 +117,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 485,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -155,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 486,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -192,7 +192,7 @@
        "        <Axes: title={'center': 'median_house_value'}>]], dtype=object)"
       ]
      },
-     "execution_count": 486,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -235,7 +235,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 487,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -247,7 +247,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 488,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -260,7 +260,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 489,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -290,7 +290,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 490,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -308,7 +308,7 @@
        "Name: median_house_value, dtype: float64"
       ]
      },
-     "execution_count": 490,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -328,7 +328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 491,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -353,7 +353,7 @@
        "      dtype=object)"
       ]
      },
-     "execution_count": 491,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -394,7 +394,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 492,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -403,7 +403,7 @@
        "<Axes: xlabel='median_income', ylabel='median_house_value'>"
       ]
      },
-     "execution_count": 492,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     },
@@ -425,7 +425,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 493,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -446,7 +446,7 @@
        "Name: median_house_value, dtype: float64"
       ]
      },
-     "execution_count": 493,
+     "execution_count": 14,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -462,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 494,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
@@ -488,7 +488,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 495,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
@@ -510,7 +510,7 @@
        "Name: median_house_value, dtype: float64"
       ]
      },
-     "execution_count": 495,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -528,7 +528,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 496,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -540,7 +540,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 497,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -562,39 +562,55 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 513,
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.preprocessing import OneHotEncoder\n",
+    "\n",
+    "# Use one hot encoding to map each option for the ocean proximity feature\n",
+    "# to its own boolean column\n",
+    "\n",
+    "housing_cat = housing[[\"ocean_proximity\"]]\n",
+    "encoder = OneHotEncoder()\n",
+    "housing_1hot = encoder.fit_transform(housing_cat)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "                   0\n",
-      "0        (0, 0)\\t1.0\n",
-      "1        (0, 0)\\t1.0\n",
-      "2        (0, 0)\\t1.0\n",
-      "3        (0, 0)\\t1.0\n",
-      "4        (0, 0)\\t1.0\n",
-      "...              ...\n",
-      "16507    (0, 1)\\t1.0\n",
-      "16508    (0, 1)\\t1.0\n",
-      "16509    (0, 4)\\t1.0\n",
-      "16510    (0, 3)\\t1.0\n",
-      "16511    (0, 1)\\t1.0\n",
-      "\n",
-      "[16512 rows x 1 columns]\n"
+      "[[ 0.23279352 -0.65568544 -0.29411765 ... -0.86955152 -0.73359645\n",
+      "  -0.63126026]\n",
+      " [ 0.21659919 -0.7194474   0.33333333 ... -0.92962924 -0.88653182\n",
+      "  -0.31322327]\n",
+      " [ 0.24089069 -0.7109458   0.45098039 ... -0.95266604 -0.94540372\n",
+      "  -0.5640336 ]\n",
+      " ...\n",
+      " [ 0.46963563 -0.97662062 -0.25490196 ... -0.96008823 -0.89047854\n",
+      "  -0.70771438]\n",
+      " [-0.61336032  0.11583422  1.         ... -0.94419354 -0.84015787\n",
+      "  -0.52068247]\n",
+      " [-0.38866397  0.21997875 -0.49019608 ... -0.95406645 -0.94014142\n",
+      "  -0.45784196]]\n"
      ]
     }
    ],
    "source": [
-    "from sklearn.preprocessing import OneHotEncoder\n",
+    "from sklearn.preprocessing import MinMaxScaler\n",
     "\n",
-    "# Use one hot encoding to map each option for the ocean proximity feature\n",
-    "# to its own boolean column\n",
+    "# Built in min max computation ranging from -1 to 1. \n",
+    "# See minMaxScaling folder for custom implementation of this from 0 to 1.\n",
     "\n",
-    "housing_cat = housing[[\"ocean_proximity\"]]\n",
-    "encoder = OneHotEncoder()\n",
-    "housing_1hot = encoder.fit_transform(housing_cat)\n"
+    "min_max = MinMaxScaler(feature_range=(-1,1))\n",
+    "housing_num_min_max = min_max.fit_transform(housing_num)\n",
+    "\n",
+    "print(housing_num_min_max)"
    ]
   }
  ],
diff --git a/standardization/standardization.ipynb b/standardization/standardization.ipynb
@@ -0,0 +1,84 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \\\n",
+      "0        -122.23     37.88                41.0        880.0           129.0   \n",
+      "1        -122.22     37.86                21.0       7099.0          1106.0   \n",
+      "2        -122.24     37.85                52.0       1467.0           190.0   \n",
+      "3        -122.25     37.85                52.0       1274.0           235.0   \n",
+      "4        -122.25     37.85                52.0       1627.0           280.0   \n",
+      "...          ...       ...                 ...          ...             ...   \n",
+      "20635    -121.09     39.48                25.0       1665.0           374.0   \n",
+      "20636    -121.21     39.49                18.0        697.0           150.0   \n",
+      "20637    -121.22     39.43                17.0       2254.0           485.0   \n",
+      "20638    -121.32     39.43                18.0       1860.0           409.0   \n",
+      "20639    -121.24     39.37                16.0       2785.0           616.0   \n",
+      "\n",
+      "       population  households  median_income  median_house_value  \\\n",
+      "0           322.0       126.0         8.3252            452600.0   \n",
+      "1          2401.0      1138.0         8.3014            358500.0   \n",
+      "2           496.0       177.0         7.2574            352100.0   \n",
+      "3           558.0       219.0         5.6431            341300.0   \n",
+      "4           565.0       259.0         3.8462            342200.0   \n",
+      "...           ...         ...            ...                 ...   \n",
+      "20635       845.0       330.0         1.5603             78100.0   \n",
+      "20636       356.0       114.0         2.5568             77100.0   \n",
+      "20637      1007.0       433.0         1.7000             92300.0   \n",
+      "20638       741.0       349.0         1.8672             84700.0   \n",
+      "20639      1387.0       530.0         2.3886             89400.0   \n",
+      "\n",
+      "      ocean_proximity  \n",
+      "0            NEAR BAY  \n",
+      "1            NEAR BAY  \n",
+      "2            NEAR BAY  \n",
+      "3            NEAR BAY  \n",
+      "4            NEAR BAY  \n",
+      "...               ...  \n",
+      "20635          INLAND  \n",
+      "20636          INLAND  \n",
+      "20637          INLAND  \n",
+      "20638          INLAND  \n",
+      "20639          INLAND  \n",
+      "\n",
+      "[20640 rows x 10 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from pathlib import Path \n",
+    "\n",
+    "df = pd.read_csv(Path('../datasets/housing/housing.csv'))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "notebook",
+   "language": "python",
+   "name": "notebook"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	machinelearning Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs

M	linearRegression/LinearRegressionHousingV2.ipynb	\|	104	++++++++++++++++++++++++++++++++++++++++++++++----------------------------------
A	standardization/standardization.ipynb	\|	84	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++