machinelearning

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 5e94d00f154d263ad235bccf885cfb1b61d207f5
parent 3196bb39b068759b5e879e3dfb28eada6978242a
Author: Andrew <andrewlaack1@gmail.com>
Date:   Fri, 24 May 2024 19:06:48 -0500

Completed some stuff

Diffstat:
MlinearRegression/LinearRegressionHousingV2.ipynb | 383++++++++++++++++++++++---------------------------------------------------------
AminMaxScaling/MinMaxScaling.ipynb | 234+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
MsigmoidFunction/Sigmoid.ipynb | 6+++---
3 files changed, 343 insertions(+), 280 deletions(-)

diff --git a/linearRegression/LinearRegressionHousingV2.ipynb b/linearRegression/LinearRegressionHousingV2.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 288, + "execution_count": 481, "metadata": {}, "outputs": [], "source": [ @@ -36,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 289, + "execution_count": 482, "metadata": {}, "outputs": [ { @@ -78,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 290, + "execution_count": 483, "metadata": {}, "outputs": [ { @@ -101,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 291, + "execution_count": 484, "metadata": {}, "outputs": [], "source": [ @@ -117,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 292, + "execution_count": 485, "metadata": {}, "outputs": [ { @@ -155,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 293, + "execution_count": 486, "metadata": {}, "outputs": [ { @@ -192,7 +192,7 @@ " <Axes: title={'center': 'median_house_value'}>]], dtype=object)" ] }, - "execution_count": 293, + "execution_count": 486, "metadata": {}, "output_type": "execute_result" }, @@ -235,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 294, + "execution_count": 487, "metadata": {}, "outputs": [], "source": [ @@ -247,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 295, + "execution_count": 488, "metadata": {}, "outputs": [], "source": [ @@ -260,7 +260,7 @@ }, { "cell_type": "code", - "execution_count": 296, + "execution_count": 489, "metadata": {}, "outputs": [ { @@ -290,7 +290,7 @@ }, { "cell_type": "code", - "execution_count": 297, + "execution_count": 490, "metadata": {}, "outputs": [ { @@ -308,7 +308,7 @@ "Name: median_house_value, dtype: float64" ] }, - "execution_count": 297, + "execution_count": 490, "metadata": {}, "output_type": "execute_result" } @@ -328,7 +328,7 @@ }, { "cell_type": "code", - "execution_count": 298, + "execution_count": 491, "metadata": {}, "outputs": [ { @@ -353,7 +353,7 @@ " dtype=object)" ] }, - "execution_count": 298, + "execution_count": 491, "metadata": {}, "output_type": "execute_result" }, @@ -394,7 +394,7 @@ }, { "cell_type": "code", - "execution_count": 299, + "execution_count": 492, "metadata": {}, "outputs": [ { @@ -403,7 +403,7 @@ "<Axes: xlabel='median_income', ylabel='median_house_value'>" ] }, - "execution_count": 299, + "execution_count": 492, "metadata": {}, "output_type": "execute_result" }, @@ -425,7 +425,7 @@ }, { "cell_type": "code", - "execution_count": 300, + "execution_count": 493, "metadata": {}, "outputs": [ { @@ -446,7 +446,7 @@ "Name: median_house_value, dtype: float64" ] }, - "execution_count": 300, + "execution_count": 493, "metadata": {}, "output_type": "execute_result" } @@ -462,7 +462,7 @@ }, { "cell_type": "code", - "execution_count": 301, + "execution_count": 494, "metadata": {}, "outputs": [ { @@ -488,271 +488,29 @@ }, { "cell_type": "code", - "execution_count": 310, + "execution_count": 495, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "<div>\n", - "<style scoped>\n", - " .dataframe tbody tr th:only-of-type {\n", - " vertical-align: middle;\n", - " }\n", - "\n", - " .dataframe tbody tr th {\n", - " vertical-align: top;\n", - " }\n", - "\n", - " .dataframe thead th {\n", - " text-align: right;\n", - " }\n", - "</style>\n", - "<table border=\"1\" class=\"dataframe\">\n", - " <thead>\n", - " <tr style=\"text-align: right;\">\n", - " <th></th>\n", - " <th>longitude</th>\n", - " <th>latitude</th>\n", - " <th>housing_median_age</th>\n", - " <th>total_rooms</th>\n", - " <th>total_bedrooms</th>\n", - " <th>population</th>\n", - " <th>households</th>\n", - " <th>median_income</th>\n", - " <th>median_house_value</th>\n", - " <th>rooms_per_house</th>\n", - " <th>bedroom_ratio</th>\n", - " <th>people_per_house</th>\n", - " <th>ocean_dist</th>\n", - " </tr>\n", - " </thead>\n", - " <tbody>\n", - " <tr>\n", - " <th>5419</th>\n", - " <td>-118.26</td>\n", - " <td>34.16</td>\n", - " <td>19.0</td>\n", - " <td>2919.0</td>\n", - " <td>857.0</td>\n", - " <td>1866.0</td>\n", - " <td>811.0</td>\n", - " <td>3.1733</td>\n", - " <td>206300.0</td>\n", - " <td>3.599260</td>\n", - " <td>0.293594</td>\n", - " <td>2.300863</td>\n", - " <td>240104.670351</td>\n", - " </tr>\n", - " <tr>\n", - " <th>3598</th>\n", - " <td>-118.34</td>\n", - " <td>33.86</td>\n", - " <td>35.0</td>\n", - " <td>1936.0</td>\n", - " <td>343.0</td>\n", - " <td>1008.0</td>\n", - " <td>346.0</td>\n", - " <td>5.4791</td>\n", - " <td>285900.0</td>\n", - " <td>5.595376</td>\n", - " <td>0.177169</td>\n", - " <td>2.913295</td>\n", - " <td>240104.670351</td>\n", - " </tr>\n", - " <tr>\n", - " <th>13269</th>\n", - " <td>-118.22</td>\n", - " <td>33.90</td>\n", - " <td>38.0</td>\n", - " <td>796.0</td>\n", - " <td>159.0</td>\n", - " <td>679.0</td>\n", - " <td>167.0</td>\n", - " <td>3.6607</td>\n", - " <td>110400.0</td>\n", - " <td>4.766467</td>\n", - " <td>0.199749</td>\n", - " <td>4.065868</td>\n", - " <td>240104.670351</td>\n", - " </tr>\n", - " <tr>\n", - " <th>16983</th>\n", - " <td>-118.15</td>\n", - " <td>34.19</td>\n", - " <td>47.0</td>\n", - " <td>1717.0</td>\n", - " <td>314.0</td>\n", - " <td>868.0</td>\n", - " <td>295.0</td>\n", - " <td>3.6094</td>\n", - " <td>160700.0</td>\n", - " <td>5.820339</td>\n", - " <td>0.182877</td>\n", - " <td>2.942373</td>\n", - " <td>240104.670351</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7364</th>\n", - " <td>-117.14</td>\n", - " <td>32.93</td>\n", - " <td>14.0</td>\n", - " <td>1946.0</td>\n", - " <td>463.0</td>\n", - " <td>1205.0</td>\n", - " <td>390.0</td>\n", - " <td>4.2109</td>\n", - " <td>171200.0</td>\n", - " <td>4.989744</td>\n", - " <td>0.237924</td>\n", - " <td>3.089744</td>\n", - " <td>240104.670351</td>\n", - " </tr>\n", - " <tr>\n", - " <th>...</th>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " <td>...</td>\n", - " </tr>\n", - " <tr>\n", - " <th>1153</th>\n", - " <td>-117.42</td>\n", - " <td>33.94</td>\n", - " <td>26.0</td>\n", - " <td>2420.0</td>\n", - " <td>532.0</td>\n", - " <td>1383.0</td>\n", - " <td>469.0</td>\n", - " <td>3.5403</td>\n", - " <td>113500.0</td>\n", - " <td>5.159915</td>\n", - " <td>0.219835</td>\n", - " <td>2.948827</td>\n", - " <td>125032.908322</td>\n", - " </tr>\n", - " <tr>\n", - " <th>14075</th>\n", - " <td>-121.46</td>\n", - " <td>38.60</td>\n", - " <td>29.0</td>\n", - " <td>1978.0</td>\n", - " <td>538.0</td>\n", - " <td>823.0</td>\n", - " <td>490.0</td>\n", - " <td>1.9688</td>\n", - " <td>135600.0</td>\n", - " <td>4.036735</td>\n", - " <td>0.271992</td>\n", - " <td>1.679592</td>\n", - " <td>125032.908322</td>\n", - " </tr>\n", - " <tr>\n", - " <th>7277</th>\n", - " <td>-117.09</td>\n", - " <td>32.65</td>\n", - " <td>20.0</td>\n", - " <td>1445.0</td>\n", - " <td>323.0</td>\n", - " <td>573.0</td>\n", - " <td>334.0</td>\n", - " <td>2.6190</td>\n", - " <td>145800.0</td>\n", - " <td>4.326347</td>\n", - " <td>0.223529</td>\n", - " <td>1.715569</td>\n", - " <td>250327.236143</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9621</th>\n", - " <td>-122.44</td>\n", - " <td>37.79</td>\n", - " <td>52.0</td>\n", - " <td>1817.0</td>\n", - " <td>535.0</td>\n", - " <td>800.0</td>\n", - " <td>487.0</td>\n", - " <td>3.9750</td>\n", - " <td>500001.0</td>\n", - " <td>3.731006</td>\n", - " <td>0.294441</td>\n", - " <td>1.642710</td>\n", - " <td>256986.420765</td>\n", - " </tr>\n", - " <tr>\n", - " <th>9665</th>\n", - " <td>-121.33</td>\n", - " <td>38.28</td>\n", - " <td>14.0</td>\n", - " <td>980.0</td>\n", - " <td>171.0</td>\n", - " <td>659.0</td>\n", - " <td>183.0</td>\n", - " <td>4.4306</td>\n", - " <td>170100.0</td>\n", - " <td>5.355191</td>\n", - " <td>0.174490</td>\n", - " <td>3.601093</td>\n", - " <td>125032.908322</td>\n", - " </tr>\n", - " </tbody>\n", - "</table>\n", - "<p>16512 rows × 13 columns</p>\n", - "</div>" - ], "text/plain": [ - " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", - "5419 -118.26 34.16 19.0 2919.0 857.0 \n", - "3598 -118.34 33.86 35.0 1936.0 343.0 \n", - "13269 -118.22 33.90 38.0 796.0 159.0 \n", - "16983 -118.15 34.19 47.0 1717.0 314.0 \n", - "7364 -117.14 32.93 14.0 1946.0 463.0 \n", - "... ... ... ... ... ... \n", - "1153 -117.42 33.94 26.0 2420.0 532.0 \n", - "14075 -121.46 38.60 29.0 1978.0 538.0 \n", - "7277 -117.09 32.65 20.0 1445.0 323.0 \n", - "9621 -122.44 37.79 52.0 1817.0 535.0 \n", - "9665 -121.33 38.28 14.0 980.0 171.0 \n", - "\n", - " population households median_income median_house_value \\\n", - "5419 1866.0 811.0 3.1733 206300.0 \n", - "3598 1008.0 346.0 5.4791 285900.0 \n", - "13269 679.0 167.0 3.6607 110400.0 \n", - "16983 868.0 295.0 3.6094 160700.0 \n", - "7364 1205.0 390.0 4.2109 171200.0 \n", - "... ... ... ... ... \n", - "1153 1383.0 469.0 3.5403 113500.0 \n", - "14075 823.0 490.0 1.9688 135600.0 \n", - "7277 573.0 334.0 2.6190 145800.0 \n", - "9621 800.0 487.0 3.9750 500001.0 \n", - "9665 659.0 183.0 4.4306 170100.0 \n", - "\n", - " rooms_per_house bedroom_ratio people_per_house ocean_dist \n", - "5419 3.599260 0.293594 2.300863 240104.670351 \n", - "3598 5.595376 0.177169 2.913295 240104.670351 \n", - "13269 4.766467 0.199749 4.065868 240104.670351 \n", - "16983 5.820339 0.182877 2.942373 240104.670351 \n", - "7364 4.989744 0.237924 3.089744 240104.670351 \n", - "... ... ... ... ... \n", - "1153 5.159915 0.219835 2.948827 125032.908322 \n", - "14075 4.036735 0.271992 1.679592 125032.908322 \n", - "7277 4.326347 0.223529 1.715569 250327.236143 \n", - "9621 3.731006 0.294441 1.642710 256986.420765 \n", - "9665 5.355191 0.174490 3.601093 125032.908322 \n", - "\n", - "[16512 rows x 13 columns]" + "median_house_value 1.000000\n", + "median_income 0.689222\n", + "ocean_dist 0.484102\n", + "rooms_per_house 0.148076\n", + "total_rooms 0.128957\n", + "housing_median_age 0.101160\n", + "households 0.059776\n", + "total_bedrooms 0.043272\n", + "people_per_house -0.021860\n", + "population -0.027846\n", + "longitude -0.046072\n", + "latitude -0.143096\n", + "bedroom_ratio -0.258569\n", + "Name: median_house_value, dtype: float64" ] }, - "execution_count": 310, + "execution_count": 495, "metadata": {}, "output_type": "execute_result" } @@ -767,6 +525,77 @@ "\n", "corr['median_house_value'].sort_values(ascending=False)\n" ] + }, + { + "cell_type": "code", + "execution_count": 496, + "metadata": {}, + "outputs": [], + "source": [ + "# Reload Data\n", + "\n", + "housing = stratTrain.drop(\"median_house_value\", axis=1)\n", + "housing_labels = stratTrain[\"median_house_value\"].copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 497, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.impute import SimpleImputer\n", + "\n", + "imputer = SimpleImputer(strategy='median')\n", + "\n", + "# All sample attributes that have a type of np.number. In essence, this removes the proximity \n", + "# string attribute\n", + "housing_num = housing.select_dtypes(include=[np.number])\n", + "imputer.fit(housing_num)\n", + "\n", + "# Imputer all null values of the dataset.\n", + "X = imputer.transform(housing_num)\n", + "\n", + "# Take the numpy array and put it back into a dataframe\n", + "housing_tr = pd.DataFrame(X, columns=housing_num.columns, index=housing_num.index)" + ] + }, + { + "cell_type": "code", + "execution_count": 513, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " 0\n", + "0 (0, 0)\\t1.0\n", + "1 (0, 0)\\t1.0\n", + "2 (0, 0)\\t1.0\n", + "3 (0, 0)\\t1.0\n", + "4 (0, 0)\\t1.0\n", + "... ...\n", + "16507 (0, 1)\\t1.0\n", + "16508 (0, 1)\\t1.0\n", + "16509 (0, 4)\\t1.0\n", + "16510 (0, 3)\\t1.0\n", + "16511 (0, 1)\\t1.0\n", + "\n", + "[16512 rows x 1 columns]\n" + ] + } + ], + "source": [ + "from sklearn.preprocessing import OneHotEncoder\n", + "\n", + "# Use one hot encoding to map each option for the ocean proximity feature\n", + "# to its own boolean column\n", + "\n", + "housing_cat = housing[[\"ocean_proximity\"]]\n", + "encoder = OneHotEncoder()\n", + "housing_1hot = encoder.fit_transform(housing_cat)\n" + ] } ], "metadata": { diff --git a/minMaxScaling/MinMaxScaling.ipynb b/minMaxScaling/MinMaxScaling.ipynb @@ -0,0 +1,234 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import pathlib as path\n", + "# Load in saved csv data\n", + "df = pd.read_csv(path.Path('../datasets/housing/housing.csv'))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Remove string column\n", + "df.drop(columns='ocean_proximity', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>longitude</th>\n", + " <th>latitude</th>\n", + " <th>housing_median_age</th>\n", + " <th>total_rooms</th>\n", + " <th>total_bedrooms</th>\n", + " <th>population</th>\n", + " <th>households</th>\n", + " <th>median_income</th>\n", + " <th>median_house_value</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>20640.000000</td>\n", + " <td>20640.000000</td>\n", + " <td>20640.000000</td>\n", + " <td>20640.000000</td>\n", + " <td>20433.000000</td>\n", + " <td>20640.000000</td>\n", + " <td>20640.000000</td>\n", + " <td>20640.000000</td>\n", + " <td>20640.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>0.476125</td>\n", + " <td>0.328572</td>\n", + " <td>0.541951</td>\n", + " <td>0.066986</td>\n", + " <td>0.083313</td>\n", + " <td>0.039869</td>\n", + " <td>0.081983</td>\n", + " <td>0.232464</td>\n", + " <td>0.395579</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>0.199555</td>\n", + " <td>0.226988</td>\n", + " <td>0.246776</td>\n", + " <td>0.055486</td>\n", + " <td>0.065392</td>\n", + " <td>0.031740</td>\n", + " <td>0.062873</td>\n", + " <td>0.131020</td>\n", + " <td>0.237928</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>0.253984</td>\n", + " <td>0.147715</td>\n", + " <td>0.333333</td>\n", + " <td>0.036771</td>\n", + " <td>0.045779</td>\n", + " <td>0.021974</td>\n", + " <td>0.045881</td>\n", + " <td>0.142308</td>\n", + " <td>0.215671</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>0.583665</td>\n", + " <td>0.182784</td>\n", + " <td>0.549020</td>\n", + " <td>0.054046</td>\n", + " <td>0.067349</td>\n", + " <td>0.032596</td>\n", + " <td>0.067094</td>\n", + " <td>0.209301</td>\n", + " <td>0.339588</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>0.631474</td>\n", + " <td>0.549416</td>\n", + " <td>0.705882</td>\n", + " <td>0.080014</td>\n", + " <td>0.100248</td>\n", + " <td>0.048264</td>\n", + " <td>0.099326</td>\n", + " <td>0.292641</td>\n", + " <td>0.514897</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms \\\n", + "count 20640.000000 20640.000000 20640.000000 20640.000000 \n", + "mean 0.476125 0.328572 0.541951 0.066986 \n", + "std 0.199555 0.226988 0.246776 0.055486 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 0.253984 0.147715 0.333333 0.036771 \n", + "50% 0.583665 0.182784 0.549020 0.054046 \n", + "75% 0.631474 0.549416 0.705882 0.080014 \n", + "max 1.000000 1.000000 1.000000 1.000000 \n", + "\n", + " total_bedrooms population households median_income \\\n", + "count 20433.000000 20640.000000 20640.000000 20640.000000 \n", + "mean 0.083313 0.039869 0.081983 0.232464 \n", + "std 0.065392 0.031740 0.062873 0.131020 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 0.045779 0.021974 0.045881 0.142308 \n", + "50% 0.067349 0.032596 0.067094 0.209301 \n", + "75% 0.100248 0.048264 0.099326 0.292641 \n", + "max 1.000000 1.000000 1.000000 1.000000 \n", + "\n", + " median_house_value \n", + "count 20640.000000 \n", + "mean 0.395579 \n", + "std 0.237928 \n", + "min 0.000000 \n", + "25% 0.215671 \n", + "50% 0.339588 \n", + "75% 0.514897 \n", + "max 1.000000 " + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# For each column (assuming they are numbers) iterate through them and set all\n", + "# features to be equal to the (current - min) / diff. \n", + "\n", + "for i in df:\n", + " min = df[i].min()\n", + " diff = df[i].max() - min\n", + " df[i] = (df[i] - min) / diff \n", + "\n", + "df.describe()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "notebook", + "language": "python", + "name": "notebook" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/sigmoidFunction/Sigmoid.ipynb b/sigmoidFunction/Sigmoid.ipynb @@ -2,16 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 44, + "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[<matplotlib.lines.Line2D at 0x7fe4267b2b50>]" + "[<matplotlib.lines.Line2D at 0x7fe42672a510>]" ] }, - "execution_count": 44, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" },