machinelearning

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 2d7379654f2f5b1fe99e02c111c5a1ba09b3a371
parent d757eb637359b9d26dbd2cc59f06e36fa7945b69
Author: Andrew <andrewlaack1@gmail.com>
Date:   Thu,  8 Aug 2024 18:11:02 -0500

Wanted to mess with one hot encoder.

Diffstat:
AdataViz/testingOneHotEncodingAndGraphing.ipynb | 517+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 517 insertions(+), 0 deletions(-)

diff --git a/dataViz/testingOneHotEncodingAndGraphing.ipynb b/dataViz/testingOneHotEncodingAndGraphing.ipynb @@ -0,0 +1,517 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 135, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " <th>3</th>\n", + " <th>4</th>\n", + " <th>5</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0.947152</td>\n", + " <td>0.182652</td>\n", + " <td>0.116222</td>\n", + " <td>0.249947</td>\n", + " <td>0.212487</td>\n", + " <td>cheese</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.932965</td>\n", + " <td>0.404326</td>\n", + " <td>0.111577</td>\n", + " <td>0.098454</td>\n", + " <td>0.684146</td>\n", + " <td>cheese</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.831340</td>\n", + " <td>0.641161</td>\n", + " <td>0.722503</td>\n", + " <td>0.719412</td>\n", + " <td>0.749303</td>\n", + " <td>pepper</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.088983</td>\n", + " <td>0.099260</td>\n", + " <td>0.832301</td>\n", + " <td>0.269737</td>\n", + " <td>0.382743</td>\n", + " <td>cheese</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.569650</td>\n", + " <td>0.321217</td>\n", + " <td>0.849422</td>\n", + " <td>0.765569</td>\n", + " <td>0.082142</td>\n", + " <td>cheese</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>95</th>\n", + " <td>0.999064</td>\n", + " <td>0.421309</td>\n", + " <td>0.795260</td>\n", + " <td>0.200927</td>\n", + " <td>0.811947</td>\n", + " <td>pepper</td>\n", + " </tr>\n", + " <tr>\n", + " <th>96</th>\n", + " <td>0.913032</td>\n", + " <td>0.158652</td>\n", + " <td>0.072846</td>\n", + " <td>0.320127</td>\n", + " <td>0.847452</td>\n", + " <td>cheese</td>\n", + " </tr>\n", + " <tr>\n", + " <th>97</th>\n", + " <td>0.453406</td>\n", + " <td>0.829704</td>\n", + " <td>0.076251</td>\n", + " <td>0.327103</td>\n", + " <td>0.698135</td>\n", + " <td>cheese</td>\n", + " </tr>\n", + " <tr>\n", + " <th>98</th>\n", + " <td>0.465324</td>\n", + " <td>0.410674</td>\n", + " <td>0.752463</td>\n", + " <td>0.858177</td>\n", + " <td>0.078763</td>\n", + " <td>cheese</td>\n", + " </tr>\n", + " <tr>\n", + " <th>99</th>\n", + " <td>0.931791</td>\n", + " <td>0.349814</td>\n", + " <td>0.202655</td>\n", + " <td>0.480509</td>\n", + " <td>0.253459</td>\n", + " <td>cheese</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>100 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " 0 1 2 3 4 5\n", + "0 0.947152 0.182652 0.116222 0.249947 0.212487 cheese\n", + "1 0.932965 0.404326 0.111577 0.098454 0.684146 cheese\n", + "2 0.831340 0.641161 0.722503 0.719412 0.749303 pepper\n", + "3 0.088983 0.099260 0.832301 0.269737 0.382743 cheese\n", + "4 0.569650 0.321217 0.849422 0.765569 0.082142 cheese\n", + ".. ... ... ... ... ... ...\n", + "95 0.999064 0.421309 0.795260 0.200927 0.811947 pepper\n", + "96 0.913032 0.158652 0.072846 0.320127 0.847452 cheese\n", + "97 0.453406 0.829704 0.076251 0.327103 0.698135 cheese\n", + "98 0.465324 0.410674 0.752463 0.858177 0.078763 cheese\n", + "99 0.931791 0.349814 0.202655 0.480509 0.253459 cheese\n", + "\n", + "[100 rows x 6 columns]" + ] + }, + "execution_count": 135, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "\n", + "data = []\n", + "for i in range(0,100):\n", + " rnd0 = np.random.random()\n", + " rnd1 = np.random.random()\n", + " rnd2 = np.random.random()\n", + " rnd3 = np.random.random()\n", + " rnd4 = np.random.random()\n", + "\n", + " sum = np.sqrt(rnd0 * rnd0 + rnd1 * rnd1 + rnd2 * rnd2 + rnd3 * rnd3 + rnd4 * rnd4)\n", + " txt = ''\n", + " if sum > 1.5:\n", + " txt = 'pepper'\n", + " else:\n", + " txt = 'cheese'\n", + " data.append([rnd0, rnd1,rnd2,rnd3,rnd4,txt])\n", + "\n", + "df = pd.DataFrame(data=data)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 float64\n", + "1 float64\n", + "2 float64\n", + "3 float64\n", + "4 float64\n", + "5 object\n", + "dtype: object" + ] + }, + "execution_count": 136, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import OneHotEncoder" + ] + }, + { + "cell_type": "code", + "execution_count": 138, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [0., 1.],\n", + " [0., 1.],\n", + " [0., 1.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [0., 1.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.],\n", + " [1., 0.]])" + ] + }, + "execution_count": 138, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "oh2 = OneHotEncoder(sparse_output=False)\n", + "out2 = oh2.fit_transform(df[[5]])\n", + "out2" + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "metadata": {}, + "outputs": [], + "source": [ + "comb = pd.DataFrame(data=out2, columns=oh2.get_feature_names_out(['favorite']))\n", + "combined = pd.concat([comb,df], axis=1)\n", + "combined = combined.drop(axis=1 , columns=[5])" + ] + }, + { + "cell_type": "code", + "execution_count": 140, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>favorite_cheese</th>\n", + " <th>favorite_pepper</th>\n", + " <th>0</th>\n", + " <th>1</th>\n", + " <th>2</th>\n", + " <th>3</th>\n", + " <th>4</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.947152</td>\n", + " <td>0.182652</td>\n", + " <td>0.116222</td>\n", + " <td>0.249947</td>\n", + " <td>0.212487</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.932965</td>\n", + " <td>0.404326</td>\n", + " <td>0.111577</td>\n", + " <td>0.098454</td>\n", + " <td>0.684146</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>0.0</td>\n", + " <td>1.0</td>\n", + " <td>0.831340</td>\n", + " <td>0.641161</td>\n", + " <td>0.722503</td>\n", + " <td>0.719412</td>\n", + " <td>0.749303</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.088983</td>\n", + " <td>0.099260</td>\n", + " <td>0.832301</td>\n", + " <td>0.269737</td>\n", + " <td>0.382743</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.569650</td>\n", + " <td>0.321217</td>\n", + " <td>0.849422</td>\n", + " <td>0.765569</td>\n", + " <td>0.082142</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " favorite_cheese favorite_pepper 0 1 2 3 \\\n", + "0 1.0 0.0 0.947152 0.182652 0.116222 0.249947 \n", + "1 1.0 0.0 0.932965 0.404326 0.111577 0.098454 \n", + "2 0.0 1.0 0.831340 0.641161 0.722503 0.719412 \n", + "3 1.0 0.0 0.088983 0.099260 0.832301 0.269737 \n", + "4 1.0 0.0 0.569650 0.321217 0.849422 0.765569 \n", + "\n", + " 4 \n", + "0 0.212487 \n", + "1 0.684146 \n", + "2 0.749303 \n", + "3 0.382743 \n", + "4 0.082142 " + ] + }, + "execution_count": 140, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "combined.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 141, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "\n", + "pca = PCA(n_components=3)\n", + "\n", + "out = pca.fit_transform(X=np.array(combined))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import plotly.express as px\n", + "\n", + "px.scatter_3d(x=out[:,0], y=out[:,1], z=out[:,2], color=combined['favorite_cheese'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}