commit 2d7379654f2f5b1fe99e02c111c5a1ba09b3a371
parent d757eb637359b9d26dbd2cc59f06e36fa7945b69
Author: Andrew <andrewlaack1@gmail.com>
Date: Thu, 8 Aug 2024 18:11:02 -0500
Wanted to mess with one hot encoder.
Diffstat:
1 file changed, 517 insertions(+), 0 deletions(-)
diff --git a/dataViz/testingOneHotEncodingAndGraphing.ipynb b/dataViz/testingOneHotEncodingAndGraphing.ipynb
@@ -0,0 +1,517 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 135,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>0</th>\n",
+ " <th>1</th>\n",
+ " <th>2</th>\n",
+ " <th>3</th>\n",
+ " <th>4</th>\n",
+ " <th>5</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>0.947152</td>\n",
+ " <td>0.182652</td>\n",
+ " <td>0.116222</td>\n",
+ " <td>0.249947</td>\n",
+ " <td>0.212487</td>\n",
+ " <td>cheese</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>0.932965</td>\n",
+ " <td>0.404326</td>\n",
+ " <td>0.111577</td>\n",
+ " <td>0.098454</td>\n",
+ " <td>0.684146</td>\n",
+ " <td>cheese</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>0.831340</td>\n",
+ " <td>0.641161</td>\n",
+ " <td>0.722503</td>\n",
+ " <td>0.719412</td>\n",
+ " <td>0.749303</td>\n",
+ " <td>pepper</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>0.088983</td>\n",
+ " <td>0.099260</td>\n",
+ " <td>0.832301</td>\n",
+ " <td>0.269737</td>\n",
+ " <td>0.382743</td>\n",
+ " <td>cheese</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>0.569650</td>\n",
+ " <td>0.321217</td>\n",
+ " <td>0.849422</td>\n",
+ " <td>0.765569</td>\n",
+ " <td>0.082142</td>\n",
+ " <td>cheese</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>95</th>\n",
+ " <td>0.999064</td>\n",
+ " <td>0.421309</td>\n",
+ " <td>0.795260</td>\n",
+ " <td>0.200927</td>\n",
+ " <td>0.811947</td>\n",
+ " <td>pepper</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>96</th>\n",
+ " <td>0.913032</td>\n",
+ " <td>0.158652</td>\n",
+ " <td>0.072846</td>\n",
+ " <td>0.320127</td>\n",
+ " <td>0.847452</td>\n",
+ " <td>cheese</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>97</th>\n",
+ " <td>0.453406</td>\n",
+ " <td>0.829704</td>\n",
+ " <td>0.076251</td>\n",
+ " <td>0.327103</td>\n",
+ " <td>0.698135</td>\n",
+ " <td>cheese</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>98</th>\n",
+ " <td>0.465324</td>\n",
+ " <td>0.410674</td>\n",
+ " <td>0.752463</td>\n",
+ " <td>0.858177</td>\n",
+ " <td>0.078763</td>\n",
+ " <td>cheese</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>99</th>\n",
+ " <td>0.931791</td>\n",
+ " <td>0.349814</td>\n",
+ " <td>0.202655</td>\n",
+ " <td>0.480509</td>\n",
+ " <td>0.253459</td>\n",
+ " <td>cheese</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>100 rows × 6 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " 0 1 2 3 4 5\n",
+ "0 0.947152 0.182652 0.116222 0.249947 0.212487 cheese\n",
+ "1 0.932965 0.404326 0.111577 0.098454 0.684146 cheese\n",
+ "2 0.831340 0.641161 0.722503 0.719412 0.749303 pepper\n",
+ "3 0.088983 0.099260 0.832301 0.269737 0.382743 cheese\n",
+ "4 0.569650 0.321217 0.849422 0.765569 0.082142 cheese\n",
+ ".. ... ... ... ... ... ...\n",
+ "95 0.999064 0.421309 0.795260 0.200927 0.811947 pepper\n",
+ "96 0.913032 0.158652 0.072846 0.320127 0.847452 cheese\n",
+ "97 0.453406 0.829704 0.076251 0.327103 0.698135 cheese\n",
+ "98 0.465324 0.410674 0.752463 0.858177 0.078763 cheese\n",
+ "99 0.931791 0.349814 0.202655 0.480509 0.253459 cheese\n",
+ "\n",
+ "[100 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 135,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "\n",
+ "\n",
+ "data = []\n",
+ "for i in range(0,100):\n",
+ " rnd0 = np.random.random()\n",
+ " rnd1 = np.random.random()\n",
+ " rnd2 = np.random.random()\n",
+ " rnd3 = np.random.random()\n",
+ " rnd4 = np.random.random()\n",
+ "\n",
+ " sum = np.sqrt(rnd0 * rnd0 + rnd1 * rnd1 + rnd2 * rnd2 + rnd3 * rnd3 + rnd4 * rnd4)\n",
+ " txt = ''\n",
+ " if sum > 1.5:\n",
+ " txt = 'pepper'\n",
+ " else:\n",
+ " txt = 'cheese'\n",
+ " data.append([rnd0, rnd1,rnd2,rnd3,rnd4,txt])\n",
+ "\n",
+ "df = pd.DataFrame(data=data)\n",
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 136,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 float64\n",
+ "1 float64\n",
+ "2 float64\n",
+ "3 float64\n",
+ "4 float64\n",
+ "5 object\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 136,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df.dtypes"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 137,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import OneHotEncoder"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 138,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "array([[1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [0., 1.],\n",
+ " [0., 1.],\n",
+ " [0., 1.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [0., 1.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.],\n",
+ " [1., 0.]])"
+ ]
+ },
+ "execution_count": 138,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "oh2 = OneHotEncoder(sparse_output=False)\n",
+ "out2 = oh2.fit_transform(df[[5]])\n",
+ "out2"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 139,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "comb = pd.DataFrame(data=out2, columns=oh2.get_feature_names_out(['favorite']))\n",
+ "combined = pd.concat([comb,df], axis=1)\n",
+ "combined = combined.drop(axis=1 , columns=[5])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 140,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>favorite_cheese</th>\n",
+ " <th>favorite_pepper</th>\n",
+ " <th>0</th>\n",
+ " <th>1</th>\n",
+ " <th>2</th>\n",
+ " <th>3</th>\n",
+ " <th>4</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>1.0</td>\n",
+ " <td>0.0</td>\n",
+ " <td>0.947152</td>\n",
+ " <td>0.182652</td>\n",
+ " <td>0.116222</td>\n",
+ " <td>0.249947</td>\n",
+ " <td>0.212487</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>1.0</td>\n",
+ " <td>0.0</td>\n",
+ " <td>0.932965</td>\n",
+ " <td>0.404326</td>\n",
+ " <td>0.111577</td>\n",
+ " <td>0.098454</td>\n",
+ " <td>0.684146</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>0.0</td>\n",
+ " <td>1.0</td>\n",
+ " <td>0.831340</td>\n",
+ " <td>0.641161</td>\n",
+ " <td>0.722503</td>\n",
+ " <td>0.719412</td>\n",
+ " <td>0.749303</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>1.0</td>\n",
+ " <td>0.0</td>\n",
+ " <td>0.088983</td>\n",
+ " <td>0.099260</td>\n",
+ " <td>0.832301</td>\n",
+ " <td>0.269737</td>\n",
+ " <td>0.382743</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>1.0</td>\n",
+ " <td>0.0</td>\n",
+ " <td>0.569650</td>\n",
+ " <td>0.321217</td>\n",
+ " <td>0.849422</td>\n",
+ " <td>0.765569</td>\n",
+ " <td>0.082142</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " favorite_cheese favorite_pepper 0 1 2 3 \\\n",
+ "0 1.0 0.0 0.947152 0.182652 0.116222 0.249947 \n",
+ "1 1.0 0.0 0.932965 0.404326 0.111577 0.098454 \n",
+ "2 0.0 1.0 0.831340 0.641161 0.722503 0.719412 \n",
+ "3 1.0 0.0 0.088983 0.099260 0.832301 0.269737 \n",
+ "4 1.0 0.0 0.569650 0.321217 0.849422 0.765569 \n",
+ "\n",
+ " 4 \n",
+ "0 0.212487 \n",
+ "1 0.684146 \n",
+ "2 0.749303 \n",
+ "3 0.382743 \n",
+ "4 0.082142 "
+ ]
+ },
+ "execution_count": 140,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "combined.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 141,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.decomposition import PCA\n",
+ "\n",
+ "pca = PCA(n_components=3)\n",
+ "\n",
+ "out = pca.fit_transform(X=np.array(combined))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import plotly.express as px\n",
+ "\n",
+ "px.scatter_3d(x=out[:,0], y=out[:,1], z=out[:,2], color=combined['favorite_cheese'])"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}