testingOneHotEncodingAndGraphing.ipynb (14627B)
1 { 2 "cells": [ 3 { 4 "cell_type": "code", 5 "execution_count": 135, 6 "metadata": {}, 7 "outputs": [ 8 { 9 "data": { 10 "text/html": [ 11 "<div>\n", 12 "<style scoped>\n", 13 " .dataframe tbody tr th:only-of-type {\n", 14 " vertical-align: middle;\n", 15 " }\n", 16 "\n", 17 " .dataframe tbody tr th {\n", 18 " vertical-align: top;\n", 19 " }\n", 20 "\n", 21 " .dataframe thead th {\n", 22 " text-align: right;\n", 23 " }\n", 24 "</style>\n", 25 "<table border=\"1\" class=\"dataframe\">\n", 26 " <thead>\n", 27 " <tr style=\"text-align: right;\">\n", 28 " <th></th>\n", 29 " <th>0</th>\n", 30 " <th>1</th>\n", 31 " <th>2</th>\n", 32 " <th>3</th>\n", 33 " <th>4</th>\n", 34 " <th>5</th>\n", 35 " </tr>\n", 36 " </thead>\n", 37 " <tbody>\n", 38 " <tr>\n", 39 " <th>0</th>\n", 40 " <td>0.947152</td>\n", 41 " <td>0.182652</td>\n", 42 " <td>0.116222</td>\n", 43 " <td>0.249947</td>\n", 44 " <td>0.212487</td>\n", 45 " <td>cheese</td>\n", 46 " </tr>\n", 47 " <tr>\n", 48 " <th>1</th>\n", 49 " <td>0.932965</td>\n", 50 " <td>0.404326</td>\n", 51 " <td>0.111577</td>\n", 52 " <td>0.098454</td>\n", 53 " <td>0.684146</td>\n", 54 " <td>cheese</td>\n", 55 " </tr>\n", 56 " <tr>\n", 57 " <th>2</th>\n", 58 " <td>0.831340</td>\n", 59 " <td>0.641161</td>\n", 60 " <td>0.722503</td>\n", 61 " <td>0.719412</td>\n", 62 " <td>0.749303</td>\n", 63 " <td>pepper</td>\n", 64 " </tr>\n", 65 " <tr>\n", 66 " <th>3</th>\n", 67 " <td>0.088983</td>\n", 68 " <td>0.099260</td>\n", 69 " <td>0.832301</td>\n", 70 " <td>0.269737</td>\n", 71 " <td>0.382743</td>\n", 72 " <td>cheese</td>\n", 73 " </tr>\n", 74 " <tr>\n", 75 " <th>4</th>\n", 76 " <td>0.569650</td>\n", 77 " <td>0.321217</td>\n", 78 " <td>0.849422</td>\n", 79 " <td>0.765569</td>\n", 80 " <td>0.082142</td>\n", 81 " <td>cheese</td>\n", 82 " </tr>\n", 83 " <tr>\n", 84 " <th>...</th>\n", 85 " <td>...</td>\n", 86 " <td>...</td>\n", 87 " <td>...</td>\n", 88 " <td>...</td>\n", 89 " <td>...</td>\n", 90 " <td>...</td>\n", 91 " </tr>\n", 92 " <tr>\n", 93 " <th>95</th>\n", 94 " <td>0.999064</td>\n", 95 " <td>0.421309</td>\n", 96 " <td>0.795260</td>\n", 97 " <td>0.200927</td>\n", 98 " <td>0.811947</td>\n", 99 " <td>pepper</td>\n", 100 " </tr>\n", 101 " <tr>\n", 102 " <th>96</th>\n", 103 " <td>0.913032</td>\n", 104 " <td>0.158652</td>\n", 105 " <td>0.072846</td>\n", 106 " <td>0.320127</td>\n", 107 " <td>0.847452</td>\n", 108 " <td>cheese</td>\n", 109 " </tr>\n", 110 " <tr>\n", 111 " <th>97</th>\n", 112 " <td>0.453406</td>\n", 113 " <td>0.829704</td>\n", 114 " <td>0.076251</td>\n", 115 " <td>0.327103</td>\n", 116 " <td>0.698135</td>\n", 117 " <td>cheese</td>\n", 118 " </tr>\n", 119 " <tr>\n", 120 " <th>98</th>\n", 121 " <td>0.465324</td>\n", 122 " <td>0.410674</td>\n", 123 " <td>0.752463</td>\n", 124 " <td>0.858177</td>\n", 125 " <td>0.078763</td>\n", 126 " <td>cheese</td>\n", 127 " </tr>\n", 128 " <tr>\n", 129 " <th>99</th>\n", 130 " <td>0.931791</td>\n", 131 " <td>0.349814</td>\n", 132 " <td>0.202655</td>\n", 133 " <td>0.480509</td>\n", 134 " <td>0.253459</td>\n", 135 " <td>cheese</td>\n", 136 " </tr>\n", 137 " </tbody>\n", 138 "</table>\n", 139 "<p>100 rows × 6 columns</p>\n", 140 "</div>" 141 ], 142 "text/plain": [ 143 " 0 1 2 3 4 5\n", 144 "0 0.947152 0.182652 0.116222 0.249947 0.212487 cheese\n", 145 "1 0.932965 0.404326 0.111577 0.098454 0.684146 cheese\n", 146 "2 0.831340 0.641161 0.722503 0.719412 0.749303 pepper\n", 147 "3 0.088983 0.099260 0.832301 0.269737 0.382743 cheese\n", 148 "4 0.569650 0.321217 0.849422 0.765569 0.082142 cheese\n", 149 ".. ... ... ... ... ... ...\n", 150 "95 0.999064 0.421309 0.795260 0.200927 0.811947 pepper\n", 151 "96 0.913032 0.158652 0.072846 0.320127 0.847452 cheese\n", 152 "97 0.453406 0.829704 0.076251 0.327103 0.698135 cheese\n", 153 "98 0.465324 0.410674 0.752463 0.858177 0.078763 cheese\n", 154 "99 0.931791 0.349814 0.202655 0.480509 0.253459 cheese\n", 155 "\n", 156 "[100 rows x 6 columns]" 157 ] 158 }, 159 "execution_count": 135, 160 "metadata": {}, 161 "output_type": "execute_result" 162 } 163 ], 164 "source": [ 165 "import pandas as pd\n", 166 "import numpy as np\n", 167 "\n", 168 "\n", 169 "data = []\n", 170 "for i in range(0,100):\n", 171 " rnd0 = np.random.random()\n", 172 " rnd1 = np.random.random()\n", 173 " rnd2 = np.random.random()\n", 174 " rnd3 = np.random.random()\n", 175 " rnd4 = np.random.random()\n", 176 "\n", 177 " sum = np.sqrt(rnd0 * rnd0 + rnd1 * rnd1 + rnd2 * rnd2 + rnd3 * rnd3 + rnd4 * rnd4)\n", 178 " txt = ''\n", 179 " if sum > 1.5:\n", 180 " txt = 'pepper'\n", 181 " else:\n", 182 " txt = 'cheese'\n", 183 " data.append([rnd0, rnd1,rnd2,rnd3,rnd4,txt])\n", 184 "\n", 185 "df = pd.DataFrame(data=data)\n", 186 "df" 187 ] 188 }, 189 { 190 "cell_type": "code", 191 "execution_count": 136, 192 "metadata": {}, 193 "outputs": [ 194 { 195 "data": { 196 "text/plain": [ 197 "0 float64\n", 198 "1 float64\n", 199 "2 float64\n", 200 "3 float64\n", 201 "4 float64\n", 202 "5 object\n", 203 "dtype: object" 204 ] 205 }, 206 "execution_count": 136, 207 "metadata": {}, 208 "output_type": "execute_result" 209 } 210 ], 211 "source": [ 212 "df.dtypes" 213 ] 214 }, 215 { 216 "cell_type": "code", 217 "execution_count": 137, 218 "metadata": {}, 219 "outputs": [], 220 "source": [ 221 "from sklearn.preprocessing import OneHotEncoder" 222 ] 223 }, 224 { 225 "cell_type": "code", 226 "execution_count": 138, 227 "metadata": {}, 228 "outputs": [ 229 { 230 "data": { 231 "text/plain": [ 232 "array([[1., 0.],\n", 233 " [1., 0.],\n", 234 " [0., 1.],\n", 235 " [1., 0.],\n", 236 " [1., 0.],\n", 237 " [1., 0.],\n", 238 " [1., 0.],\n", 239 " [1., 0.],\n", 240 " [1., 0.],\n", 241 " [1., 0.],\n", 242 " [0., 1.],\n", 243 " [0., 1.],\n", 244 " [0., 1.],\n", 245 " [0., 1.],\n", 246 " [0., 1.],\n", 247 " [1., 0.],\n", 248 " [0., 1.],\n", 249 " [1., 0.],\n", 250 " [0., 1.],\n", 251 " [0., 1.],\n", 252 " [1., 0.],\n", 253 " [1., 0.],\n", 254 " [1., 0.],\n", 255 " [1., 0.],\n", 256 " [0., 1.],\n", 257 " [1., 0.],\n", 258 " [1., 0.],\n", 259 " [1., 0.],\n", 260 " [1., 0.],\n", 261 " [1., 0.],\n", 262 " [1., 0.],\n", 263 " [1., 0.],\n", 264 " [1., 0.],\n", 265 " [1., 0.],\n", 266 " [1., 0.],\n", 267 " [1., 0.],\n", 268 " [1., 0.],\n", 269 " [1., 0.],\n", 270 " [1., 0.],\n", 271 " [1., 0.],\n", 272 " [1., 0.],\n", 273 " [1., 0.],\n", 274 " [1., 0.],\n", 275 " [0., 1.],\n", 276 " [1., 0.],\n", 277 " [1., 0.],\n", 278 " [1., 0.],\n", 279 " [1., 0.],\n", 280 " [1., 0.],\n", 281 " [1., 0.],\n", 282 " [1., 0.],\n", 283 " [1., 0.],\n", 284 " [1., 0.],\n", 285 " [1., 0.],\n", 286 " [1., 0.],\n", 287 " [1., 0.],\n", 288 " [1., 0.],\n", 289 " [1., 0.],\n", 290 " [1., 0.],\n", 291 " [0., 1.],\n", 292 " [1., 0.],\n", 293 " [1., 0.],\n", 294 " [0., 1.],\n", 295 " [1., 0.],\n", 296 " [1., 0.],\n", 297 " [0., 1.],\n", 298 " [1., 0.],\n", 299 " [1., 0.],\n", 300 " [1., 0.],\n", 301 " [0., 1.],\n", 302 " [1., 0.],\n", 303 " [1., 0.],\n", 304 " [1., 0.],\n", 305 " [1., 0.],\n", 306 " [1., 0.],\n", 307 " [1., 0.],\n", 308 " [1., 0.],\n", 309 " [1., 0.],\n", 310 " [1., 0.],\n", 311 " [1., 0.],\n", 312 " [1., 0.],\n", 313 " [1., 0.],\n", 314 " [1., 0.],\n", 315 " [0., 1.],\n", 316 " [1., 0.],\n", 317 " [1., 0.],\n", 318 " [1., 0.],\n", 319 " [1., 0.],\n", 320 " [1., 0.],\n", 321 " [1., 0.],\n", 322 " [1., 0.],\n", 323 " [1., 0.],\n", 324 " [1., 0.],\n", 325 " [1., 0.],\n", 326 " [1., 0.],\n", 327 " [0., 1.],\n", 328 " [1., 0.],\n", 329 " [1., 0.],\n", 330 " [1., 0.],\n", 331 " [1., 0.]])" 332 ] 333 }, 334 "execution_count": 138, 335 "metadata": {}, 336 "output_type": "execute_result" 337 } 338 ], 339 "source": [ 340 "oh2 = OneHotEncoder(sparse_output=False)\n", 341 "out2 = oh2.fit_transform(df[[5]])\n", 342 "out2" 343 ] 344 }, 345 { 346 "cell_type": "code", 347 "execution_count": 139, 348 "metadata": {}, 349 "outputs": [], 350 "source": [ 351 "comb = pd.DataFrame(data=out2, columns=oh2.get_feature_names_out(['favorite']))\n", 352 "combined = pd.concat([comb,df], axis=1)\n", 353 "combined = combined.drop(axis=1 , columns=[5])" 354 ] 355 }, 356 { 357 "cell_type": "code", 358 "execution_count": 140, 359 "metadata": {}, 360 "outputs": [ 361 { 362 "data": { 363 "text/html": [ 364 "<div>\n", 365 "<style scoped>\n", 366 " .dataframe tbody tr th:only-of-type {\n", 367 " vertical-align: middle;\n", 368 " }\n", 369 "\n", 370 " .dataframe tbody tr th {\n", 371 " vertical-align: top;\n", 372 " }\n", 373 "\n", 374 " .dataframe thead th {\n", 375 " text-align: right;\n", 376 " }\n", 377 "</style>\n", 378 "<table border=\"1\" class=\"dataframe\">\n", 379 " <thead>\n", 380 " <tr style=\"text-align: right;\">\n", 381 " <th></th>\n", 382 " <th>favorite_cheese</th>\n", 383 " <th>favorite_pepper</th>\n", 384 " <th>0</th>\n", 385 " <th>1</th>\n", 386 " <th>2</th>\n", 387 " <th>3</th>\n", 388 " <th>4</th>\n", 389 " </tr>\n", 390 " </thead>\n", 391 " <tbody>\n", 392 " <tr>\n", 393 " <th>0</th>\n", 394 " <td>1.0</td>\n", 395 " <td>0.0</td>\n", 396 " <td>0.947152</td>\n", 397 " <td>0.182652</td>\n", 398 " <td>0.116222</td>\n", 399 " <td>0.249947</td>\n", 400 " <td>0.212487</td>\n", 401 " </tr>\n", 402 " <tr>\n", 403 " <th>1</th>\n", 404 " <td>1.0</td>\n", 405 " <td>0.0</td>\n", 406 " <td>0.932965</td>\n", 407 " <td>0.404326</td>\n", 408 " <td>0.111577</td>\n", 409 " <td>0.098454</td>\n", 410 " <td>0.684146</td>\n", 411 " </tr>\n", 412 " <tr>\n", 413 " <th>2</th>\n", 414 " <td>0.0</td>\n", 415 " <td>1.0</td>\n", 416 " <td>0.831340</td>\n", 417 " <td>0.641161</td>\n", 418 " <td>0.722503</td>\n", 419 " <td>0.719412</td>\n", 420 " <td>0.749303</td>\n", 421 " </tr>\n", 422 " <tr>\n", 423 " <th>3</th>\n", 424 " <td>1.0</td>\n", 425 " <td>0.0</td>\n", 426 " <td>0.088983</td>\n", 427 " <td>0.099260</td>\n", 428 " <td>0.832301</td>\n", 429 " <td>0.269737</td>\n", 430 " <td>0.382743</td>\n", 431 " </tr>\n", 432 " <tr>\n", 433 " <th>4</th>\n", 434 " <td>1.0</td>\n", 435 " <td>0.0</td>\n", 436 " <td>0.569650</td>\n", 437 " <td>0.321217</td>\n", 438 " <td>0.849422</td>\n", 439 " <td>0.765569</td>\n", 440 " <td>0.082142</td>\n", 441 " </tr>\n", 442 " </tbody>\n", 443 "</table>\n", 444 "</div>" 445 ], 446 "text/plain": [ 447 " favorite_cheese favorite_pepper 0 1 2 3 \\\n", 448 "0 1.0 0.0 0.947152 0.182652 0.116222 0.249947 \n", 449 "1 1.0 0.0 0.932965 0.404326 0.111577 0.098454 \n", 450 "2 0.0 1.0 0.831340 0.641161 0.722503 0.719412 \n", 451 "3 1.0 0.0 0.088983 0.099260 0.832301 0.269737 \n", 452 "4 1.0 0.0 0.569650 0.321217 0.849422 0.765569 \n", 453 "\n", 454 " 4 \n", 455 "0 0.212487 \n", 456 "1 0.684146 \n", 457 "2 0.749303 \n", 458 "3 0.382743 \n", 459 "4 0.082142 " 460 ] 461 }, 462 "execution_count": 140, 463 "metadata": {}, 464 "output_type": "execute_result" 465 } 466 ], 467 "source": [ 468 "combined.head()" 469 ] 470 }, 471 { 472 "cell_type": "code", 473 "execution_count": 141, 474 "metadata": {}, 475 "outputs": [], 476 "source": [ 477 "from sklearn.decomposition import PCA\n", 478 "\n", 479 "pca = PCA(n_components=3)\n", 480 "\n", 481 "out = pca.fit_transform(X=np.array(combined))" 482 ] 483 }, 484 { 485 "cell_type": "code", 486 "execution_count": null, 487 "metadata": {}, 488 "outputs": [], 489 "source": [ 490 "import plotly.express as px\n", 491 "\n", 492 "px.scatter_3d(x=out[:,0], y=out[:,1], z=out[:,2], color=combined['favorite_cheese'])" 493 ] 494 } 495 ], 496 "metadata": { 497 "kernelspec": { 498 "display_name": ".venv", 499 "language": "python", 500 "name": "python3" 501 }, 502 "language_info": { 503 "codemirror_mode": { 504 "name": "ipython", 505 "version": 3 506 }, 507 "file_extension": ".py", 508 "mimetype": "text/x-python", 509 "name": "python", 510 "nbconvert_exporter": "python", 511 "pygments_lexer": "ipython3", 512 "version": "3.11.2" 513 } 514 }, 515 "nbformat": 4, 516 "nbformat_minor": 2 517 }