KerasEmbeddingPG.ipynb (22248B)
1 { 2 "cells": [ 3 { 4 "cell_type": "markdown", 5 "metadata": {}, 6 "source": [ 7 "Embedding playground" 8 ] 9 }, 10 { 11 "cell_type": "code", 12 "execution_count": 2, 13 "metadata": {}, 14 "outputs": [ 15 { 16 "name": "stderr", 17 "output_type": "stream", 18 "text": [ 19 "2024-07-01 07:58:30.477487: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", 20 "2024-07-01 07:58:30.480784: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", 21 "2024-07-01 07:58:30.522090: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", 22 "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", 23 "2024-07-01 07:58:31.202746: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" 24 ] 25 } 26 ], 27 "source": [ 28 "import tensorflow as tf\n", 29 "import pandas as pd\n", 30 "import keras\n", 31 "import numpy as np" 32 ] 33 }, 34 { 35 "cell_type": "code", 36 "execution_count": 3, 37 "metadata": {}, 38 "outputs": [ 39 { 40 "name": "stderr", 41 "output_type": "stream", 42 "text": [ 43 "2024-07-01 07:58:31.966844: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", 44 "2024-07-01 07:58:31.967654: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n", 45 "Skipping registering GPU devices...\n" 46 ] 47 } 48 ], 49 "source": [ 50 "arrText = np.array(['test', 'why', 'would', 'this', 'work', 'weird', 'array', 'method', 'returns', 'math'])\n", 51 "arrNums = np.array([])\n", 52 "mapping = {}\n", 53 "nextMap = 0\n", 54 "\n", 55 "for i in arrText:\n", 56 " if i in mapping:\n", 57 " arrNums = np.append(arrNums, mapping[i])\n", 58 " else:\n", 59 " mapping[i] = nextMap\n", 60 " arrNums = np.append(arrNums, mapping[i])\n", 61 " nextMap += 1\n", 62 "\n", 63 "\n", 64 "embedding = keras.layers.Embedding(input_dim=10, output_dim=2)\n", 65 "embedded = embedding(arrNums)" 66 ] 67 }, 68 { 69 "cell_type": "code", 70 "execution_count": 4, 71 "metadata": {}, 72 "outputs": [ 73 { 74 "data": { 75 "text/plain": [ 76 "<tf.Tensor: shape=(10, 2), dtype=float32, numpy=\n", 77 "array([[ 0.03480979, 0.0453563 ],\n", 78 " [ 0.01358359, -0.00183941],\n", 79 " [-0.01757333, -0.04817909],\n", 80 " [-0.0360611 , 0.04445745],\n", 81 " [-0.00211517, -0.04308406],\n", 82 " [ 0.01702977, 0.01570695],\n", 83 " [ 0.04625987, 0.00359092],\n", 84 " [-0.0155 , 0.02139652],\n", 85 " [ 0.04769083, 0.00969797],\n", 86 " [ 0.01138154, -0.04589012]], dtype=float32)>" 87 ] 88 }, 89 "execution_count": 4, 90 "metadata": {}, 91 "output_type": "execute_result" 92 } 93 ], 94 "source": [ 95 "embedded" 96 ] 97 }, 98 { 99 "cell_type": "code", 100 "execution_count": 5, 101 "metadata": {}, 102 "outputs": [ 103 { 104 "data": { 105 "text/plain": [ 106 "<matplotlib.collections.PathCollection at 0x7fb3f02b8410>" 107 ] 108 }, 109 "execution_count": 5, 110 "metadata": {}, 111 "output_type": "execute_result" 112 }, 113 { 114 "data": { 115 "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGdCAYAAADuR1K7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAmxUlEQVR4nO3df3TUVX7/8ddMQjKskknDj0yCiaC1CzEUCmxCPHuOrcRNdi1KF49sDohSjlQW0C6UAsqSZtse6rpWcFU4u6ceapGVYrfuhtLsocHuustIIFGXEMKxHhYQMomQzQTR/DBzv3/wzeiYSQhhJpm583yc8zmcufP+zNybT+K8/NzP547DGGMEAABgCedIdwAAACCSCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKskj3QHRkIgEND58+c1ZswYORyOke4OAAAYBGOMLl26pOzsbDmd/Z+fSchwc/78eeXk5Ix0NwAAwBCcPXtWN910U7/PJ2S4GTNmjKQrP5y0tLQR7g0AABiM9vZ25eTkBD/H+5OQ4aZ3KiotLY1wAwBAnLnaJSVcUAwAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWCUhF/EDAOB69QSMak61quVShyaMcalgcoaSnHxfYSwg3AAAcI2q6ptUUdmgJn9HsC3L7VL5vDyV5meNYM8gMS0FAMA1qapv0opddSHBRpJ8/g6t2FWnqvqmEeoZehFuAAAYpJ6AUUVlg0yY53rbKiob1BMIV4HhQriJoJ6Akff9i/rZO+fkff8iv9wAYJmaU619zth8npHU5O9QzanW4esU+uCamwhh/hUA7Ndyqf9gM5Q6RAdnbiKA+VcASAwTxrgiWofoINxcJ+ZfASBxFEzOUJbbpf5u+Hboyln7gskZw9ktfAHh5jox/woAiSPJ6VD5vDxJ6hNweh+Xz8tjvZsRRri5Tsy/AkBiKc3P0vbFM+Vxh049edwubV88k+ssYwAXFF8n5l8BIPGU5mfp7jwPKxTHKMLNdeqdf/X5O8Jed+PQlTTP/CsA2CXJ6VDRrWNHuhsIg2mp68T8KwAAsYVwEwHMvwIAEDuYlooQ5l8BAIgNhJsIYv4VAICRx7QUAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrDEu4eeGFFzRp0iS5XC4VFhaqpqZmwPq9e/dqypQpcrlcmjZtmvbv399v7aOPPiqHw6GtW7dGuNcAACAeRT3c7NmzR2vWrFF5ebnq6uo0ffp0lZSUqKWlJWz9oUOHVFZWpmXLluntt9/W/PnzNX/+fNXX1/ep/c///E+99dZbys7OjvYwAABAnIh6uPnnf/5nPfLII1q6dKny8vK0Y8cOfelLX9JLL70Utn7btm0qLS3VunXrNHXqVP393/+9Zs6cqeeffz6k7ty5c1q9erVeeeUVjRo1KtrDAAAAcSKq4aarq0u1tbUqLi7+7A2dThUXF8vr9Ybdx+v1htRLUklJSUh9IBDQgw8+qHXr1un222+/aj86OzvV3t4esgEAADtFNdxcuHBBPT09yszMDGnPzMyUz+cLu4/P57tq/VNPPaXk5GQ99thjg+rHli1b5Ha7g1tOTs41jgQAAMSLuLtbqra2Vtu2bdPOnTvlcAzue5s2btwov98f3M6ePRvlXgIAgJES1XAzbtw4JSUlqbm5OaS9ublZHo8n7D4ej2fA+jfffFMtLS3Kzc1VcnKykpOTdfr0aa1du1aTJk0K+5qpqalKS0sL2QAAgJ2iGm5SUlI0a9YsVVdXB9sCgYCqq6tVVFQUdp+ioqKQekk6cOBAsP7BBx/Ub3/7W73zzjvBLTs7W+vWrdMvfvGL6A0GAADEhah/K/iaNWv00EMPafbs2SooKNDWrVt1+fJlLV26VJK0ZMkSTZw4UVu2bJEkPf7447rzzjv1zDPP6J577tGrr76qo0eP6kc/+pEkaezYsRo7NvSbt0eNGiWPx6Mvf/nL0R4OAACIcVEPNwsXLtSHH36ozZs3y+fzacaMGaqqqgpeNHzmzBk5nZ+dQLrjjju0e/dubdq0SU888YRuu+02vf7668rPz492VwEAgAUcxhgz0p0Ybu3t7XK73fL7/Vx/AwBAnBjs53fc3S0FAAAwEMINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVkke6A8BI6wkY1ZxqVculDk0Y41LB5AwlOR0j3S0AwBARbpDQquqbVFHZoCZ/R7Aty+1S+bw8leZnjWDPAABDxbQUElZVfZNW7KoLCTaS5PN3aMWuOlXVN41QzwAA14Nwg4TUEzCqqGyQCfNcb1tFZYN6AuEqAACxjHCDhFRzqrXPGZvPM5Ka/B2qOdU6fJ0CAETEsISbF154QZMmTZLL5VJhYaFqamoGrN+7d6+mTJkil8uladOmaf/+/cHnuru7tX79ek2bNk033HCDsrOztWTJEp0/fz7aw4BFWi71H2yGUgcAiB1RDzd79uzRmjVrVF5errq6Ok2fPl0lJSVqaWkJW3/o0CGVlZVp2bJlevvttzV//nzNnz9f9fX1kqSPP/5YdXV1+u53v6u6ujr99Kc/1cmTJ3XvvfdGeyiwyIQxrojWAQBih8MYE9WLCgoLC/WVr3xFzz//vCQpEAgoJydHq1ev1oYNG/rUL1y4UJcvX9a+ffuCbXPmzNGMGTO0Y8eOsO9x5MgRFRQU6PTp08rNzb1qn9rb2+V2u+X3+5WWljbEkSGe9QSMvvrUQfn8HWGvu3FI8rhd+vX6u7gtHABixGA/v6N65qarq0u1tbUqLi7+7A2dThUXF8vr9Ybdx+v1htRLUklJSb/1kuT3++VwOJSenh6RfsN+SU6HyuflSboSZD6v93H5vDyCDQDEoaiGmwsXLqinp0eZmZkh7ZmZmfL5fGH38fl811Tf0dGh9evXq6ysrN8U19nZqfb29pANKM3P0vbFM+Vxh049edwubV88k3VuACBOxfUift3d3XrggQdkjNH27dv7rduyZYsqKiqGsWeIF6X5Wbo7z8MKxQBgkaiGm3HjxikpKUnNzc0h7c3NzfJ4PGH38Xg8g6rvDTanT5/WwYMHB5x727hxo9asWRN83N7erpycnGsdDiyV5HSo6NaxI90NAECERHVaKiUlRbNmzVJ1dXWwLRAIqLq6WkVFRWH3KSoqCqmXpAMHDoTU9wab9957T//zP/+jsWMH/mBKTU1VWlpayAYAAOwU9WmpNWvW6KGHHtLs2bNVUFCgrVu36vLly1q6dKkkacmSJZo4caK2bNkiSXr88cd155136plnntE999yjV199VUePHtWPfvQjSVeCzf3336+6ujrt27dPPT09wetxMjIylJKSEu0hAQCAGBb1cLNw4UJ9+OGH2rx5s3w+n2bMmKGqqqrgRcNnzpyR0/nZCaQ77rhDu3fv1qZNm/TEE0/otttu0+uvv678/HxJ0rlz5/Tzn/9ckjRjxoyQ93rjjTf0p3/6p9EeEgAAiGFRX+cmFrHODQAA8Scm1rkBAAAYboQbAABgFcINAACwCuEGAABYJa5XKAaARNYTMKyuDYRBuAGAOFRV36SKygY1+TuCbVlul8rn5fG9aEh4TEsBQJypqm/Sil11IcFGknz+Dq3YVaeq+qYR6hkQGwg3ABBHegJGFZUNCrdAWW9bRWWDegIJt4QZEES4AYA4UnOqtc8Zm88zkpr8Hao51Tp8nQJiDOEGAOJIy6X+g81Q6gAbEW4AII5MGOOKaB1gI8INAMSRgskZynK71N8N3w5duWuqYHLGcHYLiCmEGwCII0lOh8rn5UlSn4DT+7h8Xh7r3SChEW4AIM6U5mdp++KZ8rhDp548bpe2L57JOjdIeCziBwBxqDQ/S3fneVihGAiDcAMAcSrJ6VDRrWNHuhtAzGFaCgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACswtcvAACAiOgJmJj4vjPCDQAAuG5V9U2qqGxQk78j2Jbldql8Xt6wf1M901IAAOC6VNU3acWuupBgI0k+f4dW7KpTVX3TsPaHcAMAAIasJ2BUUdkgE+a53raKygb1BMJVRAfhBgAADFnNqdY+Z2w+z0hq8neo5lTrsPWJcAMAAIas5VL/wWYodZFAuAEAAEM2YYwronWRQLgBAABDVjA5Q1lul/q74duhK3dNFUzOGLY+EW4AAMCQJTkdKp+XJ0l9Ak7v4/J5ecO63g3hBgAAXJfS/CxtXzxTHnfo1JPH7dL2xTOHfZ0bFvEDAADXrTQ/S3fneVihGAAA2CPJ6VDRrWNHuhtMSwEAALsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVfj6BQAAMGg9ARMT3x81EMINAAAYlKr6JlVUNqjJ3xFsy3K7VD4vb9i/+XsgTEsBAICrqqpv0opddSHBRpJ8/g6t2FWnqvqmEepZX4QbAAAwoJ6AUUVlg0yY53rbKiob1BMIVzH8CDcAAGBANada+5yx+TwjqcnfoZpTrcPXqQEQbgAAwIBaLvUfbIZSF22EGwAAMKAJY1wRrYs2wg0AABhQweQMZbld6u+Gb4eu3DVVMDljOLvVL8INAAAYUJLTofJ5eZLUJ+D0Pi6flxcz690QbgAAwFWV5mdp++KZ8rhDp548bpe2L54ZU+vcsIgfAAAYlNL8LN2d52GFYgAAYI8kp0NFt44d6W4MiGkpAABglWEJNy+88IImTZokl8ulwsJC1dTUDFi/d+9eTZkyRS6XS9OmTdP+/ftDnjfGaPPmzcrKytLo0aNVXFys9957L5pDAAAAcSLq4WbPnj1as2aNysvLVVdXp+nTp6ukpEQtLS1h6w8dOqSysjItW7ZMb7/9tubPn6/58+ervr4+WPP9739fzz33nHbs2KHDhw/rhhtuUElJiTo6YmPxIAAAMHIcxpiofhFEYWGhvvKVr+j555+XJAUCAeXk5Gj16tXasGFDn/qFCxfq8uXL2rdvX7Btzpw5mjFjhnbs2CFjjLKzs7V27Vr9zd/8jSTJ7/crMzNTO3fu1Le+9a2r9qm9vV1ut1t+v19paWkRGikAAIimwX5+R/XMTVdXl2pra1VcXPzZGzqdKi4ultfrDbuP1+sNqZekkpKSYP2pU6fk8/lCatxutwoLC/t9zc7OTrW3t4dsAADATlENNxcuXFBPT48yMzND2jMzM+Xz+cLu4/P5Bqzv/fdaXnPLli1yu93BLScnZ0jjAQAAsS8h7pbauHGj/H5/cDt79uxIdwkAAERJVNe5GTdunJKSktTc3BzS3tzcLI/HE3Yfj8czYH3vv83NzcrKygqpmTFjRtjXTE1NVWpq6lCHAQAR1RMwMb8IGhDPonrmJiUlRbNmzVJ1dXWwLRAIqLq6WkVFRWH3KSoqCqmXpAMHDgTrJ0+eLI/HE1LT3t6uw4cP9/uaABArquqb9NWnDqrsx2/p8VffUdmP39JXnzqoqvqmke4aYI2oT0utWbNGP/7xj/Wv//qvOnHihFasWKHLly9r6dKlkqQlS5Zo48aNwfrHH39cVVVVeuaZZ9TY2Ki/+7u/09GjR7Vq1SpJksPh0F//9V/rH/7hH/Tzn/9cx44d05IlS5Sdna358+dHezgAMGRV9U1asatOTf7QZSt8/g6t2FVHwAEiJOpfv7Bw4UJ9+OGH2rx5s3w+n2bMmKGqqqrgBcFnzpyR0/lZxrrjjju0e/dubdq0SU888YRuu+02vf7668rPzw/W/O3f/q0uX76s5cuXq62tTV/96ldVVVUll8vV5/0BIBb0BIwqKhsUbu0NoyvfrFxR2aC78zxMUQHXKerr3MQi1rkBMNy8719U2Y/fumrdTx6ZE/Pf2wOMlJhY5wYAcEXLpcGtoD7YOgD9I9wAwDCYMGZw0+aDrQPQP8INAAyDgskZynK71N/VNA5JWe4rt4UDuD6EGwAYBklOh8rn5UlSn4DT+7h8Xh4XEwMRQLgBgGFSmp+l7YtnyuMOnXryuF3avnimSvOz+tkTwLWI+q3gAIDPlOZn6e48DysUA1FEuAGAYZbkdHC7NxBFTEsBAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFglauGmtbVVixYtUlpamtLT07Vs2TJ99NFHA+7T0dGhlStXauzYsbrxxhu1YMECNTc3B59/9913VVZWppycHI0ePVpTp07Vtm3bojUEAAAQh6IWbhYtWqTjx4/rwIED2rdvn371q19p+fLlA+7zne98R5WVldq7d69++ctf6vz58/rmN78ZfL62tlYTJkzQrl27dPz4cT355JPauHGjnn/++WgNAwAAxBmHMcZE+kVPnDihvLw8HTlyRLNnz5YkVVVV6Rvf+IY++OADZWdn99nH7/dr/Pjx2r17t+6//35JUmNjo6ZOnSqv16s5c+aEfa+VK1fqxIkTOnjw4KD7197eLrfbLb/fr7S0tCGMEAAADLfBfn5H5cyN1+tVenp6MNhIUnFxsZxOpw4fPhx2n9raWnV3d6u4uDjYNmXKFOXm5srr9fb7Xn6/XxkZGQP2p7OzU+3t7SEbAACwU1TCjc/n04QJE0LakpOTlZGRIZ/P1+8+KSkpSk9PD2nPzMzsd59Dhw5pz549V53u2rJli9xud3DLyckZ/GAAAEBcuaZws2HDBjkcjgG3xsbGaPU1RH19ve677z6Vl5fra1/72oC1GzdulN/vD25nz54dlj4CAIDhl3wtxWvXrtXDDz88YM0tt9wij8ejlpaWkPZPP/1Ura2t8ng8YffzeDzq6upSW1tbyNmb5ubmPvs0NDRo7ty5Wr58uTZt2nTVfqempio1NfWqdQAAIP5dU7gZP368xo8ff9W6oqIitbW1qba2VrNmzZIkHTx4UIFAQIWFhWH3mTVrlkaNGqXq6motWLBAknTy5EmdOXNGRUVFwbrjx4/rrrvu0kMPPaR//Md/vJbuAwCABBCVu6Uk6etf/7qam5u1Y8cOdXd3a+nSpZo9e7Z2794tSTp37pzmzp2rl19+WQUFBZKkFStWaP/+/dq5c6fS0tK0evVqSVeurZGuTEXdddddKikp0dNPPx18r6SkpEGFrl7cLQUAQPwZ7Of3NZ25uRavvPKKVq1apblz58rpdGrBggV67rnngs93d3fr5MmT+vjjj4Ntzz77bLC2s7NTJSUlevHFF4PPv/baa/rwww+1a9cu7dq1K9h+880363e/+120hgIAAOJI1M7cxDLO3AAAEH9GdJ0bAACAkUK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqUQs3ra2tWrRokdLS0pSenq5ly5bpo48+GnCfjo4OrVy5UmPHjtWNN96oBQsWqLm5OWztxYsXddNNN8nhcKitrS0KIwAAAPEoauFm0aJFOn78uA4cOKB9+/bpV7/6lZYvXz7gPt/5zndUWVmpvXv36pe//KXOnz+vb37zm2Frly1bpj/+4z+ORtcBAEAccxhjTKRf9MSJE8rLy9ORI0c0e/ZsSVJVVZW+8Y1v6IMPPlB2dnafffx+v8aPH6/du3fr/vvvlyQ1NjZq6tSp8nq9mjNnTrB2+/bt2rNnjzZv3qy5c+fq97//vdLT0wfdv/b2drndbvn9fqWlpV3fYAEAwLAY7Od3VM7ceL1epaenB4ONJBUXF8vpdOrw4cNh96mtrVV3d7eKi4uDbVOmTFFubq68Xm+wraGhQd/73vf08ssvy+kcXPc7OzvV3t4esgEAADtFJdz4fD5NmDAhpC05OVkZGRny+Xz97pOSktLnDExmZmZwn87OTpWVlenpp59Wbm7uoPuzZcsWud3u4JaTk3NtAwIAAHHjmsLNhg0b5HA4BtwaGxuj1Vdt3LhRU6dO1eLFi695P7/fH9zOnj0bpR4CAICRlnwtxWvXrtXDDz88YM0tt9wij8ejlpaWkPZPP/1Ura2t8ng8YffzeDzq6upSW1tbyNmb5ubm4D4HDx7UsWPH9Nprr0mSei8XGjdunJ588klVVFSEfe3U1FSlpqYOZogAACDOXVO4GT9+vMaPH3/VuqKiIrW1tam2tlazZs2SdCWYBAIBFRYWht1n1qxZGjVqlKqrq7VgwQJJ0smTJ3XmzBkVFRVJkv7jP/5Dn3zySXCfI0eO6C//8i/15ptv6tZbb72WoQAAAEtdU7gZrKlTp6q0tFSPPPKIduzYoe7ubq1atUrf+ta3gndKnTt3TnPnztXLL7+sgoICud1uLVu2TGvWrFFGRobS0tK0evVqFRUVBe+U+mKAuXDhQvD9ruVuKQAAYK+ohBtJeuWVV7Rq1SrNnTtXTqdTCxYs0HPPPRd8vru7WydPntTHH38cbHv22WeDtZ2dnSopKdGLL74YrS4CAAALRWWdm1jHOjfA4PUEjGpOtarlUocmjHGpYHKGkpyOke4WgAQ02M/vqJ25ARD/quqbVFHZoCZ/R7Aty+1S+bw8leZnjWDPAKB/fHEmgLCq6pu0YlddSLCRJJ+/Qyt21amqvmmEegYAAyPcAOijJ2BUUdmgcHPWvW0VlQ3qCSTcrDaAOEC4AdBHzanWPmdsPs9IavJ3qOZU6/B1CgAGiXADoI+WS/0Hm6HUAcBwItwA6GPCGFdE6wBgOBFuAPRRMDlDWW6X+rvh26Erd00VTM4Yzm4BwKAQbgD0keR0qHxeniT1CTi9j8vn5bHeDYCYRLgBEFZpfpa2L54pjzt06snjdmn74pmscwMgZrGIH4B+leZn6e48DysUXwWrOAOxhXADYEBJToeKbh070t2IWaziDMQepqUAYIhYxRmITYQbABgCVnEGYhfhBgCGgFWcgdhFuAGAIWAVZyB2EW4AYAhYxRmIXYQbABgCVnEGYhfhBlbqCRh537+on71zTt73L3JRJyKOVZyB2MU6N7AO645guPSu4vzF3zcPv2/AiHIYYxLuf2nb29vldrvl9/uVlpY20t1BBPWuO/LFX+re/3fmawMQDaxQDAyPwX5+c+YG1rjauiMOXVl35O48Dx88iChWcQZiC9fcwBqsOwIAkAg3sAjrjgAAJMINLMK6IwAAiXADi7DuCABAItzAIqw7AgCQCDewTO+6Ix536NSTx+3iNnAASBDcCg7rlOZn6e48D+uOAECCItzASqw7AgCJi2kpAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGCVhFyh2BgjSWpvbx/hngAAgMHq/dzu/RzvT0KGm0uXLkmScnJyRrgnAADgWl26dElut7vf5x3mavHHQoFAQCdPnlReXp7Onj2rtLS0ke4SrqK9vV05OTkcrzjB8YofHKv4kujHyxijS5cuKTs7W05n/1fWJOSZG6fTqYkTJ0qS0tLSEvIXJF5xvOILxyt+cKziSyIfr4HO2PTigmIAAGAVwg0AALBKwoab1NRUlZeXKzU1daS7gkHgeMUXjlf84FjFF47X4CTkBcUAAMBeCXvmBgAA2IlwAwAArEK4AQAAViHcAAAAq1gdblpbW7Vo0SKlpaUpPT1dy5Yt00cffTTgPh0dHVq5cqXGjh2rG2+8UQsWLFBzc3PY2osXL+qmm26Sw+FQW1tbFEaQOKJxrN59912VlZUpJydHo0eP1tSpU7Vt27ZoD8VKL7zwgiZNmiSXy6XCwkLV1NQMWL93715NmTJFLpdL06ZN0/79+0OeN8Zo8+bNysrK0ujRo1VcXKz33nsvmkNIKJE8Xt3d3Vq/fr2mTZumG264QdnZ2VqyZInOnz8f7WEkjEj/fX3eo48+KofDoa1bt0a41zHOWKy0tNRMnz7dvPXWW+bNN980f/iHf2jKysoG3OfRRx81OTk5prq62hw9etTMmTPH3HHHHWFr77vvPvP1r3/dSDK///3vozCCxBGNY/Uv//Iv5rHHHjP/+7//a95//33zb//2b2b06NHmhz/8YbSHY5VXX33VpKSkmJdeeskcP37cPPLIIyY9Pd00NzeHrf/Nb35jkpKSzPe//33T0NBgNm3aZEaNGmWOHTsWrPmnf/on43a7zeuvv27effddc++995rJkyebTz75ZLiGZa1IH6+2tjZTXFxs9uzZYxobG43X6zUFBQVm1qxZwzksa0Xj76vXT3/6UzN9+nSTnZ1tnn322SiPJLZYG24aGhqMJHPkyJFg23//938bh8Nhzp07F3aftrY2M2rUKLN3795g24kTJ4wk4/V6Q2pffPFFc+edd5rq6mrCzXWK9rH6vG9/+9vmz/7szyLX+QRQUFBgVq5cGXzc09NjsrOzzZYtW8LWP/DAA+aee+4JaSssLDR/9Vd/ZYwxJhAIGI/HY55++ung821tbSY1NdX85Cc/icIIEkukj1c4NTU1RpI5ffp0ZDqdwKJ1vD744AMzceJEU19fb26++eaECzfWTkt5vV6lp6dr9uzZwbbi4mI5nU4dPnw47D61tbXq7u5WcXFxsG3KlCnKzc2V1+sNtjU0NOh73/ueXn755QG/uAuDE81j9UV+v18ZGRmR67zlurq6VFtbG/JzdjqdKi4u7vfn7PV6Q+olqaSkJFh/6tQp+Xy+kBq3263CwsIBjx2uLhrHKxy/3y+Hw6H09PSI9DtRRet4BQIBPfjgg1q3bp1uv/326HQ+xln7yezz+TRhwoSQtuTkZGVkZMjn8/W7T0pKSp8/2MzMzOA+nZ2dKisr09NPP63c3Nyo9D3RROtYfdGhQ4e0Z88eLV++PCL9TgQXLlxQT0+PMjMzQ9oH+jn7fL4B63v/vZbXxOBE43h9UUdHh9avX6+ysrKE/eLGSInW8XrqqaeUnJysxx57LPKdjhNxF242bNggh8Mx4NbY2Bi199+4caOmTp2qxYsXR+09bDHSx+rz6uvrdd9996m8vFxf+9rXhuU9Adt0d3frgQcekDFG27dvH+nuIIza2lpt27ZNO3fulMPhGOnujJjkke7AtVq7dq0efvjhAWtuueUWeTwetbS0hLR/+umnam1tlcfjCbufx+NRV1eX2traQs4INDc3B/c5ePCgjh07ptdee03Slbs+JGncuHF68sknVVFRMcSR2Wekj1WvhoYGzZ07V8uXL9emTZuGNJZENW7cOCUlJfW5YzDcz7mXx+MZsL733+bmZmVlZYXUzJgxI4K9TzzROF69eoPN6dOndfDgQc7aREA0jtebb76plpaWkJmFnp4erV27Vlu3btXvfve7yA4iVo30RT/R0nuR6tGjR4Ntv/jFLwZ1keprr70WbGtsbAy5SPX//u//zLFjx4LbSy+9ZCSZQ4cO9Xt1OwYWrWNljDH19fVmwoQJZt26ddEbgOUKCgrMqlWrgo97enrMxIkTB7zg8c///M9D2oqKivpcUPyDH/wg+Lzf7+eC4giJ9PEyxpiuri4zf/58c/vtt5uWlpbodDxBRfp4XbhwIeQz6tixYyY7O9usX7/eNDY2Rm8gMcbacGPMlduL/+RP/sQcPnzY/PrXvza33XZbyO3FH3zwgfnyl79sDh8+HGx79NFHTW5urjl48KA5evSoKSoqMkVFRf2+xxtvvMHdUhEQjWN17NgxM378eLN48WLT1NQU3PiP87V59dVXTWpqqtm5c6dpaGgwy5cvN+np6cbn8xljjHnwwQfNhg0bgvW/+c1vTHJysvnBD35gTpw4YcrLy8PeCp6enm5+9rOfmd/+9rfmvvvu41bwCIn08erq6jL33nuvuemmm8w777wT8rfU2dk5ImO0STT+vr4oEe+WsjrcXLx40ZSVlZkbb7zRpKWlmaVLl5pLly4Fnz916pSRZN54441g2yeffGK+/e1vmz/4gz8wX/rSl8xf/MVfmKampn7fg3ATGdE4VuXl5UZSn+3mm28expHZ4Yc//KHJzc01KSkppqCgwLz11lvB5+68807z0EMPhdT/+7//u/mjP/ojk5KSYm6//XbzX//1XyHPBwIB893vftdkZmaa1NRUM3fuXHPy5MnhGEpCiOTx6v3bC7d9/u8RQxfpv68vSsRw4zDm/180AgAAYIG4u1sKAABgIIQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFjl/wFvqYjCOnCfGwAAAABJRU5ErkJggg==", 116 "text/plain": [ 117 "<Figure size 640x480 with 1 Axes>" 118 ] 119 }, 120 "metadata": {}, 121 "output_type": "display_data" 122 } 123 ], 124 "source": [ 125 "import matplotlib.pyplot as plt\n", 126 "def plotEmbedding(embedded):\n", 127 " return plt.scatter(x=embedded[:,0], y=embedded[:,1])\n", 128 "\n", 129 "plotEmbedding(embedded)" 130 ] 131 }, 132 { 133 "cell_type": "markdown", 134 "metadata": {}, 135 "source": [ 136 "There is a built in string lookup layer so you don't have to use your own dictionary for this:" 137 ] 138 }, 139 { 140 "cell_type": "code", 141 "execution_count": 6, 142 "metadata": {}, 143 "outputs": [ 144 { 145 "data": { 146 "text/plain": [ 147 "<tf.Tensor: shape=(2,), dtype=int64, numpy=array([0, 4])>" 148 ] 149 }, 150 "execution_count": 6, 151 "metadata": {}, 152 "output_type": "execute_result" 153 } 154 ], 155 "source": [ 156 "str_lookup = keras.layers.StringLookup()\n", 157 "str_lookup.adapt(arrText)\n", 158 "# 0 is reserved for unrecognized stuff. \n", 159 "str_lookup(['rnd', 'weird'])" 160 ] 161 }, 162 { 163 "cell_type": "markdown", 164 "metadata": {}, 165 "source": [ 166 "Textvectorization is just a better string lookup that removes punctuation, sets lowercase, and splits by whitespace. \n", 167 "\n", 168 "To preserve case and punctuation set Standardize=None\n", 169 "\n", 170 "Unknown words are now encoded as 1" 171 ] 172 }, 173 { 174 "cell_type": "code", 175 "execution_count": 7, 176 "metadata": {}, 177 "outputs": [], 178 "source": [ 179 "training = ['the distant realm of Lumina, where the sky shimmered with hues unknown to the mundane world, an ancient prophecy began to unfold. The Great Tree of Elaria, standing tall and majestic in the heart of the enchanted forest, whispered secrets to']\n", 180 "\n", 181 "text_vec = keras.layers.TextVectorization()\n", 182 "text_vec.adapt(training)" 183 ] 184 }, 185 { 186 "cell_type": "code", 187 "execution_count": 8, 188 "metadata": {}, 189 "outputs": [ 190 { 191 "data": { 192 "text/plain": [ 193 "<tf.Tensor: shape=(1, 6), dtype=int64, numpy=array([[29, 17, 4, 21, 8, 2]])>" 194 ] 195 }, 196 "execution_count": 8, 197 "metadata": {}, 198 "output_type": "execute_result" 199 } 200 ], 201 "source": [ 202 "out = text_vec(['distant realm of Lumina, where the '])\n", 203 "out" 204 ] 205 }, 206 { 207 "cell_type": "code", 208 "execution_count": 9, 209 "metadata": {}, 210 "outputs": [ 211 { 212 "data": { 213 "text/plain": [ 214 "array(['', '[UNK]', 'the', 'to', 'of', 'world', 'with', 'whispered',\n", 215 " 'where', 'unknown', 'unfold', 'tree', 'tall', 'standing', 'sky',\n", 216 " 'shimmered', 'secrets', 'realm', 'prophecy', 'mundane', 'majestic',\n", 217 " 'lumina', 'in', 'hues', 'heart', 'great', 'forest', 'enchanted',\n", 218 " 'elaria', 'distant', 'began', 'and', 'ancient', 'an'], dtype='<U9')" 219 ] 220 }, 221 "execution_count": 9, 222 "metadata": {}, 223 "output_type": "execute_result" 224 } 225 ], 226 "source": [ 227 "vocab = np.array(text_vec.get_vocabulary())\n", 228 "vocab" 229 ] 230 }, 231 { 232 "cell_type": "code", 233 "execution_count": 10, 234 "metadata": {}, 235 "outputs": [ 236 { 237 "data": { 238 "text/plain": [ 239 "'distant realm of lumina where the'" 240 ] 241 }, 242 "execution_count": 10, 243 "metadata": {}, 244 "output_type": "execute_result" 245 } 246 ], 247 "source": [ 248 "decoded_texts = []\n", 249 "for sequence in out:\n", 250 " decoded_texts.append([vocab[i] for i in sequence if i != 0])\n", 251 "' '.join(decoded_texts[0])" 252 ] 253 }, 254 { 255 "cell_type": "code", 256 "execution_count": 16, 257 "metadata": {}, 258 "outputs": [ 259 { 260 "data": { 261 "text/plain": [ 262 "<tf.Tensor: shape=(1, 17), dtype=int64, numpy=array([[0, 3, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>" 263 ] 264 }, 265 "execution_count": 16, 266 "metadata": {}, 267 "output_type": "execute_result" 268 } 269 ], 270 "source": [ 271 "# Default this outputs the embeddings but with count you get the count (one hot style) of the words.\n", 272 "# You can also use 'multi_hot' to get binary (has/does not have). Also, you can use tf_idf to downweight\n", 273 "# common words and upweight less common ones (still onehot esque).\n", 274 "\n", 275 "vect = keras.layers.TextVectorization(output_mode='count')\n", 276 "st = 'what would that seem to be what would why where when how they are not there when you try that would'\n", 277 "vect.adapt(st)\n", 278 "\n", 279 "vect([st])" 280 ] 281 } 282 ], 283 "metadata": { 284 "kernelspec": { 285 "display_name": ".venv", 286 "language": "python", 287 "name": "python3" 288 }, 289 "language_info": { 290 "codemirror_mode": { 291 "name": "ipython", 292 "version": 3 293 }, 294 "file_extension": ".py", 295 "mimetype": "text/x-python", 296 "name": "python", 297 "nbconvert_exporter": "python", 298 "pygments_lexer": "ipython3", 299 "version": "3.11.2" 300 } 301 }, 302 "nbformat": 4, 303 "nbformat_minor": 2 304 }