NNSpamFilter.ipynb (20441B)
1 { 2 "cells": [ 3 { 4 "cell_type": "markdown", 5 "metadata": {}, 6 "source": [ 7 "This dataset is from kaggle:\n", 8 "\n", 9 "https://www.kaggle.com/datasets/abdallahwagih/spam-emails\n", 10 "Download this, move it to the correct location and then start working. " 11 ] 12 }, 13 { 14 "cell_type": "markdown", 15 "metadata": {}, 16 "source": [ 17 "86% Accuracy Using Neural Network\n", 18 "\n", 19 "Technically, this is better than my Naive Bayes approach, but not even close to the sklearn Naive Bayes. \n", 20 "\n", 21 "This was a fun project, but stupid idea for anyone being serious." 22 ] 23 }, 24 { 25 "cell_type": "code", 26 "execution_count": 371, 27 "metadata": {}, 28 "outputs": [], 29 "source": [ 30 "import pandas as pd\n", 31 "\n", 32 "emails = pd.read_csv('../datasets/spamEmails/emails.csv')\n", 33 "\n", 34 "MSG_LEN = 5000" 35 ] 36 }, 37 { 38 "cell_type": "code", 39 "execution_count": 372, 40 "metadata": {}, 41 "outputs": [], 42 "source": [ 43 "import re\n", 44 "import numpy as np\n", 45 "X = emails['Message']\n", 46 "y = emails['Category']\n", 47 "\n", 48 "y = y == 'spam'\n", 49 "\n", 50 "def cut(text):\n", 51 " text = re.sub(r'[^a-zA-Z\\s]', '',text)\n", 52 " text = text.lower()\n", 53 " text = text[:MSG_LEN]\n", 54 " text = text.ljust(MSG_LEN)\n", 55 " text = [ord(char) for char in text]\n", 56 " return text\n", 57 "\n", 58 "\n", 59 "X = X.apply(cut)\n", 60 "\n", 61 "X = np.array(X.tolist())\n" 62 ] 63 }, 64 { 65 "cell_type": "code", 66 "execution_count": 373, 67 "metadata": {}, 68 "outputs": [], 69 "source": [ 70 "from sklearn.preprocessing import StandardScaler\n", 71 "\n", 72 "scaler = StandardScaler()\n", 73 "X = scaler.fit_transform(X)" 74 ] 75 }, 76 { 77 "cell_type": "code", 78 "execution_count": 374, 79 "metadata": {}, 80 "outputs": [], 81 "source": [ 82 "from sklearn.model_selection import train_test_split\n", 83 "import numpy as np\n", 84 "\n", 85 "\n", 86 "y = y.values\n", 87 "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=10)\n", 88 "X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, random_state=10)" 89 ] 90 }, 91 { 92 "cell_type": "code", 93 "execution_count": 375, 94 "metadata": {}, 95 "outputs": [], 96 "source": [ 97 "import tensorflow as tf\n", 98 "import keras\n", 99 "from keras.layers import BatchNormalization \n", 100 "\n", 101 "model = keras.Sequential(layers=[\n", 102 " keras.layers.Input(shape=X_train[0].shape),\n", 103 "\n", 104 " keras.layers.Dense(512, activation='relu'),\n", 105 " BatchNormalization(),\n", 106 "\n", 107 " keras.layers.Dense(256, activation='relu'),\n", 108 " BatchNormalization(),\n", 109 "\n", 110 " keras.layers.Dense(128, activation='relu'),\n", 111 " BatchNormalization(),\n", 112 "\n", 113 " keras.layers.Dense(64, activation='relu'),\n", 114 "\n", 115 " keras.layers.Dense(1, activation='sigmoid')\n", 116 "])" 117 ] 118 }, 119 { 120 "cell_type": "code", 121 "execution_count": 376, 122 "metadata": {}, 123 "outputs": [ 124 { 125 "data": { 126 "text/html": [ 127 "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"sequential_34\"</span>\n", 128 "</pre>\n" 129 ], 130 "text/plain": [ 131 "\u001b[1mModel: \"sequential_34\"\u001b[0m\n" 132 ] 133 }, 134 "metadata": {}, 135 "output_type": "display_data" 136 }, 137 { 138 "data": { 139 "text/html": [ 140 "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", 141 "┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃\n", 142 "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", 143 "│ dense_204 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,560,512</span> │\n", 144 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 145 "│ batch_normalization_80 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,048</span> │\n", 146 "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n", 147 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 148 "│ dense_205 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">131,328</span> │\n", 149 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 150 "│ batch_normalization_81 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,024</span> │\n", 151 "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n", 152 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 153 "│ dense_206 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">32,896</span> │\n", 154 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 155 "│ batch_normalization_82 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span> │\n", 156 "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n", 157 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 158 "│ dense_207 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">8,256</span> │\n", 159 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 160 "│ dense_208 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">65</span> │\n", 161 "└─────────────────────────────────┴────────────────────────┴───────────────┘\n", 162 "</pre>\n" 163 ], 164 "text/plain": [ 165 "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", 166 "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", 167 "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", 168 "│ dense_204 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m2,560,512\u001b[0m │\n", 169 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 170 "│ batch_normalization_80 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m2,048\u001b[0m │\n", 171 "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", 172 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 173 "│ dense_205 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m131,328\u001b[0m │\n", 174 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 175 "│ batch_normalization_81 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │\n", 176 "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", 177 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 178 "│ dense_206 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m32,896\u001b[0m │\n", 179 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 180 "│ batch_normalization_82 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m512\u001b[0m │\n", 181 "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", 182 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 183 "│ dense_207 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m8,256\u001b[0m │\n", 184 "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", 185 "│ dense_208 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m65\u001b[0m │\n", 186 "└─────────────────────────────────┴────────────────────────┴───────────────┘\n" 187 ] 188 }, 189 "metadata": {}, 190 "output_type": "display_data" 191 }, 192 { 193 "data": { 194 "text/html": [ 195 "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,736,641</span> (10.44 MB)\n", 196 "</pre>\n" 197 ], 198 "text/plain": [ 199 "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m2,736,641\u001b[0m (10.44 MB)\n" 200 ] 201 }, 202 "metadata": {}, 203 "output_type": "display_data" 204 }, 205 { 206 "data": { 207 "text/html": [ 208 "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,734,849</span> (10.43 MB)\n", 209 "</pre>\n" 210 ], 211 "text/plain": [ 212 "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m2,734,849\u001b[0m (10.43 MB)\n" 213 ] 214 }, 215 "metadata": {}, 216 "output_type": "display_data" 217 }, 218 { 219 "data": { 220 "text/html": [ 221 "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,792</span> (7.00 KB)\n", 222 "</pre>\n" 223 ], 224 "text/plain": [ 225 "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m1,792\u001b[0m (7.00 KB)\n" 226 ] 227 }, 228 "metadata": {}, 229 "output_type": "display_data" 230 } 231 ], 232 "source": [ 233 "model.summary()" 234 ] 235 }, 236 { 237 "cell_type": "code", 238 "execution_count": 377, 239 "metadata": {}, 240 "outputs": [], 241 "source": [ 242 "optimizer = keras.optimizers.Adam()\n", 243 "\n", 244 "model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=optimizer)" 245 ] 246 }, 247 { 248 "cell_type": "code", 249 "execution_count": 378, 250 "metadata": {}, 251 "outputs": [ 252 { 253 "name": "stdout", 254 "output_type": "stream", 255 "text": [ 256 "Epoch 1/10\n", 257 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 19ms/step - accuracy: 0.8253 - loss: 0.3683 - val_accuracy: 0.8883 - val_loss: 0.2601\n", 258 "Epoch 2/10\n", 259 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8861 - loss: 0.2529 - val_accuracy: 0.8825 - val_loss: 0.2631\n", 260 "Epoch 3/10\n", 261 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8820 - loss: 0.2517 - val_accuracy: 0.8911 - val_loss: 0.2546\n", 262 "Epoch 4/10\n", 263 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.8981 - loss: 0.2273 - val_accuracy: 0.8940 - val_loss: 0.2420\n", 264 "Epoch 5/10\n", 265 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9014 - loss: 0.2243 - val_accuracy: 0.8883 - val_loss: 0.2450\n", 266 "Epoch 6/10\n", 267 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.8987 - loss: 0.2212 - val_accuracy: 0.8911 - val_loss: 0.2732\n", 268 "Epoch 7/10\n", 269 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.9030 - loss: 0.2168 - val_accuracy: 0.8940 - val_loss: 0.2593\n", 270 "Epoch 8/10\n", 271 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.9063 - loss: 0.2100 - val_accuracy: 0.8883 - val_loss: 0.2651\n", 272 "Epoch 9/10\n", 273 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9045 - loss: 0.2176 - val_accuracy: 0.8825 - val_loss: 0.2709\n", 274 "Epoch 10/10\n", 275 "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9090 - loss: 0.2078 - val_accuracy: 0.8911 - val_loss: 0.2816\n" 276 ] 277 }, 278 { 279 "data": { 280 "text/plain": [ 281 "<keras.src.callbacks.history.History at 0x7f3de0bfd290>" 282 ] 283 }, 284 "execution_count": 378, 285 "metadata": {}, 286 "output_type": "execute_result" 287 } 288 ], 289 "source": [ 290 "model.fit(X_train,y_train, epochs=10, batch_size=16, validation_data=(X_val,y_val))" 291 ] 292 }, 293 { 294 "cell_type": "code", 295 "execution_count": 379, 296 "metadata": {}, 297 "outputs": [ 298 { 299 "name": "stdout", 300 "output_type": "stream", 301 "text": [ 302 "\u001b[1m33/33\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 3ms/step - accuracy: 0.8616 - loss: 0.3335\n" 303 ] 304 }, 305 { 306 "data": { 307 "text/plain": [ 308 "[0.33415472507476807, 0.8687739372253418]" 309 ] 310 }, 311 "execution_count": 379, 312 "metadata": {}, 313 "output_type": "execute_result" 314 } 315 ], 316 "source": [ 317 "model.evaluate(X_test, y_test)" 318 ] 319 } 320 ], 321 "metadata": { 322 "kernelspec": { 323 "display_name": ".venv", 324 "language": "python", 325 "name": "python3" 326 }, 327 "language_info": { 328 "codemirror_mode": { 329 "name": "ipython", 330 "version": 3 331 }, 332 "file_extension": ".py", 333 "mimetype": "text/x-python", 334 "name": "python", 335 "nbconvert_exporter": "python", 336 "pygments_lexer": "ipython3", 337 "version": "3.11.2" 338 } 339 }, 340 "nbformat": 4, 341 "nbformat_minor": 2 342 }