Created Neural Network for spam filtering achieving 86 accuracy. - machinelearning - Unnamed repository; edit this file 'description' to name the repository.

commit 0f032ea4bd7948aa203143d9f464b8aeb311ca42
parent 0fdeb834773677c2c28705ff33ae8fcc7c5340da
Author: Andrew <andrewlaack1@gmail.com>
Date:   Tue, 25 Jun 2024 17:31:17 -0500

Created Neural Network for spam filtering achieving 86 accuracy.

Diffstat:
A spamFilter/NNSpamFilter.ipynb  | 342 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

1 file changed, 342 insertions(+), 0 deletions(-)
diff --git a/spamFilter/NNSpamFilter.ipynb b/spamFilter/NNSpamFilter.ipynb
@@ -0,0 +1,342 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This dataset is from kaggle:\n",
+    "\n",
+    "https://www.kaggle.com/datasets/abdallahwagih/spam-emails\n",
+    "Download this, move it to the correct location and then start working. "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "86% Accuracy Using Neural Network\n",
+    "\n",
+    "Technically, this is better than my Naive Bayes approach, but not even close to the sklearn Naive Bayes. \n",
+    "\n",
+    "This was a fun project, but stupid idea for anyone being serious."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 371,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "emails = pd.read_csv('../datasets/spamEmails/emails.csv')\n",
+    "\n",
+    "MSG_LEN = 5000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 372,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import numpy as np\n",
+    "X = emails['Message']\n",
+    "y = emails['Category']\n",
+    "\n",
+    "y = y == 'spam'\n",
+    "\n",
+    "def cut(text):\n",
+    "    text = re.sub(r'[^a-zA-Z\\s]', '',text)\n",
+    "    text = text.lower()\n",
+    "    text = text[:MSG_LEN]\n",
+    "    text = text.ljust(MSG_LEN)\n",
+    "    text = [ord(char) for char in text]\n",
+    "    return text\n",
+    "\n",
+    "\n",
+    "X = X.apply(cut)\n",
+    "\n",
+    "X = np.array(X.tolist())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 373,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.preprocessing import StandardScaler\n",
+    "\n",
+    "scaler = StandardScaler()\n",
+    "X = scaler.fit_transform(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 374,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import train_test_split\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "y = y.values\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=10)\n",
+    "X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, random_state=10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 375,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "import keras\n",
+    "from keras.layers import BatchNormalization \n",
+    "\n",
+    "model = keras.Sequential(layers=[\n",
+    "    keras.layers.Input(shape=X_train[0].shape),\n",
+    "\n",
+    "    keras.layers.Dense(512, activation='relu'),\n",
+    "    BatchNormalization(),\n",
+    "\n",
+    "    keras.layers.Dense(256, activation='relu'),\n",
+    "    BatchNormalization(),\n",
+    "\n",
+    "    keras.layers.Dense(128, activation='relu'),\n",
+    "    BatchNormalization(),\n",
+    "\n",
+    "    keras.layers.Dense(64, activation='relu'),\n",
+    "\n",
+    "    keras.layers.Dense(1, activation='sigmoid')\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 376,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"sequential_34\"</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1mModel: \"sequential_34\"\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Layer (type)                    </span>┃<span style=\"font-weight: bold\"> Output Shape           </span>┃<span style=\"font-weight: bold\">       Param # </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
+       "│ dense_204 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>)            │     <span style=\"color: #00af00; text-decoration-color: #00af00\">2,560,512</span> │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ batch_normalization_80          │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>)            │         <span style=\"color: #00af00; text-decoration-color: #00af00\">2,048</span> │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>)            │                        │               │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ dense_205 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)            │       <span style=\"color: #00af00; text-decoration-color: #00af00\">131,328</span> │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ batch_normalization_81          │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)            │         <span style=\"color: #00af00; text-decoration-color: #00af00\">1,024</span> │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>)            │                        │               │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ dense_206 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>)            │        <span style=\"color: #00af00; text-decoration-color: #00af00\">32,896</span> │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ batch_normalization_82          │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>)            │           <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span> │\n",
+       "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>)            │                        │               │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ dense_207 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>)             │         <span style=\"color: #00af00; text-decoration-color: #00af00\">8,256</span> │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ dense_208 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>)              │            <span style=\"color: #00af00; text-decoration-color: #00af00\">65</span> │\n",
+       "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)                   \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape          \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m      Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
+       "│ dense_204 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m)            │     \u001b[38;5;34m2,560,512\u001b[0m │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ batch_normalization_80          │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m)            │         \u001b[38;5;34m2,048\u001b[0m │\n",
+       "│ (\u001b[38;5;33mBatchNormalization\u001b[0m)            │                        │               │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ dense_205 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)            │       \u001b[38;5;34m131,328\u001b[0m │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ batch_normalization_81          │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)            │         \u001b[38;5;34m1,024\u001b[0m │\n",
+       "│ (\u001b[38;5;33mBatchNormalization\u001b[0m)            │                        │               │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ dense_206 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m)            │        \u001b[38;5;34m32,896\u001b[0m │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ batch_normalization_82          │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m)            │           \u001b[38;5;34m512\u001b[0m │\n",
+       "│ (\u001b[38;5;33mBatchNormalization\u001b[0m)            │                        │               │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ dense_207 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m)             │         \u001b[38;5;34m8,256\u001b[0m │\n",
+       "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+       "│ dense_208 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m)              │            \u001b[38;5;34m65\u001b[0m │\n",
+       "└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,736,641</span> (10.44 MB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m2,736,641\u001b[0m (10.44 MB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,734,849</span> (10.43 MB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m2,734,849\u001b[0m (10.43 MB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,792</span> (7.00 KB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m1,792\u001b[0m (7.00 KB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 377,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "optimizer = keras.optimizers.Adam()\n",
+    "\n",
+    "model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=optimizer)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 378,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 19ms/step - accuracy: 0.8253 - loss: 0.3683 - val_accuracy: 0.8883 - val_loss: 0.2601\n",
+      "Epoch 2/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8861 - loss: 0.2529 - val_accuracy: 0.8825 - val_loss: 0.2631\n",
+      "Epoch 3/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8820 - loss: 0.2517 - val_accuracy: 0.8911 - val_loss: 0.2546\n",
+      "Epoch 4/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.8981 - loss: 0.2273 - val_accuracy: 0.8940 - val_loss: 0.2420\n",
+      "Epoch 5/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9014 - loss: 0.2243 - val_accuracy: 0.8883 - val_loss: 0.2450\n",
+      "Epoch 6/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.8987 - loss: 0.2212 - val_accuracy: 0.8911 - val_loss: 0.2732\n",
+      "Epoch 7/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.9030 - loss: 0.2168 - val_accuracy: 0.8940 - val_loss: 0.2593\n",
+      "Epoch 8/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.9063 - loss: 0.2100 - val_accuracy: 0.8883 - val_loss: 0.2651\n",
+      "Epoch 9/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9045 - loss: 0.2176 - val_accuracy: 0.8825 - val_loss: 0.2709\n",
+      "Epoch 10/10\n",
+      "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9090 - loss: 0.2078 - val_accuracy: 0.8911 - val_loss: 0.2816\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "<keras.src.callbacks.history.History at 0x7f3de0bfd290>"
+      ]
+     },
+     "execution_count": 378,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.fit(X_train,y_train, epochs=10, batch_size=16, validation_data=(X_val,y_val))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 379,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[1m33/33\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 3ms/step - accuracy: 0.8616 - loss: 0.3335\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[0.33415472507476807, 0.8687739372253418]"
+      ]
+     },
+     "execution_count": 379,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.evaluate(X_test, y_test)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	machinelearning Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs