commit 0f032ea4bd7948aa203143d9f464b8aeb311ca42
parent 0fdeb834773677c2c28705ff33ae8fcc7c5340da
Author: Andrew <andrewlaack1@gmail.com>
Date: Tue, 25 Jun 2024 17:31:17 -0500
Created Neural Network for spam filtering achieving 86 accuracy.
Diffstat:
1 file changed, 342 insertions(+), 0 deletions(-)
diff --git a/spamFilter/NNSpamFilter.ipynb b/spamFilter/NNSpamFilter.ipynb
@@ -0,0 +1,342 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This dataset is from kaggle:\n",
+ "\n",
+ "https://www.kaggle.com/datasets/abdallahwagih/spam-emails\n",
+ "Download this, move it to the correct location and then start working. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "86% Accuracy Using Neural Network\n",
+ "\n",
+ "Technically, this is better than my Naive Bayes approach, but not even close to the sklearn Naive Bayes. \n",
+ "\n",
+ "This was a fun project, but stupid idea for anyone being serious."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 371,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "\n",
+ "emails = pd.read_csv('../datasets/spamEmails/emails.csv')\n",
+ "\n",
+ "MSG_LEN = 5000"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 372,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "import numpy as np\n",
+ "X = emails['Message']\n",
+ "y = emails['Category']\n",
+ "\n",
+ "y = y == 'spam'\n",
+ "\n",
+ "def cut(text):\n",
+ " text = re.sub(r'[^a-zA-Z\\s]', '',text)\n",
+ " text = text.lower()\n",
+ " text = text[:MSG_LEN]\n",
+ " text = text.ljust(MSG_LEN)\n",
+ " text = [ord(char) for char in text]\n",
+ " return text\n",
+ "\n",
+ "\n",
+ "X = X.apply(cut)\n",
+ "\n",
+ "X = np.array(X.tolist())\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 373,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.preprocessing import StandardScaler\n",
+ "\n",
+ "scaler = StandardScaler()\n",
+ "X = scaler.fit_transform(X)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 374,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from sklearn.model_selection import train_test_split\n",
+ "import numpy as np\n",
+ "\n",
+ "\n",
+ "y = y.values\n",
+ "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=10)\n",
+ "X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, random_state=10)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 375,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import tensorflow as tf\n",
+ "import keras\n",
+ "from keras.layers import BatchNormalization \n",
+ "\n",
+ "model = keras.Sequential(layers=[\n",
+ " keras.layers.Input(shape=X_train[0].shape),\n",
+ "\n",
+ " keras.layers.Dense(512, activation='relu'),\n",
+ " BatchNormalization(),\n",
+ "\n",
+ " keras.layers.Dense(256, activation='relu'),\n",
+ " BatchNormalization(),\n",
+ "\n",
+ " keras.layers.Dense(128, activation='relu'),\n",
+ " BatchNormalization(),\n",
+ "\n",
+ " keras.layers.Dense(64, activation='relu'),\n",
+ "\n",
+ " keras.layers.Dense(1, activation='sigmoid')\n",
+ "])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 376,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"sequential_34\"</span>\n",
+ "</pre>\n"
+ ],
+ "text/plain": [
+ "\u001b[1mModel: \"sequential_34\"\u001b[0m\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
+ "┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃\n",
+ "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
+ "│ dense_204 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,560,512</span> │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ batch_normalization_80 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,048</span> │\n",
+ "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ dense_205 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">131,328</span> │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ batch_normalization_81 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,024</span> │\n",
+ "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ dense_206 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">32,896</span> │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ batch_normalization_82 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span> │\n",
+ "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ dense_207 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">8,256</span> │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ dense_208 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">65</span> │\n",
+ "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
+ "</pre>\n"
+ ],
+ "text/plain": [
+ "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
+ "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
+ "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
+ "│ dense_204 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m2,560,512\u001b[0m │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ batch_normalization_80 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m2,048\u001b[0m │\n",
+ "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ dense_205 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m131,328\u001b[0m │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ batch_normalization_81 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │\n",
+ "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ dense_206 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m32,896\u001b[0m │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ batch_normalization_82 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m512\u001b[0m │\n",
+ "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ dense_207 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m8,256\u001b[0m │\n",
+ "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
+ "│ dense_208 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m65\u001b[0m │\n",
+ "└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,736,641</span> (10.44 MB)\n",
+ "</pre>\n"
+ ],
+ "text/plain": [
+ "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m2,736,641\u001b[0m (10.44 MB)\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,734,849</span> (10.43 MB)\n",
+ "</pre>\n"
+ ],
+ "text/plain": [
+ "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m2,734,849\u001b[0m (10.43 MB)\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "text/html": [
+ "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,792</span> (7.00 KB)\n",
+ "</pre>\n"
+ ],
+ "text/plain": [
+ "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m1,792\u001b[0m (7.00 KB)\n"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "model.summary()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 377,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "optimizer = keras.optimizers.Adam()\n",
+ "\n",
+ "model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=optimizer)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 378,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Epoch 1/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 19ms/step - accuracy: 0.8253 - loss: 0.3683 - val_accuracy: 0.8883 - val_loss: 0.2601\n",
+ "Epoch 2/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8861 - loss: 0.2529 - val_accuracy: 0.8825 - val_loss: 0.2631\n",
+ "Epoch 3/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8820 - loss: 0.2517 - val_accuracy: 0.8911 - val_loss: 0.2546\n",
+ "Epoch 4/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.8981 - loss: 0.2273 - val_accuracy: 0.8940 - val_loss: 0.2420\n",
+ "Epoch 5/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9014 - loss: 0.2243 - val_accuracy: 0.8883 - val_loss: 0.2450\n",
+ "Epoch 6/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.8987 - loss: 0.2212 - val_accuracy: 0.8911 - val_loss: 0.2732\n",
+ "Epoch 7/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.9030 - loss: 0.2168 - val_accuracy: 0.8940 - val_loss: 0.2593\n",
+ "Epoch 8/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.9063 - loss: 0.2100 - val_accuracy: 0.8883 - val_loss: 0.2651\n",
+ "Epoch 9/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9045 - loss: 0.2176 - val_accuracy: 0.8825 - val_loss: 0.2709\n",
+ "Epoch 10/10\n",
+ "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9090 - loss: 0.2078 - val_accuracy: 0.8911 - val_loss: 0.2816\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "<keras.src.callbacks.history.History at 0x7f3de0bfd290>"
+ ]
+ },
+ "execution_count": 378,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model.fit(X_train,y_train, epochs=10, batch_size=16, validation_data=(X_val,y_val))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 379,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\u001b[1m33/33\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 3ms/step - accuracy: 0.8616 - loss: 0.3335\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "[0.33415472507476807, 0.8687739372253418]"
+ ]
+ },
+ "execution_count": 379,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "model.evaluate(X_test, y_test)"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": ".venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}