machinelearning

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 0f032ea4bd7948aa203143d9f464b8aeb311ca42
parent 0fdeb834773677c2c28705ff33ae8fcc7c5340da
Author: Andrew <andrewlaack1@gmail.com>
Date:   Tue, 25 Jun 2024 17:31:17 -0500

Created Neural Network for spam filtering achieving 86 accuracy.

Diffstat:
AspamFilter/NNSpamFilter.ipynb | 342+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 342 insertions(+), 0 deletions(-)

diff --git a/spamFilter/NNSpamFilter.ipynb b/spamFilter/NNSpamFilter.ipynb @@ -0,0 +1,342 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This dataset is from kaggle:\n", + "\n", + "https://www.kaggle.com/datasets/abdallahwagih/spam-emails\n", + "Download this, move it to the correct location and then start working. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "86% Accuracy Using Neural Network\n", + "\n", + "Technically, this is better than my Naive Bayes approach, but not even close to the sklearn Naive Bayes. \n", + "\n", + "This was a fun project, but stupid idea for anyone being serious." + ] + }, + { + "cell_type": "code", + "execution_count": 371, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "emails = pd.read_csv('../datasets/spamEmails/emails.csv')\n", + "\n", + "MSG_LEN = 5000" + ] + }, + { + "cell_type": "code", + "execution_count": 372, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import numpy as np\n", + "X = emails['Message']\n", + "y = emails['Category']\n", + "\n", + "y = y == 'spam'\n", + "\n", + "def cut(text):\n", + " text = re.sub(r'[^a-zA-Z\\s]', '',text)\n", + " text = text.lower()\n", + " text = text[:MSG_LEN]\n", + " text = text.ljust(MSG_LEN)\n", + " text = [ord(char) for char in text]\n", + " return text\n", + "\n", + "\n", + "X = X.apply(cut)\n", + "\n", + "X = np.array(X.tolist())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 373, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler()\n", + "X = scaler.fit_transform(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 374, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "import numpy as np\n", + "\n", + "\n", + "y = y.values\n", + "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=10)\n", + "X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, random_state=10)" + ] + }, + { + "cell_type": "code", + "execution_count": 375, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import keras\n", + "from keras.layers import BatchNormalization \n", + "\n", + "model = keras.Sequential(layers=[\n", + " keras.layers.Input(shape=X_train[0].shape),\n", + "\n", + " keras.layers.Dense(512, activation='relu'),\n", + " BatchNormalization(),\n", + "\n", + " keras.layers.Dense(256, activation='relu'),\n", + " BatchNormalization(),\n", + "\n", + " keras.layers.Dense(128, activation='relu'),\n", + " BatchNormalization(),\n", + "\n", + " keras.layers.Dense(64, activation='relu'),\n", + "\n", + " keras.layers.Dense(1, activation='sigmoid')\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 376, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"sequential_34\"</span>\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[1mModel: \"sequential_34\"\u001b[0m\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", + "┃<span style=\"font-weight: bold\"> Layer (type) </span>┃<span style=\"font-weight: bold\"> Output Shape </span>┃<span style=\"font-weight: bold\"> Param # </span>┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", + "│ dense_204 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,560,512</span> │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ batch_normalization_80 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">2,048</span> │\n", + "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ dense_205 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">131,328</span> │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ batch_normalization_81 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">1,024</span> │\n", + "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ dense_206 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">32,896</span> │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ batch_normalization_82 │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span> │\n", + "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>) │ │ │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ dense_207 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">8,256</span> │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ dense_208 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>) │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>) │ <span style=\"color: #00af00; text-decoration-color: #00af00\">65</span> │\n", + "└─────────────────────────────────┴────────────────────────┴───────────────┘\n", + "</pre>\n" + ], + "text/plain": [ + "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n", + "┃\u001b[1m \u001b[0m\u001b[1mLayer (type) \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m Param #\u001b[0m\u001b[1m \u001b[0m┃\n", + "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n", + "│ dense_204 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m2,560,512\u001b[0m │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ batch_normalization_80 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m) │ \u001b[38;5;34m2,048\u001b[0m │\n", + "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ dense_205 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m131,328\u001b[0m │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ batch_normalization_81 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m) │ \u001b[38;5;34m1,024\u001b[0m │\n", + "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ dense_206 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m32,896\u001b[0m │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ batch_normalization_82 │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m) │ \u001b[38;5;34m512\u001b[0m │\n", + "│ (\u001b[38;5;33mBatchNormalization\u001b[0m) │ │ │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ dense_207 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m) │ \u001b[38;5;34m8,256\u001b[0m │\n", + "├─────────────────────────────────┼────────────────────────┼───────────────┤\n", + "│ dense_208 (\u001b[38;5;33mDense\u001b[0m) │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m) │ \u001b[38;5;34m65\u001b[0m │\n", + "└─────────────────────────────────┴────────────────────────┴───────────────┘\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,736,641</span> (10.44 MB)\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m2,736,641\u001b[0m (10.44 MB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,734,849</span> (10.43 MB)\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m2,734,849\u001b[0m (10.43 MB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,792</span> (7.00 KB)\n", + "</pre>\n" + ], + "text/plain": [ + "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m1,792\u001b[0m (7.00 KB)\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "model.summary()" + ] + }, + { + "cell_type": "code", + "execution_count": 377, + "metadata": {}, + "outputs": [], + "source": [ + "optimizer = keras.optimizers.Adam()\n", + "\n", + "model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=optimizer)" + ] + }, + { + "cell_type": "code", + "execution_count": 378, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 1/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 19ms/step - accuracy: 0.8253 - loss: 0.3683 - val_accuracy: 0.8883 - val_loss: 0.2601\n", + "Epoch 2/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8861 - loss: 0.2529 - val_accuracy: 0.8825 - val_loss: 0.2631\n", + "Epoch 3/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8820 - loss: 0.2517 - val_accuracy: 0.8911 - val_loss: 0.2546\n", + "Epoch 4/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.8981 - loss: 0.2273 - val_accuracy: 0.8940 - val_loss: 0.2420\n", + "Epoch 5/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9014 - loss: 0.2243 - val_accuracy: 0.8883 - val_loss: 0.2450\n", + "Epoch 6/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.8987 - loss: 0.2212 - val_accuracy: 0.8911 - val_loss: 0.2732\n", + "Epoch 7/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.9030 - loss: 0.2168 - val_accuracy: 0.8940 - val_loss: 0.2593\n", + "Epoch 8/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.9063 - loss: 0.2100 - val_accuracy: 0.8883 - val_loss: 0.2651\n", + "Epoch 9/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9045 - loss: 0.2176 - val_accuracy: 0.8825 - val_loss: 0.2709\n", + "Epoch 10/10\n", + "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9090 - loss: 0.2078 - val_accuracy: 0.8911 - val_loss: 0.2816\n" + ] + }, + { + "data": { + "text/plain": [ + "<keras.src.callbacks.history.History at 0x7f3de0bfd290>" + ] + }, + "execution_count": 378, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.fit(X_train,y_train, epochs=10, batch_size=16, validation_data=(X_val,y_val))" + ] + }, + { + "cell_type": "code", + "execution_count": 379, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[1m33/33\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 3ms/step - accuracy: 0.8616 - loss: 0.3335\n" + ] + }, + { + "data": { + "text/plain": [ + "[0.33415472507476807, 0.8687739372253418]" + ] + }, + "execution_count": 379, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.evaluate(X_test, y_test)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}