NNSpamFilter.ipynb - machinelearning

NNSpamFilter.ipynb (20441B)
      1 {
      2  "cells": [
      3   {
      4    "cell_type": "markdown",
      5    "metadata": {},
      6    "source": [
      7     "This dataset is from kaggle:\n",
      8     "\n",
      9     "https://www.kaggle.com/datasets/abdallahwagih/spam-emails\n",
     10     "Download this, move it to the correct location and then start working. "
     11    ]
     12   },
     13   {
     14    "cell_type": "markdown",
     15    "metadata": {},
     16    "source": [
     17     "86% Accuracy Using Neural Network\n",
     18     "\n",
     19     "Technically, this is better than my Naive Bayes approach, but not even close to the sklearn Naive Bayes. \n",
     20     "\n",
     21     "This was a fun project, but stupid idea for anyone being serious."
     22    ]
     23   },
     24   {
     25    "cell_type": "code",
     26    "execution_count": 371,
     27    "metadata": {},
     28    "outputs": [],
     29    "source": [
     30     "import pandas as pd\n",
     31     "\n",
     32     "emails = pd.read_csv('../datasets/spamEmails/emails.csv')\n",
     33     "\n",
     34     "MSG_LEN = 5000"
     35    ]
     36   },
     37   {
     38    "cell_type": "code",
     39    "execution_count": 372,
     40    "metadata": {},
     41    "outputs": [],
     42    "source": [
     43     "import re\n",
     44     "import numpy as np\n",
     45     "X = emails['Message']\n",
     46     "y = emails['Category']\n",
     47     "\n",
     48     "y = y == 'spam'\n",
     49     "\n",
     50     "def cut(text):\n",
     51     "    text = re.sub(r'[^a-zA-Z\\s]', '',text)\n",
     52     "    text = text.lower()\n",
     53     "    text = text[:MSG_LEN]\n",
     54     "    text = text.ljust(MSG_LEN)\n",
     55     "    text = [ord(char) for char in text]\n",
     56     "    return text\n",
     57     "\n",
     58     "\n",
     59     "X = X.apply(cut)\n",
     60     "\n",
     61     "X = np.array(X.tolist())\n"
     62    ]
     63   },
     64   {
     65    "cell_type": "code",
     66    "execution_count": 373,
     67    "metadata": {},
     68    "outputs": [],
     69    "source": [
     70     "from sklearn.preprocessing import StandardScaler\n",
     71     "\n",
     72     "scaler = StandardScaler()\n",
     73     "X = scaler.fit_transform(X)"
     74    ]
     75   },
     76   {
     77    "cell_type": "code",
     78    "execution_count": 374,
     79    "metadata": {},
     80    "outputs": [],
     81    "source": [
     82     "from sklearn.model_selection import train_test_split\n",
     83     "import numpy as np\n",
     84     "\n",
     85     "\n",
     86     "y = y.values\n",
     87     "X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=10)\n",
     88     "X_test, X_val, y_test, y_val = train_test_split(X_test,y_test, random_state=10)"
     89    ]
     90   },
     91   {
     92    "cell_type": "code",
     93    "execution_count": 375,
     94    "metadata": {},
     95    "outputs": [],
     96    "source": [
     97     "import tensorflow as tf\n",
     98     "import keras\n",
     99     "from keras.layers import BatchNormalization \n",
    100     "\n",
    101     "model = keras.Sequential(layers=[\n",
    102     "    keras.layers.Input(shape=X_train[0].shape),\n",
    103     "\n",
    104     "    keras.layers.Dense(512, activation='relu'),\n",
    105     "    BatchNormalization(),\n",
    106     "\n",
    107     "    keras.layers.Dense(256, activation='relu'),\n",
    108     "    BatchNormalization(),\n",
    109     "\n",
    110     "    keras.layers.Dense(128, activation='relu'),\n",
    111     "    BatchNormalization(),\n",
    112     "\n",
    113     "    keras.layers.Dense(64, activation='relu'),\n",
    114     "\n",
    115     "    keras.layers.Dense(1, activation='sigmoid')\n",
    116     "])"
    117    ]
    118   },
    119   {
    120    "cell_type": "code",
    121    "execution_count": 376,
    122    "metadata": {},
    123    "outputs": [
    124     {
    125      "data": {
    126       "text/html": [
    127        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"sequential_34\"</span>\n",
    128        "</pre>\n"
    129       ],
    130       "text/plain": [
    131        "\u001b[1mModel: \"sequential_34\"\u001b[0m\n"
    132       ]
    133      },
    134      "metadata": {},
    135      "output_type": "display_data"
    136     },
    137     {
    138      "data": {
    139       "text/html": [
    140        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
    141        "┃<span style=\"font-weight: bold\"> Layer (type)                    </span>┃<span style=\"font-weight: bold\"> Output Shape           </span>┃<span style=\"font-weight: bold\">       Param # </span>┃\n",
    142        "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
    143        "│ dense_204 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>)            │     <span style=\"color: #00af00; text-decoration-color: #00af00\">2,560,512</span> │\n",
    144        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    145        "│ batch_normalization_80          │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span>)            │         <span style=\"color: #00af00; text-decoration-color: #00af00\">2,048</span> │\n",
    146        "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>)            │                        │               │\n",
    147        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    148        "│ dense_205 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)            │       <span style=\"color: #00af00; text-decoration-color: #00af00\">131,328</span> │\n",
    149        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    150        "│ batch_normalization_81          │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">256</span>)            │         <span style=\"color: #00af00; text-decoration-color: #00af00\">1,024</span> │\n",
    151        "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>)            │                        │               │\n",
    152        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    153        "│ dense_206 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>)            │        <span style=\"color: #00af00; text-decoration-color: #00af00\">32,896</span> │\n",
    154        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    155        "│ batch_normalization_82          │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">128</span>)            │           <span style=\"color: #00af00; text-decoration-color: #00af00\">512</span> │\n",
    156        "│ (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">BatchNormalization</span>)            │                        │               │\n",
    157        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    158        "│ dense_207 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">64</span>)             │         <span style=\"color: #00af00; text-decoration-color: #00af00\">8,256</span> │\n",
    159        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    160        "│ dense_208 (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">Dense</span>)               │ (<span style=\"color: #00d7ff; text-decoration-color: #00d7ff\">None</span>, <span style=\"color: #00af00; text-decoration-color: #00af00\">1</span>)              │            <span style=\"color: #00af00; text-decoration-color: #00af00\">65</span> │\n",
    161        "└─────────────────────────────────┴────────────────────────┴───────────────┘\n",
    162        "</pre>\n"
    163       ],
    164       "text/plain": [
    165        "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┓\n",
    166        "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)                   \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape          \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m      Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
    167        "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━┩\n",
    168        "│ dense_204 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m)            │     \u001b[38;5;34m2,560,512\u001b[0m │\n",
    169        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    170        "│ batch_normalization_80          │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m512\u001b[0m)            │         \u001b[38;5;34m2,048\u001b[0m │\n",
    171        "│ (\u001b[38;5;33mBatchNormalization\u001b[0m)            │                        │               │\n",
    172        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    173        "│ dense_205 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)            │       \u001b[38;5;34m131,328\u001b[0m │\n",
    174        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    175        "│ batch_normalization_81          │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m256\u001b[0m)            │         \u001b[38;5;34m1,024\u001b[0m │\n",
    176        "│ (\u001b[38;5;33mBatchNormalization\u001b[0m)            │                        │               │\n",
    177        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    178        "│ dense_206 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m)            │        \u001b[38;5;34m32,896\u001b[0m │\n",
    179        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    180        "│ batch_normalization_82          │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m128\u001b[0m)            │           \u001b[38;5;34m512\u001b[0m │\n",
    181        "│ (\u001b[38;5;33mBatchNormalization\u001b[0m)            │                        │               │\n",
    182        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    183        "│ dense_207 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m64\u001b[0m)             │         \u001b[38;5;34m8,256\u001b[0m │\n",
    184        "├─────────────────────────────────┼────────────────────────┼───────────────┤\n",
    185        "│ dense_208 (\u001b[38;5;33mDense\u001b[0m)               │ (\u001b[38;5;45mNone\u001b[0m, \u001b[38;5;34m1\u001b[0m)              │            \u001b[38;5;34m65\u001b[0m │\n",
    186        "└─────────────────────────────────┴────────────────────────┴───────────────┘\n"
    187       ]
    188      },
    189      "metadata": {},
    190      "output_type": "display_data"
    191     },
    192     {
    193      "data": {
    194       "text/html": [
    195        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,736,641</span> (10.44 MB)\n",
    196        "</pre>\n"
    197       ],
    198       "text/plain": [
    199        "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m2,736,641\u001b[0m (10.44 MB)\n"
    200       ]
    201      },
    202      "metadata": {},
    203      "output_type": "display_data"
    204     },
    205     {
    206      "data": {
    207       "text/html": [
    208        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">2,734,849</span> (10.43 MB)\n",
    209        "</pre>\n"
    210       ],
    211       "text/plain": [
    212        "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m2,734,849\u001b[0m (10.43 MB)\n"
    213       ]
    214      },
    215      "metadata": {},
    216      "output_type": "display_data"
    217     },
    218     {
    219      "data": {
    220       "text/html": [
    221        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1,792</span> (7.00 KB)\n",
    222        "</pre>\n"
    223       ],
    224       "text/plain": [
    225        "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m1,792\u001b[0m (7.00 KB)\n"
    226       ]
    227      },
    228      "metadata": {},
    229      "output_type": "display_data"
    230     }
    231    ],
    232    "source": [
    233     "model.summary()"
    234    ]
    235   },
    236   {
    237    "cell_type": "code",
    238    "execution_count": 377,
    239    "metadata": {},
    240    "outputs": [],
    241    "source": [
    242     "optimizer = keras.optimizers.Adam()\n",
    243     "\n",
    244     "model.compile(loss='binary_crossentropy', metrics=['accuracy'], optimizer=optimizer)"
    245    ]
    246   },
    247   {
    248    "cell_type": "code",
    249    "execution_count": 378,
    250    "metadata": {},
    251    "outputs": [
    252     {
    253      "name": "stdout",
    254      "output_type": "stream",
    255      "text": [
    256       "Epoch 1/10\n",
    257       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m7s\u001b[0m 19ms/step - accuracy: 0.8253 - loss: 0.3683 - val_accuracy: 0.8883 - val_loss: 0.2601\n",
    258       "Epoch 2/10\n",
    259       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8861 - loss: 0.2529 - val_accuracy: 0.8825 - val_loss: 0.2631\n",
    260       "Epoch 3/10\n",
    261       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.8820 - loss: 0.2517 - val_accuracy: 0.8911 - val_loss: 0.2546\n",
    262       "Epoch 4/10\n",
    263       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.8981 - loss: 0.2273 - val_accuracy: 0.8940 - val_loss: 0.2420\n",
    264       "Epoch 5/10\n",
    265       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9014 - loss: 0.2243 - val_accuracy: 0.8883 - val_loss: 0.2450\n",
    266       "Epoch 6/10\n",
    267       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.8987 - loss: 0.2212 - val_accuracy: 0.8911 - val_loss: 0.2732\n",
    268       "Epoch 7/10\n",
    269       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 20ms/step - accuracy: 0.9030 - loss: 0.2168 - val_accuracy: 0.8940 - val_loss: 0.2593\n",
    270       "Epoch 8/10\n",
    271       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 19ms/step - accuracy: 0.9063 - loss: 0.2100 - val_accuracy: 0.8883 - val_loss: 0.2651\n",
    272       "Epoch 9/10\n",
    273       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9045 - loss: 0.2176 - val_accuracy: 0.8825 - val_loss: 0.2709\n",
    274       "Epoch 10/10\n",
    275       "\u001b[1m262/262\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m5s\u001b[0m 18ms/step - accuracy: 0.9090 - loss: 0.2078 - val_accuracy: 0.8911 - val_loss: 0.2816\n"
    276      ]
    277     },
    278     {
    279      "data": {
    280       "text/plain": [
    281        "<keras.src.callbacks.history.History at 0x7f3de0bfd290>"
    282       ]
    283      },
    284      "execution_count": 378,
    285      "metadata": {},
    286      "output_type": "execute_result"
    287     }
    288    ],
    289    "source": [
    290     "model.fit(X_train,y_train, epochs=10, batch_size=16, validation_data=(X_val,y_val))"
    291    ]
    292   },
    293   {
    294    "cell_type": "code",
    295    "execution_count": 379,
    296    "metadata": {},
    297    "outputs": [
    298     {
    299      "name": "stdout",
    300      "output_type": "stream",
    301      "text": [
    302       "\u001b[1m33/33\u001b[0m \u001b[32m━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[37m\u001b[0m \u001b[1m0s\u001b[0m 3ms/step - accuracy: 0.8616 - loss: 0.3335\n"
    303      ]
    304     },
    305     {
    306      "data": {
    307       "text/plain": [
    308        "[0.33415472507476807, 0.8687739372253418]"
    309       ]
    310      },
    311      "execution_count": 379,
    312      "metadata": {},
    313      "output_type": "execute_result"
    314     }
    315    ],
    316    "source": [
    317     "model.evaluate(X_test, y_test)"
    318    ]
    319   }
    320  ],
    321  "metadata": {
    322   "kernelspec": {
    323    "display_name": ".venv",
    324    "language": "python",
    325    "name": "python3"
    326   },
    327   "language_info": {
    328    "codemirror_mode": {
    329     "name": "ipython",
    330     "version": 3
    331    },
    332    "file_extension": ".py",
    333    "mimetype": "text/x-python",
    334    "name": "python",
    335    "nbconvert_exporter": "python",
    336    "pygments_lexer": "ipython3",
    337    "version": "3.11.2"
    338   }
    339  },
    340  "nbformat": 4,
    341  "nbformat_minor": 2
    342 }
	machinelearning Machine learning code
	git clone git://git.laack.co/machinelearning.git
	Log \| Files \| Refs