machinelearning

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit 5790628433062ffa03bd8d30a1f530ac644c5f50
parent 67078fcf74755da213222eaf84e3c842e2e8bdb5
Author: Andrew <andrewlaack1@gmail.com>
Date:   Fri, 28 Jun 2024 15:03:50 -0500

stuff

Diffstat:
Aembeddings/KerasEmbeddingPG.ipynb | 254+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
AphishingClassification/PhishingClassification.ipynb | 273+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 527 insertions(+), 0 deletions(-)

diff --git a/embeddings/KerasEmbeddingPG.ipynb b/embeddings/KerasEmbeddingPG.ipynb @@ -0,0 +1,254 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Embedding playground" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [], + "source": [ + "import tensorflow as tf\n", + "import pandas as pd\n", + "import keras\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [], + "source": [ + "arrText = np.array(['test', 'why', 'would', 'this', 'work', 'weird', 'array', 'method', 'returns', 'math'])\n", + "arrNums = np.array([])\n", + "mapping = {}\n", + "nextMap = 0\n", + "\n", + "for i in arrText:\n", + " if i in mapping:\n", + " arrNums = np.append(arrNums, mapping[i])\n", + " else:\n", + " mapping[i] = nextMap\n", + " arrNums = np.append(arrNums, mapping[i])\n", + " nextMap += 1\n", + "\n", + "\n", + "embedding = keras.layers.Embedding(input_dim=10, output_dim=2)\n", + "embedded = embedding(arrNums)" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<tf.Tensor: shape=(10, 2), dtype=float32, numpy=\n", + "array([[ 0.03479927, 0.0350842 ],\n", + " [-0.01141354, 0.02443799],\n", + " [ 0.04973641, 0.01413956],\n", + " [-0.03284754, -0.04128627],\n", + " [-0.04843548, 0.03706617],\n", + " [-0.02948942, 0.04148373],\n", + " [-0.04496865, -0.04993447],\n", + " [ 0.01115872, -0.04161409],\n", + " [ 0.01248763, 0.01584405],\n", + " [ 0.02125626, -0.00353973]], dtype=float32)>" + ] + }, + "execution_count": 126, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "embedded" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<matplotlib.collections.PathCollection at 0x7f7e9d4def50>" + ] + }, + "execution_count": 127, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjcAAAGdCAYAAADuR1K7AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy80BEi2AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAnEUlEQVR4nO3df1DUd2L/8dcuCOslshQVFhQjSe0pwWrVA8nc1OtJDi45E3tm4jEajXVi4xmTHtaqiZHh2g7N5dLoXTydzDTjpMZqTVOvWMuNxfQuPYkoJDkRddIbTo2yEMOxEBN+hH1///DLJhsXBGTZ5c3zMfMZZ9/7fu++3+8h2dd+Pp/3ex3GGCMAAABLOCPdAQAAgKFEuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWCU20h2IBL/frytXrmjcuHFyOByR7g4AAOgHY4za2tqUlpYmp7P38zOjMtxcuXJF6enpke4GAAAYhEuXLmny5Mm9Pj8qw824ceMkXZ+chISECPcGAAD0R2trq9LT0wOf470ZleGm51JUQkIC4QYAgBHmZreUcEMxAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGCVUbmJH0aebr9RVX2zmtralTzOpeyMJMU4+V0wAMCNCDeIeuW1DSopq1ODrz1Qlup2qXhRpgqyUiPYMwBANOKyFKJaeW2D1u6tCQo2kuT1tWvt3hqV1zZEqGcAgGhFuEHU6vYblZTVyYR4rqespKxO3f5QNQAAoxXhBlGrqr75hjM2X2QkNfjaVVXfPHydAgBEPcINolZTW+/BZjD1AACjA+EGUSt5nGtI6wEARodhCTc7d+7U1KlT5XK5lJOTo6qqqj7rHzx4UNOnT5fL5dLMmTN15MiRXus+/vjjcjgc2r59+xD3GpGWnZGkVLdLvS34duj6qqnsjKTh7BYAIMqFPdwcOHBARUVFKi4uVk1NjWbNmqX8/Hw1NTWFrH/8+HEVFhZq9erVeuedd7R48WItXrxYtbW1N9T993//d7399ttKS0sL9zAQATFOh4oXZUrSDQGn53Hxokz2uwEABAl7uPnHf/xHPfbYY1q1apUyMzO1e/dufeUrX9Err7wSsv6OHTtUUFCgjRs3asaMGfrbv/1bzZkzRy+99FJQvcuXL2v9+vV67bXXNGbMmHAPAxFSkJWqXcvnyOMOvvTkcbu0a/kc9rkBANwgrJv4dXZ2qrq6Wlu2bAmUOZ1O5eXlqbKyMmSbyspKFRUVBZXl5+fr0KFDgcd+v1+PPPKINm7cqLvvvvum/ejo6FBHR0fgcWtr6wBHgkgqyErVvZkedigGAPRLWMPN1atX1d3drZSUlKDylJQUnTt3LmQbr9cbsr7X6w08fu655xQbG6snn3yyX/0oLS1VSUnJAHuPaBLjdCj3rvGR7gYAYAQYcaulqqurtWPHDu3Zs0cOR/++uW/ZskU+ny9wXLp0Kcy9BAAAkRLWcDNhwgTFxMSosbExqLyxsVEejydkG4/H02f9t956S01NTZoyZYpiY2MVGxurCxcuaMOGDZo6dWrI14yPj1dCQkLQAQAA7BTWcBMXF6e5c+eqoqIiUOb3+1VRUaHc3NyQbXJzc4PqS9LRo0cD9R955BH95je/0bvvvhs40tLStHHjRv3iF78I32AAAMCIEPZfBS8qKtLKlSs1b948ZWdna/v27bp27ZpWrVolSVqxYoUmTZqk0tJSSdJTTz2lBQsW6IUXXtD999+v/fv369SpU3r55ZclSePHj9f48cH3XowZM0Yej0df/epXwz0cAAAQ5cIebpYuXaoPP/xQ27Ztk9fr1ezZs1VeXh64afjixYtyOj8/gXTPPfdo37592rp1q55++mlNmzZNhw4dUlZWVri7CgAALOAwxoy6n1RubW2V2+2Wz+cb0vtvuv2G5coAAIRJfz+/w37mZrQor21QSVld0K9Yp7pdKl6UyUZzAAAMoxG3FDwaldc2aO3emqBgI0leX7vW7q1ReW1DhHoGAMDoQ7i5Rd1+o5KyOoW6ttdTVlJWp27/qLv6BwBARBBublFVffMNZ2y+yEhq8LWrqr55+DoFAMAoRri5RU1tvQebwdQDAAC3hnBzi5LHuW5eaQD1AADArWG11C3KzkhSqtslr6895H03Dkke9/Vl4QAAu7AFSHQi3NyiGKdDxYsytXZvjRxSUMDp+fMuXpTJHzsAWIYtQKIXl6WGQEFWqnYtnyOPO/jSk8ft0q7lc/gjBwDLsAVIdOPMzRApyErVvZkeTk8CgOVutgWIQ9e3ALk308NnQIQQboZQjNOh3LvG37wiAGDEGsgWIHwmRAaXpQAAGAC2AIl+hBsAAAaALUCiH+EGAIAB6NkCpLe7aRy6vmqKLUAih3ADAMAA9GwBIumGgMMWINGBcAMAwACxBUh0Y7UUAACDwBYg0YtwAwDAILEFSHTishQAALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKwSG+kOABhe3X6jqvpmNbW1K3mcS9kZSYpxOiLdLQAYMoQbYBQpr21QSVmdGnztgbJUt0vFizJVkJUawZ4BwNDhshQwSpTXNmjt3pqgYCNJXl+71u6tUXltQ4R6BgBDi3ADjALdfqOSsjqZEM/1lJWU1anbH6oGAIwshBtgFKiqb77hjM0XGUkNvnZV1TcPX6cAIEwIN8Ao0NTWe7AZTD0AiGaEG2AUSB7nGtJ6ABDNCDfAKJCdkaRUt0u9Lfh26PqqqeyMpOHsFgCEBeEGGAVinA4VL8qUpBsCTs/j4kWZ7HcDwAqEG2CUKMhK1a7lc+RxB1968rhd2rV8DvvcALAGm/gBo0hBVqruzfSwQzEAqxFugFEmxulQ7l3jI90NAAgbLksBAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALDKsISbnTt3aurUqXK5XMrJyVFVVVWf9Q8ePKjp06fL5XJp5syZOnLkSOC5rq4ubdq0STNnztRtt92mtLQ0rVixQleuXAn3MAAAwAgQ9nBz4MABFRUVqbi4WDU1NZo1a5by8/PV1NQUsv7x48dVWFio1atX65133tHixYu1ePFi1dbWSpI++eQT1dTU6Nlnn1VNTY3eeOMNnT9/Xg888EC4hwIAAEYAhzHGhPMNcnJy9LWvfU0vvfSSJMnv9ys9PV3r16/X5s2bb6i/dOlSXbt2TYcPHw6UzZ8/X7Nnz9bu3btDvsfJkyeVnZ2tCxcuaMqUKTftU2trq9xut3w+nxISEgY5MgAAMJz6+/kd1jM3nZ2dqq6uVl5e3udv6HQqLy9PlZWVIdtUVlYG1Zek/Pz8XutLks/nk8PhUGJiYsjnOzo61NraGnQAAAA7hTXcXL16Vd3d3UpJSQkqT0lJkdfrDdnG6/UOqH57e7s2bdqkwsLCXlNcaWmp3G534EhPTx/EaAAAwEgwoldLdXV16eGHH5YxRrt27eq13pYtW+Tz+QLHpUuXhrGXAABgOIX1t6UmTJigmJgYNTY2BpU3NjbK4/GEbOPxePpVvyfYXLhwQceOHevz2lt8fLzi4+MHOQoAADCShPXMTVxcnObOnauKiopAmd/vV0VFhXJzc0O2yc3NDaovSUePHg2q3xNs3n//ff33f/+3xo/nRwABAMB1Yf9V8KKiIq1cuVLz5s1Tdna2tm/frmvXrmnVqlWSpBUrVmjSpEkqLS2VJD311FNasGCBXnjhBd1///3av3+/Tp06pZdfflnS9WDz0EMPqaamRocPH1Z3d3fgfpykpCTFxcWFe0gAACCKhT3cLF26VB9++KG2bdsmr9er2bNnq7y8PHDT8MWLF+V0fn4C6Z577tG+ffu0detWPf3005o2bZoOHTqkrKwsSdLly5f1H//xH5Kk2bNnB73Xm2++qW984xvhHhIAAIhiYd/nJhqxzw0AACNPVOxzAwAAMNwINwAAwCqEGwAAYBXCDQAAsErYV0sBAG5Nt9+oqr5ZTW3tSh7nUnZGkmKcjkh3C4hahBsAiGLltQ0qKatTg689UJbqdql4UaYKslIj2DMgenFZCgCiVHltg9burQkKNpLk9bVr7d4aldc2RKhnQHQj3ABAFOr2G5WU1SnURmQ9ZSVlder2j7qtyoCbItwAQBSqqm++4YzNFxlJDb52VdU3D1+ngBGCcAMAUaiprfdgM5h6wGjCDcUAEIWSx7mGtB4wHKJlZR/hBgCiUHZGklLdLnl97SHvu3FI8rivf3gA0SCaVvZxWQoAolCM06HiRZmSrgeZL+p5XLwok/1uEBWibWUf4QYAolRBVqp2LZ8jjzv40pPH7dKu5XPY5wZRIRpX9nFZCgCiWEFWqu7N9ETFfQxAKANZ2Zd71/hh6RPhBgCiXIzTMWwfCsBARePKPi5LAQCAQYvGlX2EGwAAMGg9K/t6u1Dq0PVVU8O5so9wAwAABi0aV/YRbgAAwC2JtpV93FAMAABuWTSt7CPcAACAIREtK/u4LAUAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsMizhZufOnZo6dapcLpdycnJUVVXVZ/2DBw9q+vTpcrlcmjlzpo4cORL0vDFG27ZtU2pqqsaOHau8vDy9//774RwCAAAYIcIebg4cOKCioiIVFxerpqZGs2bNUn5+vpqamkLWP378uAoLC7V69Wq98847Wrx4sRYvXqza2tpAnR/96Ef6yU9+ot27d+vEiRO67bbblJ+fr/b29nAPBwAARDmHMcaE8w1ycnL0ta99TS+99JIkye/3Kz09XevXr9fmzZtvqL906VJdu3ZNhw8fDpTNnz9fs2fP1u7du2WMUVpamjZs2KC//uu/liT5fD6lpKRoz549+t73vnfTPrW2tsrtdsvn8ykhIWGIRgoAAMKpv5/fYT1z09nZqerqauXl5X3+hk6n8vLyVFlZGbJNZWVlUH1Jys/PD9Svr6+X1+sNquN2u5WTk9Pra3Z0dKi1tTXoAAAAdgpruLl69aq6u7uVkpISVJ6SkiKv1xuyjdfr7bN+z78Dec3S0lK53e7AkZ6ePqjxAACA6DcqVktt2bJFPp8vcFy6dCnSXQIAAGES1nAzYcIExcTEqLGxMai8sbFRHo8nZBuPx9Nn/Z5/B/Ka8fHxSkhICDoAAICdwhpu4uLiNHfuXFVUVATK/H6/KioqlJubG7JNbm5uUH1JOnr0aKB+RkaGPB5PUJ3W1ladOHGi19cEAACjR2y436CoqEgrV67UvHnzlJ2dre3bt+vatWtatWqVJGnFihWaNGmSSktLJUlPPfWUFixYoBdeeEH333+/9u/fr1OnTunll1+WJDkcDv3VX/2V/u7v/k7Tpk1TRkaGnn32WaWlpWnx4sXhHg4AAIhyYQ83S5cu1Ycffqht27bJ6/Vq9uzZKi8vD9wQfPHiRTmdn59Auueee7Rv3z5t3bpVTz/9tKZNm6ZDhw4pKysrUOdv/uZvdO3aNa1Zs0YtLS36+te/rvLycrlcrnAPBwAARLmw73MTjdjnBgCAkScq9rkBAAAYbmG/LAUAsF+336iqvllNbe1KHudSdkaSYpyOSHcLoxThBgBwS8prG1RSVqcG3+e/75fqdql4UaYKslIj2DOMVlyWAgAMWnltg9burQkKNpLk9bVr7d4aldc2RKhnGM0INwCAQen2G5WU1SnUqpSespKyOnX7R926FUQY4QYAMChV9c03nLH5IiOpwdeuqvrm4esUIMINAGCQmtp6DzaDqQcMFcINAGBQksf1b+PU/tYDhgrhBgAwKNkZSUp1u9Tbgm+Hrq+ays5IGs5uAYQbAMDgxDgdKl6UKUk3BJyex8WLMtnvBsOOcAMAGLSCrFTtWj5HHnfwpSeP26Vdy+ewzw0igk38AAC3pCArVfdmetihGFGDcAMAuGUxTody7xof6W4AkrgsBQAALEO4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArBK2cNPc3Kxly5YpISFBiYmJWr16tT7++OM+27S3t2vdunUaP368br/9di1ZskSNjY2B59977z0VFhYqPT1dY8eO1YwZM7Rjx45wDQEAAIxAYQs3y5Yt05kzZ3T06FEdPnxYv/rVr7RmzZo+2/zgBz9QWVmZDh48qF/+8pe6cuWKvvvd7waer66uVnJysvbu3aszZ87omWee0ZYtW/TSSy+FaxgAAGCEcRhjzFC/6NmzZ5WZmamTJ09q3rx5kqTy8nLdd999+uCDD5SWlnZDG5/Pp4kTJ2rfvn166KGHJEnnzp3TjBkzVFlZqfnz54d8r3Xr1uns2bM6duxYv/vX2toqt9stn8+nhISEQYwQAAAMt/5+foflzE1lZaUSExMDwUaS8vLy5HQ6deLEiZBtqqur1dXVpby8vEDZ9OnTNWXKFFVWVvb6Xj6fT0lJSX32p6OjQ62trUEHAACwU1jCjdfrVXJyclBZbGyskpKS5PV6e20TFxenxMTEoPKUlJRe2xw/flwHDhy46eWu0tJSud3uwJGent7/wQAAgBFlQOFm8+bNcjgcfR7nzp0LV1+D1NbW6sEHH1RxcbG+9a1v9Vl3y5Yt8vl8gePSpUvD0kcAADD8YgdSecOGDXr00Uf7rHPnnXfK4/GoqakpqPyzzz5Tc3OzPB5PyHYej0ednZ1qaWkJOnvT2Nh4Q5u6ujotXLhQa9as0datW2/a7/j4eMXHx9+0HgAAGPkGFG4mTpyoiRMn3rRebm6uWlpaVF1drblz50qSjh07Jr/fr5ycnJBt5s6dqzFjxqiiokJLliyRJJ0/f14XL15Ubm5uoN6ZM2f0zW9+UytXrtTf//3fD6T7AABgFAjLailJ+va3v63Gxkbt3r1bXV1dWrVqlebNm6d9+/ZJki5fvqyFCxfq1VdfVXZ2tiRp7dq1OnLkiPbs2aOEhAStX79e0vV7a6Trl6K++c1vKj8/X88//3zgvWJiYvoVunqwWgoAgJGnv5/fAzpzMxCvvfaannjiCS1cuFBOp1NLlizRT37yk8DzXV1dOn/+vD755JNA2Ysvvhio29HRofz8fP3sZz8LPP/666/rww8/1N69e7V3795A+R133KHf/e534RoKAAAYQcJ25iaaceYGAICRJ6L73AAAAEQK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKmELN83NzVq2bJkSEhKUmJio1atX6+OPP+6zTXt7u9atW6fx48fr9ttv15IlS9TY2Biy7kcffaTJkyfL4XCopaUlDCMAAAAjUdjCzbJly3TmzBkdPXpUhw8f1q9+9SutWbOmzzY/+MEPVFZWpoMHD+qXv/ylrly5ou9+97sh665evVp//Md/HI6uAwCAEcxhjDFD/aJnz55VZmamTp48qXnz5kmSysvLdd999+mDDz5QWlraDW18Pp8mTpyoffv26aGHHpIknTt3TjNmzFBlZaXmz58fqLtr1y4dOHBA27Zt08KFC/X73/9eiYmJ/e5fa2ur3G63fD6fEhISbm2wAABgWPT38zssZ24qKyuVmJgYCDaSlJeXJ6fTqRMnToRsU11dra6uLuXl5QXKpk+frilTpqiysjJQVldXpx/+8Id69dVX5XT2r/sdHR1qbW0NOgAAgJ3CEm68Xq+Sk5ODymJjY5WUlCSv19trm7i4uBvOwKSkpATadHR0qLCwUM8//7ymTJnS7/6UlpbK7XYHjvT09IENCLes229U+duP9PN3L6vytx+p2z/kJwwBAJAkxQ6k8ubNm/Xcc8/1Wefs2bO31KG+bNmyRTNmzNDy5csH3K6oqCjwuLW1lYAzjMprG1RSVqcGX3ugLNXtUvGiTBVkpUawZ0B06fYbVdU3q6mtXcnjXMrOSFKM0xHpbgEjzoDCzYYNG/Too4/2WefOO++Ux+NRU1NTUPlnn32m5uZmeTyekO08Ho86OzvV0tISdPamsbEx0ObYsWM6ffq0Xn/9dUlSz+1CEyZM0DPPPKOSkpKQrx0fH6/4+Pj+DBFDrLy2QWv31ujL52m8vnat3VujXcvnEHAA8SUAGEoDCjcTJ07UxIkTb1ovNzdXLS0tqq6u1ty5cyVdDyZ+v185OTkh28ydO1djxoxRRUWFlixZIkk6f/68Ll68qNzcXEnSv/3bv+nTTz8NtDl58qT+4i/+Qm+99ZbuuuuugQwFw6Dbb1RSVndDsJEkI8khqaSsTvdmevh2ilGNLwHA0BpQuOmvGTNmqKCgQI899ph2796trq4uPfHEE/re974XWCl1+fJlLVy4UK+++qqys7Pldru1evVqFRUVKSkpSQkJCVq/fr1yc3MDK6W+HGCuXr0aeL+BrJbC8Kiqbw76FvplRlKDr11V9c3KvWv88HUMiCJ8CQCGXtj2uXnttdc0ffp0LVy4UPfdd5++/vWv6+WXXw4839XVpfPnz+uTTz4JlL344ov6zne+oyVLluhP//RP5fF49MYbb4Sriwizprbeg81g6gE2GsiXAAD9E5YzN5KUlJSkffv29fr81KlT9eUtdlwul3bu3KmdO3f26z2+8Y1v3PAaiB7J41xDWg+wEV8CgKHHb0shbLIzkpTqdqm3E+kOXb9hMjsjaTi7BUQVvgQAQ49wg7CJcTpUvChTkm4IOD2Pixdlch8BRjW+BABDj3CDsCrIStWu5XPkcQd/6/S4XawAAcSXACAcwvLbUtGO35YafmxOBvSNfW6Am+vv5zfhhnADIErwJQDoW38/v8O2WgoAMDAxTgd7PgFDgHtuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABYhXADAACsQrgBAABWIdwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFUINwAAwCqxke4Aruv2G1XVN6uprV3J41zKzkhSjNMR6W4BADDiEG6iQHltg0rK6tTgaw+UpbpdKl6UqYKs1Aj2DACAkYfLUhFWXtugtXtrgoKNJHl97Vq7t0bltQ0R6hkAACMT4SaCuv1GJWV1MiGe6ykrKatTtz9UDQAAEArhJoKq6ptvOGPzRUZSg69dVfXNw9cpAABGOMJNBDW19R5sBlMPAAAQbiIqeZxrSOsBAADCTURlZyQp1e1Sbwu+Hbq+aio7I2k4uwUAwIhGuImgGKdDxYsyJemGgNPzuHhRJvvdAAAwAISbCCvIStWu5XPkcQdfevK4Xdq1fA773AAAMEBs4hcFCrJSdW+mhx2KAQAYAoSbKBHjdCj3rvGR7gYAACMel6UAAIBVCDcAAMAqhBsAAGAVwg0AALAK4QYAAFiFcAMAAKxCuAEAAFYh3AAAAKsQbgAAgFVG5Q7FxhhJUmtra4R7AgAA+qvnc7vnc7w3ozLctLW1SZLS09Mj3BMAADBQbW1tcrvdvT7vMDeLPxby+/26cuWKxo0bJ4djdPw4ZWtrq9LT03Xp0iUlJCREujvWY76HF/M9vJjv4cV8f84Yo7a2NqWlpcnp7P3OmlF55sbpdGry5MmR7kZEJCQkjPr/OIYT8z28mO/hxXwPL+b7ur7O2PTghmIAAGAVwg0AALAK4WaUiI+PV3FxseLj4yPdlVGB+R5ezPfwYr6HF/M9cKPyhmIAAGAvztwAAACrEG4AAIBVCDcAAMAqhBsAAGAVwo1FmpubtWzZMiUkJCgxMVGrV6/Wxx9/3Geb9vZ2rVu3TuPHj9ftt9+uJUuWqLGxMWTdjz76SJMnT5bD4VBLS0sYRjByhGOu33vvPRUWFio9PV1jx47VjBkztGPHjnAPJWrt3LlTU6dOlcvlUk5Ojqqqqvqsf/DgQU2fPl0ul0szZ87UkSNHgp43xmjbtm1KTU3V2LFjlZeXp/fffz+cQxgxhnKuu7q6tGnTJs2cOVO33Xab0tLStGLFCl25ciXcwxgxhvpv+4sef/xxORwObd++fYh7PcIYWKOgoMDMmjXLvP322+att94yf/iHf2gKCwv7bPP444+b9PR0U1FRYU6dOmXmz59v7rnnnpB1H3zwQfPtb3/bSDK///3vwzCCkSMcc/1P//RP5sknnzT/8z//Y37729+af/7nfzZjx441P/3pT8M9nKizf/9+ExcXZ1555RVz5swZ89hjj5nExETT2NgYsv6vf/1rExMTY370ox+Zuro6s3XrVjNmzBhz+vTpQJ1/+Id/MG632xw6dMi899575oEHHjAZGRnm008/Ha5hRaWhnuuWlhaTl5dnDhw4YM6dO2cqKytNdna2mTt37nAOK2qF42+7xxtvvGFmzZpl0tLSzIsvvhjmkUQ3wo0l6urqjCRz8uTJQNl//dd/GYfDYS5fvhyyTUtLixkzZow5ePBgoOzs2bNGkqmsrAyq+7Of/cwsWLDAVFRUjPpwE+65/qLvf//75s/+7M+GrvMjRHZ2tlm3bl3gcXd3t0lLSzOlpaUh6z/88MPm/vvvDyrLyckxf/mXf2mMMcbv9xuPx2Oef/75wPMtLS0mPj7e/Mu//EsYRjByDPVch1JVVWUkmQsXLgxNp0ewcM33Bx98YCZNmmRqa2vNHXfcMerDDZelLFFZWanExETNmzcvUJaXlyen06kTJ06EbFNdXa2uri7l5eUFyqZPn64pU6aosrIyUFZXV6cf/vCHevXVV/v8obLRIpxz/WU+n09JSUlD1/kRoLOzU9XV1UFz5XQ6lZeX1+tcVVZWBtWXpPz8/ED9+vp6eb3eoDput1s5OTl9zr/twjHXofh8PjkcDiUmJg5Jv0eqcM233+/XI488oo0bN+ruu+8OT+dHGD6pLOH1epWcnBxUFhsbq6SkJHm93l7bxMXF3fA/nJSUlECbjo4OFRYW6vnnn9eUKVPC0veRJlxz/WXHjx/XgQMHtGbNmiHp90hx9epVdXd3KyUlJai8r7nyer191u/5dyCvORqEY66/rL29XZs2bVJhYeGo/9HHcM33c889p9jYWD355JND3+kRinAT5TZv3iyHw9Hnce7cubC9/5YtWzRjxgwtX748bO8RLSI9119UW1urBx98UMXFxfrWt741LO8JDLWuri49/PDDMsZo165dke6Olaqrq7Vjxw7t2bNHDocj0t2JGrGR7gD6tmHDBj366KN91rnzzjvl8XjU1NQUVP7ZZ5+publZHo8nZDuPx6POzk61tLQEnVFobGwMtDl27JhOnz6t119/XdL1FSeSNGHCBD3zzDMqKSkZ5MiiT6TnukddXZ0WLlyoNWvWaOvWrYMay0g2YcIExcTE3LBqL9Rc9fB4PH3W7/m3sbFRqampQXVmz549hL0fWcIx1z16gs2FCxd07NixUX/WRgrPfL/11ltqamoKOrPe3d2tDRs2aPv27frd7343tIMYKSJ90w+GRs9NrqdOnQqU/eIXv+jXTa6vv/56oOzcuXNBN7n+3//9nzl9+nTgeOWVV4wkc/z48V7v7rdduObaGGNqa2tNcnKy2bhxY/gGMAJkZ2ebJ554IvC4u7vbTJo0qc+bLr/zne8EleXm5t5wQ/GPf/zjwPM+n48bis3Qz7UxxnR2dprFixebu+++2zQ1NYWn4yPUUM/31atXg/4fffr0aZOWlmY2bdpkzp07F76BRDnCjUUKCgrMn/zJn5gTJ06Y//3f/zXTpk0LWp78wQcfmK9+9avmxIkTgbLHH3/cTJkyxRw7dsycOnXK5Obmmtzc3F7f48033xz1q6WMCc9cnz592kycONEsX77cNDQ0BI7R+OGwf/9+Ex8fb/bs2WPq6urMmjVrTGJiovF6vcYYYx555BGzefPmQP1f//rXJjY21vz4xz82Z8+eNcXFxSGXgicmJpqf//zn5je/+Y158MEHWQpuhn6uOzs7zQMPPGAmT55s3n333aC/5Y6OjoiMMZqE42/7y1gtRbixykcffWQKCwvN7bffbhISEsyqVatMW1tb4Pn6+nojybz55puBsk8//dR8//vfN3/wB39gvvKVr5g///M/Nw0NDb2+B+HmunDMdXFxsZF0w3HHHXcM48iix09/+lMzZcoUExcXZ7Kzs83bb78deG7BggVm5cqVQfX/9V//1fzRH/2RiYuLM3fffbf5z//8z6Dn/X6/efbZZ01KSoqJj483CxcuNOfPnx+OoUS9oZzrnr/9UMcX/3sYzYb6b/vLCDfGOIz5/zdRAAAAWIDVUgAAwCqEGwAAYBXCDQAAsArhBgAAWIVwAwAArEK4AQAAViHcAAAAqxBuAACAVQg3AADAKoQbAABgFcINAACwCuEGAABY5f8BJEyIo9fm7mMAAAAASUVORK5CYII=", + "text/plain": [ + "<Figure size 640x480 with 1 Axes>" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "def plotEmbedding(embedded):\n", + " return plt.scatter(x=embedded[:,0], y=embedded[:,1])\n", + "\n", + "plotEmbedding(embedded)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There is a built in string lookup layer so you don't have to use your own dictionary for this:" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<tf.Tensor: shape=(2,), dtype=int64, numpy=array([0, 4])>" + ] + }, + "execution_count": 130, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "str_lookup = keras.layers.StringLookup()\n", + "str_lookup.adapt(arrText)\n", + "# 0 is reserved for unrecognized stuff. \n", + "str_lookup(['rnd', 'weird'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Textvectorization is just a better string lookup that removes punctuation, sets lowercase, and splits by whitespace. \n", + "\n", + "To preserve case and punctuation set Standardize=None\n", + "\n", + "Unknown words are now encoded as 1" + ] + }, + { + "cell_type": "code", + "execution_count": 161, + "metadata": {}, + "outputs": [], + "source": [ + "training = ['the distant realm of Lumina, where the sky shimmered with hues unknown to the mundane world, an ancient prophecy began to unfold. The Great Tree of Elaria, standing tall and majestic in the heart of the enchanted forest, whispered secrets to']\n", + "\n", + "text_vec = keras.layers.TextVectorization()\n", + "text_vec.adapt(training)" + ] + }, + { + "cell_type": "code", + "execution_count": 177, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<tf.Tensor: shape=(1, 6), dtype=int64, numpy=array([[29, 17, 4, 21, 8, 2]])>" + ] + }, + "execution_count": 177, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out = text_vec(['distant realm of Lumina, where the '])\n", + "out" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['', '[UNK]', 'the', 'to', 'of', 'world', 'with', 'whispered',\n", + " 'where', 'unknown', 'unfold', 'tree', 'tall', 'standing', 'sky',\n", + " 'shimmered', 'secrets', 'realm', 'prophecy', 'mundane', 'majestic',\n", + " 'lumina', 'in', 'hues', 'heart', 'great', 'forest', 'enchanted',\n", + " 'elaria', 'distant', 'began', 'and', 'ancient', 'an'], dtype='<U9')" + ] + }, + "execution_count": 178, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "vocab = np.array(text_vec.get_vocabulary())\n", + "vocab" + ] + }, + { + "cell_type": "code", + "execution_count": 186, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'distant realm of lumina where the'" + ] + }, + "execution_count": 186, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "decoded_texts = []\n", + "for sequence in out:\n", + " decoded_texts.append([vocab[i] for i in sequence if i != 0])\n", + "' '.join(decoded_texts[0])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/phishingClassification/PhishingClassification.ipynb b/phishingClassification/PhishingClassification.ipynb @@ -0,0 +1,273 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://www.kaggle.com/datasets/subhajournal/phishingemails" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>Email Text</th>\n", + " <th>Email Type</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>0</td>\n", + " <td>re : 6 . 1100 , disc : uniformitarianism , re ...</td>\n", + " <td>Safe Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>the other side of * galicismos * * galicismo *...</td>\n", + " <td>Safe Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>2</td>\n", + " <td>re : equistar deal tickets are you still avail...</td>\n", + " <td>Safe Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>3</td>\n", + " <td>\\nHello I am your hot lil horny toy.\\n I am...</td>\n", + " <td>Phishing Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>4</td>\n", + " <td>software at incredibly low prices ( 86 % lower...</td>\n", + " <td>Phishing Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18645</th>\n", + " <td>18646</td>\n", + " <td>date a lonely housewife always wanted to date ...</td>\n", + " <td>Phishing Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18646</th>\n", + " <td>18647</td>\n", + " <td>request submitted : access request for anita ....</td>\n", + " <td>Safe Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18647</th>\n", + " <td>18648</td>\n", + " <td>re : important - prc mtg hi dorn &amp; john , as y...</td>\n", + " <td>Safe Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18648</th>\n", + " <td>18649</td>\n", + " <td>press clippings - letter on californian utilit...</td>\n", + " <td>Safe Email</td>\n", + " </tr>\n", + " <tr>\n", + " <th>18649</th>\n", + " <td>18650</td>\n", + " <td>empty</td>\n", + " <td>Phishing Email</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>18650 rows × 3 columns</p>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 Email Text \\\n", + "0 0 re : 6 . 1100 , disc : uniformitarianism , re ... \n", + "1 1 the other side of * galicismos * * galicismo *... \n", + "2 2 re : equistar deal tickets are you still avail... \n", + "3 3 \\nHello I am your hot lil horny toy.\\n I am... \n", + "4 4 software at incredibly low prices ( 86 % lower... \n", + "... ... ... \n", + "18645 18646 date a lonely housewife always wanted to date ... \n", + "18646 18647 request submitted : access request for anita .... \n", + "18647 18648 re : important - prc mtg hi dorn & john , as y... \n", + "18648 18649 press clippings - letter on californian utilit... \n", + "18649 18650 empty \n", + "\n", + " Email Type \n", + "0 Safe Email \n", + "1 Safe Email \n", + "2 Safe Email \n", + "3 Phishing Email \n", + "4 Phishing Email \n", + "... ... \n", + "18645 Phishing Email \n", + "18646 Safe Email \n", + "18647 Safe Email \n", + "18648 Safe Email \n", + "18649 Phishing Email \n", + "\n", + "[18650 rows x 3 columns]" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd \n", + "\n", + "df = pd.read_csv('../datasets/phishing/Phishing_Email.csv')\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.model_selection import train_test_split\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(df.drop('Email Type', axis=1), df['Email Type'])\n", + "X_val , X_test , y_val , y_test = train_test_split(X_test, y_test, test_size=.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-06-28 15:01:02.519344: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2024-06-28 15:01:02.522426: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2024-06-28 15:01:02.563951: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2024-06-28 15:01:03.357473: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "2024-06-28 15:01:03.896173: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355\n", + "2024-06-28 15:01:03.896878: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.\n", + "Skipping registering GPU devices...\n" + ] + } + ], + "source": [ + "from sklearn.naive_bayes import CategoricalNB\n", + "import keras\n", + "import numpy as np\n", + "\n", + "def toStr(inp):\n", + " return str(inp)\n", + "\n", + "layer = keras.layers.TextVectorization()\n", + "\n", + "X_train['Email Text'] = X_train['Email Text'].apply(toStr)\n", + "arr = ' '.join(np.array(X_train['Email Text']).tolist())" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "layer.adapt(arr)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "166954" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# All the words vectorized\n", + "len(layer.get_vocabulary())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<tf.Tensor: shape=(1, 5), dtype=int64, numpy=array([[ 2, 15274, 9, 12, 11579]])>" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "layer(['the fuck is that shit'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}