notes

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs

commit d4332493dafa7398afca298c663fc6f3b31234c6
parent 80924d21b2a706cb2c6da0bc38f560aabc6de391
Author: Andrew Laack <andrew@laack.co>
Date:   Thu, 12 Jun 2025 16:02:34 -0500

Did work with pytorch, took notes on DDIA

Diffstat:
M.gitignore | 4++--
Mlatex/designing/DesigningDataIntensiveApplications.tex | 76+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Alatex/pytorch/PyTorch.tex | 136+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apytorch/ch2/pretrained-image-classifiers.ipynb | 159+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Apytorch/ch2/tensor-multiplication.ipynb | 41+++++++++++++++++++++++++++++++++++++++++
Dpytorch/tensor-multiplication.ipynb | 54------------------------------------------------------
6 files changed, 413 insertions(+), 57 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -3,4 +3,5 @@ *.log *.toc .venv/* -myenv- \ No newline at end of file +myenv +data/ diff --git a/latex/designing/DesigningDataIntensiveApplications.tex b/latex/designing/DesigningDataIntensiveApplications.tex @@ -41,7 +41,7 @@ } \lstnewenvironment{code}{ - \hspace{.45cm}\textbf{Code:} + \textbf{Code:} \lstset{ basicstyle=\ttfamily, columns=fullflexible, @@ -105,6 +105,79 @@ Imperative languages state the steps needed to perform a task. Declarative langu Additionally, given the necessity of execution order for imperative languages, it can be easier to parallelize execution when using a declarative style. +\subsection{MapReduce} + +MapReduce is a model for the bulk processing of large amounts of data across distributed systems. The model was popularized by Google and is implemented, to varying degrees, in some NoSQL datastores like MongoDB and CouchDB. + +MapReduce allows the definition of the functions map and reduce. Map defines the key and value to emit from each document, and reduce is called for each grouping of keys returned from map. + +This is exceptionally useful for big datasets because map can be parallelized (locally ran on each system), the results can be shuffled together based on keys, and then reduced to outputs. + +\vspace{.25cm} + +\noindent\begin{code} + +db.observations.mapReduce( + function map() { + var year = this.observationTimestamp.getFullYear(); + var month = this.observationTimestamp.getMonth() + 1; + emit(year + "-" + month, this.numAnimals); + }, + function reduce(key, values) { + return Array.sum(values); + }, + { + query: { family: "Sharks" }, + out: "monthlySharkReport" + } +); + +\end{code} + +The map and reduce functions must also be \textit{pure} functions, functions that don't have any side effects and always have the same output for the same inputs. + +Additionally, given the low-level nature of map reduce, it is possible to create SQL implementations that are use a pipeline of map reduce steps. + + +\subsection{Aggregation Pipeline} + +Some more recent versions of MongoDB support the aggregation pipeline query language which is similar to SQL, but built with documents in mind. This allows a declarative, query optimizer based approach, to be used with document datastores. + +\begin{verbatim} +db.observations.aggregate([ + { $match: { family: "Sharks" } }, + { $group: { + _id: { + year: { $year: "$observationTimestamp" }, + month: { $month: "$observationTimestamp" } + }, + totalAnimals: { $sum: "$numAnimals" } + } } +]) +\end{verbatim} + +\subsection{Graph-Like Data Models} + +Graph models are often useful when there are lots of many-to-many relationships, as is the case with a social network, a road network, or a graph of the web. Despite the homogeneity of data in each of these situations, it needn't be so. In the case of Facebook, they may have a single graph where vertices could represent friends, events, comments, and everything else that relates individuals to each other. + +\subsubsection{Property Graph Model} + +The property graph model, as implemented by Neo4j, Titan, and InfiniteGraph, each vertex contains and identifier, a set of outgoing edges, a set of incoming edges, and a collection of key-value pairs (properties). Each edge has a unique identifier, a tail vertex (where it starts), a head vertex (where it ends), a label to describe the relationship, and a collection of properties (key-value pairs). + +As we see, this is a directed graph and each object (edges and vertices) contains its own collection of properties and identifier. Conceptually, this can be implemented by a SQL database where we have a table for vertices and a table for edges, each of which contains respective columns for attributes of said records, presuming we have a json datatype, which is supported in PostgreSQL. + + + +\subsubsection{Triple-Store Model} + +The triple-store model, as implemented by Datomic, AllegroGraph, and others, + +\subsection{Graph Query Languages} + +\subsubsection{Cypher} +\subsubsection{SPARQL} +\subsubsection{Datalog} + \section{Storage and Retrieval} \section{Encoding and Evolution} \section{Replication} @@ -116,4 +189,5 @@ Additionally, given the necessity of execution order for imperative languages, i \section{Stream Processing} \section{The Future of Data Systems} + \end{document} diff --git a/latex/pytorch/PyTorch.tex b/latex/pytorch/PyTorch.tex @@ -0,0 +1,136 @@ +\documentclass[12pt, letterpaper]{article} +\usepackage{xcolor} + +\setlength{\parindent}{0pt} +\setlength{\parskip}{.5em} + +\usepackage{enumitem} +\usepackage{graphicx} +\usepackage{listings} +\usepackage{caption} +\usepackage{tcolorbox} +\usepackage{datetime} +\usepackage{amsfonts} +\usepackage{amsmath} +\usepackage{geometry} +\geometry{verbose,tmargin=1in,bmargin=1in,lmargin=1in,rmargin=1in} +\usepackage{amssymb,enumerate} +\usepackage{amsthm,stmaryrd} +\usepackage[all]{xy} + +\newenvironment{definition}{ + \begin{quote} + \textbf{Definition:} + }{ + \end{quote} +} + + +\newenvironment{explanation}{ + \begin{quote} + \textbf{Explanation:} + }{ + \end{quote} +} + +\newenvironment{example}{ + \begin{quote} + \textbf{Example:} + }{ + \end{quote} +} + +\lstnewenvironment{code}{ + \hspace{.45cm}\textbf{Code:} + \lstset{ + basicstyle=\ttfamily, + columns=fullflexible, + breaklines=true + } +}{ +} + + +\begin{document} + +\noindent{\large \textbf{Deep Learning with PyTorch by Eli Stevens}} + +\noindent Notes by Andrew Laack + +\tableofcontents + +\section{Introducing Deep Learning and PyTorch} + +\subsection{PyTorch} + +PyTorch is a deep learning library, but more generally, is an optimization library. + +\subsection{torch.nn} + +torch.nn provides common functionality for neural network layers and architecture. Some common functionality are activation functions, conv. layers, loss functions, and connected layers. + +\subsection{torch.Tensor} + +torch.Tensor is a multidimensional array class that provides tensors for use with PyTorch's optimization functionallity. + +\subsection{torch.optim} + +In our training loop, PyTorch has an autograd engine to compute gradients quickly, but it does not supply a way to update the weights and biases of our model. torch.optim provides these optimizers to update the weights of our models. + +\subsection{Dataset (torch.utils.data)} + +The Dataset class is used to store data in a PyTorch compliant way. + +\subsection{DataLoader (torch.utils.data)} + +DataLoader wraps our dataset, facilitating batching and other fast operations. + +\subsection{Distributed Training} + +To facilitate distributed training PyTorch provides torch.nn.parallel.Distributed-DataParallel and torch.distributed to use additional hardware. + +\subsection{TorchScript} + +TorchScript is a way to compile a model ahead of time into a set of instructions that can be ran without a Python runtime. + +\subsection{Modules} + +Modules are the building blocks for NN architectures in PyTorch, other libraries often call modules \textit{layers}. + +\section{Datasets} + +\subsection{torchvision.transforms} + +torchvision.transforms allows us to define pipelines to do basic preprocessing. + +\subsection{ImageNet} + +ImageNet is a dataset of over 14 million labeled images, maintained by Standford University. Each of the images are labeled with with words from the WordNet dataset which is an English word dataset. + +The ImageNet Large Scale Visual Recognition Challenge (ILSVRC) is a varying competition, held each year, to test models on different tasks related to the dataset. The image classification task consist of taking an input image and giving 5 labels out of 1,000 total labels, ranked by confidence, to describe the image. The training set for ILSVRC is 1.2 million images labeled with one of 1,000 nouns. + +\section{Pretrained Networks} + +\subsection{torchvision.models} + +torchvision.models provides a bunch of pretrained vision models. The capitalized options are classes, and the lower case options are functions that return objects of the specified architecture. When we create objects using the functions, we can also specify if we want \texttt{pretrained=True} (the default is \texttt{False}). + +\subsection{AlexNet} + +AlexNet won ILSVRC in 2012. It is a CNN for image based classification. + +\subsection{ResNet} + +ResNet was the first mainstream residual network, achieving stable training at depths that were previously extremely difficult. The ResNet network was trained on the ImageNet dataset with 1.2 million images and 1,000 categories. + +\section{Architectures} + +\subsection{GAN} + +GANs (generative adversarial networks) are networks that play the \textit{GAN game}, where a generator network tries to create images to trick a discriminator into thinking it is a real image, and the discriminator network tries to predict which images are real and which come from the other network. + +\subsection{CycleGAN} + + + +\end{document} diff --git a/pytorch/ch2/pretrained-image-classifiers.ipynb b/pytorch/ch2/pretrained-image-classifiers.ipynb @@ -0,0 +1,159 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 8, + "id": "1452c9e5-2ed1-4318-920a-6513ba106493", + "metadata": {}, + "outputs": [], + "source": [ + "from torchvision import models\n", + "from torchvision import transforms\n", + "\n", + "resnet = models.resnet101(pretrained=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "1df38383-5c33-4887-ae4d-126feb4ae8d8", + "metadata": {}, + "outputs": [], + "source": [ + "preprocess = transforms.Compose([\n", + " transforms.Resize(256),\n", + " transforms.CenterCrop(224),\n", + " transforms.ToTensor(),\n", + " transforms.Normalize(\n", + " mean = [.485, .456, .406],\n", + " std = [.229, .224, .225]\n", + " )\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "162d49a9-1c26-4da6-9fe7-83673d9f381f", + "metadata": {}, + "outputs": [], + "source": [ + "from PIL import Image\n", + "\n", + "img = Image.open('../data/p1ch2/dog.jpeg')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "91509f42-efb0-4cd1-86ef-607e0ba3f514", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "\n", + "dog_img_tensor = preprocess(img)\n", + "batch_tensor = torch.unsqueeze(dog_img_tensor, 0)\n", + "\n", + "# put network into eval mode (always do this before inference)\n", + "\n", + "resnet.eval()\n", + "out = resnet(batch_tensor)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ba143165-f758-4ed5-b834-7d55e314acd6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "207, golden_retriever\n" + ] + } + ], + "source": [ + "\n", + "_, most_likely = torch.max(out, 1)\n", + "\n", + "with open('../data/p1ch2/imagenet_classes.txt') as f:\n", + " labels = [line.strip() for line in f.readlines()]\n", + " print(labels[most_likely])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f0fc8fc5-bdfe-463f-897c-247c4dd73845", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "tensor([97.6346], grad_fn=<IndexBackward0>)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "proba = torch.nn.functional.softmax(out, dim=1)[0] * 100\n", + "proba[most_likely]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "298c308d-ff8e-4316-84c0-7af7c01a3538", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('207, golden_retriever', 97.63463592529297),\n", + " ('208, Labrador_retriever', 1.6066557168960571),\n", + " ('852, tennis_ball', 0.30485954880714417),\n", + " ('222, kuvasz', 0.06295711547136307),\n", + " ('205, flat-coated_retriever', 0.05671220272779465)]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "_, indices = torch.sort(out, descending=True)\n", + "[(labels[idx], proba[idx].item()) for idx in indices[0][:5]]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "myenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pytorch/ch2/tensor-multiplication.ipynb b/pytorch/ch2/tensor-multiplication.ipynb @@ -0,0 +1,40 @@ +{ + "cells":[{ + "metadata":{ + }, + "outputs":[{ + "data":{ + "text/plain":["tensor([[ 2, 2, 1],\n"," [ 4, 9, 5],\n"," [ 4, 6, 10]])"] + }, + "output_type":"execute_result", + "execution_count":24, + "metadata":{ + } + }], + "id":"4c9b6198", + "cell_type":"code", + "execution_count":24, + "source":["import torch\n","\n","t1 = torch.tensor(data=[[2,2,1], [2, 3, 1], [1, 3, 5]])\n","t2 = torch.tensor(data=[[1,1,1], [2, 3, 5], [4, 2, 2]])\n","\n","torch.mul(t1, t2)"] + }], + "metadata":{ + "kernelspec":{ + "name":"python3", + "display_name":"myenv", + "language":"python" + }, + "language_info":{ + "codemirror_mode":{ + "version":3, + "name":"ipython" + }, + "file_extension":".py", + "pygments_lexer":"ipython3", + "version":"3.11.5", + "name":"python", + "nbconvert_exporter":"python", + "mimetype":"text/x-python" + } + }, + "nbformat":4, + "nbformat_minor":5 +}+ \ No newline at end of file diff --git a/pytorch/tensor-multiplication.ipynb b/pytorch/tensor-multiplication.ipynb @@ -1,54 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 15, - "id": "4c9b6198", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "tensor([[ 2, 2, 1],\n", - " [ 4, 9, 5],\n", - " [ 4, 6, 10]])" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import torch\n", - "\n", - "\n", - "t1 = torch.tensor(data=[[2,2,1], [2, 3, 1], [1, 3, 5]])\n", - "t2 = torch.tensor(data=[[1,1,1], [2, 3, 5], [4, 2, 2]])\n", - "\n", - "torch.mul(t1, t2)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "myenv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}