CompPhysics
diff --git a/‎doc/src/week47/Latexslides/ae.tex‎
Lines changed: 399 additions & 0 deletions b/‎doc/src/week47/Latexslides/ae.tex‎
Lines changed: 399 additions & 0 deletions
@@ -0,0 +1,399 @@
+\documentclass[11pt]{beamer}
+\usetheme{Madrid}
+\usecolortheme{seagull}
+\usefonttheme{professionalfonts}
+
+\usepackage[utf8]{inputenc}
+\usepackage{amsmath,amssymb,mathtools}
+\usepackage{graphicx}
+\usepackage{tikz}
+\usepackage{hyperref}
+\usepackage{listings}
+\usepackage{xcolor}
+\usepackage{booktabs}
+\usepackage{caption}
+
+% Listings (Python) style
+\lstset{language=Python,
+basicstyle=\ttfamily\scriptsize,
+keywordstyle=\color{blue},
+commentstyle=\color{gray},
+breaklines=true,
+frame=single,
+columns=fullflexible
+}
+
+% TikZ styles
+\tikzset{encoder/.style={rectangle, draw=black!80, fill=blue!10, rounded corners, minimum width=2.2cm, minimum height=6mm},
+decoder/.style={rectangle, draw=black!80, fill=green!10, rounded corners, minimum width=2.2cm, minimum height=6mm},
+arrow/.style={->, >=stealth, thick}}
+
+\title{Autoencoders: Theory, Variants and Applications}
+\author{FYS-STK3155/4155}
+\date{\today}
+
+\begin{document}
+
+\begin{frame}
+\titlepage
+\end{frame}
+
+\begin{frame}{Outline}
+\tableofcontents
+\end{frame}
+
+
+\begin{frame}{Learning Goals}
+\begin{itemize}
+\item Understand the basic autoencoder architecture (encoder, latent space, decoder).
+\item Derive the linear autoencoder and its connection to PCA.
+\item See simple implementations and start hands-on examples.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{What is an Autoencoder?}
+\begin{itemize}
+\item An autoencoder (AE) is a neural network trained to reconstruct its input: $\hat{x}=\mathrm{Dec}(\mathrm{Enc}(x))$.
+\item Components:
+\begin{itemize}
+\item \textbf{Encoder} $f_\theta:\mathbb{R}^d\to\mathbb{R}^m$ compresses input to latent code $z=f_\theta(x)$
+\item \textbf{Decoder} $g_\phi:\mathbb{R}^m\to\mathbb{R}^d$ reconstructs $\hat{x}=g_\phi(z)$
+\end{itemize}
+\item Training objective: minimize reconstruction loss, e.g. MSE
+\mathcal{L}(\theta,\phi)=\frac{1}{N}\sum_{i=1}^N \|x^{(i)}-g_\phi(f_\theta(x^{(i)}))\|^2_2.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Simple Diagram: Autoencoder}
+\centering
+\begin{tikzpicture}[node distance=12mm, auto, scale=0.95]
+\node (x) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm] {$x\in\mathbb{R}^d$};
+\node (enc1) [encoder, right=12mm of x] {Encoder layers};
+\node (z) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm, right=12mm of enc1] {$z\in\mathbb{R}^m$};
+\node (dec1) [decoder, right=12mm of z] {Decoder layers};
+\node (xhat) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm, right=12mm of dec1] {$\hat{x}\in\mathbb{R}^d$};
+\draw[arrow] (x) – (enc1);
+\draw[arrow] (enc1) – (z);
+\draw[arrow] (z) – (dec1);
+\draw[arrow] (dec1) – (xhat);
+\end{tikzpicture}
+\vspace{2mm}
+\begin{itemize}
+\item Encoder + Decoder may be \emph{linear} or \emph{nonlinear}.
+\item Bottleneck dimension $m$ controls compression; if $m<d$ we force representation learning.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Linear Autoencoder}
+\begin{itemize}
+\item Consider linear encoder and decoder with no biases for simplicity:
+z=W_e x,\qquad \hat{x}=W_d z = W_d W_e x.
+\item Minimize reconstruction error over dataset $X\in\mathbb{R}^{d\times N}$ (columns are datapoints):
+\min_{W_e,W_d} \|X - W_d W_e X\|_F^2.
+\item Let $W= W_d W_e$ be a rank-$m$ matrix approximation of the identity mapping on the data subspace.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Linear AE \leftrightarrow PCA}
+\begin{itemize}
+\item If we constrain $W_e$ to have orthonormal rows and $W_d=W_e^T$, minimizing Frobenius norm leads to PCA projection onto top-$m$ principal components.
+\item Proof sketch (informal): SVD of data $X = U \Sigma V^T$. Best rank-$m$ approximation in Frobenius norm is via truncation $U_m\Sigma_m V_m^T$ (Eckart–Young theorem). Linear AE learns the same subspace when optimal.
+\item Therefore linear AE with MSE and appropriate constraints recovers PCA.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Derivation (Sketch)}
+\begin{align*}
+&\min_{W,:,\mathrm{rank}(W)\le m} |X - W X|_F^2 \[2mm]
+&\text{SVD: } X = U \Sigma V^T, \quad X_m = U_m \Sigma_m V_m^T \\
+&\Rightarrow \operatorname{argmin}_W \|X - W X\|F^2 = P{U_m} = U_m U_m^T,
+\end{align*}
+which is the projection onto the leading $m$ principal components. A linear AE with encoder $W_e=U_m^T$, decoder $W_d=U_m$ achieves this.
+\end{frame}
+
+\begin{frame}{Practical remarks}
+\begin{itemize}
+\item Linear AE = good for understanding; nonlinear AE (with activations) can learn more complex manifolds.
+\item Bottleneck dimension $m$ should balance reconstruction fidelity vs. compression.
+\item Regularization (weight decay, sparsity) helps to learn meaningful features.
+\end{itemize}
+\end{frame}
+
+
+
+\begin{frame}{Nonlinear Autoencoder (Single Hidden Layer)}
+\begin{align*}
+z &= \sigma(W_e x + b_e),\\
+\hat{x} &= \sigma’(W_d z + b_d),
+\end{align*}
+where $\sigma,\sigma’$ are activation functions (ReLU, tanh, sigmoid) and losses are typically MSE or binary cross-entropy for normalized inputs.
+\end{frame}
+
+\begin{frame}{Backpropagation Through Autoencoder}
+\begin{itemize}
+\item Loss $\mathcal{L}(\theta) = \frac{1}{N}\sum_i \ell(x^{(i)},\hat{x}^{(i)})$ with $\hat{x}=g_\phi(f_\theta(x))$.
+\item Gradient computed via chain rule through decoder and encoder:
+\[\nabla_{\phi,\theta} \mathcal{L} = \frac{1}{N} \sum_i \nabla_{\hat{x}} \ell \cdot \nabla_{\phi} g_\phi(z) \cdot \nabla_\theta f_\theta(x).\]
+\item Standard autodiff in PyTorch / TensorFlow handles these computations.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{PyTorch: Full Autoencoder (MNIST) — Code}
+\begin{lstlisting}
+
+PyTorch AE for MNIST (fully self-contained snippet)
+
+import torch
+from torch import nn, optim
+from torchvision import datasets, transforms
+from torch.utils.data import DataLoader
+
+Data
+
+transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
+train_ds = datasets.MNIST(’.’, train=True, download=True, transform=transform)
+train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
+
+Model
+
+class Autoencoder(nn.Module):
+def init(self, input_dim=28*28, hidden_dim=64, latent_dim=16):
+super().init()
+self.encoder = nn.Sequential(
+nn.Flatten(),
+nn.Linear(input_dim, hidden_dim),
+nn.ReLU(),
+nn.Linear(hidden_dim, latent_dim),
+nn.ReLU()
+)
+self.decoder = nn.Sequential(
+nn.Linear(latent_dim, hidden_dim),
+nn.ReLU(),
+nn.Linear(hidden_dim, input_dim),
+nn.Sigmoid(),
+nn.Unflatten(1, (1,28,28))
+)
+def forward(self, x):
+z = self.encoder(x)
+xhat = self.decoder(z)
+return xhat
+
+model = Autoencoder()
+criterion = nn.MSELoss()
+optimizer = optim.Adam(model.parameters(), lr=1e-3)
+
+Training loop (one epoch example)
+
+model.train()
+for epoch in range(1, 11):
+epoch_loss = 0.0
+for xb, _ in train_loader:
+optimizer.zero_grad()
+xb = xb
+xhat = model(xb)
+loss = criterion(xhat, xb)
+loss.backward()
+optimizer.step()
+epoch_loss += loss.item() * xb.size(0)
+epoch_loss /= len(train_loader.dataset)
+print(f”Epoch {epoch}, Loss: {epoch_loss:.6f}”)
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}{TensorFlow: Full Autoencoder (MNIST) — Code}
+\begin{lstlisting}
+
+TensorFlow Keras AE for MNIST
+
+import tensorflow as tf
+from tensorflow.keras import layers, models
+
+Data
+
+(x_train, _), (x_test, _) = tf.keras.datasets.mnist.load_data()
+x_train = x_train.astype(‘float32’) / 255.0
+x_train = x_train[…, None]
+
+Model
+
+input_shape = (28,28,1)
+encoder_inputs = layers.Input(shape=input_shape)
+x = layers.Flatten()(encoder_inputs)
+x = layers.Dense(64, activation=‘relu’)(x)
+latent = layers.Dense(16, activation=‘relu’)(x)
+
+x = layers.Dense(64, activation=‘relu’)(latent)
+x = layers.Dense(28*28, activation=‘sigmoid’)(x)
+outputs = layers.Reshape((28,28,1))(x)
+
+autoencoder = models.Model(encoder_inputs, outputs)
+autoencoder.compile(optimizer=‘adam’, loss=‘mse’)
+
+Training
+
+autoencoder.fit(x_train, x_train, epochs=10, batch_size=128)
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}{Discussion: Architectures and Choices}
+\begin{itemize}
+\item Activation functions: ReLU often for hidden layers, sigmoid for output when input scaled to [0,1].
+\item Loss: MSE for continuous-valued inputs; binary cross-entropy if inputs are binary/normalized.
+\item Bottleneck/latent size affects compression and reconstruction quality.
+\item Use dropout, batch-norm, or weight decay as needed.
+\end{itemize}
+\end{frame}
+
+
+
+\begin{frame}{Learning Goals (Lecture 3)}
+\begin{itemize}
+\item Understand denoising and sparse autoencoders.
+\item Learn regularization techniques to improve latent representations.
+\item See code examples for denoising and sparse penalties.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Denoising Autoencoder (DAE)}
+\begin{itemize}
+\item Idea: Corrupt input $x$ with noise to $\tilde{x}$ and train AE to reconstruct the clean $x$.
+\item Objective: $\mathbb{E}{x\sim p{data}}\mathbb{E}{\tilde{x}\sim q(\tilde{x}|x)}\big[\ell(x,g\phi(f_\theta(\tilde{x})))\big]$.
+\item Denoising forces encoder to learn robust features and manifold structure.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{PyTorch: Denoising Autoencoder (Snippet)}
+\begin{lstlisting}
+
+Add Gaussian noise to inputs and train AE to reconstruct clean images
+
+noise_std = 0.3
+for xb, _ in train_loader:
+xb_noisy = xb + noise_std * torch.randn_like(xb)
+xb_noisy = torch.clip(xb_noisy, 0., 1.)
+optimizer.zero_grad()
+xhat = model(xb_noisy)
+loss = criterion(xhat, xb)  # target is original clean xb
+loss.backward()
+optimizer.step()
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}{Sparse Autoencoders}
+\begin{itemize}
+\item Encourage activations in the latent code to be sparse (many near-zero).
+\item Add penalty (e.g., $L_1$ on activations, or KL divergence to small target sparsity $\rho$):
+\mathcal{L} = \frac{1}{N}\sum_i \|x^{(i)}-\hat{x}^{(i)}\|^2 + \beta \sum_j \text{KL}(\rho \| \hat{\rho}_j),
+where $\hat{\rho}_j$ is average activation of unit $j$ across data.
+\item Sparsity encourages disentanglement and interpretable features.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Implementation: Sparsity via $L_1$ Penalty (PyTorch)}
+\begin{lstlisting}
+
+After forward pass: z = model.encoder(xb)
+
+l1_lambda = 1e-5
+reconstruction = model.decoder(z)
+mse = criterion(reconstruction, xb)
+l1_penalty = l1_lambda * torch.mean(torch.abs(z))
+loss = mse + l1_penalty
+loss.backward()
+optimizer.step()
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}{Contractive Autoencoders (Brief)}
+\begin{itemize}
+\item Penalize sensitivity of encoder to input by adding Frobenius norm of Jacobian $J_{f}(x)$:
+\mathcal{L} = \text{recon loss} + \lambda \|J_f(x)\|_F^2.
+\item Encourages robustness and local invariance around training points.
+\item More expensive: requires computing Jacobian or tractable approximations.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Applications: Denoising, Dimensionality Reduction, Anomaly Detection}
+\begin{itemize}
+\item \textbf{Denoising}: reconstruct clean signals from noisy inputs (images, sensor data).
+\item \textbf{Dimensionality reduction}: use latent $z$ for visualization, clustering.
+\item \textbf{Anomaly detection}: large reconstruction error indicates outliers or anomalies.
+\end{itemize}
+\end{frame}
+
+
+\begin{frame}{Learning Goals (Lecture 4)}
+\begin{itemize}
+\item Put together applications: anomaly detection pipeline, denoising in practice.
+\item Discuss evaluation metrics and practical tips (regularization, early stopping, hyperparameters).
+\item Provide complete runnable examples in PyTorch and TensorFlow that illustrate anomaly detection and denoising.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Autoencoders for Anomaly Detection}
+\begin{itemize}
+\item Train AE on “normal” data only.
+\item For a test point $x^$ compute reconstruction error $r(x^)=|x^-\hat{x}^|^2$.
+\item If $r(x^*)$ exceeds threshold $\tau$, flag as anomaly. Threshold chosen via validation set or percentile.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{PyTorch: Anomaly Detection Example (Synthetic)}
+\begin{lstlisting}
+
+Train AE on normal sine-wave data (from earlier lecture). After training:
+
+with torch.no_grad():
+recon = model(X_test)
+errors = torch.mean((recon - X_test)**2, dim=(1,2))  # per-sample MSE
+
+Choose threshold (e.g., 95th percentile of train errors)
+
+with torch.no_grad():
+recon_train = model(X_train)
+train_errors = torch.mean((recon_train - X_train)**2, dim=(1,2))
+threshold = torch.quantile(train_errors, 0.95)
+anomalies = (errors > threshold)
+print(‘Detected’, anomalies.sum().item(), ‘anomalies out of’, len(errors))
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}{TensorFlow: Denoising AE Example (MNIST) — Code}
+\begin{lstlisting}
+
+Using autoencoder defined earlier (TensorFlow)
+
+import numpy as np
+noise_factor = 0.5
+x_train_noisy = x_train + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x_train.shape)
+x_train_noisy = np.clip(x_train_noisy, 0., 1.)
+autoencoder.fit(x_train_noisy, x_train, epochs=10, batch_size=128, validation_split=0.1)
+
+Then predict on noisy test images
+
+x_test_noisy = x_test + noise_factor * np.random.normal(size=x_test.shape)
+x_test_noisy = np.clip(x_test_noisy, 0., 1.)
+reconstructions = autoencoder.predict(x_test_noisy)
+\end{lstlisting}
+\end{frame}
+
+\begin{frame}{Evaluation Metrics and Model Selection}
+\begin{itemize}
+\item Reconstruction MSE / MAE: direct measure of reconstruction quality.
+\item For anomaly detection: Precision/Recall, ROC-AUC using reconstruction error as score.
+\item Visualization: t-SNE or PCA of latent vectors $z$ to inspect cluster structure.
+\item Cross-validation / held-out validation using normal data only for threshold selection.
+\end{itemize}
+\end{frame}
+
+\begin{frame}{Practical Tips}
+\begin{itemize}
+\item Normalize inputs consistently (e.g., [0,1] or zero-mean unit-variance).
+\item Small learning rates and early stopping help prevent overfitting to noise.
+\item Monitor validation reconstruction and latent-space structure.
+\item Consider deeper architectures or convolutional autoencoders for images.
+\end{itemize}
+\end{frame}
+
+
+\end{document}