|
| 1 | +\documentclass[11pt]{beamer} |
| 2 | +\usetheme{Madrid} |
| 3 | +\usecolortheme{seagull} |
| 4 | +\usefonttheme{professionalfonts} |
| 5 | + |
| 6 | +\usepackage[utf8]{inputenc} |
| 7 | +\usepackage{amsmath,amssymb,mathtools} |
| 8 | +\usepackage{graphicx} |
| 9 | +\usepackage{tikz} |
| 10 | +\usepackage{hyperref} |
| 11 | +\usepackage{listings} |
| 12 | +\usepackage{xcolor} |
| 13 | +\usepackage{booktabs} |
| 14 | +\usepackage{caption} |
| 15 | + |
| 16 | +% Listings (Python) style |
| 17 | +\lstset{language=Python, |
| 18 | +basicstyle=\ttfamily\scriptsize, |
| 19 | +keywordstyle=\color{blue}, |
| 20 | +commentstyle=\color{gray}, |
| 21 | +breaklines=true, |
| 22 | +frame=single, |
| 23 | +columns=fullflexible |
| 24 | +} |
| 25 | + |
| 26 | +% TikZ styles |
| 27 | +\tikzset{encoder/.style={rectangle, draw=black!80, fill=blue!10, rounded corners, minimum width=2.2cm, minimum height=6mm}, |
| 28 | +decoder/.style={rectangle, draw=black!80, fill=green!10, rounded corners, minimum width=2.2cm, minimum height=6mm}, |
| 29 | +arrow/.style={->, >=stealth, thick}} |
| 30 | + |
| 31 | +\title{Autoencoders: Theory, Variants and Applications} |
| 32 | +\author{FYS-STK3155/4155} |
| 33 | +\date{\today} |
| 34 | + |
| 35 | +\begin{document} |
| 36 | + |
| 37 | +\begin{frame} |
| 38 | +\titlepage |
| 39 | +\end{frame} |
| 40 | + |
| 41 | +\begin{frame}{Outline} |
| 42 | +\tableofcontents |
| 43 | +\end{frame} |
| 44 | + |
| 45 | + |
| 46 | +\begin{frame}{Learning Goals} |
| 47 | +\begin{itemize} |
| 48 | +\item Understand the basic autoencoder architecture (encoder, latent space, decoder). |
| 49 | +\item Derive the linear autoencoder and its connection to PCA. |
| 50 | +\item See simple implementations and start hands-on examples. |
| 51 | +\end{itemize} |
| 52 | +\end{frame} |
| 53 | + |
| 54 | +\begin{frame}{What is an Autoencoder?} |
| 55 | +\begin{itemize} |
| 56 | +\item An autoencoder (AE) is a neural network trained to reconstruct its input: $\hat{x}=\mathrm{Dec}(\mathrm{Enc}(x))$. |
| 57 | +\item Components: |
| 58 | +\begin{itemize} |
| 59 | +\item \textbf{Encoder} $f_\theta:\mathbb{R}^d\to\mathbb{R}^m$ compresses input to latent code $z=f_\theta(x)$ |
| 60 | +\item \textbf{Decoder} $g_\phi:\mathbb{R}^m\to\mathbb{R}^d$ reconstructs $\hat{x}=g_\phi(z)$ |
| 61 | +\end{itemize} |
| 62 | +\item Training objective: minimize reconstruction loss, e.g. MSE |
| 63 | +\mathcal{L}(\theta,\phi)=\frac{1}{N}\sum_{i=1}^N \|x^{(i)}-g_\phi(f_\theta(x^{(i)}))\|^2_2. |
| 64 | +\end{itemize} |
| 65 | +\end{frame} |
| 66 | + |
| 67 | +\begin{frame}{Simple Diagram: Autoencoder} |
| 68 | +\centering |
| 69 | +\begin{tikzpicture}[node distance=12mm, auto, scale=0.95] |
| 70 | +\node (x) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm] {$x\in\mathbb{R}^d$}; |
| 71 | +\node (enc1) [encoder, right=12mm of x] {Encoder layers}; |
| 72 | +\node (z) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm, right=12mm of enc1] {$z\in\mathbb{R}^m$}; |
| 73 | +\node (dec1) [decoder, right=12mm of z] {Decoder layers}; |
| 74 | +\node (xhat) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm, right=12mm of dec1] {$\hat{x}\in\mathbb{R}^d$}; |
| 75 | +\draw[arrow] (x) – (enc1); |
| 76 | +\draw[arrow] (enc1) – (z); |
| 77 | +\draw[arrow] (z) – (dec1); |
| 78 | +\draw[arrow] (dec1) – (xhat); |
| 79 | +\end{tikzpicture} |
| 80 | +\vspace{2mm} |
| 81 | +\begin{itemize} |
| 82 | +\item Encoder + Decoder may be \emph{linear} or \emph{nonlinear}. |
| 83 | +\item Bottleneck dimension $m$ controls compression; if $m<d$ we force representation learning. |
| 84 | +\end{itemize} |
| 85 | +\end{frame} |
| 86 | + |
| 87 | +\begin{frame}{Linear Autoencoder} |
| 88 | +\begin{itemize} |
| 89 | +\item Consider linear encoder and decoder with no biases for simplicity: |
| 90 | +z=W_e x,\qquad \hat{x}=W_d z = W_d W_e x. |
| 91 | +\item Minimize reconstruction error over dataset $X\in\mathbb{R}^{d\times N}$ (columns are datapoints): |
| 92 | +\min_{W_e,W_d} \|X - W_d W_e X\|_F^2. |
| 93 | +\item Let $W= W_d W_e$ be a rank-$m$ matrix approximation of the identity mapping on the data subspace. |
| 94 | +\end{itemize} |
| 95 | +\end{frame} |
| 96 | + |
| 97 | +\begin{frame}{Linear AE \leftrightarrow PCA} |
| 98 | +\begin{itemize} |
| 99 | +\item If we constrain $W_e$ to have orthonormal rows and $W_d=W_e^T$, minimizing Frobenius norm leads to PCA projection onto top-$m$ principal components. |
| 100 | +\item Proof sketch (informal): SVD of data $X = U \Sigma V^T$. Best rank-$m$ approximation in Frobenius norm is via truncation $U_m\Sigma_m V_m^T$ (Eckart–Young theorem). Linear AE learns the same subspace when optimal. |
| 101 | +\item Therefore linear AE with MSE and appropriate constraints recovers PCA. |
| 102 | +\end{itemize} |
| 103 | +\end{frame} |
| 104 | + |
| 105 | +\begin{frame}{Derivation (Sketch)} |
| 106 | +\begin{align*} |
| 107 | +&\min_{W,:,\mathrm{rank}(W)\le m} |X - W X|_F^2 \[2mm] |
| 108 | +&\text{SVD: } X = U \Sigma V^T, \quad X_m = U_m \Sigma_m V_m^T \\ |
| 109 | +&\Rightarrow \operatorname{argmin}_W \|X - W X\|F^2 = P{U_m} = U_m U_m^T, |
| 110 | +\end{align*} |
| 111 | +which is the projection onto the leading $m$ principal components. A linear AE with encoder $W_e=U_m^T$, decoder $W_d=U_m$ achieves this. |
| 112 | +\end{frame} |
| 113 | +
|
| 114 | +\begin{frame}{Practical remarks} |
| 115 | +\begin{itemize} |
| 116 | +\item Linear AE = good for understanding; nonlinear AE (with activations) can learn more complex manifolds. |
| 117 | +\item Bottleneck dimension $m$ should balance reconstruction fidelity vs. compression. |
| 118 | +\item Regularization (weight decay, sparsity) helps to learn meaningful features. |
| 119 | +\end{itemize} |
| 120 | +\end{frame} |
| 121 | +
|
| 122 | +
|
| 123 | +
|
| 124 | +\begin{frame}{Nonlinear Autoencoder (Single Hidden Layer)} |
| 125 | +\begin{align*} |
| 126 | +z &= \sigma(W_e x + b_e),\\ |
| 127 | +\hat{x} &= \sigma’(W_d z + b_d), |
| 128 | +\end{align*} |
| 129 | +where $\sigma,\sigma’$ are activation functions (ReLU, tanh, sigmoid) and losses are typically MSE or binary cross-entropy for normalized inputs. |
| 130 | +\end{frame} |
| 131 | +
|
| 132 | +\begin{frame}{Backpropagation Through Autoencoder} |
| 133 | +\begin{itemize} |
| 134 | +\item Loss $\mathcal{L}(\theta) = \frac{1}{N}\sum_i \ell(x^{(i)},\hat{x}^{(i)})$ with $\hat{x}=g_\phi(f_\theta(x))$. |
| 135 | +\item Gradient computed via chain rule through decoder and encoder: |
| 136 | +\[\nabla_{\phi,\theta} \mathcal{L} = \frac{1}{N} \sum_i \nabla_{\hat{x}} \ell \cdot \nabla_{\phi} g_\phi(z) \cdot \nabla_\theta f_\theta(x).\] |
| 137 | +\item Standard autodiff in PyTorch / TensorFlow handles these computations. |
| 138 | +\end{itemize} |
| 139 | +\end{frame} |
| 140 | +
|
| 141 | +\begin{frame}{PyTorch: Full Autoencoder (MNIST) — Code} |
| 142 | +\begin{lstlisting} |
| 143 | +
|
| 144 | +PyTorch AE for MNIST (fully self-contained snippet) |
| 145 | +
|
| 146 | +import torch |
| 147 | +from torch import nn, optim |
| 148 | +from torchvision import datasets, transforms |
| 149 | +from torch.utils.data import DataLoader |
| 150 | +
|
| 151 | +Data |
| 152 | +
|
| 153 | +transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]) |
| 154 | +train_ds = datasets.MNIST(’.’, train=True, download=True, transform=transform) |
| 155 | +train_loader = DataLoader(train_ds, batch_size=128, shuffle=True) |
| 156 | +
|
| 157 | +Model |
| 158 | +
|
| 159 | +class Autoencoder(nn.Module): |
| 160 | +def init(self, input_dim=28*28, hidden_dim=64, latent_dim=16): |
| 161 | +super().init() |
| 162 | +self.encoder = nn.Sequential( |
| 163 | +nn.Flatten(), |
| 164 | +nn.Linear(input_dim, hidden_dim), |
| 165 | +nn.ReLU(), |
| 166 | +nn.Linear(hidden_dim, latent_dim), |
| 167 | +nn.ReLU() |
| 168 | +) |
| 169 | +self.decoder = nn.Sequential( |
| 170 | +nn.Linear(latent_dim, hidden_dim), |
| 171 | +nn.ReLU(), |
| 172 | +nn.Linear(hidden_dim, input_dim), |
| 173 | +nn.Sigmoid(), |
| 174 | +nn.Unflatten(1, (1,28,28)) |
| 175 | +) |
| 176 | +def forward(self, x): |
| 177 | +z = self.encoder(x) |
| 178 | +xhat = self.decoder(z) |
| 179 | +return xhat |
| 180 | +
|
| 181 | +model = Autoencoder() |
| 182 | +criterion = nn.MSELoss() |
| 183 | +optimizer = optim.Adam(model.parameters(), lr=1e-3) |
| 184 | +
|
| 185 | +Training loop (one epoch example) |
| 186 | +
|
| 187 | +model.train() |
| 188 | +for epoch in range(1, 11): |
| 189 | +epoch_loss = 0.0 |
| 190 | +for xb, _ in train_loader: |
| 191 | +optimizer.zero_grad() |
| 192 | +xb = xb |
| 193 | +xhat = model(xb) |
| 194 | +loss = criterion(xhat, xb) |
| 195 | +loss.backward() |
| 196 | +optimizer.step() |
| 197 | +epoch_loss += loss.item() * xb.size(0) |
| 198 | +epoch_loss /= len(train_loader.dataset) |
| 199 | +print(f”Epoch {epoch}, Loss: {epoch_loss:.6f}”) |
| 200 | +\end{lstlisting} |
| 201 | +\end{frame} |
| 202 | +
|
| 203 | +\begin{frame}{TensorFlow: Full Autoencoder (MNIST) — Code} |
| 204 | +\begin{lstlisting} |
| 205 | +
|
| 206 | +TensorFlow Keras AE for MNIST |
| 207 | +
|
| 208 | +import tensorflow as tf |
| 209 | +from tensorflow.keras import layers, models |
| 210 | +
|
| 211 | +Data |
| 212 | +
|
| 213 | +(x_train, _), (x_test, _) = tf.keras.datasets.mnist.load_data() |
| 214 | +x_train = x_train.astype(‘float32’) / 255.0 |
| 215 | +x_train = x_train[…, None] |
| 216 | +
|
| 217 | +Model |
| 218 | +
|
| 219 | +input_shape = (28,28,1) |
| 220 | +encoder_inputs = layers.Input(shape=input_shape) |
| 221 | +x = layers.Flatten()(encoder_inputs) |
| 222 | +x = layers.Dense(64, activation=‘relu’)(x) |
| 223 | +latent = layers.Dense(16, activation=‘relu’)(x) |
| 224 | +
|
| 225 | +x = layers.Dense(64, activation=‘relu’)(latent) |
| 226 | +x = layers.Dense(28*28, activation=‘sigmoid’)(x) |
| 227 | +outputs = layers.Reshape((28,28,1))(x) |
| 228 | +
|
| 229 | +autoencoder = models.Model(encoder_inputs, outputs) |
| 230 | +autoencoder.compile(optimizer=‘adam’, loss=‘mse’) |
| 231 | +
|
| 232 | +Training |
| 233 | +
|
| 234 | +autoencoder.fit(x_train, x_train, epochs=10, batch_size=128) |
| 235 | +\end{lstlisting} |
| 236 | +\end{frame} |
| 237 | +
|
| 238 | +\begin{frame}{Discussion: Architectures and Choices} |
| 239 | +\begin{itemize} |
| 240 | +\item Activation functions: ReLU often for hidden layers, sigmoid for output when input scaled to [0,1]. |
| 241 | +\item Loss: MSE for continuous-valued inputs; binary cross-entropy if inputs are binary/normalized. |
| 242 | +\item Bottleneck/latent size affects compression and reconstruction quality. |
| 243 | +\item Use dropout, batch-norm, or weight decay as needed. |
| 244 | +\end{itemize} |
| 245 | +\end{frame} |
| 246 | +
|
| 247 | +
|
| 248 | +
|
| 249 | +\begin{frame}{Learning Goals (Lecture 3)} |
| 250 | +\begin{itemize} |
| 251 | +\item Understand denoising and sparse autoencoders. |
| 252 | +\item Learn regularization techniques to improve latent representations. |
| 253 | +\item See code examples for denoising and sparse penalties. |
| 254 | +\end{itemize} |
| 255 | +\end{frame} |
| 256 | +
|
| 257 | +\begin{frame}{Denoising Autoencoder (DAE)} |
| 258 | +\begin{itemize} |
| 259 | +\item Idea: Corrupt input $x$ with noise to $\tilde{x}$ and train AE to reconstruct the clean $x$. |
| 260 | +\item Objective: $\mathbb{E}{x\sim p{data}}\mathbb{E}{\tilde{x}\sim q(\tilde{x}|x)}\big[\ell(x,g\phi(f_\theta(\tilde{x})))\big]$. |
| 261 | +\item Denoising forces encoder to learn robust features and manifold structure. |
| 262 | +\end{itemize} |
| 263 | +\end{frame} |
| 264 | +
|
| 265 | +\begin{frame}{PyTorch: Denoising Autoencoder (Snippet)} |
| 266 | +\begin{lstlisting} |
| 267 | +
|
| 268 | +Add Gaussian noise to inputs and train AE to reconstruct clean images |
| 269 | +
|
| 270 | +noise_std = 0.3 |
| 271 | +for xb, _ in train_loader: |
| 272 | +xb_noisy = xb + noise_std * torch.randn_like(xb) |
| 273 | +xb_noisy = torch.clip(xb_noisy, 0., 1.) |
| 274 | +optimizer.zero_grad() |
| 275 | +xhat = model(xb_noisy) |
| 276 | +loss = criterion(xhat, xb) # target is original clean xb |
| 277 | +loss.backward() |
| 278 | +optimizer.step() |
| 279 | +\end{lstlisting} |
| 280 | +\end{frame} |
| 281 | +
|
| 282 | +\begin{frame}{Sparse Autoencoders} |
| 283 | +\begin{itemize} |
| 284 | +\item Encourage activations in the latent code to be sparse (many near-zero). |
| 285 | +\item Add penalty (e.g., $L_1$ on activations, or KL divergence to small target sparsity $\rho$): |
| 286 | +\mathcal{L} = \frac{1}{N}\sum_i \|x^{(i)}-\hat{x}^{(i)}\|^2 + \beta \sum_j \text{KL}(\rho \| \hat{\rho}_j), |
| 287 | +where $\hat{\rho}_j$ is average activation of unit $j$ across data. |
| 288 | +\item Sparsity encourages disentanglement and interpretable features. |
| 289 | +\end{itemize} |
| 290 | +\end{frame} |
| 291 | +
|
| 292 | +\begin{frame}{Implementation: Sparsity via $L_1$ Penalty (PyTorch)} |
| 293 | +\begin{lstlisting} |
| 294 | +
|
| 295 | +After forward pass: z = model.encoder(xb) |
| 296 | +
|
| 297 | +l1_lambda = 1e-5 |
| 298 | +reconstruction = model.decoder(z) |
| 299 | +mse = criterion(reconstruction, xb) |
| 300 | +l1_penalty = l1_lambda * torch.mean(torch.abs(z)) |
| 301 | +loss = mse + l1_penalty |
| 302 | +loss.backward() |
| 303 | +optimizer.step() |
| 304 | +\end{lstlisting} |
| 305 | +\end{frame} |
| 306 | +
|
| 307 | +\begin{frame}{Contractive Autoencoders (Brief)} |
| 308 | +\begin{itemize} |
| 309 | +\item Penalize sensitivity of encoder to input by adding Frobenius norm of Jacobian $J_{f}(x)$: |
| 310 | +\mathcal{L} = \text{recon loss} + \lambda \|J_f(x)\|_F^2. |
| 311 | +\item Encourages robustness and local invariance around training points. |
| 312 | +\item More expensive: requires computing Jacobian or tractable approximations. |
| 313 | +\end{itemize} |
| 314 | +\end{frame} |
| 315 | +
|
| 316 | +\begin{frame}{Applications: Denoising, Dimensionality Reduction, Anomaly Detection} |
| 317 | +\begin{itemize} |
| 318 | +\item \textbf{Denoising}: reconstruct clean signals from noisy inputs (images, sensor data). |
| 319 | +\item \textbf{Dimensionality reduction}: use latent $z$ for visualization, clustering. |
| 320 | +\item \textbf{Anomaly detection}: large reconstruction error indicates outliers or anomalies. |
| 321 | +\end{itemize} |
| 322 | +\end{frame} |
| 323 | +
|
| 324 | +
|
| 325 | +\begin{frame}{Learning Goals (Lecture 4)} |
| 326 | +\begin{itemize} |
| 327 | +\item Put together applications: anomaly detection pipeline, denoising in practice. |
| 328 | +\item Discuss evaluation metrics and practical tips (regularization, early stopping, hyperparameters). |
| 329 | +\item Provide complete runnable examples in PyTorch and TensorFlow that illustrate anomaly detection and denoising. |
| 330 | +\end{itemize} |
| 331 | +\end{frame} |
| 332 | +
|
| 333 | +\begin{frame}{Autoencoders for Anomaly Detection} |
| 334 | +\begin{itemize} |
| 335 | +\item Train AE on “normal” data only. |
| 336 | +\item For a test point $x^$ compute reconstruction error $r(x^)=|x^-\hat{x}^|^2$. |
| 337 | +\item If $r(x^*)$ exceeds threshold $\tau$, flag as anomaly. Threshold chosen via validation set or percentile. |
| 338 | +\end{itemize} |
| 339 | +\end{frame} |
| 340 | +
|
| 341 | +\begin{frame}{PyTorch: Anomaly Detection Example (Synthetic)} |
| 342 | +\begin{lstlisting} |
| 343 | +
|
| 344 | +Train AE on normal sine-wave data (from earlier lecture). After training: |
| 345 | +
|
| 346 | +with torch.no_grad(): |
| 347 | +recon = model(X_test) |
| 348 | +errors = torch.mean((recon - X_test)**2, dim=(1,2)) # per-sample MSE |
| 349 | +
|
| 350 | +Choose threshold (e.g., 95th percentile of train errors) |
| 351 | +
|
| 352 | +with torch.no_grad(): |
| 353 | +recon_train = model(X_train) |
| 354 | +train_errors = torch.mean((recon_train - X_train)**2, dim=(1,2)) |
| 355 | +threshold = torch.quantile(train_errors, 0.95) |
| 356 | +anomalies = (errors > threshold) |
| 357 | +print(‘Detected’, anomalies.sum().item(), ‘anomalies out of’, len(errors)) |
| 358 | +\end{lstlisting} |
| 359 | +\end{frame} |
| 360 | +
|
| 361 | +\begin{frame}{TensorFlow: Denoising AE Example (MNIST) — Code} |
| 362 | +\begin{lstlisting} |
| 363 | +
|
| 364 | +Using autoencoder defined earlier (TensorFlow) |
| 365 | +
|
| 366 | +import numpy as np |
| 367 | +noise_factor = 0.5 |
| 368 | +x_train_noisy = x_train + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x_train.shape) |
| 369 | +x_train_noisy = np.clip(x_train_noisy, 0., 1.) |
| 370 | +autoencoder.fit(x_train_noisy, x_train, epochs=10, batch_size=128, validation_split=0.1) |
| 371 | +
|
| 372 | +Then predict on noisy test images |
| 373 | +
|
| 374 | +x_test_noisy = x_test + noise_factor * np.random.normal(size=x_test.shape) |
| 375 | +x_test_noisy = np.clip(x_test_noisy, 0., 1.) |
| 376 | +reconstructions = autoencoder.predict(x_test_noisy) |
| 377 | +\end{lstlisting} |
| 378 | +\end{frame} |
| 379 | +
|
| 380 | +\begin{frame}{Evaluation Metrics and Model Selection} |
| 381 | +\begin{itemize} |
| 382 | +\item Reconstruction MSE / MAE: direct measure of reconstruction quality. |
| 383 | +\item For anomaly detection: Precision/Recall, ROC-AUC using reconstruction error as score. |
| 384 | +\item Visualization: t-SNE or PCA of latent vectors $z$ to inspect cluster structure. |
| 385 | +\item Cross-validation / held-out validation using normal data only for threshold selection. |
| 386 | +\end{itemize} |
| 387 | +\end{frame} |
| 388 | +
|
| 389 | +\begin{frame}{Practical Tips} |
| 390 | +\begin{itemize} |
| 391 | +\item Normalize inputs consistently (e.g., [0,1] or zero-mean unit-variance). |
| 392 | +\item Small learning rates and early stopping help prevent overfitting to noise. |
| 393 | +\item Monitor validation reconstruction and latent-space structure. |
| 394 | +\item Consider deeper architectures or convolutional autoencoders for images. |
| 395 | +\end{itemize} |
| 396 | +\end{frame} |
| 397 | +
|
| 398 | +
|
| 399 | +\end{document} |
0 commit comments