Skip to content

Commit 76f3221

Browse files
committed
update
1 parent 1155ebf commit 76f3221

File tree

2 files changed

+819
-0
lines changed

2 files changed

+819
-0
lines changed

doc/src/week47/Latexslides/ae.tex

Lines changed: 399 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,399 @@
1+
\documentclass[11pt]{beamer}
2+
\usetheme{Madrid}
3+
\usecolortheme{seagull}
4+
\usefonttheme{professionalfonts}
5+
6+
\usepackage[utf8]{inputenc}
7+
\usepackage{amsmath,amssymb,mathtools}
8+
\usepackage{graphicx}
9+
\usepackage{tikz}
10+
\usepackage{hyperref}
11+
\usepackage{listings}
12+
\usepackage{xcolor}
13+
\usepackage{booktabs}
14+
\usepackage{caption}
15+
16+
% Listings (Python) style
17+
\lstset{language=Python,
18+
basicstyle=\ttfamily\scriptsize,
19+
keywordstyle=\color{blue},
20+
commentstyle=\color{gray},
21+
breaklines=true,
22+
frame=single,
23+
columns=fullflexible
24+
}
25+
26+
% TikZ styles
27+
\tikzset{encoder/.style={rectangle, draw=black!80, fill=blue!10, rounded corners, minimum width=2.2cm, minimum height=6mm},
28+
decoder/.style={rectangle, draw=black!80, fill=green!10, rounded corners, minimum width=2.2cm, minimum height=6mm},
29+
arrow/.style={->, >=stealth, thick}}
30+
31+
\title{Autoencoders: Theory, Variants and Applications}
32+
\author{FYS-STK3155/4155}
33+
\date{\today}
34+
35+
\begin{document}
36+
37+
\begin{frame}
38+
\titlepage
39+
\end{frame}
40+
41+
\begin{frame}{Outline}
42+
\tableofcontents
43+
\end{frame}
44+
45+
46+
\begin{frame}{Learning Goals}
47+
\begin{itemize}
48+
\item Understand the basic autoencoder architecture (encoder, latent space, decoder).
49+
\item Derive the linear autoencoder and its connection to PCA.
50+
\item See simple implementations and start hands-on examples.
51+
\end{itemize}
52+
\end{frame}
53+
54+
\begin{frame}{What is an Autoencoder?}
55+
\begin{itemize}
56+
\item An autoencoder (AE) is a neural network trained to reconstruct its input: $\hat{x}=\mathrm{Dec}(\mathrm{Enc}(x))$.
57+
\item Components:
58+
\begin{itemize}
59+
\item \textbf{Encoder} $f_\theta:\mathbb{R}^d\to\mathbb{R}^m$ compresses input to latent code $z=f_\theta(x)$
60+
\item \textbf{Decoder} $g_\phi:\mathbb{R}^m\to\mathbb{R}^d$ reconstructs $\hat{x}=g_\phi(z)$
61+
\end{itemize}
62+
\item Training objective: minimize reconstruction loss, e.g. MSE
63+
\mathcal{L}(\theta,\phi)=\frac{1}{N}\sum_{i=1}^N \|x^{(i)}-g_\phi(f_\theta(x^{(i)}))\|^2_2.
64+
\end{itemize}
65+
\end{frame}
66+
67+
\begin{frame}{Simple Diagram: Autoencoder}
68+
\centering
69+
\begin{tikzpicture}[node distance=12mm, auto, scale=0.95]
70+
\node (x) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm] {$x\in\mathbb{R}^d$};
71+
\node (enc1) [encoder, right=12mm of x] {Encoder layers};
72+
\node (z) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm, right=12mm of enc1] {$z\in\mathbb{R}^m$};
73+
\node (dec1) [decoder, right=12mm of z] {Decoder layers};
74+
\node (xhat) [draw, rectangle, rounded corners, minimum width=1.6cm, minimum height=6mm, right=12mm of dec1] {$\hat{x}\in\mathbb{R}^d$};
75+
\draw[arrow] (x) – (enc1);
76+
\draw[arrow] (enc1) – (z);
77+
\draw[arrow] (z) – (dec1);
78+
\draw[arrow] (dec1) – (xhat);
79+
\end{tikzpicture}
80+
\vspace{2mm}
81+
\begin{itemize}
82+
\item Encoder + Decoder may be \emph{linear} or \emph{nonlinear}.
83+
\item Bottleneck dimension $m$ controls compression; if $m<d$ we force representation learning.
84+
\end{itemize}
85+
\end{frame}
86+
87+
\begin{frame}{Linear Autoencoder}
88+
\begin{itemize}
89+
\item Consider linear encoder and decoder with no biases for simplicity:
90+
z=W_e x,\qquad \hat{x}=W_d z = W_d W_e x.
91+
\item Minimize reconstruction error over dataset $X\in\mathbb{R}^{d\times N}$ (columns are datapoints):
92+
\min_{W_e,W_d} \|X - W_d W_e X\|_F^2.
93+
\item Let $W= W_d W_e$ be a rank-$m$ matrix approximation of the identity mapping on the data subspace.
94+
\end{itemize}
95+
\end{frame}
96+
97+
\begin{frame}{Linear AE \leftrightarrow PCA}
98+
\begin{itemize}
99+
\item If we constrain $W_e$ to have orthonormal rows and $W_d=W_e^T$, minimizing Frobenius norm leads to PCA projection onto top-$m$ principal components.
100+
\item Proof sketch (informal): SVD of data $X = U \Sigma V^T$. Best rank-$m$ approximation in Frobenius norm is via truncation $U_m\Sigma_m V_m^T$ (Eckart–Young theorem). Linear AE learns the same subspace when optimal.
101+
\item Therefore linear AE with MSE and appropriate constraints recovers PCA.
102+
\end{itemize}
103+
\end{frame}
104+
105+
\begin{frame}{Derivation (Sketch)}
106+
\begin{align*}
107+
&\min_{W,:,\mathrm{rank}(W)\le m} |X - W X|_F^2 \[2mm]
108+
&\text{SVD: } X = U \Sigma V^T, \quad X_m = U_m \Sigma_m V_m^T \\
109+
&\Rightarrow \operatorname{argmin}_W \|X - W X\|F^2 = P{U_m} = U_m U_m^T,
110+
\end{align*}
111+
which is the projection onto the leading $m$ principal components. A linear AE with encoder $W_e=U_m^T$, decoder $W_d=U_m$ achieves this.
112+
\end{frame}
113+
114+
\begin{frame}{Practical remarks}
115+
\begin{itemize}
116+
\item Linear AE = good for understanding; nonlinear AE (with activations) can learn more complex manifolds.
117+
\item Bottleneck dimension $m$ should balance reconstruction fidelity vs. compression.
118+
\item Regularization (weight decay, sparsity) helps to learn meaningful features.
119+
\end{itemize}
120+
\end{frame}
121+
122+
123+
124+
\begin{frame}{Nonlinear Autoencoder (Single Hidden Layer)}
125+
\begin{align*}
126+
z &= \sigma(W_e x + b_e),\\
127+
\hat{x} &= \sigma’(W_d z + b_d),
128+
\end{align*}
129+
where $\sigma,\sigma$ are activation functions (ReLU, tanh, sigmoid) and losses are typically MSE or binary cross-entropy for normalized inputs.
130+
\end{frame}
131+
132+
\begin{frame}{Backpropagation Through Autoencoder}
133+
\begin{itemize}
134+
\item Loss $\mathcal{L}(\theta) = \frac{1}{N}\sum_i \ell(x^{(i)},\hat{x}^{(i)})$ with $\hat{x}=g_\phi(f_\theta(x))$.
135+
\item Gradient computed via chain rule through decoder and encoder:
136+
\[\nabla_{\phi,\theta} \mathcal{L} = \frac{1}{N} \sum_i \nabla_{\hat{x}} \ell \cdot \nabla_{\phi} g_\phi(z) \cdot \nabla_\theta f_\theta(x).\]
137+
\item Standard autodiff in PyTorch / TensorFlow handles these computations.
138+
\end{itemize}
139+
\end{frame}
140+
141+
\begin{frame}{PyTorch: Full Autoencoder (MNIST) — Code}
142+
\begin{lstlisting}
143+
144+
PyTorch AE for MNIST (fully self-contained snippet)
145+
146+
import torch
147+
from torch import nn, optim
148+
from torchvision import datasets, transforms
149+
from torch.utils.data import DataLoader
150+
151+
Data
152+
153+
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
154+
train_ds = datasets.MNIST(’.’, train=True, download=True, transform=transform)
155+
train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
156+
157+
Model
158+
159+
class Autoencoder(nn.Module):
160+
def init(self, input_dim=28*28, hidden_dim=64, latent_dim=16):
161+
super().init()
162+
self.encoder = nn.Sequential(
163+
nn.Flatten(),
164+
nn.Linear(input_dim, hidden_dim),
165+
nn.ReLU(),
166+
nn.Linear(hidden_dim, latent_dim),
167+
nn.ReLU()
168+
)
169+
self.decoder = nn.Sequential(
170+
nn.Linear(latent_dim, hidden_dim),
171+
nn.ReLU(),
172+
nn.Linear(hidden_dim, input_dim),
173+
nn.Sigmoid(),
174+
nn.Unflatten(1, (1,28,28))
175+
)
176+
def forward(self, x):
177+
z = self.encoder(x)
178+
xhat = self.decoder(z)
179+
return xhat
180+
181+
model = Autoencoder()
182+
criterion = nn.MSELoss()
183+
optimizer = optim.Adam(model.parameters(), lr=1e-3)
184+
185+
Training loop (one epoch example)
186+
187+
model.train()
188+
for epoch in range(1, 11):
189+
epoch_loss = 0.0
190+
for xb, _ in train_loader:
191+
optimizer.zero_grad()
192+
xb = xb
193+
xhat = model(xb)
194+
loss = criterion(xhat, xb)
195+
loss.backward()
196+
optimizer.step()
197+
epoch_loss += loss.item() * xb.size(0)
198+
epoch_loss /= len(train_loader.dataset)
199+
print(f”Epoch {epoch}, Loss: {epoch_loss:.6f}”)
200+
\end{lstlisting}
201+
\end{frame}
202+
203+
\begin{frame}{TensorFlow: Full Autoencoder (MNIST) — Code}
204+
\begin{lstlisting}
205+
206+
TensorFlow Keras AE for MNIST
207+
208+
import tensorflow as tf
209+
from tensorflow.keras import layers, models
210+
211+
Data
212+
213+
(x_train, _), (x_test, _) = tf.keras.datasets.mnist.load_data()
214+
x_train = x_train.astype(‘float32’) / 255.0
215+
x_train = x_train[…, None]
216+
217+
Model
218+
219+
input_shape = (28,28,1)
220+
encoder_inputs = layers.Input(shape=input_shape)
221+
x = layers.Flatten()(encoder_inputs)
222+
x = layers.Dense(64, activation=‘relu’)(x)
223+
latent = layers.Dense(16, activation=‘relu’)(x)
224+
225+
x = layers.Dense(64, activation=‘relu’)(latent)
226+
x = layers.Dense(28*28, activation=‘sigmoid’)(x)
227+
outputs = layers.Reshape((28,28,1))(x)
228+
229+
autoencoder = models.Model(encoder_inputs, outputs)
230+
autoencoder.compile(optimizer=‘adam’, loss=‘mse’)
231+
232+
Training
233+
234+
autoencoder.fit(x_train, x_train, epochs=10, batch_size=128)
235+
\end{lstlisting}
236+
\end{frame}
237+
238+
\begin{frame}{Discussion: Architectures and Choices}
239+
\begin{itemize}
240+
\item Activation functions: ReLU often for hidden layers, sigmoid for output when input scaled to [0,1].
241+
\item Loss: MSE for continuous-valued inputs; binary cross-entropy if inputs are binary/normalized.
242+
\item Bottleneck/latent size affects compression and reconstruction quality.
243+
\item Use dropout, batch-norm, or weight decay as needed.
244+
\end{itemize}
245+
\end{frame}
246+
247+
248+
249+
\begin{frame}{Learning Goals (Lecture 3)}
250+
\begin{itemize}
251+
\item Understand denoising and sparse autoencoders.
252+
\item Learn regularization techniques to improve latent representations.
253+
\item See code examples for denoising and sparse penalties.
254+
\end{itemize}
255+
\end{frame}
256+
257+
\begin{frame}{Denoising Autoencoder (DAE)}
258+
\begin{itemize}
259+
\item Idea: Corrupt input $x$ with noise to $\tilde{x}$ and train AE to reconstruct the clean $x$.
260+
\item Objective: $\mathbb{E}{x\sim p{data}}\mathbb{E}{\tilde{x}\sim q(\tilde{x}|x)}\big[\ell(x,g\phi(f_\theta(\tilde{x})))\big]$.
261+
\item Denoising forces encoder to learn robust features and manifold structure.
262+
\end{itemize}
263+
\end{frame}
264+
265+
\begin{frame}{PyTorch: Denoising Autoencoder (Snippet)}
266+
\begin{lstlisting}
267+
268+
Add Gaussian noise to inputs and train AE to reconstruct clean images
269+
270+
noise_std = 0.3
271+
for xb, _ in train_loader:
272+
xb_noisy = xb + noise_std * torch.randn_like(xb)
273+
xb_noisy = torch.clip(xb_noisy, 0., 1.)
274+
optimizer.zero_grad()
275+
xhat = model(xb_noisy)
276+
loss = criterion(xhat, xb) # target is original clean xb
277+
loss.backward()
278+
optimizer.step()
279+
\end{lstlisting}
280+
\end{frame}
281+
282+
\begin{frame}{Sparse Autoencoders}
283+
\begin{itemize}
284+
\item Encourage activations in the latent code to be sparse (many near-zero).
285+
\item Add penalty (e.g., $L_1$ on activations, or KL divergence to small target sparsity $\rho$):
286+
\mathcal{L} = \frac{1}{N}\sum_i \|x^{(i)}-\hat{x}^{(i)}\|^2 + \beta \sum_j \text{KL}(\rho \| \hat{\rho}_j),
287+
where $\hat{\rho}_j$ is average activation of unit $j$ across data.
288+
\item Sparsity encourages disentanglement and interpretable features.
289+
\end{itemize}
290+
\end{frame}
291+
292+
\begin{frame}{Implementation: Sparsity via $L_1$ Penalty (PyTorch)}
293+
\begin{lstlisting}
294+
295+
After forward pass: z = model.encoder(xb)
296+
297+
l1_lambda = 1e-5
298+
reconstruction = model.decoder(z)
299+
mse = criterion(reconstruction, xb)
300+
l1_penalty = l1_lambda * torch.mean(torch.abs(z))
301+
loss = mse + l1_penalty
302+
loss.backward()
303+
optimizer.step()
304+
\end{lstlisting}
305+
\end{frame}
306+
307+
\begin{frame}{Contractive Autoencoders (Brief)}
308+
\begin{itemize}
309+
\item Penalize sensitivity of encoder to input by adding Frobenius norm of Jacobian $J_{f}(x)$:
310+
\mathcal{L} = \text{recon loss} + \lambda \|J_f(x)\|_F^2.
311+
\item Encourages robustness and local invariance around training points.
312+
\item More expensive: requires computing Jacobian or tractable approximations.
313+
\end{itemize}
314+
\end{frame}
315+
316+
\begin{frame}{Applications: Denoising, Dimensionality Reduction, Anomaly Detection}
317+
\begin{itemize}
318+
\item \textbf{Denoising}: reconstruct clean signals from noisy inputs (images, sensor data).
319+
\item \textbf{Dimensionality reduction}: use latent $z$ for visualization, clustering.
320+
\item \textbf{Anomaly detection}: large reconstruction error indicates outliers or anomalies.
321+
\end{itemize}
322+
\end{frame}
323+
324+
325+
\begin{frame}{Learning Goals (Lecture 4)}
326+
\begin{itemize}
327+
\item Put together applications: anomaly detection pipeline, denoising in practice.
328+
\item Discuss evaluation metrics and practical tips (regularization, early stopping, hyperparameters).
329+
\item Provide complete runnable examples in PyTorch and TensorFlow that illustrate anomaly detection and denoising.
330+
\end{itemize}
331+
\end{frame}
332+
333+
\begin{frame}{Autoencoders for Anomaly Detection}
334+
\begin{itemize}
335+
\item Train AE on “normal” data only.
336+
\item For a test point $x^$ compute reconstruction error $r(x^)=|x^-\hat{x}^|^2$.
337+
\item If $r(x^*)$ exceeds threshold $\tau$, flag as anomaly. Threshold chosen via validation set or percentile.
338+
\end{itemize}
339+
\end{frame}
340+
341+
\begin{frame}{PyTorch: Anomaly Detection Example (Synthetic)}
342+
\begin{lstlisting}
343+
344+
Train AE on normal sine-wave data (from earlier lecture). After training:
345+
346+
with torch.no_grad():
347+
recon = model(X_test)
348+
errors = torch.mean((recon - X_test)**2, dim=(1,2)) # per-sample MSE
349+
350+
Choose threshold (e.g., 95th percentile of train errors)
351+
352+
with torch.no_grad():
353+
recon_train = model(X_train)
354+
train_errors = torch.mean((recon_train - X_train)**2, dim=(1,2))
355+
threshold = torch.quantile(train_errors, 0.95)
356+
anomalies = (errors > threshold)
357+
print(‘Detected’, anomalies.sum().item(), ‘anomalies out of’, len(errors))
358+
\end{lstlisting}
359+
\end{frame}
360+
361+
\begin{frame}{TensorFlow: Denoising AE Example (MNIST) — Code}
362+
\begin{lstlisting}
363+
364+
Using autoencoder defined earlier (TensorFlow)
365+
366+
import numpy as np
367+
noise_factor = 0.5
368+
x_train_noisy = x_train + noise_factor * np.random.normal(loc=0.0, scale=1.0, size=x_train.shape)
369+
x_train_noisy = np.clip(x_train_noisy, 0., 1.)
370+
autoencoder.fit(x_train_noisy, x_train, epochs=10, batch_size=128, validation_split=0.1)
371+
372+
Then predict on noisy test images
373+
374+
x_test_noisy = x_test + noise_factor * np.random.normal(size=x_test.shape)
375+
x_test_noisy = np.clip(x_test_noisy, 0., 1.)
376+
reconstructions = autoencoder.predict(x_test_noisy)
377+
\end{lstlisting}
378+
\end{frame}
379+
380+
\begin{frame}{Evaluation Metrics and Model Selection}
381+
\begin{itemize}
382+
\item Reconstruction MSE / MAE: direct measure of reconstruction quality.
383+
\item For anomaly detection: Precision/Recall, ROC-AUC using reconstruction error as score.
384+
\item Visualization: t-SNE or PCA of latent vectors $z$ to inspect cluster structure.
385+
\item Cross-validation / held-out validation using normal data only for threshold selection.
386+
\end{itemize}
387+
\end{frame}
388+
389+
\begin{frame}{Practical Tips}
390+
\begin{itemize}
391+
\item Normalize inputs consistently (e.g., [0,1] or zero-mean unit-variance).
392+
\item Small learning rates and early stopping help prevent overfitting to noise.
393+
\item Monitor validation reconstruction and latent-space structure.
394+
\item Consider deeper architectures or convolutional autoencoders for images.
395+
\end{itemize}
396+
\end{frame}
397+
398+
399+
\end{document}

0 commit comments

Comments
 (0)