diff --git a/README.md b/README.md new file mode 100644 index 00000000..93b61660 --- /dev/null +++ b/README.md @@ -0,0 +1,372 @@ +
+ + + +
+ +[![GSoC 2026](https://img.shields.io/badge/GSoC-2026%20HumanAI-F6AE2D?style=for-the-badge&logo=google&logoColor=white)](https://summerofcode.withgoogle.com/) +[![HumanAI](https://img.shields.io/badge/Org-HumanAI-E94560?style=for-the-badge)](https://human-ai.org/) +[![PyTorch](https://img.shields.io/badge/PyTorch-2.x-EE4C2C?style=for-the-badge&logo=pytorch&logoColor=white)](https://pytorch.org) +[![WikiArt](https://img.shields.io/badge/Data-WikiArt%203K-F59E0B?style=for-the-badge)]() +[![NGA](https://img.shields.io/badge/Data-NGA%20Open%20API-10B981?style=for-the-badge)]() +[![License](https://img.shields.io/badge/License-MIT-22C55E?style=for-the-badge)](LICENSE) + +
+ +> **🎨 When a master hides another painting beneath their canvas β€” can AI find it?** +> +> ArtExtract trains a **multi-task CNN-RNN** to classify art by style, artist, and genre on WikiArt, then builds a **Siamese Network** that retrieves visually similar paintings from the National Gallery of Art β€” together enabling the detection of hidden underpaintings by identifying anomalous style signatures that don't belong. + +
+ +[πŸš€ Quick Start](#-quick-start) Β· [πŸ— Architecture](#-architecture) Β· [πŸ“Š Results](#-results) Β· [πŸ” The Mystery](#-the-mystery--painting-in-a-painting) Β· [🎯 GSoC Vision](#-gsoc-2026-vision) + +
+ +--- + +## πŸ–ΌοΈ The Mystery β€” Painting In A Painting + +Art historians have long known that masters like **Rembrandt, Vermeer, and El Greco** sometimes painted over earlier works β€” their own or others'. X-ray scans reveal hidden faces, different compositions, even stolen paintings lurking beneath centuries of varnish. + +**ArtExtract teaches AI to detect these anomalies computationally:** + +``` +Step 1 ─ Train CNN-RNN to recognise each artist's unique visual signature + (brushstroke, composition, palette, spatial grammar) + +Step 2 ─ Build Siamese similarity embeddings for the full NGA collection + +Step 3 ─ Scan a painting: does any region's style NOT match the declared artist? + ↓ + Outlier detected β†’ possible underpainting / hidden work 🎭 +``` + +--- + +## πŸ‘¨β€πŸŽ¨ Mentors + +| Mentor | Affiliation | Expertise | +|---|---|---| +| **Emanuele Usai** | University of Alabama | Computer Vision, Art Analysis | +| **Sergei Gleyzer** | University of Alabama | ML4Sci, Physics-Informed ML | + +--- + +## ✨ Two Tasks, One Vision + +| | Task 1 | Task 2 | +|---|---|---| +| **Goal** | Classify WikiArt paintings by Style + Artist + Genre | Retrieve similar paintings from NGA collection | +| **Model** | CNN-RNN Multi-task Classifier | Siamese Network + Triplet Loss | +| **Backbone** | ResNet-50 (ImageNet pretrained) | EfficientNet-B2 | +| **Key Extra** | Outlier detection via embedding distance | Hard negative triplet mining | +| **Dataset** | WikiArt 3,000 paintings (HuggingFace) | NGA Open Dataset + IIIF API | +| **Metrics** | Style/Genre Accuracy | Precision@K Β· mAP | + +--- + +## πŸ— Architecture + +### Task 1 β€” CNN-RNN Multi-Task Art Classifier + +``` +Input: Painting (B, 3, 224, 224) + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ ResNet-50 Backbone (ImageNet pretrained) β”‚ +β”‚ Remove avgpool + fc layers β€” keep spatial feature map β”‚ +β”‚ Output: (B, 2048, 7, 7) ← rich spatial features β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ Treat 7Γ—7 grid = 49 spatial tokens + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Reshape: (B, 49, 2048) β”‚ +β”‚ BiLSTM (hidden=512, layers=2) β”‚ +β”‚ ← captures GLOBAL composition: where objects are relative β”‚ +β”‚ Output: (B, 49, 1024) β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ + β–Ό +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ Attention Pooling β”‚ +β”‚ Score each of the 49 positions β†’ weighted sum β”‚ +β”‚ Output: (B, 1024) ← painting embedding β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + β”‚ β”‚ β”‚ + β–Ό β–Ό β–Ό + Style Head Artist Head Genre Head + Linearβ†’softmax Linearβ†’softmax Linearβ†’softmax +``` + +**Why CNN + RNN for paintings?** +``` +CNN alone β†’ sees local texture, brushstroke, colour +RNN alone β†’ no visual features +CNN + RNN β†’ CNN captures "what is here" at each of 49 grid cells + RNN captures "how things relate across the composition" + = how art historians actually read a painting +``` + +**Multi-Task Loss (weighted):** +```python +loss = 1.0 Γ— CrossEntropy(style_pred, style_true) # primary task + + 0.8 Γ— CrossEntropy(artist_pred, artist_true) # secondary + + 0.6 Γ— CrossEntropy(genre_pred, genre_true) # tertiary + +Label smoothing = 0.1 (prevents overconfidence on ambiguous art styles) +``` + +**Training Strategy:** +``` +Phase 1 β€” Freeze CNN backbone (10 epochs): + β†’ Only train BiLSTM + attention + classification heads + β†’ LR = 1e-3 (fast convergence on unfrozen layers) + +Phase 2 β€” Unfreeze full model (15 epochs): + β†’ Fine-tune ResNet-50 layers too + β†’ LR reduced for backbone (avoid forgetting ImageNet features) + β†’ Cosine annealing LR schedule +``` + +--- + +### Task 1 β€” Outlier Detection (The Core Discovery Engine) + +``` +After training, every painting has an embedding vector (B, 1024) + +Step 1: Compute class centroid for each style + centroid_s = mean(embeddings of all style-s paintings) + +Step 2: For each painting, measure distance from its own centroid + d(painting_i) = ||embedding_i - centroid_{style_i}||β‚‚ + +Step 3: Threshold at 95th percentile of all distances + distance > threshold β†’ OUTLIER 🚨 + +Interpretation: + Normal painting β†’ embedding sits close to its style cluster + Outlier painting β†’ embedding is far from its declared style + β†’ its visual language "doesn't belong" + β†’ possible underpainting / misattribution +``` + +--- + +### Task 2 β€” Siamese Network + Triplet Loss + +``` +Training with Triplet Mining: + + Anchor ── same artist ──► Positive (pulled together) + Anchor ── diff artist ──► Negative (pushed apart) + + Triplet Loss: + L = max(0, d(anchor, positive) - d(anchor, negative) + margin) + + Hard Negative Mining: + Pick the negative CLOSEST to the anchor + β†’ forces the model to learn fine-grained artist distinctions +``` + +**Siamese Backbone:** +``` +Input: Painting (B, 3, 224, 224) + β”‚ + β–Ό + EfficientNet-B2 (pretrained) β†’ Global Average Pool β†’ FC(256) β†’ L2-norm + β”‚ + β–Ό + 256-dimensional embedding unit sphere + (cosine similarity = angular distance between painting styles) +``` + +**Retrieval at inference:** +``` +Query painting β†’ Siamese embed β†’ cosine similarity to all NGA paintings + β†’ Top-K nearest neighbours = similar paintings 🎨 +``` + +--- + +## πŸ“Š Results + +### Task 1 β€” WikiArt Classification + +``` +══════════════════════════════════════════════════════ + TASK 1 TEST RESULTS β€” CNN-RNN Art Classifier +══════════════════════════════════════════════════════ + Style Accuracy : [run notebook β†’] % + Genre Accuracy : [run notebook β†’] % +────────────────────────────────────────────────────── + Outliers found : [run notebook β†’] paintings + (top 5% by embedding distance) +══════════════════════════════════════════════════════ +``` + +### Task 2 β€” NGA Painting Similarity + +``` +══════════════════════════════════════════════════════ + TASK 2 TEST RESULTS β€” Siamese Retrieval +══════════════════════════════════════════════════════ + Precision@1 : [run notebook β†’] + Precision@3 : [run notebook β†’] + Precision@5 : [run notebook β†’] + Precision@10 : [run notebook β†’] + mAP : [run notebook β†’] +══════════════════════════════════════════════════════ +``` + +### Visual Retrieval β€” What It Looks Like + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ QUERY πŸ”΄ β”‚ Result 1 β”‚ Result 2 β”‚ Result 3 β”‚ Result 4 β”‚ Result 5 β”‚ +β”‚ Rembrandt β”‚ βœ“ Sim:0.94 β”‚ βœ“ Sim:0.91 β”‚ βœ— Sim:0.78 β”‚ βœ“ Sim:0.76 β”‚ βœ“ Sim:0.74 β”‚ +β”‚ (portrait) β”‚ Rembrandt β”‚ Rembrandt β”‚ Hals β”‚ Rembrandt β”‚ Rembrandt β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ + βœ“ = same artist βœ— = different artist Similarity = 1 - cosine_distance +``` + +*See `outputs/08_similarity_retrieval.png` generated on run.* + +--- + +## πŸš€ Quick Start + +### ▢️ Run on Google Colab (Recommended) + +[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhiram123467/artextract-deeplense/blob/main/ArtExtract_PaintingInAPainting.ipynb) + +1. Click **Open in Colab** +2. `Runtime` β†’ `Change runtime type` β†’ **T4 GPU** β†’ Save +3. `Runtime` β†’ `Run all` +4. β˜• ~25–35 min β€” all visualisations auto-saved to `outputs/` + +### πŸ’» Run Locally + +```bash +# Clone +git clone https://github.com/abhiram123467/artextract-deeplense +cd artextract-deeplense + +# Install +pip install torch torchvision timm matplotlib scikit-learn seaborn \ + tqdm Pillow requests pandas datasets + +# Launch notebook +jupyter notebook ArtExtract_PaintingInAPainting.ipynb +``` + +--- + +## πŸ“ Project Structure + +``` +artextract-deeplense/ +β”‚ +β”œβ”€β”€ πŸ““ ArtExtract_PaintingInAPainting.ipynb # Complete pipeline +β”‚ +β”œβ”€β”€ outputs/ +β”‚ β”œβ”€β”€ 01_wikiart_samples.png # Sample paintings from WikiArt +β”‚ β”œβ”€β”€ 02_class_distribution.png # Style/Artist/Genre class counts +β”‚ β”œβ”€β”€ 03_training_curves.png # Loss + accuracy over epochs +β”‚ β”œβ”€β”€ 04_confusion_matrix.png # Style + Genre confusion matrices +β”‚ β”œβ”€β”€ 05_tsne_embeddings.png # t-SNE of painting embeddings +β”‚ β”œβ”€β”€ 06_outlier_paintings.png # Top outlier paintings detected +β”‚ β”œβ”€β”€ 07_nga_samples.png # NGA dataset sample paintings +β”‚ β”œβ”€β”€ 08_similarity_retrieval.png # Query β†’ Top-5 similar paintings +β”‚ β”œβ”€β”€ 09_summary_dashboard.png # Full results dashboard +β”‚ β”œβ”€β”€ best_cnn_rnn.pth # Task 1 trained weights +β”‚ └── best_siamese.pth # Task 2 trained weights +β”‚ +β”œβ”€β”€ README.md +└── requirements.txt +``` + +--- + +## πŸ”§ Technical Stack + +``` +πŸ€– Deep Learning : PyTorch 2.x, torchvision +πŸ— Task 1 Backbone : ResNet-50 (ImageNet pretrained) + BiLSTM + Attention +πŸ— Task 2 Backbone : EfficientNet-B2 (timm) + Triplet Loss +πŸ“Š Metrics : Accuracy, mAP, Precision@K, NearestNeighbors (sklearn) +πŸ“‰ Loss T1 : CrossEntropyLoss (label_smoothing=0.1) Β· multi-task weighted +πŸ“‰ Loss T2 : Triplet Margin Loss (hard negative mining) +🎨 Dataset 1 : WikiArt via HuggingFace `huggan/wikiart` (3,000 samples) +πŸ–ΌοΈ Dataset 2 : National Gallery of Art Open Dataset + IIIF Image API +βš™οΈ Optimizer : AdamW (weight_decay=1e-4) + CosineAnnealingLR +πŸ“ Image Size : 224Γ—224 px (ImageNet standard) +🌈 Augmentation : RandomFlip Β· RandomRotation Β· ColorJitter +☁️ Compute : Google Colab T4 GPU +``` + +--- + +## 🎯 GSoC 2026 Vision + +> **Target Organisation: HumanAI** +> **Project: ArtExtract β€” Painting In A Painting** +> **Mentors: Emanuele Usai (Alabama) Β· Sergei Gleyzer (Alabama)** + +Proposed 12-week GSoC sprint: + +| Phase | Weeks | Deliverable | +|---|---|---| +| **Foundation** | 1–2 | Reproduce + extend with EfficientNet-B3 backbone | +| **Patch-level Analysis** | 3–5 | Detect *regions* of anomaly (not just whole paintings) | +| **Deeper Similarity** | 6–7 | CLIP embeddings for zero-shot style retrieval | +| **Real Underpaintings** | 8–9 | Validate on known X-ray scan datasets (Ghent Altarpiece) | +| **Web App** | 10–11 | Interactive Streamlit app β€” upload a painting, find its hidden layers | +| **GSoC Final** | 12 | Paper draft + public dataset of detected painting-in-painting candidates | + +**Why this matters for art history:** +- 🎭 **Louvre alone has 38,000 paintings** β€” manual X-ray analysis is impossible at scale +- πŸ”¬ **AI can flag candidates** for physical X-ray investigation, saving time and cost +- πŸ› **NGA Open Dataset + WikiArt = 200,000+ paintings** available for automated analysis +- 🌍 Hidden underpaintings have rewritten attribution of works by Caravaggio, Raphael, Van Eyck + +--- + +## πŸ“š References + +- [ResNet β€” He et al. 2015](https://arxiv.org/abs/1512.03385) β€” Deep Residual Learning +- [EfficientNet β€” Tan & Le 2019](https://arxiv.org/abs/1905.11946) β€” backbone for Task 2 +- [BiLSTM β€” Schuster & Paliwal 1997](https://ieeexplore.ieee.org/document/650093) β€” bidirectional sequence +- [Triplet Loss β€” Schroff et al. 2015](https://arxiv.org/abs/1503.03832) β€” FaceNet / Siamese +- [WikiArt HuggingFace](https://huggingface.co/datasets/huggan/wikiart) β€” training data +- [NGA Open Data](https://github.com/NationalGalleryOfArt/opendata) β€” National Gallery of Art +- [Painting-in-Painting research](https://www.courtauld.ac.uk/) β€” Courtauld Institute art science + +--- + +
+ +## πŸ‘¨β€πŸŽ¨ About the Author + +**Abhi Ramg** β€” AI/ML Researcher & GSoC 2026 Applicant + +πŸ“ Hyderabad, India  |  🎨 Art AI  |  πŸ”­ Astrophysics ML  |  🧠 Physics-Informed DL + +[![GitHub](https://img.shields.io/badge/GitHub-abhiram123467-181717?style=for-the-badge&logo=github)](https://github.com/abhiram123467) +[![ArtExtract](https://img.shields.io/badge/Repo-ArtExtract-F59E0B?style=for-the-badge&logo=github)](https://github.com/abhiram123467/artextract-deeplense) +[![DeepLense8](https://img.shields.io/badge/Also%20See-DeepLense8%20DDPM-8B5CF6?style=for-the-badge&logo=github)](https://github.com/abhiram123467/DeepLense8) +[![SIRA](https://img.shields.io/badge/Also%20See-SIRA%20Neural%20ODE-E94560?style=for-the-badge&logo=github)](https://github.com/abhiram123467/sira-deeplense) + +
+ +*"Every painting hides a secret. Every master leaves a ghost. We teach machines to listen for them."* + +
+ +**⭐ Star this repo if AI-powered art forensics excites you!** + + + +
diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..b7d3f2cc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,11 @@ +torch>=2.0.0 +torchvision>=0.15.0 +timm>=0.9.0 +scikit-learn>=1.3.0 +matplotlib>=3.7.0 +numpy>=1.24.0 +Pillow>=10.0.0 +tqdm>=4.65.0 +seaborn>=0.12.0 +einops>=0.6.0 +datasets>=2.14.0 diff --git a/unet_extractor.py b/unet_extractor.py new file mode 100644 index 00000000..2d8173d5 --- /dev/null +++ b/unet_extractor.py @@ -0,0 +1,103 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +class DoubleConv(nn.Module): + """(convolution => [BN] => ReLU) * 2""" + def __init__(self, in_channels, out_channels, mid_channels=None): + super().__init__() + if not mid_channels: + mid_channels = out_channels + self.double_conv = nn.Sequential( + nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(mid_channels), + nn.ReLU(inplace=True), + nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(inplace=True) + ) + + def forward(self, x): + return self.double_conv(x) + +class Down(nn.Module): + """Downscaling with maxpool then double conv""" + def __init__(self, in_channels, out_channels): + super().__init__() + self.maxpool_conv = nn.Sequential( + nn.MaxPool2d(2), + DoubleConv(in_channels, out_channels) + ) + + def forward(self, x): + return self.maxpool_conv(x) + +class Up(nn.Module): + """Upscaling then double conv""" + def __init__(self, in_channels, out_channels, bilinear=True): + super().__init__() + # if bilinear, use the normal convolutions to reduce the number of channels + if bilinear: + self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) + self.conv = DoubleConv(in_channels, out_channels, in_channels // 2) + else: + self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2) + self.conv = DoubleConv(in_channels, out_channels) + + def forward(self, x1, x2): + x1 = self.up(x1) + # input is CHW + diffY = x2.size()[2] - x1.size()[2] + diffX = x2.size()[3] - x1.size()[3] + + x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2, + diffY // 2, diffY - diffY // 2]) + # concatenate along the channels axis + x = torch.cat([x2, x1], dim=1) + return self.conv(x) + +class OutConv(nn.Module): + def __init__(self, in_channels, out_channels): + super(OutConv, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1) + + def forward(self, x): + return self.conv(x) + +class UNetExtractor(nn.Module): + def __init__(self, n_channels=3, n_classes=3, bilinear=False): + super(UNetExtractor, self).__init__() + self.n_channels = n_channels + self.n_classes = n_classes + self.bilinear = bilinear + + self.inc = DoubleConv(n_channels, 64) + self.down1 = Down(64, 128) + self.down2 = Down(128, 256) + self.down3 = Down(256, 512) + factor = 2 if bilinear else 1 + self.down4 = Down(512, 1024 // factor) + + self.up1 = Up(1024, 512 // factor, bilinear) + self.up2 = Up(512, 256 // factor, bilinear) + self.up3 = Up(256, 128 // factor, bilinear) + self.up4 = Up(128, 64, bilinear) + self.outc = OutConv(64, n_classes) + + def forward(self, x, extract_features=False): + x1 = self.inc(x) + x2 = self.down1(x1) + x3 = self.down2(x2) + x4 = self.down3(x3) + x5 = self.down4(x4) # Bottleneck layer (latent space) + + if extract_features: + # Flatten the bottleneck to use as a feature vector for Cosine Similarity + return torch.flatten(x5, start_dim=1) + + x = self.up1(x5, x4) + x = self.up2(x, x3) + x = self.up3(x, x2) + x = self.up4(x, x1) + logits = self.outc(x) + return logits \ No newline at end of file