add first training stability measure from cogview paper, hidden behind feature flag

lucidrains · lucidrains · commit 27079ddc84cf · 2021-05-28T12:50:55.000-07:00
diff --git a/README.md b/README.md
@@ -544,4 +544,15 @@ $ python generate.py --chinese --text '追老鼠的猫'
 }
 ```
 
+```bibtex
+@misc{ding2021cogview,
+    title   = {CogView: Mastering Text-to-Image Generation via Transformers},
+    author  = {Ming Ding and Zhuoyi Yang and Wenyi Hong and Wendi Zheng and Chang Zhou and Da Yin and Junyang Lin and Xu Zou and Zhou Shao and Hongxia Yang and Jie Tang},
+    year    = {2021},
+    eprint  = {2105.13290},
+    archivePrefix = {arXiv},
+    primaryClass = {cs.CV}
+}
+```
+
 *Those who do not want to imitate anything, produce nothing.* - Dali
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -9,7 +9,7 @@
 from dalle_pytorch import distributed_utils
 from dalle_pytorch.vae import OpenAIDiscreteVAE
 from dalle_pytorch.vae import VQGanVAE1024
-from dalle_pytorch.transformer import Transformer
+from dalle_pytorch.transformer import Transformer, DivideMax
 
 # helpers
 
@@ -322,6 +322,7 @@ def __init__(
         sparse_attn = False,
         attn_types = None,
         loss_img_weight = 7,
+        stable = False
     ):
         super().__init__()
         assert isinstance(vae, (DiscreteVAE, OpenAIDiscreteVAE, VQGanVAE1024)), 'vae must be an instance of DiscreteVAE'
@@ -365,10 +366,12 @@ def __init__(
             ff_dropout = ff_dropout,
             attn_types = attn_types,
             image_fmap_size = image_fmap_size,
-            sparse_attn = sparse_attn
+            sparse_attn = sparse_attn,
+            stable = stable
         )
 
         self.to_logits = nn.Sequential(
+            DivideMax(dim = -1) if stable else nn.Identity(),
             nn.LayerNorm(dim),
             nn.Linear(dim, self.total_tokens),
         )
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -26,6 +26,15 @@ def cast_tuple(val, depth = 1):
 
 # classes
 
+class DivideMax(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x):
+        maxes = x.amax(dim = self.dim, keepdim = True)
+        return x / maxes
+
 # https://arxiv.org/abs/2103.17239
 class LayerScale(nn.Module):
     def __init__(self, dim, depth, fn):
@@ -86,7 +95,8 @@ def __init__(
         ff_dropout = 0.,
         attn_types = None,
         image_fmap_size = None,
-        sparse_attn = False
+        sparse_attn = False,
+        stable = False
     ):
         super().__init__()
         layers = nn.ModuleList([])
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'dalle-pytorch',
   packages = find_packages(),
   include_package_data = True,
-  version = '0.12.0',
+  version = '0.12.1',
   license='MIT',
   description = 'DALL-E - Pytorch',
   author = 'Phil Wang',