Add modality for images that compresses pixels and can be used for generation tasks.

Niki Parmar · Ryan Sepassi · commit 4084c5c1cfbc · 2017-11-10T17:41:17.000-08:00
PiperOrigin-RevId: 175313670
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
@@ -151,6 +151,16 @@ def standardize_images(x):
     return x
 
 
+def convert_rgb_to_real(x):
+  """Conversion of pixel values to real numbers."""
+  with tf.name_scope("rgb_to_real", [x]):
+    x = tf.to_float(x)
+    # Use the formula (value/128) - 1 to convert each channel value into a
+    # real number in the range -1 to 1.
+    x = (x /128) - 1
+    return x
+
+
 def image_augmentation(images, do_colors=False):
   """Image augmentation: cropping, flipping, and color transforms."""
   images = tf.random_crop(images, [299, 299, 3])
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -229,6 +229,76 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
         top_out, targets, weights_fn=weights_fn)
 
 
+@registry.register_image_modality("image_identity_compress")
+class ImageIdentityCompressModality(modality.Modality):
+  """Modality for images used in generation."""
+
+  @property
+  def top_dimensionality(self):
+    return 256
+
+  def bottom_compress(self, inputs, name="bottom"):
+    """Transform input from data space to model space.
+
+    Perform conversion of RGB pixel values to a real number and combine values
+    for each pixel to form representation of image_length x image_length dims.
+
+    Args:
+      inputs: A Tensor with shape [batch, ...]
+      name: string, scope.
+    Returns:
+      body_input: A Tensor with shape [batch, ?, ?, body_input_depth].
+    """
+    with tf.variable_scope(name):
+      inputs = common_layers.convert_rgb_to_real(inputs)
+      ishape = tf.shape(inputs)
+      inputs = tf.reshape(inputs, [-1, ishape[1], ishape[2]*ishape[3], 1])
+      inputs.set_shape([None, None, None, 1])
+      # We compress RGB intensities for each pixel using a conv.
+      x = common_layers.conv_block(
+          inputs,
+          self._body_input_depth, [((1, 1), (1, 3))],
+          first_relu=False,
+          padding="VALID",
+          strides=(1, 3),
+          force2d=True,
+          name="conv_input")
+      return x
+
+  def bottom(self, inputs):
+    return self.bottom_compress(inputs, "input_bottom")
+
+  def targets_bottom(self, inputs):
+    return self.bottom_compress(inputs, "output_bottom")
+
+  def top(self, body_output, _):
+    with tf.variable_scope(self.name):
+      hidden_dim = self._model_hparams.hidden_size
+      img_len = self._model_hparams.img_len
+      channels = self._model_hparams.num_channels
+      batch = tf.shape(body_output)[0]
+      x = common_layers.conv(
+          body_output,
+          hidden_dim*channels, (1, 1),
+          padding="VALID",
+          activation=tf.nn.relu,
+          name="decompress_conv")
+      x = tf.reshape(x, [batch, img_len, img_len*channels, hidden_dim])
+      x.set_shape([None, None, None, hidden_dim])
+      x = common_layers.conv(x,
+                             self.top_dimensionality,
+                             (1, 1), name="output_conv")
+      x = tf.reshape(x, [-1, img_len, img_len,
+                         channels, self.top_dimensionality])
+      return x
+
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
+    # Call the default implementation, but weight 1.0 on 0s by default.
+    # (Since we're processing images and so have no padding and some pixel 0s.)
+    return super(ImageIdentityCompressModality, self).loss(
+        top_out, targets, weights_fn=weights_fn)
+
+
 @registry.register_audio_modality("default")
 class AudioModality(modality.Modality):
   """Performs strided conv compressions for audio data."""