Get rid of "last_position_only" by adding the corresponding property to Modality.

Lukasz Kaiser · Ryan Sepassi · commit fa460706a494 · 2017-11-10T17:41:20.000-08:00
PiperOrigin-RevId: 175342179
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
@@ -49,6 +49,10 @@ def name(self):
   def top_dimensionality(self):
     return self._vocab_size
 
+  @property
+  def top_is_pointwise(self):
+    return True
+
   def _get_weights(self, hidden_dim=None):
     """Create or get concatenated embedding or softmax variable.
 
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
@@ -143,52 +143,46 @@ def model_fn_body(self, features):
                        encoder_decoder_attention_bias,
                        decoder_self_attention_bias, hparams)
 
-  def _greedy_infer(self, features, decode_length, last_position_only=True):
+  def _greedy_infer(self, features, decode_length):
     """Fast version of greedy decoding.
 
     Args:
       features: an map of string to `Tensor`
       decode_length: an integer.  How many additional timesteps to decode.
-      last_position_only: MUST be true for fast decoding!
 
     Returns:
        samples: [batch_size, input_length + decode_length]
        logits: Not returned
        losses: Not returned
 
     Raises:
-      ValueError: If last_position_only if False
       NotImplementedError: If there are multiple data shards.
     """
-    decoded_ids, _ = self._fast_decode(
-        features, decode_length, last_position_only)
+    decoded_ids, _ = self._fast_decode(features, decode_length)
     return decoded_ids, None, None
 
   def _beam_decode(self, features, decode_length, beam_size, top_beams,
-                   last_position_only, alpha):
+                   alpha):
     """Beam search decoding.
 
     Args:
       features: an map of string to `Tensor`
       decode_length: an integer.  How many additional timesteps to decode.
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
-      last_position_only: MUST be true for fast decoding!
       alpha: Float that controls the length penalty. larger the alpha, stronger
         the preference for slonger translations.
 
     Returns:
        samples: an integer `Tensor`. Top samples from the beam search
     """
     decoded_ids, scores = self._fast_decode(
-        features, decode_length, last_position_only, beam_size, top_beams,
-        alpha)
+        features, decode_length, beam_size, top_beams, alpha)
     return {"outputs": decoded_ids, "scores": scores}
 
   def _fast_decode(self,
                    features,
                    decode_length,
-                   last_position_only=True,
                    beam_size=1,
                    top_beams=1,
                    alpha=1.0):
@@ -200,7 +194,6 @@ def _fast_decode(self,
     Args:
       features: a map of string to model  features.
       decode_length: an integer.  How many additional timesteps to decode.
-      last_position_only: MUST be true for fast decoding!
       beam_size: number of beams.
       top_beams: an integer. How many of the beams to return.
       alpha: Float that controls the length penalty. larger the alpha, stronger
@@ -210,11 +203,8 @@ def _fast_decode(self,
        samples: an integer `Tensor`. Top samples from the beam search
 
     Raises:
-      ValueError: If last_position_only if False
       NotImplementedError: If there are multiple data shards.
     """
-    if not last_position_only:
-      raise ValueError("Fast decoding only deals with the last positions!")
     if self._num_datashards != 1:
       raise NotImplementedError("Fast decoding only supports a single shard.")
     dp = self._data_parallelism
diff --git a/tensor2tensor/models/transformer_adv.py b/tensor2tensor/models/transformer_adv.py
@@ -166,7 +166,7 @@ def model_fn_body(self, features):
         features["target_space_id"], self._hparams)
 
   def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
-            last_position_only=False, alpha=0.0):
+            alpha=0.0):
     """Produce predictions from the model."""
     if not features:
       features = {}
@@ -184,8 +184,7 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
       initial_output = tf.zeros((batch_size, 2 * length, 1, 1), dtype=tf.int64)
 
     features["targets"] = initial_output
-    sharded_logits, _ = self.model_fn(
-        features, False, last_position_only=last_position_only)
+    sharded_logits, _ = self.model_fn(features, False)
     sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4)
     samples = tf.concat(sharded_samples, 0)
 
@@ -194,8 +193,7 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
     for _ in xrange(how_many_more_steps):
       with tf.variable_scope(tf.get_variable_scope(), reuse=True):
         features["targets"] = samples
-        sharded_logits, _ = self.model_fn(
-            features, False, last_position_only=last_position_only)
+        sharded_logits, _ = self.model_fn(features, False)
         sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4)
         samples = tf.concat(sharded_samples, 0)
 
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
@@ -56,8 +56,7 @@ def getModel(self, hparams, mode=tf.estimator.ModeKeys.TRAIN):
         "target_space_id": tf.constant(1, dtype=tf.int32),
     }
 
-    return transformer.Transformer(
-        hparams, tf.estimator.ModeKeys.PREDICT, p_hparams), features
+    return transformer.Transformer(hparams, mode, p_hparams), features
 
   def testTransformer(self):
     model, features = self.getModel(transformer.transformer_small())
@@ -99,8 +98,7 @@ def testGreedyVsFast(self):
                              mode=tf.estimator.ModeKeys.PREDICT)
 
     with tf.variable_scope(tf.get_variable_scope(), reuse=True):
-      greedy_result, _, _ = model._slow_greedy_infer(
-          features, decode_length, last_position_only=True)
+      greedy_result, _, _ = model._slow_greedy_infer(features, decode_length)
       greedy_result = tf.squeeze(greedy_result, axis=[2, 3])
 
       fast_result, _, _ = model._greedy_infer(features, decode_length)
@@ -139,15 +137,13 @@ def testBeamVsFast(self):
           decode_length,
           beam_size=4,
           top_beams=1,
-          last_position_only=True,
           alpha=1.0)["outputs"]
 
       fast_result = model._beam_decode(
           features,
           decode_length,
           beam_size=4,
           top_beams=1,
-          last_position_only=True,
           alpha=1.0)["outputs"]
 
     with self.test_session():
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
@@ -42,7 +42,6 @@
 def decode_hparams(overrides=""):
   """Hyperparameters for decoding."""
   hp = tf.contrib.training.HParams(
-      use_last_position_only=False,
       save_images=False,
       problem_idx=0,
       extra_length=50,
diff --git a/tensor2tensor/utils/modality.py b/tensor2tensor/utils/modality.py
@@ -71,6 +71,22 @@ def top_dimensionality(self):
   def _body_input_depth(self):
     return self._model_hparams.hidden_size
 
+  @property
+  def top_is_pointwise(self):
+    """Whether the top mapping of the modality is pointwise.
+
+    An example of a pointwise top mapping is a linear layer followed by
+    a softmax. Given a tensor [batch, length, height, depth] it operates
+    only on the last axis, on every point in [batch, length, height] fully
+    independently. In contrast, a classifier that first averages over length
+    and height is not pointwise, as it depends on the whole field. It is useful
+    to know if a top is pointwise to speed up decoding in certain models.
+
+    Returns:
+      A Boolean, True if the modality is pointwise, False otherwise (default).
+    """
+    return False
+
   def bottom(self, x):
     """Transform one shard of input.
 
diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
@@ -115,7 +115,6 @@ def nth_model(n):
           features,
           beam_size=decode_hp.beam_size,
           top_beams=(decode_hp.beam_size if decode_hp.return_beams else 1),
-          last_position_only=decode_hp.use_last_position_only,
           alpha=decode_hp.alpha,
           decode_length=decode_hp.extra_length)
     # In distributed mode, we build graph for problem=0 and problem=worker_id.
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py