Updated the attention mechanisms so that they support arbitrary attention state types.

eaplatanios · eaplatanios · commit 39c986e27656 · 2018-01-23T16:24:25.000-05:00
diff --git a/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/Attention.scala b/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/Attention.scala
@@ -15,8 +15,10 @@
 
 package org.platanios.tensorflow.api.ops.rnn.attention
 
+import org.platanios.tensorflow.api.core.Shape
 import org.platanios.tensorflow.api.core.exception.InvalidShapeException
 import org.platanios.tensorflow.api.implicits.Implicits._
+import org.platanios.tensorflow.api.ops.control_flow.WhileLoopVariable
 import org.platanios.tensorflow.api.ops.{Basic, Checks, Math, NN, Op, Output}
 import org.platanios.tensorflow.api.types.{DataType, INT32}
 
@@ -38,27 +40,31 @@ import scala.language.postfixOps
   *
   * @author Emmanouil Antonios Platanios
   */
-abstract class Attention(
+abstract class Attention[AS, ASS](
     protected val memory: Output,
     protected val memorySequenceLengths: Output = null,
     val checkInnerDimensionsDefined: Boolean = true,
     val scoreMaskValue: Output = Float.NegativeInfinity,
     val name: String = "Attention"
+)(implicit
+    evAS: WhileLoopVariable.Aux[AS, ASS]
 ) {
-  lazy val values: Output = Op.createWithNameScope(s"$name/Initialization") {
+  lazy val values: Output = Op.createWithNameScope(s"$name/Values") {
     Attention.maybeMaskValues(memory, memorySequenceLengths, checkInnerDimensionsDefined)
   }
 
   lazy val keys: Output = values
 
-  lazy val batchSize: Output = Op.createWithNameScope(s"$name/Initialization") {
+  lazy val batchSize: Output = Op.createWithNameScope(s"$name/BatchSize") {
     Attention.dimSize(keys, 0)
   }
 
-  lazy val alignmentSize: Output = Op.createWithNameScope(s"$name/Initialization") {
+  lazy val alignmentSize: Output = Op.createWithNameScope(s"$name/AlignmentSize") {
     Attention.dimSize(keys, 1)
   }
 
+  def stateSize: ASS
+
   lazy val dataType: DataType = keys.dataType
 
   /** Initial alignment value.
@@ -69,42 +75,79 @@ abstract class Attention(
     * The default behavior is to return a tensor of all zeros.
     */
   lazy val initialAlignment: Output = {
-    Op.createWithNameScope(s"$name/InitialAlignments", Set(batchSize.op)) {
+    Op.createWithNameScope(s"$name/InitialAlignment", Set(batchSize.op)) {
       val fullShape = Basic.stack(Seq(batchSize, alignmentSize.cast(batchSize.dataType)), axis = 0)
       Basic.zeros(dataType, fullShape)
     }
   }
 
+  /** Initial state value.
+    *
+    * This is important for attention mechanisms that use the previous alignment to calculate the alignment at the
+    * next time step (e.g., monotonic attention).
+    *
+    * The default behavior is to return the same output as `initialAlignment`.
+    */
+  def initialState: AS
+
   /** Computes an alignment tensor given the provided query and previous alignment tensor.
     *
     * The previous alignment tensor is important for attention mechanisms that use the previous alignment to calculate
     * the attention at the next time step, such as monotonic attention mechanisms.
     *
-    * @param  query             Query tensor.
-    * @param  previousAlignment Previous alignment tensor.
-    * @return Alignment tensor.
+    * TODO: Figure out how to generalize the "next state" functionality.
+    *
+    * @param  query         Query tensor.
+    * @param  previousState Previous alignment tensor.
+    * @return Tuple containing the alignment tensor and the next attention state.
     */
-  final def alignment(query: Output, previousAlignment: Output): Output = Op.createWithNameScope(name) {
-    val unmaskedScore = score(query, previousAlignment)
-    val maskedScore = Attention.maybeMaskScore(unmaskedScore, memorySequenceLengths, scoreMaskValue)
-    probability(maskedScore, previousAlignment)
-  }
+  def alignment(query: Output, previousState: AS): (Output, AS)
 
   /** Computes an alignment score for `query`.
     *
-    * @param  query             Query tensor.
-    * @param  previousAlignment Previous alignment tensor.
+    * @param  query Query tensor.
+    * @param  state Current attention mechanism state (defaults to the previous alignment tensor). The data type of
+    *               this tensor matches that of `values` and its shape is `[batchSize, alignmentSize]`, where
+    *               `alignmentSize` is the memory's maximum time.
     * @return Score tensor.
     */
-  protected def score(query: Output, previousAlignment: Output): Output
+  protected def score(query: Output, state: AS): Output
 
   /** Computes alignment probabilities for `score`.
     *
-    * @param  score             Alignment score tensor.
-    * @param  previousAlignment Previous alignment tensor.
+    * @param  score Alignment score tensor.
+    * @param  state Current attention mechanism state (defaults to the previous alignment tensor). The data type of
+    *               this tensor matches that of `values` and its shape is `[batchSize, alignmentSize]`, where
+    *               `alignmentSize` is the memory's maximum time.
     * @return Alignment probabilities tensor.
     */
-  protected def probability(score: Output, previousAlignment: Output): Output = NN.softmax(score, name = "Probability")
+  protected def probability(score: Output, state: AS): Output = NN.softmax(score, name = "Probability")
+}
+
+/** Base class for attention models that use as state the previous alignment. */
+abstract class SimpleAttention(
+    override protected val memory: Output,
+    override protected val memorySequenceLengths: Output = null,
+    override val checkInnerDimensionsDefined: Boolean = true,
+    override val scoreMaskValue: Output = Float.NegativeInfinity,
+    override val name: String = "SimpleAttention"
+) extends Attention[Output, Shape](memory, memorySequenceLengths, checkInnerDimensionsDefined, scoreMaskValue, name) {
+  override def stateSize: Shape = {
+    Output.constantValueAsShape(alignmentSize).getOrElse(Shape.unknown())
+  }
+
+  override def initialState: Output = {
+    Op.createWithNameScope(s"$name/InitialState", Set(batchSize.op)) {
+      Basic.identity(initialAlignment)
+    }
+  }
+
+  override def alignment(query: Output, previousState: Output): (Output, Output) = Op.createWithNameScope(name) {
+    val unmaskedScore = score(query, previousState)
+    val maskedScore = Attention.maybeMaskScore(unmaskedScore, memorySequenceLengths, scoreMaskValue)
+    val alignment = probability(maskedScore, previousState)
+    (alignment, alignment)
+  }
 }
 
 object Attention {
@@ -117,7 +160,7 @@ object Attention {
 
   /** Potentially masks the provided values tensor based on the provided sequence lengths. */
   @throws[InvalidShapeException]
-  private[Attention] def maybeMaskValues(
+  private[attention] def maybeMaskValues(
       values: Output, sequenceLengths: Output, checkInnerDimensionsDefined: Boolean
   ): Output = {
     if (checkInnerDimensionsDefined && !values.shape(2 ::).isFullyDefined)
@@ -152,7 +195,7 @@ object Attention {
   }
 
   /** Potentially masks the provided score tensor based on the provided sequence lengths. */
-  private[Attention] def maybeMaskScore(
+  private[attention] def maybeMaskScore(
       score: Output, sequenceLengths: Output, scoreMaskValue: Output
   ): Output = {
     if (sequenceLengths != null) {
diff --git a/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/AttentionWrapperCell.scala b/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/AttentionWrapperCell.scala
@@ -43,17 +43,18 @@ import org.platanios.tensorflow.api.types.{DataType, INT32}
   *
   * @author Emmanouil Antonios Platanios
   */
-class AttentionWrapperCell[S, SS] private[attention] (
+class AttentionWrapperCell[S, SS, AS, ASS] private[attention] (
     val cell: RNNCell[Output, Shape, S, SS],
-    val attentions: Seq[Attention],
+    val attentions: Seq[Attention[AS, ASS]], // TODO: Allow for varying supported types in the sequence.
     val attentionLayerWeights: Seq[Output] = null,
     val cellInputFn: (Output, Output) => Output = (input, attention) => Basic.concatenate(Seq(input, attention), -1),
     val outputAttention: Boolean = true,
     val storeAlignmentsHistory: Boolean = false,
     val name: String = "AttentionWrapperCell"
 )(implicit
-    evS: WhileLoopVariable.Aux[S, SS]
-) extends RNNCell[Output, Shape, AttentionWrapperState[S, SS], (SS, Shape, Shape, Seq[Shape], Seq[Shape])] {
+    evS: WhileLoopVariable.Aux[S, SS],
+    evAS: WhileLoopVariable.Aux[AS, ASS]
+) extends RNNCell[Output, Shape, AttentionWrapperState[S, SS, Seq[AS], Seq[ASS]], (SS, Shape, Shape, Seq[Shape], Seq[Shape], Seq[ASS])] {
   private[this] val attentionLayersSize: Int = {
     if (attentionLayerWeights != null) {
       require(attentionLayerWeights.lengthCompare(attentions.size) == 0,
@@ -74,7 +75,7 @@ class AttentionWrapperCell[S, SS] private[attention] (
     *                          `initialCellState`.
     * @return Initial state for this attention cell wrapper.
     */
-  def initialState(initialCellState: S, dataType: DataType = null): AttentionWrapperState[S, SS] = {
+  def initialState(initialCellState: S, dataType: DataType = null): AttentionWrapperState[S, SS, Seq[AS], Seq[ASS]] = {
     if (initialCellState == null) {
       null
     } else {
@@ -101,17 +102,18 @@ class AttentionWrapperCell[S, SS] private[attention] (
               attentions.map(_ => TensorArray.create(0, inferredDataType, dynamicSize = true))
             else
               Seq.empty
-          })
+          },
+          attentionState = attentions.map(_.initialState))
       }
     }
   }
 
   override def outputShape: Shape = if (outputAttention) Shape(attentionLayersSize) else cell.outputShape
 
-  override def stateShape: (SS, Shape, Shape, Seq[Shape], Seq[Shape]) = {
+  override def stateShape: (SS, Shape, Shape, Seq[Shape], Seq[Shape], Seq[ASS]) = {
     (cell.stateShape, Shape(1), Shape(attentionLayersSize),
         attentions.map(a => Output.constantValueAsShape(a.alignmentSize.expandDims(0)).getOrElse(Shape.unknown())),
-        attentions.map(_ => Shape.scalar()))
+        attentions.map(_ => Shape.scalar()), attentions.map(_.stateSize))
   }
 
   /** Performs a step using this attention-wrapped RNN cell.
@@ -129,7 +131,8 @@ class AttentionWrapperCell[S, SS] private[attention] (
     * @return Next tuple.
     */
   override def forward(
-      input: Tuple[Output, AttentionWrapperState[S, SS]]): Tuple[Output, AttentionWrapperState[S, SS]] = {
+      input: Tuple[Output, AttentionWrapperState[S, SS, Seq[AS], Seq[ASS]]]
+  ): Tuple[Output, AttentionWrapperState[S, SS, Seq[AS], Seq[ASS]]] = {
     // Step 1: Calculate the true inputs to the cell based on the previous attention value.
     val cellInput = cellInputFn(input.output, input.state.attention)
     val nextTuple = cell.forward(Tuple(cellInput, input.state.cellState))
@@ -142,9 +145,9 @@ class AttentionWrapperCell[S, SS] private[attention] (
       Basic.identity(output, "CheckedCellOutput")
     }
     val weights = if (attentionLayerWeights != null) attentionLayerWeights else attentions.map(_ => null)
-    val (allAttentions, allAlignments) = (attentions, input.state.alignments, weights).zipped.map {
-      case (mechanism, previous, w) =>
-        val alignments = mechanism.alignment(checkedOutput, previous)
+    val (allAttentions, allAlignments, allStates) = (attentions, input.state.attentionState, weights).zipped.map {
+      case (mechanism, previousState, w) =>
+        val (alignments, state) = mechanism.alignment(checkedOutput, previousState)
         // Reshape from [batchSize, memoryTime] to [batchSize, 1, memoryTime]
         val expandedAlignments = alignments.expandDims(1)
         // Context is the inner product of alignments and values along the memory time dimension.
@@ -159,8 +162,8 @@ class AttentionWrapperCell[S, SS] private[attention] (
           else
             context
         }
-        (attention, alignments)
-    }.unzip
+        (attention, alignments, state)
+    }.unzip3
     val histories = {
       if (storeAlignmentsHistory)
         input.state.alignmentsHistory.zip(allAlignments).map(p => p._1.write(input.state.time, p._2))
@@ -169,7 +172,8 @@ class AttentionWrapperCell[S, SS] private[attention] (
     }
     val one = Basic.constant(1)
     val attention = Basic.concatenate(allAttentions, one)
-    val nextState = AttentionWrapperState(nextTuple.state, input.state.time + one, attention, allAlignments, histories)
+    val nextState = AttentionWrapperState(
+      nextTuple.state, input.state.time + one, attention, allAlignments, histories, allStates)
     if (outputAttention)
       Tuple(attention, nextState)
     else
@@ -178,18 +182,19 @@ class AttentionWrapperCell[S, SS] private[attention] (
 }
 
 object AttentionWrapperCell {
-  def apply[S, SS](
+  def apply[S, SS, AS, ASS](
       cell: RNNCell[Output, Shape, S, SS],
-      attentions: Seq[Attention],
+      attentions: Seq[Attention[AS, ASS]],
       attentionLayerWeights: Seq[Output] = null,
       cellInputFn: (Output, Output) => Output = (input, attention) => Basic.concatenate(Seq(input, attention), -1),
       outputAttention: Boolean = true,
       storeAlignmentsHistory: Boolean = false,
       name: String = "AttentionWrapperCell"
   )(implicit
-      evS: WhileLoopVariable.Aux[S, SS]
-  ): AttentionWrapperCell[S, SS] = {
-    new AttentionWrapperCell[S, SS](
+      evS: WhileLoopVariable.Aux[S, SS],
+      evAS: WhileLoopVariable.Aux[AS, ASS]
+  ): AttentionWrapperCell[S, SS, AS, ASS] = {
+    new AttentionWrapperCell[S, SS, AS, ASS](
       cell, attentions, attentionLayerWeights, cellInputFn, outputAttention, storeAlignmentsHistory, name)
   }
 }
diff --git a/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/BahdanauAttention.scala b/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/BahdanauAttention.scala
@@ -64,7 +64,7 @@ class BahdanauAttention(
     protected val probabilityFn: (Output) => Output = NN.softmax(_, name = "Probability"),
     override val scoreMaskValue: Output = Float.NegativeInfinity,
     override val name: String = "BahdanauAttention"
-) extends Attention(memory, memorySequenceLengths, checkInnerDimensionsDefined = true, scoreMaskValue, name) {
+) extends SimpleAttention(memory, memorySequenceLengths, checkInnerDimensionsDefined = true, scoreMaskValue, name) {
   override lazy val keys: Output = NN.linear(values, memoryWeights)
 
   @throws[InvalidArgumentException]
diff --git a/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/LuongAttention.scala b/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/LuongAttention.scala
@@ -50,7 +50,7 @@ class LuongAttention(
     protected val probabilityFn: (Output) => Output = NN.softmax(_, name = "Probability"),
     override val scoreMaskValue: Output = Float.NegativeInfinity,
     override val name: String = "LuongAttention"
-) extends Attention(memory, memorySequenceLengths, checkInnerDimensionsDefined = true, scoreMaskValue, name) {
+) extends SimpleAttention(memory, memorySequenceLengths, checkInnerDimensionsDefined = true, scoreMaskValue, name) {
   override lazy val keys: Output = NN.linear(values, memoryWeights)
 
   @throws[InvalidArgumentException]
diff --git a/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/package.scala b/api/src/main/scala/org/platanios/tensorflow/api/ops/rnn/attention/package.scala