@@ -83,7 +83,7 @@ def register_layer(
83
83
default_args (list): The default parameters to add to the function.
84
84
default_kwargs (dict): The default parameters to add to the function.
85
85
Those arguments can be overwritten when calling the function.
86
- use_dp (bool): Wrap the function call within a dataparalellism object if
86
+ use_dp (bool): Wrap the function call within a dataparallelism object if
87
87
dp is available. Some layers (like MOE) must be called without dp.
88
88
recompute_grad (bool): If True, recompute the function during the
89
89
backward pass to save memory
@@ -1378,7 +1378,7 @@ def _relative_attention_inner(x, y, z, transpose):
1378
1378
x: Tensor with shape [batch_size, heads, length, length or depth].
1379
1379
y: Tensor with shape [batch_size, heads, length, depth].
1380
1380
z: Tensor with shape [length, length, depth].
1381
- transpose: Whether to tranpose inner matrices of y and z. Should be true if
1381
+ transpose: Whether to transpose inner matrices of y and z. Should be true if
1382
1382
last dimension of x is depth, not length.
1383
1383
1384
1384
Returns:
@@ -1422,7 +1422,7 @@ def dot_product_attention_relative(q,
1422
1422
k: a Tensor with shape [batch, heads, length, depth].
1423
1423
v: a Tensor with shape [batch, heads, length, depth].
1424
1424
bias: bias Tensor.
1425
- max_relative_position: an integer specifying the maxmimum distance between
1425
+ max_relative_position: an integer specifying the maximum distance between
1426
1426
inputs that unique position embeddings should be learned for.
1427
1427
dropout_rate: a floating point number.
1428
1428
image_shapes: optional tuple of integer scalars.
@@ -2141,7 +2141,7 @@ def gather_indices_2d(x, block_shape, block_stride):
2141
2141
2142
2142
2143
2143
def make_2d_block_raster_mask (query_shape , memory_flange ):
2144
- """creates a mask for 2d block raster scany .
2144
+ """creates a mask for 2d block raster scan .
2145
2145
2146
2146
The query mask can look to the left, top left, top, and top right, but
2147
2147
not to the right. Inside the query, we have the standard raster scan
@@ -2661,7 +2661,7 @@ def ffn_self_attention_layer(x,
2661
2661
We use self-attention to do feedforward computations. We apply this function
2662
2662
positionwise where for each position, we linearly transform the output to have
2663
2663
depth filter_depth, and break up the result depth-wise into num_parts
2664
- contiguous parts. The parts self-attentd , we concatenate the results
2664
+ contiguous parts. The parts self-attend , we concatenate the results
2665
2665
depth-wise, and we linearly transform to a depth of output_depth. The
2666
2666
goal is to get multiplicative interactions between components of a
2667
2667
representation.
@@ -2764,7 +2764,7 @@ def parameter_attention(x,
2764
2764
x , total_key_depth , use_bias = False , name = "q_transform" )
2765
2765
if dropout_rate :
2766
2766
# This is a cheaper form of attention dropout where we use to use
2767
- # the same dropout decisions across batch elemets and query positions,
2767
+ # the same dropout decisions across batch elements and query positions,
2768
2768
# but different decisions across heads and memory positions.
2769
2769
v = tf .nn .dropout (
2770
2770
v , 1.0 - dropout_rate , noise_shape = [num_heads , memory_rows , 1 ])
0 commit comments