FluxML
diff --git a/‎src/convnets/inception.jl
+26-23 b/‎src/convnets/inception.jl
+26-23
diff --git a/‎src/convnets/resnet.jl
+71-75 b/‎src/convnets/resnet.jl
+71-75
@@ -279,18 +279,18 @@ function inceptionv4_c()
 end
 
 """
-    inceptionv4(; inchannels = 3, dropout = 0.0, nclasses = 1000)
+    inceptionv4(; inchannels = 3, drop_rate = 0.0, nclasses = 1000)
 
 Create an Inceptionv4 model.
 ([reference](https://arxiv.org/abs/1602.07261))
 
 # Arguments
 
   - `inchannels`: number of input channels.
-  - `dropout`: rate of dropout in classifier head.
+  - `drop_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function inceptionv4(; inchannels = 3, dropout = 0.0, nclasses = 1000)
+function inceptionv4(; inchannels = 3, drop_rate = 0.0, nclasses = 1000)
     body = Chain(conv_bn((3, 3), inchannels, 32; stride = 2)...,
                  conv_bn((3, 3), 32, 32)...,
                  conv_bn((3, 3), 32, 64; pad = 1)...,
@@ -313,12 +313,13 @@ function inceptionv4(; inchannels = 3, dropout = 0.0, nclasses = 1000)
                  inceptionv4_c(),
                  inceptionv4_c(),
                  inceptionv4_c())
-    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(dropout), Dense(1536, nclasses))
+    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(drop_rate),
+                 Dense(1536, nclasses))
     return Chain(body, head)
 end
 
 """
-    Inceptionv4(; pretrain = false, inchannels = 3, dropout = 0.0, nclasses = 1000)
+    Inceptionv4(; pretrain = false, inchannels = 3, drop_rate = 0.0, nclasses = 1000)
 
 Creates an Inceptionv4 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -327,7 +328,7 @@ Creates an Inceptionv4 model.
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
   - `inchannels`: number of input channels.
-  - `dropout`: rate of dropout in classifier head.
+  - `drop_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
 !!! warning
@@ -338,7 +339,7 @@ struct Inceptionv4
     layers::Any
 end
 
-function Inceptionv4(; pretrain = false, inchannels = 3, dropout = 0.0, nclasses = 1000)
+function Inceptionv4(; pretrain = false, inchannels = 3, drop_rate = 0.0, nclasses = 1000)
     layers = inceptionv4(; inchannels, dropout, nclasses)
     pretrain && loadpretrain!(layers, "Inceptionv4")
     return Inceptionv4(layers)
@@ -419,18 +420,18 @@ function block8(scale = 1.0f0; activation = identity)
 end
 
 """
-    inceptionresnetv2(; inchannels = 3, dropout = 0.0, nclasses = 1000)
+    inceptionresnetv2(; inchannels = 3, drop_rate =0.0, nclasses = 1000)
 
 Creates an InceptionResNetv2 model.
 ([reference](https://arxiv.org/abs/1602.07261))
 
 # Arguments
 
   - `inchannels`: number of input channels.
-  - `dropout`: rate of dropout in classifier head.
+  - `drop_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function inceptionresnetv2(; inchannels = 3, dropout = 0.0, nclasses = 1000)
+function inceptionresnetv2(; inchannels = 3, drop_rate = 0.0, nclasses = 1000)
     body = Chain(conv_bn((3, 3), inchannels, 32; stride = 2)...,
                  conv_bn((3, 3), 32, 32)...,
                  conv_bn((3, 3), 32, 64; pad = 1)...,
@@ -446,12 +447,13 @@ function inceptionresnetv2(; inchannels = 3, dropout = 0.0, nclasses = 1000)
                  [block8(0.20f0) for _ in 1:9]...,
                  block8(; activation = relu),
                  conv_bn((1, 1), 2080, 1536)...)
-    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(dropout), Dense(1536, nclasses))
+    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(drop_rate),
+                 Dense(1536, nclasses))
     return Chain(body, head)
 end
 
 """
-    InceptionResNetv2(; pretrain = false, inchannels = 3, dropout = 0.0, nclasses = 1000)
+    InceptionResNetv2(; pretrain = false, inchannels = 3, drop_rate =0.0, nclasses = 1000)
 
 Creates an InceptionResNetv2 model.
 ([reference](https://arxiv.org/abs/1602.07261))
@@ -460,7 +462,7 @@ Creates an InceptionResNetv2 model.
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet
   - `inchannels`: number of input channels.
-  - `dropout`: rate of dropout in classifier head.
+  - `drop_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
 !!! warning
@@ -471,9 +473,9 @@ struct InceptionResNetv2
     layers::Any
 end
 
-function InceptionResNetv2(; pretrain = false, inchannels = 3, dropout = 0.0,
+function InceptionResNetv2(; pretrain = false, inchannels = 3, drop_rate = 0.0,
                            nclasses = 1000)
-    layers = inceptionresnetv2(; inchannels, dropout, nclasses)
+    layers = inceptionresnetv2(; inchannels, drop_rate, nclasses)
     pretrain && loadpretrain!(layers, "InceptionResNetv2")
     return InceptionResNetv2(layers)
 end
@@ -533,18 +535,18 @@ function xception_block(inchannels, outchannels, nrepeats; stride = 1,
 end
 
 """
-    xception(; inchannels = 3, dropout = 0.0, nclasses = 1000)
+    xception(; inchannels = 3, drop_rate =0.0, nclasses = 1000)
 
 Creates an Xception model.
 ([reference](https://arxiv.org/abs/1610.02357))
 
 # Arguments
 
   - `inchannels`: number of input channels.
-  - `dropout`: rate of dropout in classifier head.
+  - `drop_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 """
-function xception(; inchannels = 3, dropout = 0.0, nclasses = 1000)
+function xception(; inchannels = 3, drop_rate = 0.0, nclasses = 1000)
     body = Chain(conv_bn((3, 3), inchannels, 32; stride = 2, bias = false)...,
                  conv_bn((3, 3), 32, 64; bias = false)...,
                  xception_block(64, 128, 2; stride = 2, start_with_relu = false),
@@ -554,7 +556,8 @@ function xception(; inchannels = 3, dropout = 0.0, nclasses = 1000)
                  xception_block(728, 1024, 2; stride = 2, grow_at_start = false),
                  depthwise_sep_conv_bn((3, 3), 1024, 1536; pad = 1)...,
                  depthwise_sep_conv_bn((3, 3), 1536, 2048; pad = 1)...)
-    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(dropout), Dense(2048, nclasses))
+    head = Chain(GlobalMeanPool(), MLUtils.flatten, Dropout(drop_rate),
+                 Dense(2048, nclasses))
     return Chain(body, head)
 end
 
@@ -563,7 +566,7 @@ struct Xception
 end
 
 """
-    Xception(; pretrain = false, inchannels = 3, dropout = 0.0, nclasses = 1000)
+    Xception(; pretrain = false, inchannels = 3, drop_rate =0.0, nclasses = 1000)
 
 Creates an Xception model.
 ([reference](https://arxiv.org/abs/1610.02357))
@@ -572,15 +575,15 @@ Creates an Xception model.
 
   - `pretrain`: set to `true` to load the pre-trained weights for ImageNet.
   - `inchannels`: number of input channels.
-  - `dropout`: rate of dropout in classifier head.
+  - `drop_rate`: rate of dropout in classifier head.
   - `nclasses`: the number of output classes.
 
 !!! warning
     
     `Xception` does not currently support pretrained weights.
 """
-function Xception(; pretrain = false, inchannels = 3, dropout = 0.0, nclasses = 1000)
-    layers = xception(; inchannels, dropout, nclasses)
+function Xception(; pretrain = false, inchannels = 3, drop_rate = 0.0, nclasses = 1000)
+    layers = xception(; inchannels, drop_rate, nclasses)
     pretrain && loadpretrain!(layers, "xception")
     return Xception(layers)
 end
 
@@ -1,9 +1,42 @@
+function drop_blocks(drop_prob = 0.0)
+    return [
+        identity,
+        identity,
+        DropBlock(drop_prob, 5, 0.25),
+        DropBlock(drop_prob, 3, 1.00),
+    ]
+end
+
+function downsample_conv(kernel_size, in_channels, out_channels; stride = 1, dilation = 1,
+                         first_dilation = nothing, norm_layer = BatchNorm)
+    kernel_size = stride == 1 && dilation == 1 ? 1 : kernel_size
+    first_dilation = kernel_size[1] > 1 ?
+                     (!isnothing(first_dilation) ? first_dilation : dilation) : 1
+    pad = ((stride - 1) + dilation * (kernel_size[1] - 1)) ÷ 2
+    return Chain(Conv(kernel_size, in_channels => out_channels; stride, pad,
+                      dilation = first_dilation, bias = false),
+                 norm_layer(out_channels))
+end
+
+function downsample_avg(kernel_size, in_channels, out_channels; stride = 1, dilation = 1,
+                        first_dilation = nothing, norm_layer = BatchNorm)
+    avg_stride = dilation == 1 ? stride : 1
+    if stride == 1 && dilation == 1
+        pool = identity
+    else
+        pad = avg_stride == 1 && dilation > 1 ? SamePad() : 0
+        pool = avg_pool_fn((2, 2); stride = avg_stride, pad)
+    end
+    return Chain(pool,
+                 Conv((1, 1), in_channels => out_channels; bias = false),
+                 norm_layer(out_channels))
+end
+
 function basicblock(inplanes, planes; stride = 1, downsample = identity, cardinality = 1,
-                    base_width = 64,
-                    reduce_first = 1, dilation = 1, first_dilation = nothing,
-                    act_layer = relu, norm_layer = BatchNorm,
+                    base_width = 64, reduce_first = 1, dilation = 1,
+                    first_dilation = nothing, activation = relu, norm_layer = BatchNorm,
                     drop_block = identity, drop_path = identity)
-    expansion = 1
+    expansion = expansion_factor(basicblock)
     @assert cardinality==1 "BasicBlock only supports cardinality of 1"
     @assert base_width==64 "BasicBlock does not support changing base width"
     first_planes = planes ÷ reduce_first
@@ -17,16 +50,16 @@ function basicblock(inplanes, planes; stride = 1, downsample = identity, cardina
                           dilation = dilation, bias = false),
                      norm_layer(outplanes))
     return Chain(Parallel(+, downsample,
-                          Chain(conv_bn1, drop_block, act_layer, conv_bn2, drop_path)),
-                 act_layer)
+                          Chain(conv_bn1, drop_block, activation, conv_bn2, drop_path)),
+                 activation)
 end
+expansion_factor(::typeof(basicblock)) = 1
 
 function bottleneck(inplanes, planes; stride = 1, downsample = identity, cardinality = 1,
-                    base_width = 64,
-                    reduce_first = 1, dilation = 1, first_dilation = nothing,
-                    act_layer = relu, norm_layer = BatchNorm,
+                    base_width = 64, reduce_first = 1, dilation = 1,
+                    first_dilation = nothing, activation = relu, norm_layer = BatchNorm,
                     drop_block = identity, drop_path = identity)
-    expansion = 4
+    expansion = expansion_factor(bottleneck)
     width = floor(Int, planes * (base_width / 64)) * cardinality
     first_planes = width ÷ reduce_first
     outplanes = planes * expansion
@@ -39,62 +72,33 @@ function bottleneck(inplanes, planes; stride = 1, downsample = identity, cardina
     drop_block = drop_block === identity ? identity : drop_block()
     conv_bn3 = Chain(Conv((1, 1), width => outplanes; bias = false), norm_layer(outplanes))
     return Chain(Parallel(+, downsample,
-                          Chain(conv_bn1, drop_block, act_layer, conv_bn2, drop_block,
-                                act_layer, conv_bn3, drop_path)),
-                 act_layer)
-end
-
-function drop_blocks(drop_prob = 0.0)
-    return [identity, identity,
-        drop_prob == 0.0 ? DropBlock(drop_prob, 5, 0.25) : identity,
-        drop_prob == 0.0 ? DropBlock(drop_prob, 3, 1.00) : identity]
+                          Chain(conv_bn1, drop_block, activation, conv_bn2, drop_block,
+                                activation, conv_bn3, drop_path)),
+                 activation)
 end
+expansion_factor(::typeof(bottleneck)) = 4
 
-function downsample_conv(kernel_size, in_channels, out_channels; stride = 1, dilation = 1,
-                         first_dilation = nothing, norm_layer = BatchNorm)
-    kernel_size = stride == 1 && dilation == 1 ? 1 : kernel_size
-    first_dilation = kernel_size[1] > 1 ?
-                     (!isnothing(first_dilation) ? first_dilation : dilation) : 1
-    pad = ((stride - 1) + dilation * (kernel_size[1] - 1)) ÷ 2
-    return Chain(Conv(kernel_size, in_channels => out_channels; stride, pad,
-                      dilation = first_dilation, bias = false),
-                 norm_layer(out_channels))
-end
-
-function downsample_avg(kernel_size, in_channels, out_channels; stride = 1, dilation = 1,
-                        first_dilation = nothing, norm_layer = BatchNorm)
-    avg_stride = dilation == 1 ? stride : 1
-    if stride == 1 && dilation == 1
-        pool = identity
-    else
-        pad = avg_stride == 1 && dilation > 1 ? SamePad() : 0
-        pool = avg_pool_fn((2, 2); stride = avg_stride, pad)
-    end
-
-    return Chain(pool,
-                 Conv((1, 1), in_channels => out_channels; stride = 1, pad = 0,
-                      bias = false),
-                 norm_layer(out_channels))
-end
-
-function make_blocks(block_fn, channels, block_repeats, inplanes; expansion = 1,
-                     reduce_first = 1, output_stride = 32,
-                     down_kernel_size = 1, avg_down = false, drop_block_rate = 0.0,
-                     drop_path_rate = 0.0, kwargs...)
+function make_blocks(block_fn, channels, block_repeats, inplanes;
+                     reduce_first = 1, output_stride = 32, down_kernel_size = 1,
+                     avg_down = false, drop_block_rate = 0.0, drop_path_rate = 0.0,
+                     kwargs...)
+    expansion = expansion_factor(block_fn)
     kwarg_dict = Dict(kwargs...)
     stages = []
     net_block_idx = 1
     net_stride = 4
     dilation = prev_dilation = 1
-    for (stage_idx, (planes, num_blocks, db)) in enumerate(zip(channels, block_repeats,
-                                                               drop_blocks(drop_block_rate)))
+    for (stage_idx, (planes, num_blocks, drop_block)) in enumerate(zip(channels,
+                                                                       block_repeats,
+                                                                       drop_blocks(drop_block_rate)))
         stride = stage_idx == 1 ? 1 : 2
         if net_stride >= output_stride
             dilation *= stride
             stride = 1
         else
             net_stride *= stride
         end
+        # first block needs to be handled differently for downsampling
         downsample = identity
         if stride != 1 || inplanes != planes * expansion
             downsample = avg_down ?
@@ -106,7 +110,7 @@ function make_blocks(block_fn, channels, block_repeats, inplanes; expansion = 1,
                                          norm_layer = kwarg_dict[:norm_layer])
         end
         block_kwargs = Dict(:reduce_first => reduce_first, :dilation => dilation,
-                            :drop_block => db, kwargs...)
+                            :drop_block => drop_block, kwargs...)
         blocks = []
         for block_idx in 1:num_blocks
             downsample = block_idx == 1 ? downsample : identity
@@ -127,15 +131,13 @@ function make_blocks(block_fn, channels, block_repeats, inplanes; expansion = 1,
 end
 
 function resnet(block, layers; num_classes = 1000, inchannels = 3, output_stride = 32,
-                expansion = 1,
                 cardinality = 1, base_width = 64, stem_width = 64, stem_type = :default,
-                replace_stem_pool = false, reduce_first = 1,
-                down_kernel_size = (1, 1), avg_down = false, act_layer = relu,
-                norm_layer = BatchNorm,
+                replace_stem_pool = false, reduce_first = 1, down_kernel_size = (1, 1),
+                avg_down = false, activation = relu, norm_layer = BatchNorm,
                 drop_rate = 0.0, drop_path_rate = 0.0, drop_block_rate = 0.0,
                 block_kwargs...)
-    @assert output_stride in (8, 16, 32)
-    @assert stem_type in [:default, :deep, :deep_tiered]
+    @assert output_stride in (8, 16, 32) "Invalid `output_stride`. Must be one of (8, 16, 32)"
+    @assert stem_type in [:default, :deep, :deep_tiered] "Stem type must be one of [:default, :deep, :deep_tiered]"
     # Stem
     inplanes = stem_type == :deep ? stem_width * 2 : 64
     if stem_type == :deep
@@ -145,38 +147,32 @@ function resnet(block, layers; num_classes = 1000, inchannels = 3, output_stride
         end
         conv1 = Chain(Conv((3, 3), inchannels => stem_channels[0]; stride = 2, pad = 1,
                            bias = false),
-                      norm_layer(stem_channels[1]),
-                      act_layer(),
-                      Conv((3, 3), stem_channels[1] => stem_channels[1]; stride = 1,
-                           pad = 1, bias = false),
-                      norm_layer(stem_channels[2]),
-                      act_layer(),
-                      Conv((3, 3), stem_channels[2] => inplanes; stride = 1, pad = 1,
-                           bias = false))
+                      norm_layer(stem_channels[1], activation),
+                      Conv((3, 3), stem_channels[1] => stem_channels[1]; pad = 1,
+                           bias = false),
+                      norm_layer(stem_channels[2], activation),
+                      Conv((3, 3), stem_channels[2] => inplanes; pad = 1, bias = false))
     else
         conv1 = Conv((7, 7), inchannels => inplanes; stride = 2, pad = 3, bias = false)
     end
-    bn1 = norm_layer(inplanes)
-    act1 = act_layer
+    bn1 = norm_layer(inplanes, activation)
     # Stem pooling
     if replace_stem_pool
         stempool = Chain(Conv((3, 3), inplanes => inplanes; stride = 2, pad = 1,
                               bias = false),
-                         norm_layer(inplanes),
-                         act_layer)
+                         norm_layer(inplanes, activation))
     else
         stempool = MaxPool((3, 3); stride = 2, pad = 1)
     end
-    stem = Chain(conv1, bn1, act1, stempool)
-
+    stem = Chain(conv1, bn1, stempool)
     # Feature Blocks
     channels = [64, 128, 256, 512]
     stage_blocks = make_blocks(block, channels, layers, inplanes; cardinality, base_width,
                                output_stride, reduce_first, avg_down,
-                               down_kernel_size, act_layer, norm_layer,
+                               down_kernel_size, activation, norm_layer,
                                drop_block_rate, drop_path_rate, block_kwargs...)
-
     # Head (Pooling and Classifier)
+    expansion = expansion_factor(block)
     num_features = 512 * expansion
     classifier = Chain(GlobalMeanPool(), Dropout(drop_rate), MLUtils.flatten,
                        Dense(num_features, num_classes))