From e0ecd2ed93794d089cb9fd4f6f6f45606ad09e69 Mon Sep 17 00:00:00 2001
From: Le-Zheng <30695225+Le-Zheng@users.noreply.github.com>
Date: Mon, 6 Dec 2021 01:18:27 +0000
Subject: [PATCH] add Dllib python style check (#3654)

---
 .github/workflows/style-check.yml             |   3 +
 python/dllib/dev/lint-python                  |   2 +-
 python/dllib/dev/pep8-1.7.0.py                |   2 +-
 .../dllib/src/bigdl/dllib/contrib/__init__.py |   1 -
 .../dllib/contrib/onnx/converter_utils.py     |  77 +-
 .../bigdl/dllib/contrib/onnx/onnx_loader.py   |   6 +-
 .../bigdl/dllib/contrib/onnx/ops_converter.py | 387 +++----
 .../bigdl/dllib/contrib/onnx/ops_mapping.py   |  26 +-
 .../src/bigdl/dllib/estimator/estimator.py    |   1 +
 .../src/bigdl/dllib/feature/dataset/base.py   |   6 +-
 .../bigdl/dllib/feature/dataset/dataset.py    |  11 +-
 .../src/bigdl/dllib/feature/dataset/mnist.py  |   8 +-
 .../bigdl/dllib/feature/dataset/movielens.py  |  20 +-
 .../bigdl/dllib/feature/dataset/sentence.py   |   4 +
 .../dllib/feature/transform/vision/image.py   | 169 +++-
 .../src/bigdl/dllib/keras/ToBigDLHelper.py    |   2 +-
 .../dllib/src/bigdl/dllib/keras/converter.py  | 377 ++++---
 .../src/bigdl/dllib/keras/layers/torch.py     |  35 +-
 .../src/bigdl/dllib/keras/layers/wrappers.py  |   4 +
 .../bigdl/dllib/models/inception/__init__.py  |   1 -
 .../bigdl/dllib/models/inception/inception.py |  87 +-
 .../src/bigdl/dllib/models/lenet/lenet5.py    |  18 +-
 .../dllib/models/local_lenet/__init__.py      |   1 -
 .../dllib/models/ml_pipeline/__init__.py      |   1 -
 .../dllib/models/ml_pipeline/dl_classifier.py |  24 +-
 .../src/bigdl/dllib/models/rnn/__init__.py    |   1 -
 .../src/bigdl/dllib/models/rnn/rnnexample.py  |  30 +-
 .../dllib/models/textclassifier/__init__.py   |   1 -
 .../models/textclassifier/textclassifier.py   |  20 +-
 .../src/bigdl/dllib/models/utils/__init__.py  |   1 -
 .../dllib/models/utils/model_broadcast.py     |   3 +
 python/dllib/src/bigdl/dllib/nn/criterion.py  |  79 +-
 .../bigdl/dllib/nn/initialization_method.py   |  21 +-
 .../dllib/src/bigdl/dllib/nn/keras/backend.py |   3 +-
 .../src/bigdl/dllib/nn/keras/layers/layer.py  | 444 +++++---
 .../bigdl/dllib/nn/keras/layers/topology.py   |   9 +-
 .../src/bigdl/dllib/nn/keras/optimization.py  |   3 +-
 python/dllib/src/bigdl/dllib/nn/layer.py      | 949 ++++++++++--------
 python/dllib/src/bigdl/dllib/nn/onnx/layer.py |  10 +-
 python/dllib/src/bigdl/dllib/nncontext.py     |  70 +-
 .../dllib/src/bigdl/dllib/optim/optimizer.py  | 174 +++-
 .../src/bigdl/dllib/utils/bigdl_export.py     |  30 +-
 python/dllib/src/bigdl/dllib/utils/common.py  |  51 +-
 python/dllib/src/bigdl/dllib/utils/engine.py  |  14 +-
 python/dllib/src/bigdl/dllib/utils/spark.py   |  43 +-
 python/dllib/src/bigdl/dllib/utils/tf.py      |   4 +-
 .../dllib/src/bigdl/dllib/utils/tf_utils.py   |  13 +-
 python/dllib/src/bigdl/dllib/utils/utils.py   |   8 +-
 .../dllib/src/bigdl/dllib/utils/zoo_engine.py |   2 +-
 python/dllib/src/setup.py                     |  11 +-
 50 files changed, 1914 insertions(+), 1353 deletions(-)

diff --git a/.github/workflows/style-check.yml b/.github/workflows/style-check.yml
index ff85ce4127e..616d13dbc2b 100644
--- a/.github/workflows/style-check.yml
+++ b/.github/workflows/style-check.yml
@@ -38,6 +38,9 @@ jobs:
         run: bash python/nano/test/run-nano-codestyle-test.sh
         env:
             ANALYTICS_ZOO_ROOT: ${{ github.workspace }}
+      
+      - name: Dllib style checking
+        run: bash python/dllib/dev/lint-python
 
       - name: Orca style checking
         run: bash python/orca/dev/test/lint-python
diff --git a/python/dllib/dev/lint-python b/python/dllib/dev/lint-python
index 7d0c81dde2e..ea78744b388 100755
--- a/python/dllib/dev/lint-python
+++ b/python/dllib/dev/lint-python
@@ -20,7 +20,7 @@
 SCRIPT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 PYTHON_ROOT_DIR="$SCRIPT_DIR/.."
 echo $PYTHON_ROOT_DIR
-PATHS_TO_CHECK="."
+PATHS_TO_CHECK="$SCRIPT_DIR/../src"
 PEP8_REPORT_PATH="$PYTHON_ROOT_DIR/dev/pep8-report.txt"
 PYLINT_REPORT_PATH="$PYTHON_ROOT_DIR/dev/pylint-report.txt"
 PYLINT_INSTALL_INFO="$PYTHON_ROOT_DIR/dev/pylint-info.txt"
diff --git a/python/dllib/dev/pep8-1.7.0.py b/python/dllib/dev/pep8-1.7.0.py
index 4d9ad36ef4a..a608ff5a562 100644
--- a/python/dllib/dev/pep8-1.7.0.py
+++ b/python/dllib/dev/pep8-1.7.0.py
@@ -106,7 +106,7 @@
 RERAISE_COMMA_REGEX = re.compile(r'raise\s+\w+\s*,.*,\s*\w+\s*$')
 ERRORCODE_REGEX = re.compile(r'\b[A-Z]\d{3}\b')
 DOCSTRING_REGEX = re.compile(r'u?r?["\']')
-EXTRANEOUS_WHITESPACE_REGEX = re.compile(r'[[({] | []}),;:]')
+EXTRANEOUS_WHITESPACE_REGEX = re.compile(r'[\[({] | [\]}),;:]')
 WHITESPACE_AFTER_COMMA_REGEX = re.compile(r'[,;:]\s*(?:  |\t)')
 COMPARE_SINGLETON_REGEX = re.compile(r'(\bNone|\bFalse|\bTrue)?\s*([=!]=)'
                                      r'\s*(?(1)|(None|False|True))\b')
diff --git a/python/dllib/src/bigdl/dllib/contrib/__init__.py b/python/dllib/src/bigdl/dllib/contrib/__init__.py
index 1d7a97cae5e..2151a805423 100644
--- a/python/dllib/src/bigdl/dllib/contrib/__init__.py
+++ b/python/dllib/src/bigdl/dllib/contrib/__init__.py
@@ -13,4 +13,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
diff --git a/python/dllib/src/bigdl/dllib/contrib/onnx/converter_utils.py b/python/dllib/src/bigdl/dllib/contrib/onnx/converter_utils.py
index e7d1417200a..2edb91a1727 100644
--- a/python/dllib/src/bigdl/dllib/contrib/onnx/converter_utils.py
+++ b/python/dllib/src/bigdl/dllib/contrib/onnx/converter_utils.py
@@ -19,52 +19,53 @@
 
 
 def calc_output_shape(input, kernel, padding=0, stride=1, dilation=1, ceil_mode=False):
-	def dilated_kernel_size(kernel, dilation):
-		return kernel + (kernel - 1) * (dilation - 1)
-	rounding = math.ceil if ceil_mode else math.floor
-	out = (input + 2 * padding - dilated_kernel_size(kernel, dilation)) / stride + 1
-	out = int(rounding(out))
-	return out
+    def dilated_kernel_size(kernel, dilation):
+        return kernel + (kernel - 1) * (dilation - 1)
+
+    rounding = math.ceil if ceil_mode else math.floor
+    out = (input + 2 * padding - dilated_kernel_size(kernel, dilation)) / stride + 1
+    out = int(rounding(out))
+    return out
 
 
 def parse_node_attr(node_proto):
-	attrs = {}
-	attr_proto = node_proto.attribute
+    attrs = {}
+    attr_proto = node_proto.attribute
 
-	for attr in attr_proto:
-		for field in ['f', 'i', 's']:
-			if attr.HasField(field):
-				attrs[attr.name] = getattr(attr, field)
+    for attr in attr_proto:
+        for field in ['f', 'i', 's']:
+            if attr.HasField(field):
+                attrs[attr.name] = getattr(attr, field)
 
-				# Needed for supporting python version > 3.5
-				if isinstance(attrs[attr.name], bytes):
-					attrs[attr.name] = attrs[attr.name].decode(encoding='utf-8')
+                # Needed for supporting python version > 3.5
+                if isinstance(attrs[attr.name], bytes):
+                    attrs[attr.name] = attrs[attr.name].decode(encoding='utf-8')
 
-		for field in ['floats', 'ints', 'strings']:
-			if list(getattr(attr, field)):
-				assert attr.name not in attrs, "Only one type of attr is allowed"
-				attrs[attr.name] = tuple(getattr(attr, field))
+        for field in ['floats', 'ints', 'strings']:
+            if list(getattr(attr, field)):
+                assert attr.name not in attrs, "Only one type of attr is allowed"
+                attrs[attr.name] = tuple(getattr(attr, field))
 
-		for field in ['t', 'g']:
-			if attr.HasField(field):
-				attrs[attr.name] = getattr(attr, field)
-		for field in ['tensors', 'graphs']:
-			if list(getattr(attr, field)):
-				raise NotImplementedError()
-		if attr.name not in attrs:
-			raise ValueError("Cannot parse attribute: \n{}\n.".format(attr))
+        for field in ['t', 'g']:
+            if attr.HasField(field):
+                attrs[attr.name] = getattr(attr, field)
+        for field in ['tensors', 'graphs']:
+            if list(getattr(attr, field)):
+                raise NotImplementedError()
+        if attr.name not in attrs:
+            raise ValueError("Cannot parse attribute: \n{}\n.".format(attr))
 
-	return attrs
+    return attrs
 
 
 def parse_tensor_data(tensor_proto):
-	try:
-		from onnx.numpy_helper import to_array
-	except ImportError:
-		raise ImportError("Onnx and protobuf need to be installed.")
-	if len(tuple(tensor_proto.dims)) > 0:
-		np_array = to_array(tensor_proto).reshape(tuple(tensor_proto.dims))
-	else:
-		# If it is a scalar tensor
-		np_array = np.array([to_array(tensor_proto)])
-	return np_array
+    try:
+        from onnx.numpy_helper import to_array
+    except ImportError:
+        raise ImportError("Onnx and protobuf need to be installed.")
+    if len(tuple(tensor_proto.dims)) > 0:
+        np_array = to_array(tensor_proto).reshape(tuple(tensor_proto.dims))
+    else:
+        # If it is a scalar tensor
+        np_array = np.array([to_array(tensor_proto)])
+    return np_array
diff --git a/python/dllib/src/bigdl/dllib/contrib/onnx/onnx_loader.py b/python/dllib/src/bigdl/dllib/contrib/onnx/onnx_loader.py
index b4b29d6e172..24611c5b7c5 100644
--- a/python/dllib/src/bigdl/dllib/contrib/onnx/onnx_loader.py
+++ b/python/dllib/src/bigdl/dllib/contrib/onnx/onnx_loader.py
@@ -77,7 +77,9 @@ def load_graph(self, graph_proto):
                 root_nodes.append((name, op_type))
                 prev_modules = [dummy_root]
 
-            bigdl_module, outputs_shape = self._make_module_from_onnx_node(op_type, inputs, prev_modules, attrs, outputs)
+            bigdl_module, outputs_shape = self._make_module_from_onnx_node(op_type, inputs,
+                                                                           prev_modules, attrs,
+                                                                           outputs)
 
             assert len(outputs) == len(outputs_shape)
 
@@ -108,4 +110,4 @@ def load(model_path):
 
 def load_model_proto(model_proto):
     loader = OnnxLoader()
-    return loader.load_graph(model_proto.graph)
\ No newline at end of file
+    return loader.load_graph(model_proto.graph)
diff --git a/python/dllib/src/bigdl/dllib/contrib/onnx/ops_converter.py b/python/dllib/src/bigdl/dllib/contrib/onnx/ops_converter.py
index a2acce3b832..59e118ea53a 100644
--- a/python/dllib/src/bigdl/dllib/contrib/onnx/ops_converter.py
+++ b/python/dllib/src/bigdl/dllib/contrib/onnx/ops_converter.py
@@ -22,235 +22,240 @@
 
 
 def average_pool(inputs, prev_modules, attrs, outputs):
-	# extract attributes
-	auto_pad = attrs.get('auto_pad', 'NOTSET')
-	ceil_mode = True if attrs.get('ceil_mode', 0) == 1 else False
-	count_include_pad = True if attrs.get('count_include_pad', 0) == 1 else False
-	kernel_width, kernel_height = map(int, attrs.get('kernel_shape', (1, 1))[:2])
-	stride_width, stride_height = map(int, attrs.get('strides', (1, 1))[:2])
-	padding_width, padding_height = map(int, attrs.get('pads', (0, 0))[:2])
-	# extract inputs
-	_, data_tensor_shape = inputs[0]
-	# calc output tensor shape
-	input_height, input_width = data_tensor_shape[-2:]
-	output_height = calc_output_shape(input_height, kernel_height, 
-		padding = padding_height, stride = stride_height, ceil_mode = ceil_mode)
-	output_width = calc_output_shape(input_width, kernel_width, 
-		padding = padding_width, stride = stride_width, ceil_mode = ceil_mode)
-	out_tensor_shape = list(data_tensor_shape)
-	out_tensor_shape[-2] = output_height
-	out_tensor_shape[-1] = output_width
-	out_tensor_shape = tuple(out_tensor_shape)
-	# create module node
-	module = SpatialAveragePooling(kw=kernel_width, kh=kernel_height,
-				dw=stride_width, dh=stride_height,
-				pad_w=padding_width, pad_h=padding_height,
-				ceil_mode=ceil_mode, count_include_pad=count_include_pad
-		)(prev_modules)
-	return module, [out_tensor_shape]
+    # extract attributes
+    auto_pad = attrs.get('auto_pad', 'NOTSET')
+    ceil_mode = True if attrs.get('ceil_mode', 0) == 1 else False
+    count_include_pad = True if attrs.get('count_include_pad', 0) == 1 else False
+    kernel_width, kernel_height = map(int, attrs.get('kernel_shape', (1, 1))[:2])
+    stride_width, stride_height = map(int, attrs.get('strides', (1, 1))[:2])
+    padding_width, padding_height = map(int, attrs.get('pads', (0, 0))[:2])
+    # extract inputs
+    _, data_tensor_shape = inputs[0]
+    # calc output tensor shape
+    input_height, input_width = data_tensor_shape[-2:]
+    output_height = calc_output_shape(input_height, kernel_height,
+                                      padding=padding_height, stride=stride_height,
+                                      ceil_mode=ceil_mode)
+    output_width = calc_output_shape(input_width, kernel_width,
+                                     padding=padding_width, stride=stride_width,
+                                     ceil_mode=ceil_mode)
+    out_tensor_shape = list(data_tensor_shape)
+    out_tensor_shape[-2] = output_height
+    out_tensor_shape[-1] = output_width
+    out_tensor_shape = tuple(out_tensor_shape)
+    # create module node
+    module = SpatialAveragePooling(kw=kernel_width, kh=kernel_height,
+                                   dw=stride_width, dh=stride_height,
+                                   pad_w=padding_width, pad_h=padding_height,
+                                   ceil_mode=ceil_mode, count_include_pad=count_include_pad
+                                   )(prev_modules)
+    return module, [out_tensor_shape]
 
 
 def batch_norm(inputs, prev_modules, attrs, outputs):
-	# extract attributes
-	epsilon = float(attrs.get('epsilon', 1e-05))
-	momentum = float(attrs.get('momentum', 0.9))
-	# extract inputs
-	_, data_tensor_shape = inputs[0]
-	scale_tensor_val, _ = inputs[1]
-	bias_tensor_val, _ = inputs[2]
-	mean_tensor_val, _ = inputs[3]
-	var_tensor_val, _ = inputs[4]
-	# calc output tensor shape
-	out_tensor_shape = data_tensor_shape
-	# create module node
-	n_output = int(data_tensor_shape[1])
+    # extract attributes
+    epsilon = float(attrs.get('epsilon', 1e-05))
+    momentum = float(attrs.get('momentum', 0.9))
+    # extract inputs
+    _, data_tensor_shape = inputs[0]
+    scale_tensor_val, _ = inputs[1]
+    bias_tensor_val, _ = inputs[2]
+    mean_tensor_val, _ = inputs[3]
+    var_tensor_val, _ = inputs[4]
+    # calc output tensor shape
+    out_tensor_shape = data_tensor_shape
+    # create module node
+    n_output = int(data_tensor_shape[1])
 
-	temp_module = SpatialBatchNormalization(n_output=n_output, eps=epsilon,
-		momentum=momentum, init_weight=scale_tensor_val, init_bias=bias_tensor_val)
-	if mean_tensor_val is not None:
-		temp_module.set_running_mean(mean_tensor_val)
-	if var_tensor_val is not None:
-		temp_module.set_running_std(var_tensor_val)
-	module = temp_module(prev_modules[0])
-	return module, [out_tensor_shape]
+    temp_module = SpatialBatchNormalization(n_output=n_output, eps=epsilon,
+                                            momentum=momentum, init_weight=scale_tensor_val,
+                                            init_bias=bias_tensor_val)
+    if mean_tensor_val is not None:
+        temp_module.set_running_mean(mean_tensor_val)
+    if var_tensor_val is not None:
+        temp_module.set_running_std(var_tensor_val)
+    module = temp_module(prev_modules[0])
+    return module, [out_tensor_shape]
 
 
 def concat(inputs, prev_modules, attrs, outputs):
-	# extract attributes
-	axis = int(attrs.get('axis'))
-	# extract inputs
-	_, data_tensor_shape = inputs[0]
-	# calc output tensor shape
-	dim_rank = 0
-	for i in range(len(inputs)):
-		_, curr_input_shape = inputs[i]
-		for j in range(len(data_tensor_shape)):
-			if axis != j:
-				if curr_input_shape[i] != data_tensor_shape[i]:
-					raise ValueError("Input shape mismatch. Expect receive input shape " +
-						data_tensor_shape[i] + " but got " + curr_input_shape[i])
-			else:
-				dim_rank += curr_input_shape[axis]
-	out_tensor_shape = list(data_tensor_shape)
-	out_tensor_shape[axis] = dim_rank
-	out_tensor_shape = tuple(out_tensor_shape)
-	# create module node
-	module = JoinTable(dimension=axis+1, n_input_dims=len(data_tensor_shape))(prev_modules)
-	return module, [out_tensor_shape]
+    # extract attributes
+    axis = int(attrs.get('axis'))
+    # extract inputs
+    _, data_tensor_shape = inputs[0]
+    # calc output tensor shape
+    dim_rank = 0
+    for i in range(len(inputs)):
+        _, curr_input_shape = inputs[i]
+        for j in range(len(data_tensor_shape)):
+            if axis != j:
+                if curr_input_shape[i] != data_tensor_shape[i]:
+                    raise ValueError("Input shape mismatch. Expect receive input shape " +
+                                     data_tensor_shape[i] + " but got " + curr_input_shape[i])
+            else:
+                dim_rank += curr_input_shape[axis]
+    out_tensor_shape = list(data_tensor_shape)
+    out_tensor_shape[axis] = dim_rank
+    out_tensor_shape = tuple(out_tensor_shape)
+    # create module node
+    module = JoinTable(dimension=axis + 1, n_input_dims=len(data_tensor_shape))(prev_modules)
+    return module, [out_tensor_shape]
 
 
 def constant(inputs, prev_modules, attrs, outputs):
-	# extract attributes
-	value = parse_tensor_data(attrs.get('value'))
+    # extract attributes
+    value = parse_tensor_data(attrs.get('value'))
 
-	# calc output tensor shape
-	out_tensor_shape = value.shape
-	# create module node
-	module = Constant(value)(prev_modules[0])
-	return module, [out_tensor_shape]
+    # calc output tensor shape
+    out_tensor_shape = value.shape
+    # create module node
+    module = Constant(value)(prev_modules[0])
+    return module, [out_tensor_shape]
 
 
 def conv(inputs, prev_modules, attrs, outputs):
-	# extract attributes
-	auto_pad = attrs.get('auto_pad', 'NOTSET')
-	padW, padH = map(int, attrs.get('pads', (0, 0))[:2])
-	kernelW, kernelH = map(int, attrs.get('kernel_shape', (0, 0))[:2])
-	strideW, strideH = map(int, attrs.get('strides', (1, 1))[:2])
-	dilationW, dilationH = map(int, attrs.get('dilations', (1, 1))[:2])
-	group = int(attrs.get('group', 1))
-	withBias = len(inputs) == 3 and inputs[2] is not None
-	# extract inputs
-	data_tensor_val, data_tensor_shape = inputs[0]
-	weight_tensor_val, weight_tensor_shape = inputs[1]
-	bias_tensor_val = None
-	if withBias:
-		bias_tensor_val, _ = inputs[2]
-	# calc output tensor shape
-	input_batch_size, n_input_plane = map(int, data_tensor_shape[:2])
-	n_output_plane = weight_tensor_shape[0]
-	input_height, input_width = data_tensor_shape[-2:]
-	output_height = calc_output_shape(input_height, kernelH, padding = padH, stride=strideH)
-	output_width = calc_output_shape(input_width, kernelW, padding = padW, stride=strideW)
-	out_tensor_shape = (input_batch_size, n_output_plane, output_height, output_width)
-	# create module node
-	module = SpatialConvolution(
-		n_input_plane=n_input_plane, n_output_plane=n_output_plane,
-		kernel_w=kernelW, kernel_h=kernelH, stride_w=strideW, stride_h=strideH,
-		pad_w=padW, pad_h=padH, n_group=group, init_weight=weight_tensor_val,
-		init_bias=bias_tensor_val, with_bias=withBias
-		)(prev_modules[0])
-	return module, [out_tensor_shape]
+    # extract attributes
+    auto_pad = attrs.get('auto_pad', 'NOTSET')
+    padW, padH = map(int, attrs.get('pads', (0, 0))[:2])
+    kernelW, kernelH = map(int, attrs.get('kernel_shape', (0, 0))[:2])
+    strideW, strideH = map(int, attrs.get('strides', (1, 1))[:2])
+    dilationW, dilationH = map(int, attrs.get('dilations', (1, 1))[:2])
+    group = int(attrs.get('group', 1))
+    withBias = len(inputs) == 3 and inputs[2] is not None
+    # extract inputs
+    data_tensor_val, data_tensor_shape = inputs[0]
+    weight_tensor_val, weight_tensor_shape = inputs[1]
+    bias_tensor_val = None
+    if withBias:
+        bias_tensor_val, _ = inputs[2]
+    # calc output tensor shape
+    input_batch_size, n_input_plane = map(int, data_tensor_shape[:2])
+    n_output_plane = weight_tensor_shape[0]
+    input_height, input_width = data_tensor_shape[-2:]
+    output_height = calc_output_shape(input_height, kernelH, padding=padH, stride=strideH)
+    output_width = calc_output_shape(input_width, kernelW, padding=padW, stride=strideW)
+    out_tensor_shape = (input_batch_size, n_output_plane, output_height, output_width)
+    # create module node
+    module = SpatialConvolution(
+        n_input_plane=n_input_plane, n_output_plane=n_output_plane,
+        kernel_w=kernelW, kernel_h=kernelH, stride_w=strideW, stride_h=strideH,
+        pad_w=padW, pad_h=padH, n_group=group, init_weight=weight_tensor_val,
+        init_bias=bias_tensor_val, with_bias=withBias
+    )(prev_modules[0])
+    return module, [out_tensor_shape]
 
 
 def gather(inputs, prev_modules, attrs, outputs):
-	# extract attributes
-	axis = int(attrs.get('axis', 0))
-	if axis != 0:
-		raise ValueError("Gather layer axis value")
-	# extract inputs
-	data_tensor_val, data_tensor_shape = inputs[0]
-	indices_val, indices = inputs[1]
-	# calc output tensor shape
-	out_tensor_shape = tuple(data_tensor_shape[:axis] + indices + data_tensor_shape[axis + 1:])
-	# create module node
-	module = Gather()(prev_modules)
-	return module, [out_tensor_shape]
+    # extract attributes
+    axis = int(attrs.get('axis', 0))
+    if axis != 0:
+        raise ValueError("Gather layer axis value")
+    # extract inputs
+    data_tensor_val, data_tensor_shape = inputs[0]
+    indices_val, indices = inputs[1]
+    # calc output tensor shape
+    out_tensor_shape = tuple(data_tensor_shape[:axis] + indices + data_tensor_shape[axis + 1:])
+    # create module node
+    module = Gather()(prev_modules)
+    return module, [out_tensor_shape]
 
 
 def gemm(inputs, prev_modules, attrs, outputs):
-	# extract attributes
-	alpha = float(attrs.get("alpha", 1.0))
-	beta = float(attrs.get("beta", 1.0))
-	trans_a = int(attrs.get("transA", 0))
-	trans_b = int(attrs.get("transB", 0))
-	# extract inputs
-	_, tensor_a_shape = inputs[0]
-	tensor_b_val, tensor_b_shape = inputs[1]
-	tensor_c_val, tensor_c_shape = inputs[2]
-	# create module node
-	module = Gemm(alpha=alpha, beta=beta, trans_a=trans_a, trans_b=trans_b,
-				matrix_b=tensor_b_val, matrix_c=tensor_c_val)(prev_modules)
-	return module, [tensor_c_shape]
+    # extract attributes
+    alpha = float(attrs.get("alpha", 1.0))
+    beta = float(attrs.get("beta", 1.0))
+    trans_a = int(attrs.get("transA", 0))
+    trans_b = int(attrs.get("transB", 0))
+    # extract inputs
+    _, tensor_a_shape = inputs[0]
+    tensor_b_val, tensor_b_shape = inputs[1]
+    tensor_c_val, tensor_c_shape = inputs[2]
+    # create module node
+    module = Gemm(alpha=alpha, beta=beta, trans_a=trans_a, trans_b=trans_b,
+                  matrix_b=tensor_b_val, matrix_c=tensor_c_val)(prev_modules)
+    return module, [tensor_c_shape]
 
 
 def max_pool(inputs, prev_modules, attrs, outputs):
-	# extract attributes
-	auto_pad = attrs.get("auto_pad", 'NOTSET')
-	kernelW, kernelH = map(int, attrs.get("kernel_shape")[:2])
-	strideW, strideH = map(int, attrs.get("strides", (1, 1))[:2])
-	dilationW, dilationH = map(int, attrs.get('dilations', (1, 1))[:2])
-	padW, padH = map(int, attrs.get("pads", (0, 0))[:2])
-	ceil_mode = True if (attrs.get("ceil_mode", 0) != 0) else False
-	storage_order = int(attrs.get("storage_order", 0))
-	# extract inputs
-	_, data_tensor_shape = inputs[0]
-	input_width, input_height = data_tensor_shape[-2:]
-	# calc output tensor shape
-	output_width = calc_output_shape(input_width, kernelW,
-		padding=padW, stride=strideW, dilation=dilationW, ceil_mode=ceil_mode)
-	output_height = calc_output_shape(input_height, kernelH,
-		padding=padH, stride=strideH, dilation=dilationH, ceil_mode=ceil_mode)
-	out_tensor_shape_list = list(data_tensor_shape)
-	out_tensor_shape_list[2] = output_height
-	out_tensor_shape_list[3] = output_width
-	out_tensor_shape = tuple(out_tensor_shape_list)
-	# create module node
-	module = SpatialMaxPooling(kw=kernelW, kh=kernelH, dw=strideW, dh=strideH,
-		pad_w=padW, pad_h=padH, to_ceil=ceil_mode)(prev_modules[0])
-	return module, [out_tensor_shape]
+    # extract attributes
+    auto_pad = attrs.get("auto_pad", 'NOTSET')
+    kernelW, kernelH = map(int, attrs.get("kernel_shape")[:2])
+    strideW, strideH = map(int, attrs.get("strides", (1, 1))[:2])
+    dilationW, dilationH = map(int, attrs.get('dilations', (1, 1))[:2])
+    padW, padH = map(int, attrs.get("pads", (0, 0))[:2])
+    ceil_mode = True if (attrs.get("ceil_mode", 0) != 0) else False
+    storage_order = int(attrs.get("storage_order", 0))
+    # extract inputs
+    _, data_tensor_shape = inputs[0]
+    input_width, input_height = data_tensor_shape[-2:]
+    # calc output tensor shape
+    output_width = calc_output_shape(input_width, kernelW,
+                                     padding=padW, stride=strideW, dilation=dilationW,
+                                     ceil_mode=ceil_mode)
+    output_height = calc_output_shape(input_height, kernelH,
+                                      padding=padH, stride=strideH, dilation=dilationH,
+                                      ceil_mode=ceil_mode)
+    out_tensor_shape_list = list(data_tensor_shape)
+    out_tensor_shape_list[2] = output_height
+    out_tensor_shape_list[3] = output_width
+    out_tensor_shape = tuple(out_tensor_shape_list)
+    # create module node
+    module = SpatialMaxPooling(kw=kernelW, kh=kernelH, dw=strideW, dh=strideH,
+                               pad_w=padW, pad_h=padH, to_ceil=ceil_mode)(prev_modules[0])
+    return module, [out_tensor_shape]
 
 
 def relu(inputs, prev_modules, attrs, outputs):
-	# extract inputs
-	_, data_tensor_shape = inputs[0]
-	# calc output tensor shape
-	output_shape = data_tensor_shape
-	# create module node
-	module = ReLU()(prev_modules[0])
-	return module, [output_shape]
+    # extract inputs
+    _, data_tensor_shape = inputs[0]
+    # calc output tensor shape
+    output_shape = data_tensor_shape
+    # create module node
+    module = ReLU()(prev_modules[0])
+    return module, [output_shape]
 
 
 def reshape(inputs, prev_modules, attrs, outputs):
-	# extract inputs
-	_, data_tensor_shape = inputs[0]
-	shape_tensor_val, _ = inputs[1]
-	shape_arry = None
-	if shape_tensor_val is not None:
-		shape_arry = np.squeeze(shape_tensor_val).astype(int).tolist()
-	# create module node
-	module = Reshape(shape_arry)(prev_modules)
-	return module, [shape_tensor_val]
+    # extract inputs
+    _, data_tensor_shape = inputs[0]
+    shape_tensor_val, _ = inputs[1]
+    shape_arry = None
+    if shape_tensor_val is not None:
+        shape_arry = np.squeeze(shape_tensor_val).astype(int).tolist()
+    # create module node
+    module = Reshape(shape_arry)(prev_modules)
+    return module, [shape_tensor_val]
 
 
 def shape(inputs, prev_modules, attrs, outputs):
-	# extract inputs
-	_, data_tensor_shape = inputs[0]
-	# create module node
-	module = Shape()(prev_modules[0])
-	return module, [(len(data_tensor_shape),)]
+    # extract inputs
+    _, data_tensor_shape = inputs[0]
+    # create module node
+    module = Shape()(prev_modules[0])
+    return module, [(len(data_tensor_shape),)]
 
 
 def softmax(inputs, prev_modules, attrs, outputs):
-	_, data_tensor_shape = inputs[0]
-	out_tensor_shape = data_tensor_shape
-	axis = int(attrs.get('axis', 1))
-	module = SoftMax()(prev_modules[0])
-	return module, [out_tensor_shape]
+    _, data_tensor_shape = inputs[0]
+    out_tensor_shape = data_tensor_shape
+    axis = int(attrs.get('axis', 1))
+    module = SoftMax()(prev_modules[0])
+    return module, [out_tensor_shape]
 
 
 def _sum(inputs, prev_modules, attrs, outputs):
-	_, data_tensor_shape = inputs[0]
-	out_tensor_shape = data_tensor_shape
-	module = CAddTable()(prev_modules)
-	return module, [data_tensor_shape]
+    _, data_tensor_shape = inputs[0]
+    out_tensor_shape = data_tensor_shape
+    module = CAddTable()(prev_modules)
+    return module, [data_tensor_shape]
 
 
 def unsqueeze(inputs, prev_modules, attrs, outputs):
-	axes = list(map(int, attrs.get('axes')))
-	data_tensor_val, data_tensor_shape = inputs[0]
-	out_tensor_shape = list(data_tensor_shape)
-	for idx in axes:
-		out_tensor_shape.insert(idx, 1)
-	out_tensor_shape = tuple(out_tensor_shape)
-	module = Unsqueeze(axes[0], len(data_tensor_shape))(prev_modules)
-	return module, [out_tensor_shape]
+    axes = list(map(int, attrs.get('axes')))
+    data_tensor_val, data_tensor_shape = inputs[0]
+    out_tensor_shape = list(data_tensor_shape)
+    for idx in axes:
+        out_tensor_shape.insert(idx, 1)
+    out_tensor_shape = tuple(out_tensor_shape)
+    module = Unsqueeze(axes[0], len(data_tensor_shape))(prev_modules)
+    return module, [out_tensor_shape]
diff --git a/python/dllib/src/bigdl/dllib/contrib/onnx/ops_mapping.py b/python/dllib/src/bigdl/dllib/contrib/onnx/ops_mapping.py
index cce517b82af..30451eb6840 100644
--- a/python/dllib/src/bigdl/dllib/contrib/onnx/ops_mapping.py
+++ b/python/dllib/src/bigdl/dllib/contrib/onnx/ops_mapping.py
@@ -25,7 +25,7 @@
 # defined in the op_translations module.
 _convert_map = {
     # Generator Functions
-    'Constant'          : constant,
+    'Constant': constant,
     # 'RandomUniform'     : random_uniform,
     # 'RandomNormal'      : random_normal,
     # 'RandomUniformLike' : random_uniform,
@@ -38,20 +38,20 @@
     # 'Div'               : divide,
     # 'Abs'               : absolute,
     # 'Neg'               : negative,
-    'Sum'               : _sum, #elemwise sum
+    'Sum': _sum,  # elemwise sum
     # #Hyperbolic functions
     # 'Tanh'              : tanh,
     # # Rounding
     # 'Ceil'              : ceil,
     # 'Floor'             : floor,
     # # Joining and spliting
-    'Concat'            : concat,
+    'Concat': concat,
     # # Basic neural network functions
     # 'Sigmoid'           : sigmoid,
-    'Relu'              : relu,
+    'Relu': relu,
     # 'Pad'               : pad,
     # 'MatMul'            : matrix_multiplication, #linalg_gemm2
-    'Conv'              : conv,
+    'Conv': conv,
     # 'ConvTranspose'     : deconv,
     'BatchNormalization': batch_norm,
     # 'SpatialBN'         : batch_norm,
@@ -59,22 +59,22 @@
     # 'Elu'               : _elu,
     # 'PRelu'             : _prelu,
     # 'Selu'              : _selu,
-    'Softmax'           : softmax,
+    'Softmax': softmax,
     # 'FC'                : fully_connected,
     # 'GlobalAveragePool' : global_avgpooling,
     # 'GlobalMaxPool'     : global_maxpooling,
     # 'GlobalLpPool'      : global_lppooling,
-    'Gemm'              : gemm,
+    'Gemm': gemm,
     # 'LRN'               : local_response_norm,
     # 'Dropout'           : dropout,
     # # Changing shape and type.
-    'Reshape'           : reshape,
+    'Reshape': reshape,
     # 'Cast'              : cast,
     # 'Split'             : split,
     # 'Slice'             : _slice,
     # 'Transpose'         : transpose,
     # 'Squeeze'           : squeeze,
-    'Unsqueeze'         : unsqueeze,
+    'Unsqueeze': unsqueeze,
     # 'Flatten'           : flatten,
     # 'Identity'          : identity,
     # #Powers
@@ -89,8 +89,8 @@
     # 'ReduceMin'         : reduce_min,
     # 'ReduceSum'         : reduce_sum,
     # 'ReduceProd'        : reduce_prod,
-    'AveragePool'       : average_pool,
-    'MaxPool'           : max_pool,
+    'AveragePool': average_pool,
+    'MaxPool': max_pool,
     # # Sorting and Searching
     # 'ArgMax'            : argmax,
     # 'ArgMin'            : argmin,
@@ -121,9 +121,9 @@
     # 'Sin'               : _sin,
     # 'Softplus'          : softplus,
     # 'Tan'               : _tan,
-    'Shape'             : shape,
+    'Shape': shape,
     # 'Size'              : size,
-    'Gather'            : gather,
+    'Gather': gather,
     # 'HardSigmoid'       : hardsigmoid,
     # 'LpPool'            : lp_pooling,
     # 'DepthToSpace'      : depthtospace,
diff --git a/python/dllib/src/bigdl/dllib/estimator/estimator.py b/python/dllib/src/bigdl/dllib/estimator/estimator.py
index f95bca89a96..70d70ecb83e 100644
--- a/python/dllib/src/bigdl/dllib/estimator/estimator.py
+++ b/python/dllib/src/bigdl/dllib/estimator/estimator.py
@@ -18,6 +18,7 @@
 
 from bigdl.dllib.utils.file_utils import callZooFunc
 
+
 class Estimator(JavaValue):
     """
     Estimator class for training and evaluation BigDL models.
diff --git a/python/dllib/src/bigdl/dllib/feature/dataset/base.py b/python/dllib/src/bigdl/dllib/feature/dataset/base.py
index 771786841c1..be6c4c77c79 100644
--- a/python/dllib/src/bigdl/dllib/feature/dataset/base.py
+++ b/python/dllib/src/bigdl/dllib/feature/dataset/base.py
@@ -46,9 +46,9 @@ def __init__(self, target, width=30, verbose=1, interval=0.01):
     def update(self, current, values=[], force=False):
         '''
 
-            :param current: index of current step
-            :param values: list of tuples (name, value_for_last_step).The progress bar will display averages for these values.
-            :param force: force visual progress update
+            :param current: index of current step :param values: list of tuples (name,
+            value_for_last_step).The progress bar will display averages for these values. :param
+            force: force visual progress update
         '''
         for k, v in values:
             if k not in self.sum_values:
diff --git a/python/dllib/src/bigdl/dllib/feature/dataset/dataset.py b/python/dllib/src/bigdl/dllib/feature/dataset/dataset.py
index 86b59f45307..146fb697826 100644
--- a/python/dllib/src/bigdl/dllib/feature/dataset/dataset.py
+++ b/python/dllib/src/bigdl/dllib/feature/dataset/dataset.py
@@ -24,9 +24,10 @@
     long = int
     unicode = str
 
+
 class DataSet(JavaValue):
 
-    def __init__(self, jvalue=None, image_frame = None, bigdl_type="float"):
+    def __init__(self, jvalue=None, image_frame=None, bigdl_type="float"):
         self.bigdl_type = bigdl_type
         if jvalue:
             self.value = jvalue
@@ -36,11 +37,13 @@ def __init__(self, jvalue=None, image_frame = None, bigdl_type="float"):
     @classmethod
     def image_frame(cls, image_frame, bigdl_type="float"):
         jvalue = callBigDlFunc(bigdl_type, "createDatasetFromImageFrame", image_frame)
-        return DataSet(jvalue=jvalue, image_frame = image_frame)
+        return DataSet(jvalue=jvalue, image_frame=image_frame)
 
     def transform(self, transformer):
         if isinstance(transformer, FeatureTransformer):
-            jvalue = callBigDlFunc(self.bigdl_type, "featureTransformDataset", self.value, transformer)
+            jvalue = callBigDlFunc(self.bigdl_type, "featureTransformDataset", self.value,
+                                   transformer)
             return DataSet(jvalue=jvalue)
+
     def get_image_frame(self):
-        return self.image_frame
\ No newline at end of file
+        return self.image_frame
diff --git a/python/dllib/src/bigdl/dllib/feature/dataset/mnist.py b/python/dllib/src/bigdl/dllib/feature/dataset/mnist.py
index 95709530766..e1b3fefdcbd 100644
--- a/python/dllib/src/bigdl/dllib/feature/dataset/mnist.py
+++ b/python/dllib/src/bigdl/dllib/feature/dataset/mnist.py
@@ -84,11 +84,9 @@ def read_data_sets(train_dir, data_type="train"):
 
     :return:
 
-    ```
-    (ndarray, ndarray) representing (features, labels)
-    features is a 4D unit8 numpy array [index, y, x, depth] representing each pixel valued from 0 to 255.
-    labels is 1D unit8 nunpy array representing the label valued from 0 to 9.
-    ```
+    ``` (ndarray, ndarray) representing (features, labels) features is a 4D unit8 numpy array [
+    index, y, x, depth] representing each pixel valued from 0 to 255. labels is 1D unit8 nunpy
+    array representing the label valued from 0 to 9. ```
 
     """
     TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
diff --git a/python/dllib/src/bigdl/dllib/feature/dataset/movielens.py b/python/dllib/src/bigdl/dllib/feature/dataset/movielens.py
index bf8da2c92bc..c798769af64 100644
--- a/python/dllib/src/bigdl/dllib/feature/dataset/movielens.py
+++ b/python/dllib/src/bigdl/dllib/feature/dataset/movielens.py
@@ -22,12 +22,14 @@
 from bigdl.dllib.feature.dataset import base
 
 SOURCE_URL = 'http://files.grouplens.org/datasets/movielens/'
+
+
 def read_data_sets(data_dir):
     """
     Parse or download movielens 1m  data if train_dir is empty.
 
     :param data_dir: The directory storing the movielens data
-    :return: a 2D numpy array with user index and item index in each row 
+    :return: a 2D numpy array with user index and item index in each row
     """
     WHOLE_DATA = 'ml-1m.zip'
     local_file = base.maybe_download(WHOLE_DATA, data_dir, SOURCE_URL + WHOLE_DATA)
@@ -37,19 +39,21 @@ def read_data_sets(data_dir):
         print("Extracting %s to %s" % (local_file, data_dir))
         zip_ref.extractall(data_dir)
         zip_ref.close()
-    rating_files = os.path.join(extracted_to,"ratings.dat")
+    rating_files = os.path.join(extracted_to, "ratings.dat")
 
-    rating_list = [i.strip().split("::") for i in open(rating_files,"r").readlines()]    
+    rating_list = [i.strip().split("::") for i in open(rating_files, "r").readlines()]
     movielens_data = np.array(rating_list).astype(int)
-    return movielens_data 
+    return movielens_data
+
 
 def get_id_pairs(data_dir):
-	movielens_data = read_data_sets(data_dir)
-	return movielens_data[:, 0:2]
+    movielens_data = read_data_sets(data_dir)
+    return movielens_data[:, 0:2]
+
 
 def get_id_ratings(data_dir):
-	movielens_data = read_data_sets(data_dir)
-	return movielens_data[:, 0:3]
+    movielens_data = read_data_sets(data_dir)
+    return movielens_data[:, 0:3]
 
 
 if __name__ == "__main__":
diff --git a/python/dllib/src/bigdl/dllib/feature/dataset/sentence.py b/python/dllib/src/bigdl/dllib/feature/dataset/sentence.py
index 960c31bdf1d..2bc21efe109 100644
--- a/python/dllib/src/bigdl/dllib/feature/dataset/sentence.py
+++ b/python/dllib/src/bigdl/dllib/feature/dataset/sentence.py
@@ -20,6 +20,7 @@
 import sys
 from nltk.tokenize import word_tokenize
 
+
 def read_localfile(fileName):
     lines = []
     with open(fileName) as f:
@@ -28,15 +29,18 @@ def read_localfile(fileName):
     f.close()
     return lines
 
+
 def sentences_split(line):
     nltk.data.path.append(os.environ.get('PWD'))
     sent_tokenizer = nltk.tokenize.PunktSentenceTokenizer()
     sentenized = sent_tokenizer.tokenize(line)
     return sentenized
 
+
 def sentences_bipadding(sent):
     return "SENTENCESTART " + sent + " SENTENCEEND"
 
+
 def sentence_tokenizer(sentences):
     tokenized_sents = nltk.word_tokenize(sentences)
     return tokenized_sents
diff --git a/python/dllib/src/bigdl/dllib/feature/transform/vision/image.py b/python/dllib/src/bigdl/dllib/feature/transform/vision/image.py
index 19cd0fbaedb..b529b48243e 100644
--- a/python/dllib/src/bigdl/dllib/feature/transform/vision/image.py
+++ b/python/dllib/src/bigdl/dllib/feature/transform/vision/image.py
@@ -31,7 +31,7 @@ class FeatureTransformer(JavaValue):
 
     def __init__(self, bigdl_type="float", *args):
         self.value = callBigDlFunc(
-                bigdl_type, JavaValue.jvm_class_constructor(self), *args)
+            bigdl_type, JavaValue.jvm_class_constructor(self), *args)
 
     def transform(self, image_feature, bigdl_type="float"):
         """
@@ -45,9 +45,10 @@ def __call__(self, image_frame, bigdl_type="float"):
         transform ImageFrame
         """
         jframe = callBigDlFunc(bigdl_type,
-                             "transformImageFrame", self.value, image_frame)
+                               "transformImageFrame", self.value, image_frame)
         return ImageFrame(jvalue=jframe)
 
+
 class Pipeline(FeatureTransformer):
     """
     Pipeline of FeatureTransformer
@@ -55,10 +56,12 @@ class Pipeline(FeatureTransformer):
 
     def __init__(self, transformers, bigdl_type="float"):
         for transfomer in transformers:
-            assert transfomer.__class__.__bases__[0].__name__ == "FeatureTransformer", "the transformer should be " \
-                                                                                       "subclass of FeatureTransformer"
+            assert transfomer.__class__.__bases__[0].__name__ == "FeatureTransformer", \
+                "the transformer should be " \
+                "subclass of FeatureTransformer"
         super(Pipeline, self).__init__(bigdl_type, transformers)
 
+
 class ImageFeature(JavaValue):
     """
     Each ImageFeature keeps information about single image,
@@ -97,6 +100,7 @@ def keys(self):
         """
         return callBigDlFunc(self.bigdl_type, "imageFeatureGetKeys", self.value)
 
+
 class ImageFrame(JavaValue):
     """
     ImageFrame wraps a set of ImageFeature
@@ -110,7 +114,6 @@ def __init__(self, jvalue, bigdl_type="float"):
         else:
             self.image_frame = DistributedImageFrame(jvalue=self.value)
 
-
     @classmethod
     def read(cls, path, sc=None, min_partitions=1, bigdl_type="float"):
         """
@@ -134,7 +137,7 @@ def read_parquet(cls, path, sc, bigdl_type="float"):
         return DistributedImageFrame(jvalue=callBigDlFunc(bigdl_type, "readParquet", path, sc))
 
     @classmethod
-    def write_parquet(cls, path, output, sc, partition_num = 1, bigdl_type="float"):
+    def write_parquet(cls, path, output, sc, partition_num=1, bigdl_type="float"):
         """
         write ImageFrame as parquet file
         """
@@ -157,7 +160,7 @@ def transform(self, transformer, bigdl_type="float"):
         transformImageFrame
         """
         self.value = callBigDlFunc(bigdl_type,
-                                 "transformImageFrame", transformer, self.value)
+                                   "transformImageFrame", transformer, self.value)
         return self
 
     def get_image(self, float_key="floats", to_chw=True):
@@ -201,15 +204,17 @@ def random_split(self, weights):
         """
         Random split imageframes according to weights
         :param weights: weights for each ImageFrame
-        :return: 
+        :return:
         """
-        jvalues =  self.image_frame.random_split(weights)
+        jvalues = self.image_frame.random_split(weights)
         return [ImageFrame(jvalue) for jvalue in jvalues]
 
+
 class LocalImageFrame(ImageFrame):
     """
     LocalImageFrame wraps a list of ImageFeature
     """
+
     def __init__(self, image_list=None, label_list=None, jvalue=None, bigdl_type="float"):
         assert jvalue or image_list, "jvalue and image_list cannot be None in the same time"
         if jvalue:
@@ -217,7 +222,8 @@ def __init__(self, image_list=None, label_list=None, jvalue=None, bigdl_type="fl
         else:
             # init from image ndarray list and label rdd(optional)
             image_tensor_list = map(lambda image: JTensor.from_ndarray(image), image_list)
-            label_tensor_list = map(lambda label: JTensor.from_ndarray(label), label_list) if label_list else None
+            label_tensor_list = map(lambda label: JTensor.from_ndarray(label),
+                                    label_list) if label_list else None
             self.value = callBigDlFunc(bigdl_type, JavaValue.jvm_class_constructor(self),
                                        image_tensor_list, label_tensor_list)
 
@@ -228,7 +234,7 @@ def get_image(self, float_key="floats", to_chw=True):
         get image list from ImageFrame
         """
         tensors = callBigDlFunc(self.bigdl_type,
-                                   "localImageFrameToImageTensor", self.value, float_key, to_chw)
+                                "localImageFrameToImageTensor", self.value, float_key, to_chw)
         return map(lambda tensor: tensor.to_ndarray(), tensors)
 
     def get_label(self):
@@ -243,17 +249,19 @@ def get_predict(self, key="predict"):
         get prediction list from ImageFrame
         """
         predicts = callBigDlFunc(self.bigdl_type, "localImageFrameToPredict", self.value, key)
-        return map(lambda predict: (predict[0], predict[1].to_ndarray()) if predict[1] else (predict[0], None), predicts)
+        return map(lambda predict: (predict[0], predict[1].to_ndarray()) if predict[1] else (
+            predict[0], None), predicts)
 
-    def get_sample(self,  key="sample"):
+    def get_sample(self, key="sample"):
         return callBigDlFunc(self.bigdl_type, "localImageFrameToSample", self.value, key)
 
-    def get_uri(self, key = "uri"):
+    def get_uri(self, key="uri"):
         return callBigDlFunc(self.bigdl_type, "localImageFrameToUri", self.value, key)
 
     def random_split(self, weights):
         raise "random split not supported in LocalImageFrame"
 
+
 class DistributedImageFrame(ImageFrame):
     """
     DistributedImageFrame wraps an RDD of ImageFeature
@@ -266,7 +274,8 @@ def __init__(self, image_rdd=None, label_rdd=None, jvalue=None, bigdl_type="floa
         else:
             # init from image ndarray rdd and label rdd(optional)
             image_tensor_rdd = image_rdd.map(lambda image: JTensor.from_ndarray(image))
-            label_tensor_rdd = label_rdd.map(lambda label: JTensor.from_ndarray(label)) if label_rdd else None
+            label_tensor_rdd = label_rdd.map(
+                lambda label: JTensor.from_ndarray(label)) if label_rdd else None
             self.value = callBigDlFunc(bigdl_type, JavaValue.jvm_class_constructor(self),
                                        image_tensor_rdd, label_tensor_rdd)
 
@@ -277,14 +286,16 @@ def get_image(self, float_key="floats", to_chw=True):
         get image rdd from ImageFrame
         """
         tensor_rdd = callBigDlFunc(self.bigdl_type,
-                               "distributedImageFrameToImageTensorRdd", self.value, float_key, to_chw)
+                                   "distributedImageFrameToImageTensorRdd", self.value, float_key,
+                                   to_chw)
         return tensor_rdd.map(lambda tensor: tensor.to_ndarray())
 
     def get_label(self):
         """
         get label rdd from ImageFrame
         """
-        tensor_rdd = callBigDlFunc(self.bigdl_type, "distributedImageFrameToLabelTensorRdd", self.value)
+        tensor_rdd = callBigDlFunc(self.bigdl_type, "distributedImageFrameToLabelTensorRdd",
+                                   self.value)
         return tensor_rdd.map(lambda tensor: tensor.to_ndarray())
 
     def get_predict(self, key="predict"):
@@ -292,16 +303,20 @@ def get_predict(self, key="predict"):
         get prediction rdd from ImageFrame
         """
         predicts = callBigDlFunc(self.bigdl_type, "distributedImageFrameToPredict", self.value, key)
-        return predicts.map(lambda predict: (predict[0], predict[1].to_ndarray()) if predict[1] else (predict[0], None))
+        return predicts.map(
+            lambda predict: (predict[0], predict[1].to_ndarray()) if predict[1] else (
+                predict[0], None))
 
-    def get_sample(self,  key="sample"):
+    def get_sample(self, key="sample"):
         return callBigDlFunc(self.bigdl_type, "distributedImageFrameToSample", self.value, key)
 
-    def get_uri(self, key = "uri"):
+    def get_uri(self, key="uri"):
         return callBigDlFunc(self.bigdl_type, "distributedImageFrameToUri", self.value, key)
 
     def random_split(self, weights):
-        return callBigDlFunc(self.bigdl_type, "distributedImageFrameRandomSplit", self.value, weights)
+        return callBigDlFunc(self.bigdl_type, "distributedImageFrameRandomSplit", self.value,
+                             weights)
+
 
 class HFlip(FeatureTransformer):
     """
@@ -309,7 +324,8 @@ class HFlip(FeatureTransformer):
     """
 
     def __init__(self, bigdl_type="float"):
-            super(HFlip, self).__init__(bigdl_type)
+        super(HFlip, self).__init__(bigdl_type)
+
 
 class Resize(FeatureTransformer):
     """
@@ -324,10 +340,11 @@ class Resize(FeatureTransformer):
     Imgproc.resize(mat, mat, new Size(resizeWH, resizeWH))
     """
 
-    def __init__(self, resize_h, resize_w, resize_mode = 1, use_scale_factor=True,
+    def __init__(self, resize_h, resize_w, resize_mode=1, use_scale_factor=True,
                  bigdl_type="float"):
         super(Resize, self).__init__(bigdl_type, resize_h, resize_w, resize_mode, use_scale_factor)
 
+
 class Brightness(FeatureTransformer):
     """
     adjust the image brightness
@@ -338,6 +355,7 @@ class Brightness(FeatureTransformer):
     def __init__(self, delta_low, delta_high, bigdl_type="float"):
         super(Brightness, self).__init__(bigdl_type, delta_low, delta_high)
 
+
 class ChannelOrder(FeatureTransformer):
     """
     random change the channel of an image
@@ -346,6 +364,7 @@ class ChannelOrder(FeatureTransformer):
     def __init__(self, bigdl_type="float"):
         super(ChannelOrder, self).__init__(bigdl_type)
 
+
 class Contrast(FeatureTransformer):
     """
     Adjust the image contrast
@@ -356,6 +375,7 @@ class Contrast(FeatureTransformer):
     def __init__(self, delta_low, delta_high, bigdl_type="float"):
         super(Contrast, self).__init__(bigdl_type, delta_low, delta_high)
 
+
 class Saturation(FeatureTransformer):
     """
     Adjust image saturation
@@ -364,6 +384,7 @@ class Saturation(FeatureTransformer):
     def __init__(self, delta_low, delta_high, bigdl_type="float"):
         super(Saturation, self).__init__(bigdl_type, delta_low, delta_high)
 
+
 class Hue(FeatureTransformer):
     """
     Adjust image hue
@@ -374,6 +395,7 @@ class Hue(FeatureTransformer):
     def __init__(self, delta_low, delta_high, bigdl_type="float"):
         super(Hue, self).__init__(bigdl_type, delta_low, delta_high)
 
+
 class ChannelNormalize(FeatureTransformer):
     """
     image channel normalize
@@ -384,16 +406,19 @@ class ChannelNormalize(FeatureTransformer):
     :param std_g std value in G channel
     :param std_b std value in B channel
     """
+
     def __init__(self, mean_r, mean_g, mean_b, std_r=1.0, std_g=1.0, std_b=1.0, bigdl_type="float"):
-        super(ChannelNormalize, self).__init__(bigdl_type, mean_r, mean_g, mean_b, std_r, std_g, std_b)
-        
+        super(ChannelNormalize, self).__init__(bigdl_type, mean_r, mean_g, mean_b, std_r, std_g,
+                                               std_b)
+
+
 class PixelNormalize(FeatureTransformer):
     """
     Pixel level normalizer, data(i) = data(i) - mean(i)
 
     :param means pixel level mean, following H * W * C order
     """
-    
+
     def __init__(self, means, bigdl_type="float"):
         super(PixelNormalize, self).__init__(bigdl_type, means)
 
@@ -402,15 +427,16 @@ class RandomCrop(FeatureTransformer):
     """
     Random crop a `cropWidth` x `cropHeight` patch from an image.
     The patch size should be less than the image size.
-    
+
     :param crop_width width after crop
     :param crop_height height after crop
     :param is_clip whether to clip the roi to image boundaries
     """
-    
+
     def __init__(self, crop_width, crop_height, is_clip=True, bigdl_type="float"):
         super(RandomCrop, self).__init__(bigdl_type, crop_width, crop_height, is_clip)
 
+
 class CenterCrop(FeatureTransformer):
     """
     Crop a `cropWidth` x `cropHeight` patch from center of image.
@@ -423,6 +449,7 @@ class CenterCrop(FeatureTransformer):
     def __init__(self, crop_width, crop_height, is_clip=True, bigdl_type="float"):
         super(CenterCrop, self).__init__(bigdl_type, crop_width, crop_height, is_clip)
 
+
 class FixedCrop(FeatureTransformer):
     """
     Crop a fixed area of image
@@ -438,6 +465,7 @@ class FixedCrop(FeatureTransformer):
     def __init__(self, x1, y1, x2, y2, normalized=True, is_clip=True, bigdl_type="float"):
         super(FixedCrop, self).__init__(bigdl_type, x1, y1, x2, y2, normalized, is_clip)
 
+
 class DetectionCrop(FeatureTransformer):
     """
     Crop from object detections, each image should has a tensor detection,
@@ -466,7 +494,8 @@ def __init__(self, means_r=123, means_g=117, means_b=104,
                  max_expand_ratio=4.0, bigdl_type="float"):
         super(Expand, self).__init__(bigdl_type, means_r, means_g, means_b,
                                      min_expand_ratio, max_expand_ratio)
-        
+
+
 class Filler(FeatureTransformer):
     """
     Fill part of image with certain pixel value
@@ -476,14 +505,15 @@ class Filler(FeatureTransformer):
     :param end_y end y ratio
     :param value filling value
     """
-    
-    def __init__(self, start_x, start_y, end_x, end_y, value = 255, bigdl_type="float"):
+
+    def __init__(self, start_x, start_y, end_x, end_y, value=255, bigdl_type="float"):
         super(Filler, self).__init__(bigdl_type, start_x,
                                      start_y,
                                      end_x,
                                      end_y,
                                      value)
 
+
 class RandomTransformer(FeatureTransformer):
     """
     It is a wrapper for transformers to control the transform probability
@@ -511,18 +541,19 @@ class ColorJitter(FeatureTransformer):
     :param random_order_prob random order for different operation
     :param shuffle  shuffle the transformers
     """
-    def __init__(self, brightness_prob = 0.5,
-                 brightness_delta = 32.0,
-                 contrast_prob = 0.5,
-                 contrast_lower = 0.5,
-                 contrast_upper = 1.5,
-                 hue_prob = 0.5,
-                 hue_delta = 18.0,
-                 saturation_prob = 0.5,
-                 saturation_lower = 0.5,
-                 saturation_upper = 1.5,
-                 random_order_prob = 0.0,
-                 shuffle = False,
+
+    def __init__(self, brightness_prob=0.5,
+                 brightness_delta=32.0,
+                 contrast_prob=0.5,
+                 contrast_lower=0.5,
+                 contrast_upper=1.5,
+                 hue_prob=0.5,
+                 hue_delta=18.0,
+                 saturation_prob=0.5,
+                 saturation_lower=0.5,
+                 saturation_upper=1.5,
+                 random_order_prob=0.0,
+                 shuffle=False,
                  bigdl_type="float"):
         super(ColorJitter, self).__init__(bigdl_type, brightness_prob,
                                           brightness_delta,
@@ -537,6 +568,7 @@ def __init__(self, brightness_prob = 0.5,
                                           random_order_prob,
                                           shuffle)
 
+
 class RandomSampler(FeatureTransformer):
     """
     Random sample a bounding box given some constraints and crop the image
@@ -546,15 +578,18 @@ class RandomSampler(FeatureTransformer):
     def __init__(self):
         super(RandomSampler, self).__init__(bigdl_type)
 
+
 class RoiProject(FeatureTransformer):
     """
     Project gt boxes onto the coordinate system defined by image boundary
-    :param need_meet_center_constraint whether need to meet center constraint, i.e., the center of gt box need be within image boundary
+    :param need_meet_center_constraint whether need to meet center constraint, i.e., the center of
+     gt box need be within image boundary
     """
 
     def __init__(self, need_meet_center_constraint, bigdl_type="float"):
         super(RoiProject, self).__init__(bigdl_type, need_meet_center_constraint)
 
+
 class RoiHFlip(FeatureTransformer):
     """
     horizontally flip the roi
@@ -563,15 +598,18 @@ class RoiHFlip(FeatureTransformer):
 
     def __init__(self, normalized=True, bigdl_type="float"):
         super(RoiHFlip, self).__init__(bigdl_type, normalized)
-        
+
+
 class RoiResize(FeatureTransformer):
     """
     resize the roi according to scale
     :param normalized whether the roi is normalized, i.e. in range [0, 1]
     """
+
     def __init__(self, normalized=True, bigdl_type="float"):
         super(RoiResize, self).__init__(bigdl_type, normalized)
 
+
 class RoiNormalize(FeatureTransformer):
     """
     Normalize Roi to [0, 1]
@@ -580,6 +618,7 @@ class RoiNormalize(FeatureTransformer):
     def __init__(self, bigdl_type="float"):
         super(RoiNormalize, self).__init__(bigdl_type)
 
+
 class MatToFloats(FeatureTransformer):
     """
     Transform OpenCVMat to float array, note that in this transformer, the mat is released
@@ -591,10 +630,11 @@ class MatToFloats(FeatureTransformer):
     """
 
     def __init__(self, valid_height=300, valid_width=300, valid_channel=300,
-                 out_key = "floats", share_buffer=True, bigdl_type="float"):
+                 out_key="floats", share_buffer=True, bigdl_type="float"):
         super(MatToFloats, self).__init__(bigdl_type, valid_height, valid_width, valid_channel,
                                           out_key, share_buffer)
 
+
 class MatToTensor(FeatureTransformer):
     """
     transform opencv mat to tensor
@@ -605,6 +645,7 @@ class MatToTensor(FeatureTransformer):
     def __init__(self, to_rgb=False, tensor_key="imageTensor", bigdl_type="float"):
         super(MatToTensor, self).__init__(bigdl_type, to_rgb, tensor_key)
 
+
 class AspectScale(FeatureTransformer):
     """
     Resize the image, keep the aspect ratio. scale according to the short edge
@@ -618,12 +659,13 @@ class AspectScale(FeatureTransformer):
     :aram min_scale control the minimum scale up for image
     """
 
-    def __init__(self, min_size, scale_multiple_of = 1, max_size = 1000,
-                 resize_mode = 1, use_scale_factor=True, min_scale=-1.0,
+    def __init__(self, min_size, scale_multiple_of=1, max_size=1000,
+                 resize_mode=1, use_scale_factor=True, min_scale=-1.0,
                  bigdl_type="float"):
         super(AspectScale, self).__init__(bigdl_type, min_size, scale_multiple_of, max_size,
                                           resize_mode, use_scale_factor, min_scale)
-        
+
+
 class RandomAspectScale(FeatureTransformer):
     """
     resize the image by randomly choosing a scale
@@ -631,17 +673,21 @@ class RandomAspectScale(FeatureTransformer):
     :param scaleMultipleOf Resize test images so that its width and height are multiples of
     :param maxSize Max pixel size of the longest side of a scaled input image
     """
-    def __init__(self, scales, scale_multiple_of = 1, max_size = 1000, bigdl_type="float"):
+
+    def __init__(self, scales, scale_multiple_of=1, max_size=1000, bigdl_type="float"):
         super(RandomAspectScale, self).__init__(bigdl_type, scales, scale_multiple_of, max_size)
 
+
 class BytesToMat(FeatureTransformer):
     """
     Transform byte array(original image file in byte) to OpenCVMat
     :param byte_key key that maps byte array
     """
-    def __init__(self, byte_key = "bytes", bigdl_type="float"):
+
+    def __init__(self, byte_key="bytes", bigdl_type="float"):
         super(BytesToMat, self).__init__(bigdl_type, byte_key)
 
+
 class ImageFrameToSample(FeatureTransformer):
     """
     transform imageframe to samples
@@ -649,18 +695,22 @@ class ImageFrameToSample(FeatureTransformer):
     :param target_keys keys that maps targets (each target should be a tensor)
     :param sample_key key to store sample
     """
+
     def __init__(self, input_keys=["imageTensor"], target_keys=None,
                  sample_key="sample", bigdl_type="float"):
         super(ImageFrameToSample, self).__init__(bigdl_type, input_keys, target_keys, sample_key)
 
+
 class PixelBytesToMat(FeatureTransformer):
     """
     Transform byte array(pixels in byte) to OpenCVMat
     :param byte_key key that maps byte array
     """
-    def __init__(self, byte_key = "bytes", bigdl_type="float"):
+
+    def __init__(self, byte_key="bytes", bigdl_type="float"):
         super(PixelBytesToMat, self).__init__(bigdl_type, byte_key)
 
+
 class FixExpand(FeatureTransformer):
     """
     Expand image with given expandHeight and expandWidth,
@@ -668,9 +718,11 @@ class FixExpand(FeatureTransformer):
     :param expand_height height expand to
     :param expand_width width expand to
     """
+
     def __init__(self, expand_height, expand_width, bigdl_type="float"):
         super(FixExpand, self).__init__(bigdl_type, expand_height, expand_width)
 
+
 class ChannelScaledNormalizer(FeatureTransformer):
     """
     Scaled image at channel level with offset and scale
@@ -679,9 +731,11 @@ class ChannelScaledNormalizer(FeatureTransformer):
     :param mean_b: offset for B channel
     :param scale: scaling factor for all channels
     """
+
     def __init__(self, mean_r, mean_g, mean_b, scale, bigdl_type="float"):
         super(ChannelScaledNormalizer, self).__init__(bigdl_type, mean_r, mean_g, mean_b, scale)
 
+
 class RandomAlterAspect(FeatureTransformer):
     """
     Apply random crop based on area ratio and resize to cropLenth size
@@ -691,6 +745,7 @@ class RandomAlterAspect(FeatureTransformer):
     :param interp_mode   interp mode applied in resize
     :param crop_length final size resized to
     """
+
     def __init__(self, min_area_ratio,
                  max_area_ratio,
                  min_aspect_ratio_change,
@@ -702,6 +757,7 @@ def __init__(self, min_area_ratio,
                                                 interp_mode,
                                                 crop_length)
 
+
 class RandomCropper(FeatureTransformer):
     """
     Random cropper on uniform distribution with fixed height & width
@@ -711,8 +767,11 @@ class RandomCropper(FeatureTransformer):
     :param cropper_method crop method
     :param channels total channels
     """
+
     def __init__(self, crop_w, crop_h, mirror, cropper_method, channels, bigdl_type="float"):
-        super(RandomCropper, self).__init__(bigdl_type, crop_w, crop_h, mirror, cropper_method, channels)
+        super(RandomCropper, self).__init__(bigdl_type, crop_w, crop_h, mirror, cropper_method,
+                                            channels)
+
 
 class RandomResize(FeatureTransformer):
     """
@@ -720,9 +779,11 @@ class RandomResize(FeatureTransformer):
     :param min_size min size to resize to
     :param max_size max size to resize to
     """
+
     def __init__(self, min_size, max_size, bigdl_type="float"):
         super(RandomResize, self).__init__(bigdl_type, min_size, max_size)
 
+
 class SeqFileFolder(JavaValue):
 
     @classmethod
@@ -746,5 +807,3 @@ def files_to_image_frame(cls,
                                class_num,
                                partition_num)
         return ImageFrame(jvalue=jvalue)
-
-
diff --git a/python/dllib/src/bigdl/dllib/keras/ToBigDLHelper.py b/python/dllib/src/bigdl/dllib/keras/ToBigDLHelper.py
index bfbe7d21dce..e72a1ad8603 100644
--- a/python/dllib/src/bigdl/dllib/keras/ToBigDLHelper.py
+++ b/python/dllib/src/bigdl/dllib/keras/ToBigDLHelper.py
@@ -92,4 +92,4 @@ def to_bigdl_reg(reg):  # reg is a dict
     if reg:
         return BRegularizer(reg['l1'], reg['l2'])
     else:
-        return None
\ No newline at end of file
+        return None
diff --git a/python/dllib/src/bigdl/dllib/keras/converter.py b/python/dllib/src/bigdl/dllib/keras/converter.py
index b7659a2f1f7..511c3f3c205 100644
--- a/python/dllib/src/bigdl/dllib/keras/converter.py
+++ b/python/dllib/src/bigdl/dllib/keras/converter.py
@@ -62,7 +62,6 @@ def load_weights_from_json_hdf5(def_json, weights_hdf5, by_name=False):
         WeightLoader.load_weights_from_hdf5(bmodel, kmodel, weights_hdf5, by_name)
         return bmodel
 
-
     @staticmethod
     def load_weights_from_hdf5(bmodel, kmodel, filepath, by_name=False):
         '''Loads all layer weights from a HDF5 save file.
@@ -88,9 +87,11 @@ def __keras_name_to_Layers(model, with_weights=False):
 
         if with_weights:
             layers = [l for l in total_layers
-                      if l.get_weights() and not isinstance(l, Model) and not isinstance(l, Sequential)]  # noqa
+                      if l.get_weights() and not isinstance(l, Model) and not isinstance(l,
+                                                                                         Sequential)]  # noqa
         else:
-            layers = [l for l in total_layers if not isinstance(l, Model) and not isinstance(l, Sequential)]  # noqa
+            layers = [l for l in total_layers if
+                      not isinstance(l, Model) and not isinstance(l, Sequential)]  # noqa
 
         return dict([(layer.name, layer) for layer in layers])
 
@@ -254,7 +255,7 @@ def convert_maxoutdense(klayer, weights):
             b_weights = np.concatenate((b_weights, k_weights[i].T))
         if len(weights) == 1:  # if without bias
             return [b_weights]
-        return [b_weights, weights[1].reshape(k_weights.shape[0]*k_weights.shape[2], )]
+        return [b_weights, weights[1].reshape(k_weights.shape[0] * k_weights.shape[2], )]
 
     @staticmethod
     def convert_srelu(klayer, weights):
@@ -275,15 +276,16 @@ def convert_locallyconnected1d(klayer, weights):
         bweights1 = np.transpose(weights[0], (0, 2, 1))
         if len(weights) == 1:  # if without bias
             return [bweights1]
-        return[bweights1, weights[1]]
+        return [bweights1, weights[1]]
 
     @staticmethod
     def convert_locallyconnected2d(klayer, weights):
         bweights1 = np.transpose(weights[0], (0, 2, 1))
         if len(weights) == 1:  # if without bias
             return [bweights1]
-        bweights2 = weights[1].reshape(weights[1].shape[0]*weights[1].shape[1], weights[1].shape[2])
-        return[bweights1, bweights2]
+        bweights2 = weights[1].reshape(weights[1].shape[0] * weights[1].shape[1],
+                                       weights[1].shape[2])
+        return [bweights1, bweights2]
 
 
 class DefinitionLoader:
@@ -296,12 +298,14 @@ def __build_node_id_2_klayer(kmodel, node_id_to_config_layer):
         :param node_id_to_config_layer: a container to store the result
         """
         node_id_to_config_layer[kmodel.name] = kmodel  # include itself as well
+
         def gather_result(layers):
             if layers:  # layers maybe None here.
                 for layer in layers:
                     if layer.name not in node_id_to_config_layer:
                         node_id_to_config_layer[layer.name] = layer
                         DefinitionLoader.__build_node_id_2_klayer(layer, node_id_to_config_layer)
+
         if hasattr(kmodel, "layers"):
             gather_result(kmodel.layers)
         if hasattr(kmodel, "flattened_layers"):
@@ -350,7 +354,8 @@ def from_kmodel(cls, kmodel):
     @classmethod
     def from_hdf5_path(cls, hdf5_path):
         """
-        :param hdf5_path: hdf5 path which can be stored in a local file system, HDFS, S3, or any Hadoop-supported file system.
+        :param hdf5_path: hdf5 path which can be stored in a local file system,
+        HDFS, S3, or any Hadoop-supported file system.
         :return: BigDL Model
         """
         from keras.models import load_model
@@ -361,7 +366,8 @@ def from_hdf5_path(cls, hdf5_path):
     @classmethod
     def from_json_path(cls, json_path):
         """
-        :param json_path: definition path which can be stored in a local file system, HDFS, S3, or any Hadoop-supported file system.
+        :param json_path: definition path which can be stored in a local file system,
+         HDFS, S3, or any Hadoop-supported file system.
         :return: BigDL Model
         """
         json_str = BCommon.text_from_path(json_path)
@@ -375,7 +381,7 @@ def from_json_str(cls, json_str):
     def _do_create_node(self, layer, clayer):
         if clayer["class_name"] == "InputLayer":
             input = BLayer.Input()
-            input.element().set_name(layer.name) # cannot set name for node?
+            input.element().set_name(layer.name)  # cannot set name for node?
             self.node_id_to_instance[layer.name] = input
             return input
         bigdl_in_nodes = []
@@ -417,6 +423,7 @@ def _construct_bigdl_sequence(self):
             bseq.add(blayer)
         return bseq
 
+
 class LayerConverter:
 
     def __init__(self, klayer, kclayer, input_shape=None):
@@ -459,7 +466,8 @@ def __check_is_share_weights(self):
         #                  ],
         if "inbound_nodes" in self.kclayer and len(self.kclayer["inbound_nodes"]) > 1:
             raise Exception(
-                "%s doesn't support multiple inputs with shared weights" % self.kclayer["class_name"])
+                "%s doesn't support multiple inputs with shared weights" % self.kclayer[
+                    "class_name"])
 
     def create(self):
         class_name = self.klayer.__class__.__name__
@@ -467,7 +475,7 @@ def create(self):
         self.__check_is_share_weights()
 
         if (hasattr(self.klayer, "b_constraint") and self.klayer.b_constraint) or \
-           (hasattr(self.klayer, "W_constraint") and self.klayer.W_constraint):
+                (hasattr(self.klayer, "W_constraint") and self.klayer.W_constraint):
             raise Exception("We don't support constraint for now")
 
         if hasattr(self.klayer, "activity_regularizer") and self.klayer.activity_regularizer:
@@ -475,7 +483,7 @@ def create(self):
 
         function_name = "create_" + class_name.lower()
         if not hasattr(self, function_name):
-            raise Exception("We don't support layer: %s for now" % class_name )
+            raise Exception("We don't support layer: %s for now" % class_name)
 
         blayer_creator = getattr(self, function_name)
         blayer = blayer_creator()
@@ -527,7 +535,7 @@ def create_timedistributeddense(self):
 
     def create_timedistributed(self):
         # input_shape is (batch, time, other dims)
-        inner_input_shape = (self.input_shape[0], ) + self.input_shape[2:]
+        inner_input_shape = (self.input_shape[0],) + self.input_shape[2:]
         blayer = LayerConverter(self.klayer.layer, self.config['layer'], inner_input_shape).create()
         return BLayer.TimeDistributed(blayer)
 
@@ -560,18 +568,18 @@ def create_embedding(self):
         if hasattr(self.klayer, "dropout") and self.klayer.dropout != 0:
             raise Exception("We don't support dropout for now")
 
-        if hasattr(self.klayer, "mask_zero") and self.klayer.mask_zero != False:
+        if hasattr(self.klayer, "mask_zero") and self.klayer.mask_zero is not False:
             raise Exception("We don't support mask_zero for now")
 
         bseq = BLayer.Sequential()
         blayer = BLayer.LookupTable(
-                 n_index=self.klayer.input_dim,
-                 n_output=self.klayer.output_dim,
-                 padding_value=0.0,
-                 norm_type=2.0,
-                 should_scale_grad_by_freq=False,
-                 wRegularizer=to_bigdl_reg(self.config["W_regularizer"]),
-                 bigdl_type="float")
+            n_index=self.klayer.input_dim,
+            n_output=self.klayer.output_dim,
+            padding_value=0.0,
+            norm_type=2.0,
+            should_scale_grad_by_freq=False,
+            wRegularizer=to_bigdl_reg(self.config["W_regularizer"]),
+            bigdl_type="float")
         bseq.add(BLayer.AddConstant(1.0))  # Add 1 as BigDL is one-based index
         bseq.add(blayer)
         blayer.set_init_method(to_bigdl_init(self.config["init"]))
@@ -598,7 +606,7 @@ def create_flatten(self):
     def create_permute(self):
         swaps = self.__perm_to_pair(list(self.klayer.dims))
         swaps.reverse()
-        swaps = map(lambda pair: (pair[0]+1, pair[1]+1), swaps)
+        swaps = map(lambda pair: (pair[0] + 1, pair[1] + 1), swaps)
         return BLayer.Transpose(list(swaps))
 
     def __perm_to_pair(self, perm):
@@ -647,7 +655,9 @@ def create_repeatvector(self):
                                 bigdl_type="float")
 
     def __is_from_sequential(self):
-        return "layers" in self.kclayer["config"] and hasattr(self.klayer, "layers") and self.klayer.layers is not None  # noqa
+        return "layers" in self.kclayer["config"] and hasattr(self.klayer,
+                                                              "layers") and self.klayer.layers\
+               is not None  # noqa
 
     def create_merge(self):
         if self.klayer.output_shape and not isinstance(self.klayer.output_shape, tuple):
@@ -675,7 +685,7 @@ def create_merge(self):
             if self.klayer.dot_axes != [1, 1]:
                 raise Exception("For merge mode dot, only dot_axes=1 is supported for now.")
             model = BLayer.Sequential()
-            blayer = model.add(BLayer.DotProduct(bigdl_type="float"))\
+            blayer = model.add(BLayer.DotProduct(bigdl_type="float")) \
                 .add(BLayer.Reshape([1], True))
         elif self.klayer.mode == "ave":
             blayer = BLayer.CAveTable(
@@ -689,8 +699,9 @@ def create_merge(self):
             blayer = BLayer.Sequential()
             blayer.add(BLayer.CosineDistance(bigdl_type="float")).add(BLayer.Reshape([1, 1], True))
         else:  # invalid mode or lambda functions
-            raise Exception("Invalid merge mode: `%s`. Lambda/function as merge mode is not supported for now."
-                            % self.klayer.mode)
+            raise Exception(
+                "Invalid merge mode: `%s`. Lambda/function as merge mode is not supported for now."
+                % self.klayer.mode)
         if self.__is_from_sequential():
             bseq = BLayer.Sequential()
             parallel_table = BLayer.ParallelTable()
@@ -746,7 +757,8 @@ def create_zeropadding1d(self):
         if isinstance(padding, int):
             return self.__generate_zeropadding1d(padding, padding)
         elif isinstance(padding, dict):
-            return self.__generate_zeropadding1d(padding.get('left_pad', 0), padding.get('right_pad', 0))
+            return self.__generate_zeropadding1d(padding.get('left_pad', 0),
+                                                 padding.get('right_pad', 0))
         else:  # tuple of int (length 2)
             padding = tuple(padding)
             return self.__generate_zeropadding1d(padding[0], padding[1])
@@ -788,28 +800,34 @@ def create_zeropadding2d(self):
         padding = self.klayer.padding
         dim = 1
         if "dim_ordering" not in self.config:
-            warnings.warn("Cannot find dim_ordering from json definition. Using the default instead.")
+            warnings.warn(
+                "Cannot find dim_ordering from json definition. Using the default instead.")
         if self.klayer.dim_ordering == "th":
             dim = 2
         if isinstance(padding, dict):  # dictionary
-            return self.__generate_zeropadding2d(dim, dim+1, len(self.input_shape) - 1,
-                                                 -padding.get('top_pad', 0), padding.get('bottom_pad', 0),
-                                                 -padding.get('left_pad', 0), padding.get('right_pad', 0))
+            return self.__generate_zeropadding2d(dim, dim + 1, len(self.input_shape) - 1,
+                                                 -padding.get('top_pad', 0),
+                                                 padding.get('bottom_pad', 0),
+                                                 -padding.get('left_pad', 0),
+                                                 padding.get('right_pad', 0))
         else:  # tuple of int
             padding = tuple(padding)
             if len(padding) == 2:
-                return self.__generate_zeropadding2d(dim, dim+1, len(self.input_shape) - 1,
-                                                     -padding[0], padding[0], -padding[1], padding[1])
+                return self.__generate_zeropadding2d(dim, dim + 1, len(self.input_shape) - 1,
+                                                     -padding[0], padding[0], -padding[1],
+                                                     padding[1])
             elif len(padding) == 4:
-                return self.__generate_zeropadding2d(dim, dim+1, len(self.input_shape) - 1,
-                                                     -padding[0], padding[1], -padding[2], padding[3])
+                return self.__generate_zeropadding2d(dim, dim + 1, len(self.input_shape) - 1,
+                                                     -padding[0], padding[1], -padding[2],
+                                                     padding[3])
 
     # NB: zeropadding doesn't serialize dim_ording to json file
     def create_zeropadding3d(self):
         padding = tuple(self.klayer.padding)
         dim = 1
         if "dim_ordering" not in self.config:
-            warnings.warn("Cannot find dim_ordering from json definition. Using the default instead.")
+            warnings.warn(
+                "Cannot find dim_ordering from json definition. Using the default instead.")
         if self.klayer.dim_ordering == "th":
             dim = 2
         model = BLayer.Sequential()
@@ -825,25 +843,25 @@ def create_zeropadding3d(self):
                                        value=0.0,
                                        n_index=1,
                                        bigdl_type="float")
-        paddinglayer3 = BLayer.Padding(dim=dim+1,
+        paddinglayer3 = BLayer.Padding(dim=dim + 1,
                                        pad=-padding[1],
                                        n_input_dim=len(self.input_shape) - 1,
                                        value=0.0,
                                        n_index=1,
                                        bigdl_type="float")
-        paddinglayer4 = BLayer.Padding(dim=dim+1,
+        paddinglayer4 = BLayer.Padding(dim=dim + 1,
                                        pad=padding[1],
                                        n_input_dim=len(self.input_shape) - 1,
                                        value=0.0,
                                        n_index=1,
                                        bigdl_type="float")
-        paddinglayer5 = BLayer.Padding(dim=dim+2,
+        paddinglayer5 = BLayer.Padding(dim=dim + 2,
                                        pad=-padding[2],
                                        n_input_dim=len(self.input_shape) - 1,
                                        value=0.0,
                                        n_index=1,
                                        bigdl_type="float")
-        paddinglayer6 = BLayer.Padding(dim=dim+2,
+        paddinglayer6 = BLayer.Padding(dim=dim + 2,
                                        pad=padding[2],
                                        n_input_dim=len(self.input_shape) - 1,
                                        value=0.0,
@@ -921,7 +939,8 @@ def generate_lstm_cell(self, klayer, kclayer, input_shape):  # create a lstm cel
         activation = get_activation_by_name(config["activation"],
                                             "%s_%s" % (config["name"], config["activation"]))
         inner_activation = get_activation_by_name(config["inner_activation"],
-                                                  "%s_%s" % (config["name"], config["inner_activation"]))
+                                                  "%s_%s" % (
+                                                      config["name"], config["inner_activation"]))
         lstm = BLayer.LSTM(input_size=int(input_shape[2]),
                            hidden_size=klayer.output_dim,
                            p=0.0,
@@ -939,13 +958,15 @@ def create_lstm(self):
         return self.__process_recurrent_layer(self.klayer.return_sequences,
                                               self.klayer.go_backwards, rec.add(lstm))
 
-    def generate_convlstm2d_cell(self, klayer, kclayer, input_shape):  # create a convlstm2d cell only
+    def generate_convlstm2d_cell(self, klayer, kclayer,
+                                 input_shape):  # create a convlstm2d cell only
         self.__check_recurrent_parameters(klayer)
         config = kclayer["config"]
         activation = get_activation_by_name(config["activation"],
                                             "%s_%s" % (config["name"], config["activation"]))
         inner_activation = get_activation_by_name(config["inner_activation"],
-                                                  "%s_%s" % (config["name"], config["inner_activation"]))
+                                                  "%s_%s" % (
+                                                      config["name"], config["inner_activation"]))
 
         convlstm = BLayer.ConvLSTMPeephole(input_size=int(input_shape[2]),
                                            output_size=config["nb_filter"],
@@ -956,7 +977,8 @@ def generate_convlstm2d_cell(self, klayer, kclayer, input_shape):  # create a co
                                            padding=-1,
                                            activation=activation,
                                            inner_activation=inner_activation,
-                                           # NB: ConvLSTM doesn't serialize regularizers to json file
+                                           # NB: ConvLSTM doesn't serialize regularizers to json
+                                           # file
                                            # wRegularizer=to_bigdl_reg(config["W_regularizer"]),
                                            # uRegularizer=to_bigdl_reg(config["U_regularizer"]),
                                            # bRegularizer=to_bigdl_reg(config["b_regularizer"]),
@@ -991,7 +1013,8 @@ def generate_gru_cell(self, klayer, kclayer, input_shape):  # create a gru cell
         activation = get_activation_by_name(config["activation"],
                                             "%s_%s" % (config["name"], config["activation"]))
         inner_activation = get_activation_by_name(config["inner_activation"],
-                                                  "%s_%s" % (config["name"], config["inner_activation"]))
+                                                  "%s_%s" % (
+                                                      config["name"], config["inner_activation"]))
         gru = BLayer.GRU(input_size=int(input_shape[2]),
                          hidden_size=klayer.output_dim,
                          p=0.0,
@@ -1040,16 +1063,16 @@ def create_batchnormalization(self):
         beta = self.get_value_from_init(self.klayer.beta_init.__name__, (n_input_channel,))
 
         blayer = BLayer.SpatialBatchNormalization(
-                 n_output=n_input_channel,
-                 eps=self.klayer.epsilon,
-                 momentum=self.klayer.momentum,
-                 affine=True,
-                 init_weight=gamma,
-                 init_bias=beta,
-                 init_grad_weight=None,
-                 init_grad_bias=None,
-                 data_format=bigdl_order,
-                 bigdl_type="float")
+            n_output=n_input_channel,
+            eps=self.klayer.epsilon,
+            momentum=self.klayer.momentum,
+            affine=True,
+            init_weight=gamma,
+            init_bias=beta,
+            init_grad_weight=None,
+            init_grad_bias=None,
+            data_format=bigdl_order,
+            bigdl_type="float")
 
         k_running_mean = keras.backend.eval(self.klayer.running_mean)
         k_running_std = keras.backend.eval(self.klayer.running_std)
@@ -1061,7 +1084,8 @@ def get_bdim_order(self, dim="2D"):  # get bigdl dim_ordering from keras dim_ord
         if "dim_ordering" in self.config:
             order = self.config["dim_ordering"]
         else:
-            warnings.warn("Cannot find dim_ordering from json definition. Using the default instead.")
+            warnings.warn(
+                "Cannot find dim_ordering from json definition. Using the default instead.")
             order = keras.backend.image_dim_ordering()
         if dim == "3D":
             return to_bigdl_3d_ordering(order)
@@ -1075,25 +1099,25 @@ def create_convolution1d(self):
         seq = BLayer.Sequential()
         seq.add(BLayer.Reshape([int(self.input_shape[1]), 1, int(self.input_shape[2])], True))
         blayer = BLayer.SpatialConvolution(
-                 n_input_plane=stack_size,
-                 n_output_plane=self.klayer.nb_filter,
-                 kernel_w=1,
-                 kernel_h=self.klayer.filter_length,
-                 stride_w=1,
-                 stride_h=self.klayer.subsample_length,
-                 pad_w=bpadW,
-                 pad_h=bpadH,
-                 n_group=1,
-                 propagate_back=True,
-                 wRegularizer=to_bigdl_reg(self.config["W_regularizer"]),
-                 bRegularizer=to_bigdl_reg(self.config["b_regularizer"]),
-                 init_weight=None,
-                 init_bias=None,
-                 init_grad_weight=None,
-                 init_grad_bias=None,
-                 with_bias=self.config["bias"],
-                 data_format="NHWC",
-                 bigdl_type="float")
+            n_input_plane=stack_size,
+            n_output_plane=self.klayer.nb_filter,
+            kernel_w=1,
+            kernel_h=self.klayer.filter_length,
+            stride_w=1,
+            stride_h=self.klayer.subsample_length,
+            pad_w=bpadW,
+            pad_h=bpadH,
+            n_group=1,
+            propagate_back=True,
+            wRegularizer=to_bigdl_reg(self.config["W_regularizer"]),
+            bRegularizer=to_bigdl_reg(self.config["b_regularizer"]),
+            init_weight=None,
+            init_bias=None,
+            init_grad_weight=None,
+            init_grad_bias=None,
+            with_bias=self.config["bias"],
+            data_format="NHWC",
+            bigdl_type="float")
         seq.add(blayer)
         seq.add(BLayer.Squeeze(3))
         return self.combo_parameter_layer(seq, self.config)
@@ -1108,31 +1132,33 @@ def create_convolution2d(self):
 
         bpadW, bpadH = to_bigdl_2d_padding(self.klayer.border_mode)
         blayer = BLayer.SpatialConvolution(
-                 n_input_plane=stack_size,
-                 n_output_plane=self.klayer.nb_filter,
-                 kernel_w=self.klayer.nb_col,
-                 kernel_h=self.klayer.nb_row,
-                 stride_w=self.klayer.subsample[1],
-                 stride_h=self.klayer.subsample[0],
-                 pad_w=bpadW,
-                 pad_h=bpadH,
-                 n_group=1,
-                 propagate_back=True,
-                 wRegularizer=to_bigdl_reg(self.config["W_regularizer"]),
-                 bRegularizer=to_bigdl_reg(self.config["b_regularizer"]),
-                 init_weight=None,
-                 init_bias=None,
-                 init_grad_weight=None,
-                 init_grad_bias=None,
-                 with_bias=self.config["bias"],
-                 data_format=bigdl_order,
-                 bigdl_type="float")
+            n_input_plane=stack_size,
+            n_output_plane=self.klayer.nb_filter,
+            kernel_w=self.klayer.nb_col,
+            kernel_h=self.klayer.nb_row,
+            stride_w=self.klayer.subsample[1],
+            stride_h=self.klayer.subsample[0],
+            pad_w=bpadW,
+            pad_h=bpadH,
+            n_group=1,
+            propagate_back=True,
+            wRegularizer=to_bigdl_reg(self.config["W_regularizer"]),
+            bRegularizer=to_bigdl_reg(self.config["b_regularizer"]),
+            init_weight=None,
+            init_bias=None,
+            init_grad_weight=None,
+            init_grad_bias=None,
+            with_bias=self.config["bias"],
+            data_format=bigdl_order,
+            bigdl_type="float")
 
         return self.combo_parameter_layer(blayer, self.config)
 
     def create_convolution3d(self):
         if self.klayer.dim_ordering != "th":
-            raise Exception("Please use `th` for `dim_ordering`. `%s` is not supported for now." % self.klayer.dim_ordering)
+            raise Exception(
+                "Please use `th` for `dim_ordering`. `%s` is not supported for now."
+                % self.klayer.dim_ordering)
 
         bpadT, bpadW, bpadH = to_bigdl_3d_padding(self.klayer.border_mode)
         blayer = BLayer.VolumetricConvolution(
@@ -1188,7 +1214,9 @@ def create_atrousconvolution1d(self):
 
     def create_atrousconvolution2d(self):
         if self.klayer.dim_ordering != "th":
-            raise Exception("Please use `th` for `dim_ordering`. `%s` is not supported for now." % self.klayer.dim_ordering)
+            raise Exception(
+                "Please use `th` for `dim_ordering`. `%s` is not supported for now."
+                % self.klayer.dim_ordering)
         if not self.config["bias"]:
             raise Exception("Only bias=True is supported for AtrousConvolution2D")
 
@@ -1201,7 +1229,7 @@ def create_atrousconvolution2d(self):
         dilation_h = self.config["atrous_rate"][0]
         dilation_w = self.config["atrous_rate"][1]
         pad_h, pad_w = to_bigdl_2d_padding(self.config["border_mode"], h, kh, dh, dilation_h,
-                                                w, kw, dw, dilation_w)
+                                           w, kw, dw, dilation_w)
         blayer = BLayer.SpatialDilatedConvolution(
             n_input_plane=int(self.input_shape[1]),
             n_output_plane=self.config["nb_filter"],
@@ -1221,7 +1249,9 @@ def create_atrousconvolution2d(self):
 
     def create_deconvolution2d(self):
         if self.klayer.dim_ordering != "th":
-            raise Exception("Please use `th` for `dim_ordering`. `%s` is not supported for now." % self.klayer.dim_ordering)
+            raise Exception(
+                "Please use `th` for `dim_ordering`. `%s` is not supported for now."
+                % self.klayer.dim_ordering)
         output_shape = self.config["output_shape"]
 
         h = int(self.input_shape[2])
@@ -1241,12 +1271,14 @@ def create_deconvolution2d(self):
                 pad_h = int(two_pad_h / 2)
             else:
                 raise Exception("For same padding, we only support padding on both sides for now. "
-                                "Please make `(input_row - 1) * subsample[0] + nb_row - output_row` an even integer.")
+                                "Please make `(input_row - 1) * subsample[0] + nb_row - output_row`"
+                                " an even integer.")
             if two_pad_w % 2 == 0:  # we only support pad_w as an int
                 pad_w = int(two_pad_w / 2)
             else:
                 raise Exception("For same padding, we only support padding on both sides for now. "
-                                "Please make `(input_col - 1) * subsample[1] + nb_col - output_col` an even integer.")
+                                "Please make `(input_col - 1) * subsample[1] + nb_col - output_col`"
+                                " an even integer.")
         blayer = BLayer.SpatialFullConvolution(
             n_input_plane=int(self.input_shape[1]),
             n_output_plane=self.klayer.nb_filter,
@@ -1268,38 +1300,40 @@ def create_deconvolution2d(self):
 
     def create_maxpooling3d(self):
         if self.klayer.dim_ordering != "th":
-            raise Exception("Please use `th` for `dim_ordering`. `%s` is not supported for now." % klayer.dim_ordering)
+            raise Exception(
+                "Please use `th` for `dim_ordering`. `%s` is not supported for now."
+                % klayer.dim_ordering)
         # TODO: border_mode = 'same'
         if self.klayer.border_mode == 'same':
             raise Exception("Unsupported border_mode: same")
 
         bpadT, bpadW, bpadH = to_bigdl_3d_padding(self.klayer.border_mode)
         blayer = BLayer.VolumetricMaxPooling(
-                k_t=self.klayer.pool_size[0],
-                k_w=self.klayer.pool_size[2],
-                k_h=self.klayer.pool_size[1],
-                d_t=self.klayer.strides[0],
-                d_w=self.klayer.strides[2],
-                d_h=self.klayer.strides[1],
-                pad_t=bpadT,
-                pad_w=bpadW,
-                pad_h=bpadH,
-                bigdl_type="float")
+            k_t=self.klayer.pool_size[0],
+            k_w=self.klayer.pool_size[2],
+            k_h=self.klayer.pool_size[1],
+            d_t=self.klayer.strides[0],
+            d_w=self.klayer.strides[2],
+            d_h=self.klayer.strides[1],
+            pad_t=bpadT,
+            pad_w=bpadW,
+            pad_h=bpadH,
+            bigdl_type="float")
         return blayer
 
     def create_maxpooling2d(self):
         bigdl_order = self.get_bdim_order()
         bpadW, bpadH = to_bigdl_2d_padding(self.klayer.border_mode)
         blayer = BLayer.SpatialMaxPooling(
-                 kw=self.klayer.pool_size[1],
-                 kh=self.klayer.pool_size[0],
-                 dw=self.klayer.strides[1],
-                 dh=self.klayer.strides[0],
-                 pad_w=bpadW,
-                 pad_h=bpadH,
-                 to_ceil=False,
-                 format=bigdl_order,
-                 bigdl_type="float")
+            kw=self.klayer.pool_size[1],
+            kh=self.klayer.pool_size[0],
+            dw=self.klayer.strides[1],
+            dh=self.klayer.strides[0],
+            pad_w=bpadW,
+            pad_h=bpadH,
+            to_ceil=False,
+            format=bigdl_order,
+            bigdl_type="float")
         return blayer
 
     def create_globalmaxpooling3d(self):
@@ -1308,20 +1342,22 @@ def create_globalmaxpooling3d(self):
             b_kw = int(self.input_shape[4])
             b_kh = int(self.input_shape[3])
         else:
-            raise Exception("Please use `th` for dim_ordering. `%s` is not supported for now." % self.klayer.dim_ordering)
+            raise Exception(
+                "Please use `th` for dim_ordering. `%s` is not supported for now." %
+                self.klayer.dim_ordering)
 
         seq = BLayer.Sequential()
         blayer = BLayer.VolumetricMaxPooling(
-                k_t=b_kt,
-                k_w=b_kw,
-                k_h=b_kh,
-                d_t=1,
-                d_w=1,
-                d_h=1,
-                pad_t=0,
-                pad_w=0,
-                pad_h=0,
-                bigdl_type="float"
+            k_t=b_kt,
+            k_w=b_kw,
+            k_h=b_kh,
+            d_t=1,
+            d_w=1,
+            d_h=1,
+            pad_t=0,
+            pad_w=0,
+            pad_h=0,
+            bigdl_type="float"
         )
         seq.add(blayer)
         seq.add(BLayer.Squeeze(5))
@@ -1336,21 +1372,23 @@ def create_globalaveragepooling3d(self):
             b_kw = int(self.input_shape[4])
             b_kh = int(self.input_shape[3])
         else:
-            raise Exception("Please use `th` for dim_ordering. `%s` is not supported for now." % self.klayer.dim_ordering)
+            raise Exception(
+                "Please use `th` for dim_ordering. `%s` is not supported for now." %
+                self.klayer.dim_ordering)
 
         seq = BLayer.Sequential()
         blayer = BLayer.VolumetricAveragePooling(
-                k_t=b_kt,
-                k_w=b_kw,
-                k_h=b_kh,
-                d_t=1,
-                d_w=1,
-                d_h=1,
-                pad_t=0,
-                pad_w=0,
-                pad_h=0,
-                count_include_pad=False,
-                bigdl_type="float"
+            k_t=b_kt,
+            k_w=b_kw,
+            k_h=b_kh,
+            d_t=1,
+            d_w=1,
+            d_h=1,
+            pad_t=0,
+            pad_w=0,
+            pad_h=0,
+            count_include_pad=False,
+            bigdl_type="float"
         )
         seq.add(blayer)
         seq.add(BLayer.Squeeze(5))
@@ -1380,24 +1418,26 @@ def create_averagepooling2d(self):
 
     def create_averagepooling3d(self):
         if self.klayer.dim_ordering != "th":
-            raise Exception("Please use `th` for `dim_ordering`. `%s` is not supported for now." % klayer.dim_ordering)
+            raise Exception(
+                "Please use `th` for `dim_ordering`. `%s` is not supported for now."
+                % klayer.dim_ordering)
         # TODO: border_mode = 'same'
         if self.klayer.border_mode == 'same':
             raise Exception("Unsupported border_mode: same")
 
         bpadT, bpadW, bpadH = to_bigdl_3d_padding(self.klayer.border_mode)
         blayer = BLayer.VolumetricAveragePooling(
-                k_t=self.klayer.pool_size[0],
-                k_w=self.klayer.pool_size[2],
-                k_h=self.klayer.pool_size[1],
-                d_t=self.klayer.strides[0],
-                d_w=self.klayer.strides[2],
-                d_h=self.klayer.strides[1],
-                pad_t=bpadT,
-                pad_w=bpadW,
-                pad_h=bpadH,
-                count_include_pad=False,
-                bigdl_type="float")
+            k_t=self.klayer.pool_size[0],
+            k_w=self.klayer.pool_size[2],
+            k_h=self.klayer.pool_size[1],
+            d_t=self.klayer.strides[0],
+            d_w=self.klayer.strides[2],
+            d_h=self.klayer.strides[1],
+            pad_t=bpadT,
+            pad_w=bpadW,
+            pad_h=bpadH,
+            count_include_pad=False,
+            bigdl_type="float")
         return blayer
 
     def create_globalmaxpooling2d(self):
@@ -1563,10 +1603,13 @@ def create_upsampling2d(self):
 
     def create_upsampling3d(self):
         if self.klayer.dim_ordering != "th":
-            raise Exception("Please use th for dim_ordering. %s is not supported for now." % self.klayer.dim_ordering)
+            raise Exception(
+                "Please use th for dim_ordering. %s is not supported for now."
+                % self.klayer.dim_ordering)
         if "dim_ordering" not in self.config:
-            warnings.warn("Cannot find dim_ordering from json definition. Using the default instead."
-                          "We only support th for now.")
+            warnings.warn(
+                "Cannot find dim_ordering from json definition. Using the default instead."
+                "We only support th for now.")
         return BLayer.UpSampling3D(self.klayer.size)
 
     def create_gaussiannoise(self):
@@ -1580,7 +1623,8 @@ def create_highway(self):
             activation = None
         else:
             activation = get_activation_by_name(self.config["activation"],
-                                                "%s_%s" % (self.config["name"], self.config["activation"]))
+                                                "%s_%s" % (
+                                                    self.config["name"], self.config["activation"]))
         blayer = BLayer.Highway(size=int(self.input_shape[1]),
                                 with_bias=self.klayer.bias,
                                 activation=activation,
@@ -1602,7 +1646,8 @@ def create_masking(self):
 
     def create_srelu(self):
         if "shared_axes" not in self.config:
-            warnings.warn("Cannot find shared_axes from json definition. Using shared_axes=None instead.")
+            warnings.warn(
+                "Cannot find shared_axes from json definition. Using shared_axes=None instead.")
         shape = self.input_shape[1:]
         t_left_init = to_bigdl_init(self.config["t_left_init"])
         a_left_init = to_bigdl_init(self.config["a_left_init"])
@@ -1686,7 +1731,8 @@ def create_locallyconnected1d(self):
         seq.add(BLayer.Squeeze(3))
         if self.config["activation"] != "linear":
             activation = get_activation_by_name(self.config["activation"],
-                                                "%s_%s" % (self.config["name"], self.config["activation"]))
+                                                "%s_%s" % (
+                                                    self.config["name"], self.config["activation"]))
             return self.fuse(seq, activation)
         else:
             return seq
@@ -1721,7 +1767,8 @@ def create_locallyconnected2d(self):
 
         if self.config["activation"] != "linear":
             activation = get_activation_by_name(self.config["activation"],
-                                                "%s_%s" % (self.config["name"], self.config["activation"]))
+                                                "%s_%s" % (
+                                                    self.config["name"], self.config["activation"]))
             return self.fuse(blayer, activation)
         else:
             return blayer
@@ -1734,7 +1781,7 @@ def combo_parameter_layer(self, blayer, config):
                                        BInit.Zeros())  # Keras always set this to be zeros
             except Exception:
                 warning_msg = "We don't support initialization " + config["init"] + " for now. " \
-                    + "Using the default instead."
+                              + "Using the default instead."
                 warnings.warn(warning_msg)
         # "linear" means doing nothing
         if config["activation"] != "linear":
@@ -1757,4 +1804,4 @@ def fuse(self, src_blayer, activation):  # activation is a layer
         seq.add(src_blayer)
         seq.add(activation)
         seq.set_name(src_blayer.name())
-        return seq
\ No newline at end of file
+        return seq
diff --git a/python/dllib/src/bigdl/dllib/keras/layers/torch.py b/python/dllib/src/bigdl/dllib/keras/layers/torch.py
index 79ef0c94051..bfe597ae747 100644
--- a/python/dllib/src/bigdl/dllib/keras/layers/torch.py
+++ b/python/dllib/src/bigdl/dllib/keras/layers/torch.py
@@ -18,7 +18,7 @@
 
 import sys
 
-#from ..engine.topology import ZooKerasLayer
+# from ..engine.topology import ZooKerasLayer
 from bigdl.dllib.keras.engine import ZooKerasLayer
 
 if sys.version >= '3':
@@ -51,6 +51,7 @@ class Select(ZooKerasLayer):
     >>> select = Select(0, -1, input_shape=(3, 4), name="select1")
     creating: createZooKerasSelect
     """
+
     def __init__(self, dim, index, input_shape=None, **kwargs):
         super(Select, self).__init__(None,
                                      dim,
@@ -83,6 +84,7 @@ class Narrow(ZooKerasLayer):
     >>> narrow = Narrow(1, 3, input_shape=(5, 6, 7), name="narrow1")
     creating: createZooKerasNarrow
     """
+
     def __init__(self, dim, offset, length=1, input_shape=None, **kwargs):
         super(Narrow, self).__init__(None,
                                      dim,
@@ -119,9 +121,10 @@ class Squeeze(ZooKerasLayer):
     >>> squeeze3 = Squeeze((1, 2), input_shape=(1, 1, 1, 32))
     creating: createZooKerasSqueeze
     """
+
     def __init__(self, dim=None, input_shape=None, **kwargs):
         if isinstance(dim, int):
-            dim = (dim, )
+            dim = (dim,)
         super(Squeeze, self).__init__(None,
                                       dim,
                                       list(input_shape) if input_shape else None,
@@ -144,6 +147,7 @@ class AddConstant(ZooKerasLayer):
     >>> addconstant = AddConstant(1, input_shape=(1, 4, 5))
     creating: createZooKerasAddConstant
     """
+
     def __init__(self, constant, input_shape=None, **kwargs):
         super(AddConstant, self).__init__(None,
                                           float(constant),
@@ -167,6 +171,7 @@ class MulConstant(ZooKerasLayer):
     >>> mulconstant = MulConstant(2.2, input_shape=(3, 4))
     creating: createZooKerasMulConstant
     """
+
     def __init__(self, constant, input_shape=None, **kwargs):
         super(MulConstant, self).__init__(None,
                                           float(constant),
@@ -195,6 +200,7 @@ class LRN2D(ZooKerasLayer):
     >>> lrn2d = LRN2D(1e-3, 1.2, 0.4, 4, dim_ordering="tf", input_shape=(4, 5, 6))
     creating: createZooKerasLRN2D
     """
+
     def __init__(self, alpha=1e-4, k=1.0, beta=0.75, n=5,
                  dim_ordering="th", input_shape=None, **kwargs):
         super(LRN2D, self).__init__(None,
@@ -244,6 +250,7 @@ class ShareConvolution2D(ZooKerasLayer):
     >>> shareconv2d = ShareConvolution2D(32, 3, 4, activation="tanh", input_shape=(3, 128, 128))
     creating: createZooKerasShareConvolution2D
     """
+
     def __init__(self, nb_filter, nb_row, nb_col, init="glorot_uniform",
                  activation=None, subsample=(1, 1), pad_h=0, pad_w=0, propagate_back=True,
                  dim_ordering="th", W_regularizer=None, b_regularizer=None,
@@ -292,6 +299,7 @@ class CAdd(ZooKerasLayer):
     >>> cadd = CAdd((2, 1), input_shape=(3, ))
     creating: createZooKerasCAdd
     """
+
     def __init__(self, size, b_regularizer=None, input_shape=None, **kwargs):
         super(CAdd, self).__init__(None,
                                    size,
@@ -324,6 +332,7 @@ class CMul(ZooKerasLayer):
     >>> cmul = CMul((2, 1), input_shape=(3, ))
     creating: createZooKerasCMul
     """
+
     def __init__(self, size, W_regularizer=None, input_shape=None, **kwargs):
         super(CMul, self).__init__(None,
                                    size,
@@ -347,6 +356,7 @@ class Exp(ZooKerasLayer):
     >>> exp = Exp(input_shape=(2, 3, 4))
     creating: createZooKerasExp
     """
+
     def __init__(self, input_shape=None, **kwargs):
         super(Exp, self).__init__(None,
                                   list(input_shape) if input_shape else None,
@@ -366,6 +376,7 @@ class Identity(ZooKerasLayer):
     >>> identity = Identity(input_shape=(3, ))
     creating: createZooKerasIdentity
     """
+
     def __init__(self, input_shape=None, **kwargs):
         super(Identity, self).__init__(None,
                                        list(input_shape) if input_shape else None,
@@ -387,6 +398,7 @@ class Log(ZooKerasLayer):
     >>> log = Log(input_shape=(4, 8, 8))
     creating: createZooKerasLog
     """
+
     def __init__(self, input_shape=None, **kwargs):
         super(Log, self).__init__(None,
                                   list(input_shape) if input_shape else None,
@@ -408,6 +420,7 @@ class Mul(ZooKerasLayer):
     >>> mul = Mul(input_shape=(3, 4, 5))
     creating: createZooKerasMul
     """
+
     def __init__(self, input_shape=None, **kwargs):
         super(Mul, self).__init__(None,
                                   list(input_shape) if input_shape else None,
@@ -434,6 +447,7 @@ class Power(ZooKerasLayer):
     >>> power = Power(3, input_shape=(3, ))
     creating: createZooKerasPower
     """
+
     def __init__(self, power, scale=1, shift=0, input_shape=None, **kwargs):
         super(Power, self).__init__(None,
                                     float(power),
@@ -463,6 +477,7 @@ class Scale(ZooKerasLayer):
     >>> scale = Scale((2, 1), input_shape=(3, ))
     creating: createZooKerasScale
     """
+
     def __init__(self, size, input_shape=None, **kwargs):
         super(Scale, self).__init__(None,
                                     size,
@@ -485,6 +500,7 @@ class Sqrt(ZooKerasLayer):
     >>> sqrt = Sqrt(input_shape=(3, ))
     creating: createZooKerasSqrt
     """
+
     def __init__(self, input_shape=None, **kwargs):
         super(Sqrt, self).__init__(None,
                                    list(input_shape) if input_shape else None,
@@ -506,6 +522,7 @@ class Square(ZooKerasLayer):
     >>> square = Square(input_shape=(5, ))
     creating: createZooKerasSquare
     """
+
     def __init__(self, input_shape=None, **kwargs):
         super(Square, self).__init__(None,
                                      list(input_shape) if input_shape else None,
@@ -528,6 +545,7 @@ class HardShrink(ZooKerasLayer):
     >>> hardshrink = HardShrink(input_shape=(2, 4, 8))
     creating: createZooKerasHardShrink
     """
+
     def __init__(self, value=0.5, input_shape=None, **kwargs):
         super(HardShrink, self).__init__(None,
                                          float(value),
@@ -552,6 +570,7 @@ class HardTanh(ZooKerasLayer):
     >>> hardtanh = HardTanh(input_shape=(3, 4))
     creating: createZooKerasHardTanh
     """
+
     def __init__(self, min_value=-1, max_value=1, input_shape=None, **kwargs):
         super(HardTanh, self).__init__(None,
                                        float(min_value),
@@ -575,6 +594,7 @@ class Negative(ZooKerasLayer):
     >>> negative = Negative(input_shape=(4, 5, 8))
     creating: createZooKerasNegative
     """
+
     def __init__(self, input_shape=None, **kwargs):
         super(Negative, self).__init__(None,
                                        list(input_shape) if input_shape else None,
@@ -600,6 +620,7 @@ class PReLU(ZooKerasLayer):
     >>> prelu = PReLU(input_shape=(3, 4, 8, 8))
     creating: createZooKerasPReLU
     """
+
     def __init__(self, n_output_plane=0, input_shape=None, **kwargs):
         super(PReLU, self).__init__(None,
                                     n_output_plane,
@@ -634,7 +655,8 @@ class RReLU(ZooKerasLayer):
     >>> rrelu = RReLU(input_shape=(3, 4))
     creating: createZooKerasRReLU
     """
-    def __init__(self, lower=1.0/8, upper=1.0/3, input_shape=None, **kwargs):
+
+    def __init__(self, lower=1.0 / 8, upper=1.0 / 3, input_shape=None, **kwargs):
         super(RReLU, self).__init__(None,
                                     float(lower),
                                     float(upper),
@@ -658,6 +680,7 @@ class SoftShrink(ZooKerasLayer):
     >>> softshrink = SoftShrink(input_shape=(4, 4, 8, 8))
     creating: createZooKerasSoftShrink
     """
+
     def __init__(self, value=0.5, input_shape=None, **kwargs):
         super(SoftShrink, self).__init__(None,
                                          float(value),
@@ -685,6 +708,7 @@ class WithinChannelLRN2D(ZooKerasLayer):
     >>> withinchannellrn2d = WithinChannelLRN2D(input_shape=(2, 3, 8, 8))
     creating: createZooKerasWithinChannelLRN2D
     """
+
     def __init__(self, size=5, alpha=1.0, beta=0.75, input_shape=None, **kwargs):
         super(WithinChannelLRN2D, self).__init__(None,
                                                  size,
@@ -712,6 +736,7 @@ class BinaryThreshold(ZooKerasLayer):
     >>> binarythreshold = BinaryThreshold(input_shape=(2, 3, 4, 5))
     creating: createZooKerasBinaryThreshold
     """
+
     def __init__(self, value=1e-6, input_shape=None, **kwargs):
         super(BinaryThreshold, self).__init__(None,
                                               float(value),
@@ -734,6 +759,7 @@ class Threshold(ZooKerasLayer):
     >>> threshold = Threshold(input_shape=(2, 3, 4, 5))
     creating: createZooKerasThreshold
     """
+
     def __init__(self, th=1e-6, v=0.0, input_shape=None, **kwargs):
         super(Threshold, self).__init__(None,
                                         float(th),
@@ -755,6 +781,7 @@ class GaussianSampler(ZooKerasLayer):
     >>> gaussianSampler = GaussianSampler(input_shape=[(3,),(3,)])
     creating: createZooKerasGaussianSampler
     """
+
     def __init__(self, input_shape=None, **kwargs):
         super(GaussianSampler, self).__init__(None,
                                               list(input_shape) if input_shape else None,
@@ -780,6 +807,7 @@ class ResizeBilinear(ZooKerasLayer):
     >>> resizeBilinear = ResizeBilinear(10, 20, input_shape=(2, 3, 5, 7))
     creating: createZooKerasResizeBilinear
     """
+
     def __init__(self, output_height, output_width, align_corner=False,
                  dim_ordering="th", input_shape=None, **kwargs):
         super(ResizeBilinear, self).__init__(None,
@@ -802,6 +830,7 @@ class SelectTable(ZooKerasLayer):
     >>> selectTable = SelectTable(0, input_shape=[[2, 3], [5, 7]])
     creating: createZooKerasSelectTable
     """
+
     def __init__(self, index, input_shape=None, **kwargs):
         super(SelectTable, self).__init__(None,
                                           index,
diff --git a/python/dllib/src/bigdl/dllib/keras/layers/wrappers.py b/python/dllib/src/bigdl/dllib/keras/layers/wrappers.py
index 04dd4ec00bd..f7c35342b2b 100644
--- a/python/dllib/src/bigdl/dllib/keras/layers/wrappers.py
+++ b/python/dllib/src/bigdl/dllib/keras/layers/wrappers.py
@@ -22,6 +22,7 @@
     long = int
     unicode = str
 
+
 class TimeDistributed(ZooKerasLayer):
     """
     TimeDistributed wrapper.
@@ -44,6 +45,7 @@ class TimeDistributed(ZooKerasLayer):
     creating: createZooKerasDense
     creating: createZooKerasTimeDistributed
     """
+
     def __init__(self, layer, input_shape=None, **kwargs):
         super(TimeDistributed, self).__init__(None,
                                               layer,
@@ -74,6 +76,7 @@ class Bidirectional(ZooKerasLayer):
     creating: createZooKerasLSTM
     creating: createZooKerasBidirectional
     """
+
     def __init__(self, layer, merge_mode="concat", input_shape=None, **kwargs):
         super(Bidirectional, self).__init__(None,
                                             layer,
@@ -99,6 +102,7 @@ class KerasLayerWrapper(ZooKerasLayer):
     >>> kerasLayer = KerasLayerWrapper(linear, input_shape=(100, ))
     creating: createZooKerasKerasLayerWrapper
     """
+
     def __init__(self, torch_layer, input_shape=None, **kwargs):
         super(KerasLayerWrapper, self).__init__(None,
                                                 torch_layer,
diff --git a/python/dllib/src/bigdl/dllib/models/inception/__init__.py b/python/dllib/src/bigdl/dllib/models/inception/__init__.py
index 1d7a97cae5e..2151a805423 100644
--- a/python/dllib/src/bigdl/dllib/models/inception/__init__.py
+++ b/python/dllib/src/bigdl/dllib/models/inception/__init__.py
@@ -13,4 +13,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
diff --git a/python/dllib/src/bigdl/dllib/models/inception/inception.py b/python/dllib/src/bigdl/dllib/models/inception/inception.py
index 31fbee0ed1e..97a0c788f8d 100644
--- a/python/dllib/src/bigdl/dllib/models/inception/inception.py
+++ b/python/dllib/src/bigdl/dllib/models/inception/inception.py
@@ -41,7 +41,7 @@ def inception_layer_v1(input_size, config, name_prefix=""):
     concat = Concat(2)
     conv1 = Sequential()
     conv1.add(SpatialConvolution(input_size, config[1][1], 1, 1, 1, 1)
-              .set_init_method(weight_init_method=Xavier(),bias_init_method=ConstInitMethod(0.1))
+              .set_init_method(weight_init_method=Xavier(), bias_init_method=ConstInitMethod(0.1))
               .set_name(name_prefix + "1x1"))
     conv1.add(ReLU(True).set_name(name_prefix + "relu_1x1"))
     concat.add(conv1)
@@ -128,7 +128,8 @@ def inception_v1_no_aux_classifier(class_num, has_dropout=True):
 def inception_v1(class_num, has_dropout=True):
     feature1 = Sequential()
     feature1.add(SpatialConvolution(3, 64, 7, 7, 2, 2, 3, 3, 1, False)
-                 .set_init_method(weight_init_method=Xavier(), bias_init_method=ConstInitMethod(0.1))
+                 .set_init_method(weight_init_method=Xavier(),
+                                  bias_init_method=ConstInitMethod(0.1))
                  .set_name("conv1/7x7_s2"))
     feature1.add(ReLU(True).set_name("conv1/relu_7x7"))
     feature1.add(
@@ -137,26 +138,31 @@ def inception_v1(class_num, has_dropout=True):
     feature1.add(SpatialCrossMapLRN(5, 0.0001, 0.75)
                  .set_name("pool1/norm1"))
     feature1.add(SpatialConvolution(64, 64, 1, 1, 1, 1)
-                 .set_init_method(weight_init_method=Xavier(), bias_init_method=ConstInitMethod(0.1))
+                 .set_init_method(weight_init_method=Xavier(),
+                                  bias_init_method=ConstInitMethod(0.1))
                  .set_name("conv2/3x3_reduce"))
     feature1.add(ReLU(True).set_name("conv2/relu_3x3_reduce"))
     feature1.add(SpatialConvolution(64, 192, 3, 3, 1, 1, 1, 1)
-                 .set_init_method(weight_init_method=Xavier(), bias_init_method=ConstInitMethod(0.1))
+                 .set_init_method(weight_init_method=Xavier(),
+                                  bias_init_method=ConstInitMethod(0.1))
                  .set_name("conv2/3x3"))
     feature1.add(ReLU(True).set_name("conv2/relu_3x3"))
     feature1.add(SpatialCrossMapLRN(5, 0.0001, 0.75).set_name("conv2/norm2"))
     feature1.add(
         SpatialMaxPooling(3, 3, 2, 2, to_ceil=True).set_name("pool2/3x3_s2"))
-    feature1.add(inception_layer_v1(192, t([
-        t([64]), t([96, 128]), t([16, 32]), t([32])]),
+    feature1.add(inception_layer_v1(192,
+                                    t([t([64]), t([96, 128]), t([16, 32]), t([32])]),
                                     "inception_3a/"))
-    feature1.add(inception_layer_v1(256, t([
-        t([128]), t([128, 192]), t([32, 96]), t([64])]),
+
+    feature1.add(inception_layer_v1(256,
+                                    t([t([128]), t([128, 192]), t([32, 96]), t([64])]),
                                     "inception_3b/"))
+
     feature1.add(
         SpatialMaxPooling(3, 3, 2, 2, to_ceil=True).set_name("pool3/3x3_s2"))
-    feature1.add(inception_layer_v1(480, t([
-        t([192]), t([96, 208]), t([16, 48]), t([64])]),
+
+    feature1.add(inception_layer_v1(480,
+                                    t([t([192]), t([96, 208]), t([16, 48]), t([64])]),
                                     "inception_4a/"))
 
     output1 = Sequential()
@@ -249,27 +255,41 @@ def config_option_parser():
                       help="url of hdfs folder store the hadoop sequence files")
     parser.add_option("--model", type=str, dest="model", default="", help="model snapshot location")
     parser.add_option("--state", type=str, dest="state", default="", help="state snapshot location")
-    parser.add_option("--checkpoint", type=str, dest="checkpoint", default="", help="where to cache the model")
+    parser.add_option("--checkpoint", type=str, dest="checkpoint", default="",
+                      help="where to cache the model")
     parser.add_option("-o", "--overwrite", action="store_true", dest="overwrite", default=False,
                       help="overwrite checkpoint files")
-    parser.add_option("-e", "--maxEpoch", type=int, dest="maxEpoch", default=0, help="epoch numbers")
-    parser.add_option("-i", "--maxIteration", type=int, dest="maxIteration", default=62000, help="iteration numbers")
-    parser.add_option("-l", "--learningRate", type=float, dest="learningRate", default=0.01, help="learning rate")
-    parser.add_option("--warmupEpoch", type=int, dest="warmupEpoch", default=0, help="warm up epoch numbers")
+    parser.add_option("-e", "--maxEpoch", type=int, dest="maxEpoch", default=0,
+                      help="epoch numbers")
+    parser.add_option("-i", "--maxIteration", type=int, dest="maxIteration", default=62000,
+                      help="iteration numbers")
+    parser.add_option("-l", "--learningRate", type=float, dest="learningRate", default=0.01,
+                      help="learning rate")
+    parser.add_option("--warmupEpoch", type=int, dest="warmupEpoch", default=0,
+                      help="warm up epoch numbers")
     parser.add_option("--maxLr", type=float, dest="maxLr", default=0.0, help="max Lr after warm up")
     parser.add_option("-b", "--batchSize", type=int, dest="batchSize", help="batch size")
     parser.add_option("--classNum", type=int, dest="classNum", default=1000, help="class number")
-    parser.add_option("--weightDecay", type=float, dest="weightDecay", default=0.0001, help="weight decay")
+    parser.add_option("--weightDecay", type=float, dest="weightDecay", default=0.0001,
+                      help="weight decay")
     parser.add_option("--checkpointIteration", type=int, dest="checkpointIteration", default=620,
                       help="checkpoint interval of iterations")
-    parser.add_option("--gradientMin", type=float, dest="gradientMin", default=0.0, help="min gradient clipping by")
-    parser.add_option("--gradientMax", type=float, dest="gradientMax", default=0.0, help="max gradient clipping by")
-    parser.add_option("--gradientL2NormThreshold", type=float, dest="gradientL2NormThreshold", default=0.0, help="gradient L2-Norm threshold")
-    parser.add_option("--executor-cores", type=int, dest="cores", default=4, help="number of executor cores")
-    parser.add_option("--num-executors", type=int, dest="executors", default=16, help="number of executors")
-    parser.add_option("--executor-memory", type=str, dest="executorMemory", default="30g", help="executor memory")
-    parser.add_option("--driver-memory", type=str, dest="driverMemory", default="30g", help="driver memory")
-    parser.add_option("--deploy-mode", type=str, dest="deployMode", default="yarn-client", help="yarn deploy mode, yarn-client or yarn-cluster")
+    parser.add_option("--gradientMin", type=float, dest="gradientMin", default=0.0,
+                      help="min gradient clipping by")
+    parser.add_option("--gradientMax", type=float, dest="gradientMax", default=0.0,
+                      help="max gradient clipping by")
+    parser.add_option("--gradientL2NormThreshold", type=float, dest="gradientL2NormThreshold",
+                      default=0.0, help="gradient L2-Norm threshold")
+    parser.add_option("--executor-cores", type=int, dest="cores", default=4,
+                      help="number of executor cores")
+    parser.add_option("--num-executors", type=int, dest="executors", default=16,
+                      help="number of executors")
+    parser.add_option("--executor-memory", type=str, dest="executorMemory", default="30g",
+                      help="executor memory")
+    parser.add_option("--driver-memory", type=str, dest="driverMemory", default="30g",
+                      help="driver memory")
+    parser.add_option("--deploy-mode", type=str, dest="deployMode", default="yarn-client",
+                      help="yarn deploy mode, yarn-client or yarn-cluster")
 
     return parser
 
@@ -286,11 +306,11 @@ def config_option_parser():
     # init
     hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
     assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \
-            "set the environment variable HADOOP_CONF_DIR"
+                        "set the environment variable HADOOP_CONF_DIR"
 
-    conf = create_spark_conf().set("spark.executor.memory", options.executorMemory)\
-        .set("spark.executor.cores", options.cores)\
-        .set("spark.executor.instances", options.executors)\
+    conf = create_spark_conf().set("spark.executor.memory", options.executorMemory) \
+        .set("spark.executor.cores", options.cores) \
+        .set("spark.executor.instances", options.executors) \
         .set("spark.driver.memory", options.driverMemory)
 
     sc = init_nncontext(conf, cluster_mode=options.deployMode, hadoop_conf=hadoop_conf)
@@ -301,7 +321,8 @@ def config_option_parser():
                                   RandomCropper(image_size, image_size, True, "Random", 3),
                                   ChannelNormalize(123.0, 117.0, 104.0),
                                   MatToTensor(to_rgb=False),
-                                  ImageFrameToSample(input_keys=["imageTensor"], target_keys=["label"])
+                                  ImageFrameToSample(input_keys=["imageTensor"],
+                                                     target_keys=["label"])
                                   ])
     raw_train_data = get_inception_data(options.folder, sc, "train")
     train_data = DataSet.image_frame(raw_train_data).transform(train_transformer)
@@ -311,7 +332,8 @@ def config_option_parser():
                                 RandomCropper(image_size, image_size, False, "Center", 3),
                                 ChannelNormalize(123.0, 117.0, 104.0),
                                 MatToTensor(to_rgb=False),
-                                  ImageFrameToSample(input_keys=["imageTensor"], target_keys=["label"])
+                                ImageFrameToSample(input_keys=["imageTensor"],
+                                                   target_keys=["label"])
                                 ])
     raw_val_data = get_inception_data(options.folder, sc, "val")
     val_data = DataSet.image_frame(raw_val_data).transform(val_transformer)
@@ -341,12 +363,13 @@ def config_option_parser():
                 maxlr = options.maxLr
             else:
                 maxlr = options.learningRate
-            warmupDelta = (maxlr - options.learningRate)/warmup_iteration
+            warmupDelta = (maxlr - options.learningRate) / warmup_iteration
         polyIteration = maxIteration - warmup_iteration
         lrSchedule = SequentialSchedule(iterationPerEpoch)
         lrSchedule.add(Warmup(warmupDelta), warmup_iteration)
         lrSchedule.add(Poly(0.5, maxIteration), polyIteration)
-        optim = SGD(learningrate=options.learningRate, learningrate_decay=0.0, weightdecay=options.weightDecay,
+        optim = SGD(learningrate=options.learningRate, learningrate_decay=0.0,
+                    weightdecay=options.weightDecay,
                     momentum=0.9, dampening=0.0, nesterov=False,
                     leaningrate_schedule=lrSchedule)
 
diff --git a/python/dllib/src/bigdl/dllib/models/lenet/lenet5.py b/python/dllib/src/bigdl/dllib/models/lenet/lenet5.py
index 5033d175f1e..96e386c6f89 100644
--- a/python/dllib/src/bigdl/dllib/models/lenet/lenet5.py
+++ b/python/dllib/src/bigdl/dllib/models/lenet/lenet5.py
@@ -48,8 +48,8 @@ def build_model(class_num):
 
         # The format index of input or output format can be checked
         # in: ${BigDL-core}/native-dnn/src/main/java/com/intel/analytics/bigdl/mkl/Memory.java
-        model.set_input_formats([7]) # Set input format to nchw
-        model.set_output_formats([4]) # Set output format to nc
+        model.set_input_formats([7])  # Set input format to nchw
+        model.set_output_formats([4])  # Set output format to nc
 
     return model
 
@@ -65,18 +65,19 @@ def build_model(class_num):
     parser.add_option("-d", "--dataPath", dest="dataPath", default="/tmp/mnist")
     parser.add_option("--optimizerVersion", dest="optimizerVersion", default="optimizerV1")
     parser.add_option("--cluster-mode", dest="clusterMode", default="local")
-    parser.add_option("--mkl-dnn", action="store_true", dest="mklDnn", default=False, help="if enable mkldnn")
+    parser.add_option("--mkl-dnn", action="store_true", dest="mklDnn", default=False,
+                      help="if enable mkldnn")
 
     (options, args) = parser.parse_args(sys.argv)
 
-    conf={}
+    conf = {}
     if options.mklDnn:
         conf["spark.driver.extraJavaOptions"] = "-Dbigdl.engineType=mkldnn"
         conf["spark.executor.extraJavaOptions"] = "-Dbigdl.engineType=mkldnn"
     if options.clusterMode.startswith("yarn"):
         hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
-        assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \
-                "set the environment variable HADOOP_CONF_DIR"
+        assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please" \
+                            "set the environment variable HADOOP_CONF_DIR"
         spark_conf = create_spark_conf().set("spark.executor.memory", "5g") \
             .set("spark.executor.cores", 2) \
             .set("spark.executor.instances", 2) \
@@ -88,7 +89,7 @@ def build_model(class_num):
         else:
             sc = init_nncontext(spark_conf, cluster_mode="yarn-cluster", hadoop_conf=hadoop_conf)
     elif options.clusterMode == "local":
-        spark_conf = SparkConf().set("spark.driver.memory", "10g")\
+        spark_conf = SparkConf().set("spark.driver.memory", "10g") \
             .set("spark.driver.cores", 4)
         sc = init_nncontext(spark_conf, cluster_mode="local")
     else:
@@ -97,7 +98,8 @@ def build_model(class_num):
     set_optimizer_version(options.optimizerVersion)
 
     # In order to use MklDnn as the backend, you should:
-    # 1. Define a graph model with Model(graph container) or convert a sequential model to a graph model
+    # 1. Define a graph model with Model(graph container) or convert a sequential model to a graph
+    # model
     # 2. Specify the input and output formats of it.
     #    BigDL needs these format information to build a graph running with MKL-DNN backend
     # 3. Run spark-submit command with correct configurations
diff --git a/python/dllib/src/bigdl/dllib/models/local_lenet/__init__.py b/python/dllib/src/bigdl/dllib/models/local_lenet/__init__.py
index 1d7a97cae5e..2151a805423 100644
--- a/python/dllib/src/bigdl/dllib/models/local_lenet/__init__.py
+++ b/python/dllib/src/bigdl/dllib/models/local_lenet/__init__.py
@@ -13,4 +13,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
diff --git a/python/dllib/src/bigdl/dllib/models/ml_pipeline/__init__.py b/python/dllib/src/bigdl/dllib/models/ml_pipeline/__init__.py
index 1d7a97cae5e..2151a805423 100644
--- a/python/dllib/src/bigdl/dllib/models/ml_pipeline/__init__.py
+++ b/python/dllib/src/bigdl/dllib/models/ml_pipeline/__init__.py
@@ -13,4 +13,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
diff --git a/python/dllib/src/bigdl/dllib/models/ml_pipeline/dl_classifier.py b/python/dllib/src/bigdl/dllib/models/ml_pipeline/dl_classifier.py
index 36d295e1167..1f7a723d56b 100644
--- a/python/dllib/src/bigdl/dllib/models/ml_pipeline/dl_classifier.py
+++ b/python/dllib/src/bigdl/dllib/models/ml_pipeline/dl_classifier.py
@@ -53,6 +53,7 @@ def getBatchSize(self):
         """
         return self.getOrDefault(self.batchSize)
 
+
 class HasMaxEpoch(Params):
     maxEpoch = Param(Params._dummy(), "maxEpoch", "number of max Epoch")
 
@@ -73,6 +74,7 @@ def getMaxEpoch(self):
         """
         return self.getOrDefault(self.maxEpoch)
 
+
 class HasFeatureSize(Params):
     featureSize = Param(Params._dummy(), "featureSize", "size of the feature")
 
@@ -90,6 +92,7 @@ def setFeatureSize(self, val):
     def getFeatureSize(self):
         return self.getOrDefault(self.featureSize)
 
+
 class HasLearningRate(Params):
     learningRate = Param(Params._dummy(), "learningRate", "learning rate")
 
@@ -110,13 +113,16 @@ def getLearningRate(self):
         """
         return self.getOrDefault(self.learningRate)
 
-class DLEstimator(Estimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasBatchSize, HasMaxEpoch, HasLearningRate, JavaValue):
+
+class DLEstimator(Estimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasBatchSize,
+                  HasMaxEpoch, HasLearningRate, JavaValue):
     """
     .. note:: Deprecated in 0.5.0. `DLEstimator` has been migrated to package
      `bigdl.dlframes`. This will be removed in BigDL 0.6.
 
     """
-    def __init__(self,  model, criterion, feature_size, label_size, jvalue=None, bigdl_type="float"):
+
+    def __init__(self, model, criterion, feature_size, label_size, jvalue=None, bigdl_type="float"):
         super(DLEstimator, self).__init__()
         self.value = jvalue if jvalue else callBigDlFunc(
             bigdl_type, self.jvm_class_constructor(), model, criterion, feature_size, label_size)
@@ -124,7 +130,7 @@ def __init__(self,  model, criterion, feature_size, label_size, jvalue=None, big
         self.featureSize = feature_size
 
     def _fit(self, dataset):
-        #self._transfer_params_to_java()
+        # self._transfer_params_to_java()
         jmodel = callBigDlFunc(self.bigdl_type, "fitEstimator", self.value, dataset)
         model = DLModel.of(jmodel, self.featureSize, self.bigdl_type)
         return model
@@ -136,7 +142,8 @@ class DLModel(Model, HasFeaturesCol, HasPredictionCol, HasBatchSize, HasFeatureS
      `bigdl.dlframes`. This will be removed in BigDL 0.6.
 
     """
-    def __init__(self,  model, featureSize, jvalue=None, bigdl_type="float"):
+
+    def __init__(self, model, featureSize, jvalue=None, bigdl_type="float"):
         super(DLModel, self).__init__()
         self.value = jvalue if jvalue else callBigDlFunc(
             bigdl_type, self.jvm_class_constructor(), model, featureSize)
@@ -158,7 +165,8 @@ class DLClassifier(DLEstimator):
      `bigdl.dlframes`. This will be removed in BigDL 0.6.
 
     """
-    def __init__(self,  model, criterion, feature_size,  bigdl_type="float"):
+
+    def __init__(self, model, criterion, feature_size, bigdl_type="float"):
         super(DLClassifier, self).__init__(model, criterion, feature_size, [1], None, bigdl_type)
 
     def _fit(self, dataset):
@@ -173,7 +181,8 @@ class DLClassifierModel(DLModel):
      `bigdl.dlframes`. This will be removed in BigDL 0.6.
 
     """
-    def __init__(self,  model, featureSize, jvalue=None, bigdl_type="float"):
+
+    def __init__(self, model, featureSize, jvalue=None, bigdl_type="float"):
         super(DLClassifierModel, self).__init__(model, featureSize, jvalue, bigdl_type)
 
     def _transform(self, dataset):
@@ -181,5 +190,6 @@ def _transform(self, dataset):
 
     @classmethod
     def of(self, jvalue, feature_size=None, bigdl_type="float"):
-        model = DLClassifierModel(model=None, featureSize=feature_size, jvalue=jvalue, bigdl_type=bigdl_type)
+        model = DLClassifierModel(model=None, featureSize=feature_size, jvalue=jvalue,
+                                  bigdl_type=bigdl_type)
         return model
diff --git a/python/dllib/src/bigdl/dllib/models/rnn/__init__.py b/python/dllib/src/bigdl/dllib/models/rnn/__init__.py
index 1d7a97cae5e..2151a805423 100644
--- a/python/dllib/src/bigdl/dllib/models/rnn/__init__.py
+++ b/python/dllib/src/bigdl/dllib/models/rnn/__init__.py
@@ -13,4 +13,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
diff --git a/python/dllib/src/bigdl/dllib/models/rnn/rnnexample.py b/python/dllib/src/bigdl/dllib/models/rnn/rnnexample.py
index f2f19a94c20..5e1d7cde5f1 100644
--- a/python/dllib/src/bigdl/dllib/models/rnn/rnnexample.py
+++ b/python/dllib/src/bigdl/dllib/models/rnn/rnnexample.py
@@ -27,14 +27,16 @@
 from bigdl.dllib.utils.common import *
 from bigdl.dllib.utils.common import Sample
 
+
 def download_data(dest_dir):
     TINYSHAKESPEARE_URL = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'  # noqa
     file_name = "input.txt"
     file_abs_path = base.maybe_download(file_name, dest_dir, TINYSHAKESPEARE_URL)
     return file_abs_path
 
+
 def prepare_data(sc, folder, vocabsize, training_split):
-    if not folder.startswith( 'hdfs://' ):
+    if not folder.startswith('hdfs://'):
         file = download_data(folder)
     else:
         file = folder
@@ -51,17 +53,19 @@ def prepare_data(sc, folder, vocabsize, training_split):
     print("max length %s" % train_max_len)
 
     words = train_tokens.flatMap(lambda x: x)
-    print("%s words and %s sentences processed in train data" % (words.count(), train_tokens.count()))
+    print(
+        "%s words and %s sentences processed in train data" % (words.count(), train_tokens.count()))
 
     val_max_len = val_tokens.map(lambda x: len(x)).max()
     print("val max length %s" % val_max_len)
 
     val_words = val_tokens.flatMap(lambda x: x)
-    print("%s words and %s sentences processed in validation data" % (val_words.count(), val_tokens.count()))
+    print("%s words and %s sentences processed in validation data" % (
+        val_words.count(), val_tokens.count()))
 
     sort_words = words.map(lambda w: (w, 1)) \
-                .reduceByKey(lambda a, b: a + b) \
-                .sortBy(lambda w_c: w_c[1])
+        .reduceByKey(lambda a, b: a + b) \
+        .sortBy(lambda w_c: w_c[1])
     vocabulary = np.array(sort_words.map(lambda w: w[0]).collect())
 
     fre_len = vocabulary.size
@@ -69,8 +73,8 @@ def prepare_data(sc, folder, vocabsize, training_split):
         length = fre_len
     else:
         length = vocabsize
-    discard_vocab = vocabulary[: fre_len-length]
-    used_vocab = vocabulary[fre_len-length: fre_len]
+    discard_vocab = vocabulary[: fre_len - length]
+    used_vocab = vocabulary[fre_len - length: fre_len]
     used_vocab_size = used_vocab.size
     index = np.arange(used_vocab_size)
     index2word = dict(enumerate(used_vocab))
@@ -86,7 +90,7 @@ def text2labeled(sent):
         return data, label
 
     def labeled2onehotformat(labeled_sent):
-        label = [x+1 for x in labeled_sent[1]]
+        label = [x + 1 for x in labeled_sent[1]]
         size = len(labeled_sent[0])
         feature_onehot = np.zeros(size * total_vocab_len, dtype='int').reshape(
             [size, total_vocab_len])
@@ -116,12 +120,13 @@ def padding(features, label, length):
 
     return sample_rdd, val_sample_rdd, total_vocab_len
 
+
 def build_model(input_size, hidden_size, output_size, model_type):
     # Model Type is simple RNN
     if model_type == "rnn":
         model = Sequential()
         model.add(Recurrent()
-                  .add(RnnCell(input_size, hidden_size, Tanh())))\
+                  .add(RnnCell(input_size, hidden_size, Tanh()))) \
             .add(TimeDistributed(Linear(hidden_size, output_size)))
         model.reset()
 
@@ -133,7 +138,7 @@ def build_model(input_size, hidden_size, output_size, model_type):
     elif model_type == "lstm":
         model = Sequential()
         model.add(Recurrent()
-                  .add(LSTM(input_size, hidden_size)))\
+                  .add(LSTM(input_size, hidden_size))) \
             .add(TimeDistributed(Linear(hidden_size, output_size)))
         model.reset()
 
@@ -146,11 +151,12 @@ def build_model(input_size, hidden_size, output_size, model_type):
 
             # The format index of input or output format can be checked
             # in: ${BigDL-core}/native-dnn/src/main/java/com/intel/analytics/bigdl/mkl/Memory.java
-            model.set_input_formats([27]) # Set input format to ntc
-            model.set_output_formats([27]) # Set output format to ntc
+            model.set_input_formats([27])  # Set input format to ntc
+            model.set_output_formats([27])  # Set output format to ntc
 
     return model
 
+
 if __name__ == "__main__":
     parser = OptionParser()
 
diff --git a/python/dllib/src/bigdl/dllib/models/textclassifier/__init__.py b/python/dllib/src/bigdl/dllib/models/textclassifier/__init__.py
index 1d7a97cae5e..2151a805423 100644
--- a/python/dllib/src/bigdl/dllib/models/textclassifier/__init__.py
+++ b/python/dllib/src/bigdl/dllib/models/textclassifier/__init__.py
@@ -13,4 +13,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
diff --git a/python/dllib/src/bigdl/dllib/models/textclassifier/textclassifier.py b/python/dllib/src/bigdl/dllib/models/textclassifier/textclassifier.py
index da422f24552..f10e03d8526 100644
--- a/python/dllib/src/bigdl/dllib/models/textclassifier/textclassifier.py
+++ b/python/dllib/src/bigdl/dllib/models/textclassifier/textclassifier.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 #
 
-
 import itertools
 import re
 from optparse import OptionParser
@@ -40,6 +39,7 @@ def analyze_texts(data_rdd):
     def index(w_c_i):
         ((w, c), i) = w_c_i
         return (w, (i + 1, c))
+
     return data_rdd.flatMap(lambda text_label: text_to_words(text_label[0])) \
         .map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b) \
         .sortBy(lambda w_c: - w_c[1]).zipWithIndex() \
@@ -127,7 +127,7 @@ def train(sc, data_path,
         lambda vectors_label: to_sample(vectors_label[0], vectors_label[1], embedding_dim))
 
     train_rdd, val_rdd = sample_rdd.randomSplit(
-        [training_split, 1-training_split])
+        [training_split, 1 - training_split])
 
     optimizer = Optimizer.create(
         model=build_model(news20.CLASS_NUM),
@@ -167,7 +167,7 @@ def train(sc, data_path,
     parser.add_option("-p", "--p", dest="p", default="0.0")
     parser.add_option("-d", "--data_path", dest="data_path", default="/tmp/news20/")
     parser.add_option("--optimizerVersion", dest="optimizerVersion", default="optimizerV1")
-    parser.add_option("--on-yarn", action="store_true",  dest="onYarn", default=False)
+    parser.add_option("--on-yarn", action="store_true", dest="onYarn", default=False)
 
     (options, args) = parser.parse_args(sys.argv)
     if options.action == "train":
@@ -183,15 +183,15 @@ def train(sc, data_path,
         data_path = options.data_path
         if options.onYarn:
             hadoop_conf = os.environ.get("HADOOP_CONF_DIR")
-            assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode. Please " \
-                    "set the environment variable HADOOP_CONF_DIR"
+            assert hadoop_conf, "Directory path to hadoop conf not found for yarn-client mode." \
+                                " Please set the environment variable HADOOP_CONF_DIR"
             conda_env_name = detect_conda_env_name()
             sc = init_spark_on_yarn(hadoop_conf=hadoop_conf,
-                    conda_name=conda_env_name,
-                    num_executors=2,
-                    executor_cores=2,
-                    executor_memory="20g",
-                    driver_memory="10g")
+                                    conda_name=conda_env_name,
+                                    num_executors=2,
+                                    executor_cores=2,
+                                    executor_memory="20g",
+                                    driver_memory="10g")
         else:
             conf = {"spark.driver.memory": "40g"}
             sc = init_spark_on_local(cores=4, conf=conf)
diff --git a/python/dllib/src/bigdl/dllib/models/utils/__init__.py b/python/dllib/src/bigdl/dllib/models/utils/__init__.py
index 1d7a97cae5e..2151a805423 100644
--- a/python/dllib/src/bigdl/dllib/models/utils/__init__.py
+++ b/python/dllib/src/bigdl/dllib/models/utils/__init__.py
@@ -13,4 +13,3 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-
diff --git a/python/dllib/src/bigdl/dllib/models/utils/model_broadcast.py b/python/dllib/src/bigdl/dllib/models/utils/model_broadcast.py
index 7aa16b8be58..7d60fc0b680 100644
--- a/python/dllib/src/bigdl/dllib/models/utils/model_broadcast.py
+++ b/python/dllib/src/bigdl/dllib/models/utils/model_broadcast.py
@@ -23,13 +23,16 @@
 from pyspark.broadcast import _from_id
 from bigdl.dllib.nn.layer import Model
 
+
 def _from_id_and_type(bid, bigdl_type):
     result = _from_id(bid)
     return ModelBroadcast(path=result._path, bigdl_type=bigdl_type)
 
+
 def broadcast_model(sc, layer):
     return ModelBroadcast(sc, layer, sc._pickled_broadcast_vars)
 
+
 class ModelBroadcast(Broadcast):
 
     def __init__(self, sc=None, layer=None, pickle_registry=None, path=None, bigdl_type="float"):
diff --git a/python/dllib/src/bigdl/dllib/nn/criterion.py b/python/dllib/src/bigdl/dllib/nn/criterion.py
index 642637f1689..a05be547df0 100644
--- a/python/dllib/src/bigdl/dllib/nn/criterion.py
+++ b/python/dllib/src/bigdl/dllib/nn/criterion.py
@@ -33,6 +33,7 @@ class Criterion(JavaValue):
     Criterion is helpful to train a neural network.
     Given an input and a target, they compute a gradient according to a given loss function.
     """
+
     def __init__(self, jvalue, bigdl_type, *args):
         self.value = jvalue if jvalue else callBigDlFunc(
             bigdl_type, JavaValue.jvm_class_constructor(self), *args)
@@ -97,7 +98,6 @@ def of(cls, jcriterion, bigdl_type="float"):
 
 
 class ClassNLLCriterion(Criterion):
-
     '''
     The negative log likelihood criterion. It is useful to train a classification problem with n
     classes. If provided, the optional argument weights should be a 1D Tensor assigning weight to
@@ -153,7 +153,6 @@ def __init__(self,
 
 
 class MSECriterion(Criterion):
-
     '''
     Creates a criterion that measures the mean squared error between n elements
     in the input x and output y:
@@ -179,7 +178,6 @@ def __init__(self, bigdl_type="float"):
 
 
 class AbsCriterion(Criterion):
-
     '''
     measures the mean absolute value of the element-wise difference between input
 
@@ -196,7 +194,6 @@ def __init__(self,
 
 
 class ClassSimplexCriterion(Criterion):
-
     '''
     ClassSimplexCriterion implements a criterion for classification.
     It learns an embedding per class, where each class' embedding is a
@@ -217,7 +214,6 @@ def __init__(self,
 
 
 class CosineDistanceCriterion(Criterion):
-
     """
     Creates a criterion that measures the loss given an input and target,
     Loss = 1 - cos(x, y)
@@ -238,7 +234,6 @@ def __init__(self,
 
 
 class CosineEmbeddingCriterion(Criterion):
-
     """
     Creates a criterion that measures the loss given an input x = {x1, x2},
     a table of two Tensors, and a Tensor label y with values 1 or -1.
@@ -265,7 +260,6 @@ def __init__(self,
 
 
 class DistKLDivCriterion(Criterion):
-
     '''
     The Kullback-Leibler divergence criterion
 
@@ -283,6 +277,7 @@ def __init__(self,
         super(DistKLDivCriterion, self).__init__(None, bigdl_type,
                                                  size_average)
 
+
 class CategoricalCrossEntropy(Criterion):
     """
     This criterion is same with cross entropy criterion, except it takes a one-hot format target
@@ -290,11 +285,12 @@ class CategoricalCrossEntropy(Criterion):
     >>> cce = CategoricalCrossEntropy()
     creating: createCategoricalCrossEntropy
     """
+
     def __init__(self, bigdl_type="float"):
         super(CategoricalCrossEntropy, self).__init__(None, bigdl_type)
 
-class HingeEmbeddingCriterion(Criterion):
 
+class HingeEmbeddingCriterion(Criterion):
     '''
     Creates a criterion that measures the loss given an
     input x which is a 1-dimensional vector and a label y (1 or -1).
@@ -324,7 +320,6 @@ def __init__(self,
 
 
 class L1HingeEmbeddingCriterion(Criterion):
-
     '''
     Creates a criterion that measures the loss given an input x = {x1, x2},
     a table of two Tensors, and a label y (1 or -1):
@@ -354,13 +349,14 @@ def __init__(self,
 
 
 class MarginCriterion(Criterion):
-
     '''
     Creates a criterion that optimizes a two-class classification hinge loss (margin-based loss)
     between input x (a Tensor of dimension 1) and output y.
 
-    When margin = 1, size_average = True and squared = False, this is the same as hinge loss in keras;
-    When margin = 1, size_average = False and squared = True, this is the same as squared_hinge loss in keras.
+    When margin = 1, size_average = True and squared = False, this is the same as hinge loss in
+    keras;
+    When margin = 1, size_average = False and squared = True, this is the same as squared_hinge loss
+    in keras.
 
     :param margin: if unspecified, is by default 1.
     :param size_average: size average in a mini-batch
@@ -383,7 +379,6 @@ def __init__(self,
 
 
 class MarginRankingCriterion(Criterion):
-
     '''
     Creates a criterion that measures the loss given an input x = {x1, x2},
     a table of two Tensors of size 1 (they contain only scalars), and a label y (1 or -1).
@@ -410,7 +405,6 @@ def __init__(self,
 
 
 class MultiCriterion(Criterion):
-
     '''
     a weighted sum of other criterions each applied to the same input and target
 
@@ -433,7 +427,6 @@ def add(self, criterion, weight=1.0):
 
 
 class MultiLabelMarginCriterion(Criterion):
-
     '''
     Creates a criterion that optimizes a multi-class multi-classification hinge loss (
     margin-based loss) between input x and output y (which is a Tensor of target class indices)
@@ -454,7 +447,6 @@ def __init__(self,
 
 
 class ParallelCriterion(Criterion):
-
     '''
     ParallelCriterion is a weighted sum of other criterions each applied to a different input
     and target. Set repeatTarget = true to share the target for criterions.
@@ -484,8 +476,8 @@ def add(self, criterion, weight=1.0):
         self.value.add(criterion.value, weight)
         return self
 
-class KLDCriterion(Criterion):
 
+class KLDCriterion(Criterion):
     '''
     Computes the KL-divergence of the input normal distribution to a standard normal distribution.
     The input has to be a table. The first element of input is the mean of the distribution,
@@ -500,7 +492,6 @@ def __init__(self, size_average=True, bigdl_type="float"):
 
 
 class GaussianCriterion(Criterion):
-
     '''
     Computes the log-likelihood of a sample x given a Gaussian distribution p.
     >>> GaussianCriterion = GaussianCriterion()
@@ -510,8 +501,8 @@ class GaussianCriterion(Criterion):
     def __init__(self, bigdl_type="float"):
         super(GaussianCriterion, self).__init__(None, bigdl_type)
 
-class SmoothL1Criterion(Criterion):
 
+class SmoothL1Criterion(Criterion):
     '''
     Creates a criterion that can be thought of as a smooth version of the AbsCriterion.
     It uses a squared term if the absolute element-wise error falls below 1.
@@ -542,7 +533,6 @@ def __init__(self,
 
 
 class SmoothL1CriterionWithWeights(Criterion):
-
     '''
     a smooth version of the AbsCriterion
     It uses a squared term if the absolute element-wise error falls below 1.
@@ -571,14 +561,15 @@ def __init__(self,
 
 
 class SoftmaxWithCriterion(Criterion):
-
     '''
     Computes the multinomial logistic loss for a one-of-many classification task,
-    passing real-valued predictions through a softmax to get a probability distribution over classes.
+    passing real-valued predictions through a softmax to get a probability distribution over
+    classes.
     It should be preferred over separate SoftmaxLayer + MultinomialLogisticLossLayer
     as its gradient computation is more numerically stable.
 
-    :param ignoreLabel:   (optional) Specify a label value thatshould be ignored when computing the loss.
+    :param ignoreLabel:   (optional) Specify a label value thatshould be ignored when computing the
+     loss.
     :param normalizeMode: How to normalize the output loss.
 
 
@@ -596,6 +587,7 @@ def __init__(self,
                                                    ignore_label,
                                                    normalize_mode)
 
+
 class TimeDistributedMaskCriterion(Criterion):
     '''
     This class is intended to support inputs with 3 or more dimensions.
@@ -620,6 +612,7 @@ def __init__(self, criterion, padding_value=0, bigdl_type="float"):
         super(TimeDistributedMaskCriterion, self).__init__(
             None, bigdl_type, criterion, padding_value)
 
+
 class TimeDistributedCriterion(Criterion):
     '''
     This class is intended to support inputs with 3 or more dimensions.
@@ -766,7 +759,8 @@ class SoftMarginCriterion(Criterion):
            loss(x, y) = sum_i (log(1 + exp(-y[i]*x[i]))) / x:nElement()
 ```
 
-    :param sizeaverage: The normalization by the number of elements in the inputcan be disabled by setting
+    :param sizeaverage: The normalization by the number of elements in the inputcan be disabled by
+     setting
 
 
     >>> softMarginCriterion = SoftMarginCriterion(False)
@@ -782,7 +776,6 @@ def __init__(self,
 
 
 class DiceCoefficientCriterion(Criterion):
-
     '''
     The Dice-Coefficient criterion
     input: Tensor,target: Tensor
@@ -807,8 +800,8 @@ def __init__(self,
                                                        size_average,
                                                        epsilon)
 
-class L1Cost(Criterion):
 
+class L1Cost(Criterion):
     '''
     compute L1 norm for input, and sign of input
 
@@ -820,8 +813,8 @@ def __init__(self,
                  bigdl_type="float"):
         super(L1Cost, self).__init__(None, bigdl_type)
 
-class CosineProximityCriterion(Criterion):
 
+class CosineProximityCriterion(Criterion):
     '''
     compute the negative of the mean cosine proximity between predictions and targets.
 ```
@@ -838,8 +831,8 @@ def __init__(self,
                  bigdl_type="float"):
         super(CosineProximityCriterion, self).__init__(None, bigdl_type)
 
-class MeanAbsolutePercentageCriterion(Criterion):
 
+class MeanAbsolutePercentageCriterion(Criterion):
     '''
     This method is same as `mean_absolute_percentage_error` loss in keras.
     It caculates diff = K.abs((y - x) / K.clip(K.abs(y), K.epsilon(), Double.MaxValue))
@@ -852,13 +845,14 @@ def __init__(self,
                  bigdl_type="float"):
         super(MeanAbsolutePercentageCriterion, self).__init__(None, bigdl_type)
 
-class MeanSquaredLogarithmicCriterion(Criterion):
 
+class MeanSquaredLogarithmicCriterion(Criterion):
     '''
     This method is same as `mean_squared_logarithmic_error` loss in keras.
     It calculates: first_log = K.log(K.clip(y, K.epsilon(),  Double.MaxValue) + 1.)
     second_log = K.log(K.clip(x, K.epsilon(),  Double.MaxValue) + 1.)
-    and output K.mean(K.square(first_log - second_log)). Here, the x and y can have or not have a batch.
+    and output K.mean(K.square(first_log - second_log)). Here, the x and y can have or not have a
+    batch.
     >>> error = MeanSquaredLogarithmicCriterion()
     creating: createMeanSquaredLogarithmicCriterion
     '''
@@ -867,8 +861,8 @@ def __init__(self,
                  bigdl_type="float"):
         super(MeanSquaredLogarithmicCriterion, self).__init__(None, bigdl_type)
 
-class KullbackLeiblerDivergenceCriterion(Criterion):
 
+class KullbackLeiblerDivergenceCriterion(Criterion):
     '''
     compute Kullback Leibler DivergenceCriterion error for intput and target
     This method is same as `kullback_leibler_divergence` loss in keras. Loss calculated as:
@@ -884,8 +878,8 @@ def __init__(self,
                  bigdl_type="float"):
         super(KullbackLeiblerDivergenceCriterion, self).__init__(None, bigdl_type)
 
-class PoissonCriterion(Criterion):
 
+class PoissonCriterion(Criterion):
     '''
     compute Poisson error for input and target, loss calculated as:
     mean(input - target * K.log(input + K.epsilon()), axis=-1)
@@ -897,18 +891,19 @@ def __init__(self,
                  bigdl_type="float"):
         super(PoissonCriterion, self).__init__(None, bigdl_type)
 
+
 class TransformerCriterion(Criterion):
     '''
     The criterion that takes two modules to transform input and target, and take
     one criterion to compute the loss with the transformed input and target.
-    
+
     This criterion can be used to construct complex criterion. For example, the
     `inputTransformer` and `targetTransformer` can be pre-trained CNN networks,
     and we can use the networks' output to compute the high-level feature
     reconstruction loss, which is commonly used in areas like neural style transfer
     (https://arxiv.org/abs/1508.06576), texture synthesis (https://arxiv.org/abs/1505.07376),
     .etc.
-    
+
     >>> trans = TransformerCriterion(MSECriterion())
     creating: createMSECriterion
     creating: createTransformerCriterion
@@ -916,8 +911,8 @@ class TransformerCriterion(Criterion):
 
     def __init__(self,
                  criterion,
-                 input_transformer = None,
-                 target_transformer = None,
+                 input_transformer=None,
+                 target_transformer=None,
                  bigdl_type="float"):
         super(TransformerCriterion, self).__init__(None,
                                                    bigdl_type,
@@ -925,23 +920,25 @@ def __init__(self,
                                                    input_transformer,
                                                    target_transformer)
 
+
 class DotProductCriterion(Criterion):
     '''
     Compute the dot product of input and target tensor.
     Input and target are required to have the same size.
     :param size_average: whether to average over each observations in the same batch
-    
+
     >>> dp =DotProductCriterion(False)
     creating: createDotProductCriterion
     '''
 
     def __init__(self,
-                 size_average = False,
+                 size_average=False,
                  bigdl_type="float"):
         super(DotProductCriterion, self).__init__(None,
                                                   bigdl_type,
                                                   size_average)
 
+
 class PGCriterion(Criterion):
     '''
     The Criterion to compute the negative policy gradient given a
@@ -954,7 +951,7 @@ class PGCriterion(Criterion):
     represents the sampled action and the non-zero element itself represents
     the reward. If the action is space is large, you should consider using
     SparseTensor for target.
-    
+
     The loss computed is simple the standard policy gradient,
 
       loss = - 1/n * sum(R_{n} dot_product log(P_{n}))
@@ -962,13 +959,13 @@ class PGCriterion(Criterion):
     where R_{n} is the reward vector, and P_{n} is the input distribution.
 
     :param sizeAverage whether to average over each observations in the same batch
-                           
+
     >>> pg = PGCriterion()
     creating: createPGCriterion
     '''
 
     def __init__(self,
-                 sizeAverage = False,
+                 sizeAverage=False,
                  bigdl_type="float"):
         super(PGCriterion, self).__init__(None,
                                           bigdl_type,
diff --git a/python/dllib/src/bigdl/dllib/nn/initialization_method.py b/python/dllib/src/bigdl/dllib/nn/initialization_method.py
index fe4154ec619..902b6c7d8fa 100644
--- a/python/dllib/src/bigdl/dllib/nn/initialization_method.py
+++ b/python/dllib/src/bigdl/dllib/nn/initialization_method.py
@@ -18,31 +18,36 @@
 
 from bigdl.dllib.utils.common import JavaValue
 
-
 if sys.version >= '3':
     long = int
     unicode = str
 
+
 class InitializationMethod(JavaValue):
     """
     Initialization method to initialize bias and weight.
     The init method will be called in Module.reset()
     """
 
+
 class Zeros(InitializationMethod):
     """
     Initializer that generates tensors with zeros.
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
 
+
 class Ones(InitializationMethod):
     """
     Initializer that generates tensors with ones.
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
 
+
 class RandomUniform(InitializationMethod):
     """
      Initializer that generates tensors with a uniform distribution.
@@ -50,6 +55,7 @@ class RandomUniform(InitializationMethod):
      If lower and upper is not specified, it draws samples form a
      uniform distribution within [-limit, limit] where "limit" is "1/sqrt(fan_in)"
     """
+
     def __init__(self, upper=None, lower=None, bigdl_type="float"):
         if upper is not None and lower is not None:
             upper = upper + 0.0
@@ -58,38 +64,48 @@ def __init__(self, upper=None, lower=None, bigdl_type="float"):
         else:
             JavaValue.__init__(self, None, bigdl_type)
 
+
 class RandomNormal(InitializationMethod):
     """
      Initializer that generates tensors with a normal distribution.
     """
+
     def __init__(self, mean, stdv, bigdl_type="float"):
         mean = mean + 0.0
         stdv = stdv + 0.0
         JavaValue.__init__(self, None, bigdl_type, mean, stdv)
 
+
 class ConstInitMethod(InitializationMethod):
     """
     Initializer that generates tensors with certain constant double.
     """
+
     def __init__(self, value, bigdl_type="float"):
         value = value + 0.0
         JavaValue.__init__(self, None, bigdl_type, value)
 
+
 class Xavier(InitializationMethod):
     """
     Xavier Initializer. See http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
 
+
 class MsraFiller(InitializationMethod):
     """
     MsraFiller Initializer.
-    See https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf
+    See https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_
+    2015_paper.pdf
     """
+
     def __init__(self, varianceNormAverage=True, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, varianceNormAverage)
 
+
 class BilinearFiller(InitializationMethod):
     """
     Initialize the weight with coefficients for bilinear interpolation.
@@ -98,5 +114,6 @@ class BilinearFiller(InitializationMethod):
     The variable tensor passed in the init function should have 5 dimensions
     of format [nGroup, nInput, nOutput, kH, kW], and kH should be equal to kW
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
diff --git a/python/dllib/src/bigdl/dllib/nn/keras/backend.py b/python/dllib/src/bigdl/dllib/nn/keras/backend.py
index 821de0804f9..d59af65cafa 100644
--- a/python/dllib/src/bigdl/dllib/nn/keras/backend.py
+++ b/python/dllib/src/bigdl/dllib/nn/keras/backend.py
@@ -104,7 +104,7 @@ def fit(self, x, y=None, batch_size=32, nb_epoch=10, verbose=1, callbacks=None,
             unsupport_exp("sample_weight")
         if initial_epoch != 0:
             unsupport_exp("initial_epoch")
-        if shuffle != True:
+        if shuffle is not True:
             unsupport_exp("shuffle")
         if validation_split != 0.:
             unsupport_exp("validation_split")
@@ -178,4 +178,3 @@ def __create_distributed_optimizer(self, training_rdd,
 def with_bigdl_backend(kmodel):
     init_engine()
     return KerasModelWrapper(kmodel)
-
diff --git a/python/dllib/src/bigdl/dllib/nn/keras/layers/layer.py b/python/dllib/src/bigdl/dllib/nn/keras/layers/layer.py
index 8eb90be9261..64bef675f45 100644
--- a/python/dllib/src/bigdl/dllib/nn/keras/layers/layer.py
+++ b/python/dllib/src/bigdl/dllib/nn/keras/layers/layer.py
@@ -97,7 +97,8 @@ class Input(Node, KerasCreator):
 
     # Arguments
     shape: A shape tuple, not including batch.
-    name: String to set the name of the input node. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the input node. If not specified, its name will by default to be
+          a generated string.
 
     >>> input = Input(name="input1", shape=(3, 5))
     creating: createKerasInput
@@ -117,7 +118,8 @@ class InputLayer(KerasLayer):
 
     # Arguments
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the input layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the input layer. If not specified, its name will by default to
+          be a generated string.
 
     >>> inputlayer = InputLayer(input_shape=(3, 5))
     creating: createKerasInputLayer
@@ -143,16 +145,19 @@ class Dense(KerasLayer):
     output_dim: The size of output dimension.
     init: String representation of the initialization method for the weights of the layer.
           Default is 'glorot_uniform'.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
-    bias: Whether to include a bias (i.e. make the layer affine rather than linear). Default is True.
+    bias: Whether to include a bias (i.e. make the layer affine rather than linear). Default is
+          True.
     input_dim: Dimensionality of the input for 2D input. For nD input, you can alternatively specify
                'input_shape' when using this layer as the first layer.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> dense = Dense(10, input_dim=8, name="dense1")
     creating: createKerasDense
@@ -191,11 +196,13 @@ class MaxoutDense(KerasLayer):
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
-    bias: Whether to include a bias (i.e. make the layer affine rather than linear). Default is True.
+    bias: Whether to include a bias (i.e. make the layer affine rather than linear). Default is
+          True.
     input_dim: Dimensionality of the input. Alternatively, you can specify 'input_shape'
                when using this layer as the first layer.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> maxoutdense = MaxoutDense(6, input_shape=(10, ))
     creating: createKerasMaxoutDense
@@ -233,7 +240,8 @@ class Embedding(KerasLayer):
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the embedding matrix. Default is None.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> embedding = Embedding(1000, 32, input_shape=(10, ), name="embedding1")
     creating: createKerasEmbedding
@@ -269,10 +277,11 @@ class BatchNormalization(KerasLayer):
               standard deviation of the data, for feature-wise normalization. Default is 0.99.
     beta_init: Name of the initialization function for shift parameter. Default is 'zero'.
     gamma_init: Name of the initialization function for scale parameter. Default is 'one'.
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
-                  For 'th', axis along which to normalize is 1. For 'tf', axis is 3.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default
+                  is 'th'. For 'th', axis along which to normalize is 1. For 'tf', axis is 3.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> batchnormalization = BatchNormalization(input_shape=(3, 12, 12), name="bn1")
     creating: createKerasBatchNormalization
@@ -280,14 +289,17 @@ class BatchNormalization(KerasLayer):
      .. note:: `bigdl.dllib.keras` is deprecated in 0.11.
     This will be removed in future releases.
     """
-    def __init__(self, epsilon=0.001, mode=0, axis=1, momentum=0.99, beta_init="zero", gamma_init="one",
+    def __init__(self, epsilon=0.001, mode=0, axis=1, momentum=0.99, beta_init="zero",
+                 gamma_init="one",
                  dim_ordering="th", input_shape=None, **kwargs):
         if mode != 0:
             raise ValueError("For BatchNormalization, only mode=0 is supported for now")
         if dim_ordering == "th" and axis != 1:
-            raise ValueError("For BatchNormalization with th dim ordering, only axis=1 is supported for now")
+            raise ValueError("For BatchNormalization with th dim ordering, only axis=1 is"
+                             " supported for now")
         if dim_ordering == "tf" and axis != -1 and axis != 3:
-            raise ValueError("For BatchNormalization with tf dim ordering, only axis=-1 is supported for now")
+            raise ValueError("For BatchNormalization with tf dim ordering, only axis=-1 is"
+                             " supported for now")
         super(BatchNormalization, self).__init__(None,
                                                  float(epsilon),
                                                  float(momentum),
@@ -342,10 +354,12 @@ class Merge(KerasLayer):
     layers: A list of layer instances. Must be more than one layer.
     mode: Merge mode. String, must be one of: 'sum', 'mul', 'concat', 'ave', 'cos',
           'dot', 'max'. Default is 'sum'.
-    concat_axis: Int, axis to use when concatenating layers. Only specify this when merge mode is 'concat'.
+    concat_axis: Int, axis to use when concatenating layers. Only specify this when merge mode is
+                 'concat'.
                  Default is -1, meaning the last axis of the input.
     input_shape: A list of shape tuples, each not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> l1 = InputLayer(input_shape=(3, 5))
     creating: createKerasInputLayer
@@ -377,9 +391,11 @@ def merge(inputs, mode="sum", concat_axis=-1, name=None):
     inputs: A list of node instances. Must be more than one node.
     mode: Merge mode. String, must be one of: 'sum', 'mul', 'concat', 'ave', 'cos',
           'dot', 'max'. Default is 'sum'.
-    concat_axis: Int, axis to use when concatenating nodes. Only specify this when merge mode is 'concat'.
+    concat_axis: Int, axis to use when concatenating nodes. Only specify this when merge mode is
+                 'concat'.
                  Default is -1, meaning the last axis of the input.
-    name: String to set the name of the merge. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the merge. If not specified, its name will by default to be a
+          generated string.
 
      .. note:: `bigdl.dllib.keras` is deprecated in 0.11.
     This will be removed in future releases.
@@ -398,7 +414,8 @@ class Dropout(KerasLayer):
     # Arguments
     p: Fraction of the input units to drop. Float between 0 and 1.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> dropout = Dropout(0.25, input_shape=(2, 3))
     creating: createKerasDropout
@@ -422,7 +439,8 @@ class Flatten(KerasLayer):
 
     # Arguments
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> flatten = Flatten(input_shape=(3, 10, 2))
     creating: createKerasFlatten
@@ -447,9 +465,11 @@ class Reshape(KerasLayer):
     input_shape (a shape tuple, does not include the batch dimension).
 
     # Arguments
-    target_shape: A shape tuple. The target shape that you desire to have. Batch dimension should be excluded.
+    target_shape: A shape tuple. The target shape that you desire to have. Batch dimension should be
+                  excluded.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> reshape = Reshape((2, 10), input_shape=(5, 4))
     creating: createKerasReshape
@@ -467,7 +487,8 @@ def __init__(self, target_shape, input_shape=None, **kwargs):
 class Activation(KerasLayer):
     """
     Simple activation function to be applied to the output.
-    Available activations: 'tanh', 'relu', 'sigmoid', 'softmax', 'softplus', 'softsign', 'hard_sigmoid'.
+    Available activations: 'tanh', 'relu', 'sigmoid', 'softmax', 'softplus', 'softsign',
+                           'hard_sigmoid'.
 
     When you use this layer as the first layer of a model, you need to provide the argument
     input_shape (a shape tuple, does not include the batch dimension).
@@ -475,7 +496,8 @@ class Activation(KerasLayer):
     # Arguments
     activation: Name of the activation function as string.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> activation = Activation("relu", input_shape=(3, 4))
     creating: createKerasActivation
@@ -503,7 +525,8 @@ class RepeatVector(KerasLayer):
     input_dim: Dimensionality of the input. Alternatively, you can specify 'input_shape'
                when using this layer as the first layer.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> repeatvector = RepeatVector(5, input_shape=(3, ))
     creating: createKerasRepeatVector
@@ -529,9 +552,11 @@ class Permute(KerasLayer):
     input_shape (a shape tuple, does not include the batch dimension).
 
     # Arguments
-    dims: Tuple of int. Permutation pattern, does not include the batch dimension. Indexing starts at 1.
+    dims: Tuple of int. Permutation pattern, does not include the batch dimension. Indexing starts
+          at 1.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> permute = Permute((2, 1, 3), input_shape=(3, 4, 5))
     creating: createKerasPermute
@@ -548,14 +573,16 @@ def __init__(self, dims, input_shape=None, **kwargs):
 
 class Highway(KerasLayer):
     """
-    Densely connected highway network. Highway layers are a natural extension of LSTMs to feedforward networks.
+    Densely connected highway network. Highway layers are a natural extension of LSTMs to
+    feedforward networks.
     The input of this layer should be 2D, i.e. (batch, input dim).
 
     When you use this layer as the first layer of a model, you need to provide the argument
     input_shape (a shape tuple, does not include the batch dimension).
 
     # Arguments
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
@@ -565,7 +592,8 @@ class Highway(KerasLayer):
     input_dim: Dimensionality of the input. Alternatively, you can specify 'input_shape'
                when using this layer as the first layer.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> highway = Highway(activation='relu', input_shape=(8, ))
     creating: createKerasHighway
@@ -600,7 +628,8 @@ class Convolution1D(KerasLayer):
     filter_length: The extension (spatial or temporal) of each filter.
     init: String representation of the initialization method for the weights of the layer.
           Default is 'glorot_uniform'.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
     subsample_length: Factor by which to subsample output. Int. Default is 1.
@@ -610,7 +639,8 @@ class Convolution1D(KerasLayer):
     bias: Whether to include a bias (i.e. make the layer affine rather than linear).
           Default is True.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> conv1d = Convolution1D(12, 4, input_shape=(3, 16))
     creating: createKerasConvolution1D
@@ -652,19 +682,22 @@ class Convolution2D(KerasLayer):
     nb_col: Number of cols in the convolution kernel.
     init: String representation of the initialization method for the weights of the layer.
           Default is 'glorot_uniform'.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
     subsample: Int tuple of length 2 corresponding to the step of the convolution in the
                height and width dimension. Also called strides elsewhere. Default is (1, 1).
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last).
+                  Default is 'th'.
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
     bias: Whether to include a bias (i.e. make the layer affine rather than linear).
           Default is True.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+        generated string.
 
     >>> conv2d = Convolution2D(32, 3, 3, input_shape=(3, 128, 128), name="convolution2d_1")
     creating: createKerasConvolution2D
@@ -710,7 +743,8 @@ class Convolution3D(KerasLayer):
     kernel_dim3: Length of the third dimension in the convolution kernel.
     init: String representation of the initialization method for the weights of the layer.
           Default is 'glorot_uniform'.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
     subsample: Int tuple of length 3. Factor by which to subsample output.
@@ -722,7 +756,8 @@ class Convolution3D(KerasLayer):
     bias: Whether to include a bias (i.e. make the layer affine rather than linear).
           Default is True.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> conv3d = Convolution3D(32, 3, 4, 5, input_shape=(3, 64, 64, 64))
     creating: createKerasConvolution3D
@@ -768,17 +803,20 @@ class AtrousConvolution1D(KerasLayer):
     filter_length: The extension (spatial or temporal) of each filter.
     init: String representation of the initialization method for the weights of the layer.
           Default is 'glorot_uniform'.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Only 'valid' is supported for now.
     subsample_length: Factor by which to subsample output. Int. Default is 1.
-    atrous_rate: Factor for kernel dilation. Also called filter_dilation elsewhere. Int. Default is 1.
+    atrous_rate: Factor for kernel dilation. Also called filter_dilation elsewhere. Int. Default
+                 is 1.
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
     bias: Only 'True' is supported for now.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> atrousconv1d = AtrousConvolution1D(8, 3, input_shape=(3, 12))
     creating: createKerasAtrousConvolution1D
@@ -790,7 +828,8 @@ def __init__(self, nb_filter, filter_length, init="glorot_uniform", activation=N
                  border_mode="valid", subsample_length=1, atrous_rate=1, W_regularizer=None,
                  b_regularizer=None, bias=True, input_shape=None, **kwargs):
         if border_mode != "valid":
-            raise ValueError("For AtrousConvolution1D, only border_mode='valid' is supported for now")
+            raise ValueError("For AtrousConvolution1D, only border_mode='valid' is supported for"
+                             " now")
         if not bias:
             raise ValueError("For AtrousConvolution1D, only bias=True is supported for now")
         super(AtrousConvolution1D, self).__init__(None,
@@ -826,7 +865,8 @@ class AtrousConvolution2D(KerasLayer):
     nb_col: Number of cols in the convolution kernel.
     init: String representation of the initialization method for the weights of the layer.
           Default is 'glorot_uniform'.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Only 'valid' is supported for now.
     subsample: Int tuple of length 2 corresponding to the step of the convolution in the
@@ -839,7 +879,8 @@ class AtrousConvolution2D(KerasLayer):
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
     bias: Only 'True' is supported for now.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> atrousconv2d = AtrousConvolution2D(12, 4, 3, input_shape=(3, 64, 64))
     creating: createKerasAtrousConvolution2D
@@ -852,7 +893,8 @@ def __init__(self, nb_filter, nb_row, nb_col, init="glorot_uniform",
                  atrous_rate=(1, 1), dim_ordering="th", W_regularizer=None,
                  b_regularizer=None, bias=True, input_shape=None, **kwargs):
         if border_mode != "valid":
-            raise ValueError("For AtrousConvolution2D, only border_mode='valid' is supported for now")
+            raise ValueError("For AtrousConvolution2D, only border_mode='valid' is supported for"
+                             " now")
         if not bias:
             raise ValueError("For AtrousConvolution2D, only bias=True is supported for now")
         super(AtrousConvolution2D, self).__init__(None,
@@ -893,7 +935,8 @@ class Deconvolution2D(KerasLayer):
     output_shape: Output shape of the transposed convolution operation. Tuple of int.
     init: String representation of the initialization method for the weights of the layer.
           Default is 'glorot_uniform'.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Only 'valid' is supported for now.
     subsample: Int tuple of length 2 corresponding to the step of the convolution in the
@@ -905,7 +948,8 @@ class Deconvolution2D(KerasLayer):
     bias: Whether to include a bias (i.e. make the layer affine rather than linear).
           Default is True.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> deconv2d = Deconvolution2D(3, 3, 3, output_shape=(None, 3, 14, 14), input_shape=(3, 12, 12))
     creating: createKerasDeconvolution2D
@@ -953,23 +997,28 @@ class SeparableConvolution2D(KerasLayer):
     nb_col: Number of cols in the convolution kernel.
     init: String representation of the initialization method for the weights of the layer.
           Default is 'glorot_uniform'.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
     subsample: Int tuple of length 2 corresponding to the step of the convolution in the
                height and width dimension. Also called strides elsewhere. Default is (1, 1).
-    depth_multiplier: How many output channel to use per input channel for the depthwise convolution step.
+    depth_multiplier: How many output channel to use per input channel for the depthwise convolution
+                      step.
                       Int. Default is 1.
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default
+                  is 'th'.
     depthwise_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                            applied to the depthwise weights matrices. Default is None.
-    pointwise_regularizer: An instance of [[Regularizer]], applied to the pointwise weights matrices.
+    pointwise_regularizer: An instance of [[Regularizer]], applied to the pointwise weights
+                           matrices.
                            Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
     bias: Whether to include a bias (i.e. make the layer affine rather than linear).
           Default is True.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> separableconv2d = SeparableConvolution2D(12, 3, 4, input_shape=(3, 32, 32))
     creating: createKerasSeparableConvolution2D
@@ -1020,7 +1069,8 @@ class Cropping1D(KerasLayer):
     cropping: Int tuple of length 2. How many units should be trimmed off at the beginning and
               end of the cropping dimension. Default is (1, 1).
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be
+          a generated string.
 
     >>> cropping1d = Cropping1D(cropping=(1, 2), input_shape=(8, 8))
     creating: createKerasCropping1D
@@ -1044,10 +1094,12 @@ class Cropping2D(KerasLayer):
     input_shape (a shape tuple, does not include the batch dimension).
 
     # Arguments
-    cropping: Int tuple of tuple of length 2. How many units should be trimmed off at the beginning and
-              end of the 2 cropping dimensions (i.e. height and width). Default is ((0, 0), (0, 0)).
+    cropping: Int tuple of tuple of length 2. How many units should be trimmed off at the beginning
+              and end of the 2 cropping dimensions (i.e. height and width). Default is ((0, 0),
+              (0, 0)).
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> cropping2d = Cropping2D(cropping=((1, 2), (0, 1)), input_shape=(12, 12, 12))
     creating: createKerasCropping2D
@@ -1074,11 +1126,12 @@ class Cropping3D(KerasLayer):
     input_shape (a shape tuple, does not include the batch dimension).
 
     # Arguments
-    cropping: Int tuple of tuple of length 3. How many units should be trimmed off at the beginning and
-              end of the 3 cropping dimensions (i.e. kernel_dim1, kernel_dim2 and kernel_dim3).
+    cropping: Int tuple of tuple of length 3. How many units should be trimmed off at the beginning
+              and end of the 3 cropping dimensions (i.e. kernel_dim1, kernel_dim2 and kernel_dim3).
               Default is ((1, 1), (1, 1), (1, 1)).
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> cropping3d = Cropping3D(cropping=((0, 2), (1, 1), (3, 1)), input_shape=(4, 12, 12, 16))
     creating: createKerasCropping3D
@@ -1109,7 +1162,8 @@ class UpSampling1D(KerasLayer):
     # Arguments
     length: Int. UpSampling factor. Default is 2.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> upsampling1d = UpSampling1D(length=3, input_shape=(3, 12))
     creating: createKerasUpSampling1D
@@ -1135,9 +1189,11 @@ class UpSampling2D(KerasLayer):
 
     # Arguments
     size: Int tuple of length 2. UpSampling factors for rows and columns. Default is (2, 2).
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default
+                  is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> upsampling2d = UpSampling2D(size=(1, 3), input_shape=(3, 16, 16))
     creating: createKerasUpSampling2D
@@ -1156,7 +1212,8 @@ def __init__(self, size=(2, 2), dim_ordering="th", input_shape=None, **kwargs):
 class UpSampling3D(KerasLayer):
     """
     UpSampling layer for 2D inputs.
-    Repeats the 1st, 2nd and 3rd dimensions of the data by size[0], size[1] and size[2] respectively.
+    Repeats the 1st, 2nd and 3rd dimensions of the data by size[0], size[1] and size[2]
+    respectively.
     Data format currently supported for this layer is dim_ordering='th' (Channel First).
     The input of this layer should be 5D.
 
@@ -1167,7 +1224,8 @@ class UpSampling3D(KerasLayer):
     size: Int tuple of length 3. UpSampling factors for dim1, dim2 and dim3. Default is (2, 2, 2).
     dim_ordering: Format of input data. Only 'th' (Channel First) is supported for now.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> upsampling3d = UpSampling3D(size=(1, 2, 3), input_shape=(3, 16, 16, 16))
     creating: createKerasUpSampling3D
@@ -1193,11 +1251,13 @@ class ZeroPadding1D(KerasLayer):
 
     # Arguments
     padding: Int or int tuple of length 2.
-             If int, how many zeros to add both at the beginning and at the end of the padding dimension.
+             If int, how many zeros to add both at the beginning and at the end of the padding
+             dimension.
              If tuple of length 2, how many zeros to add in the order '(left_pad, right_pad)'.
              Default is 1.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> zeropadding1d = ZeroPadding1D(padding=2, input_shape=(3, 6))
     creating: createKerasZeroPadding1D
@@ -1224,12 +1284,16 @@ class ZeroPadding2D(KerasLayer):
 
     # Arguments
     padding: Int tuple of length 2 or length 4.
-             If tuple of length 2, how many zeros to add both at the beginning and at the end of rows and cols.
-             If tuple of length 4, how many zeros to add in the order '(top_pad, bottom_pad, left_pad, right_pad)'.
+             If tuple of length 2, how many zeros to add both at the beginning and at the end of
+              rows and cols.
+             If tuple of length 4, how many zeros to add in the order '(top_pad, bottom_pad,
+              left_pad, right_pad)'.
              Default is (1, 1).
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last).
+                  Default is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> zeropadding2d = ZeroPadding2D(padding=(2, 1), input_shape=(2, 8, 8))
     creating: createKerasZeroPadding2D
@@ -1256,11 +1320,14 @@ class ZeroPadding3D(KerasLayer):
     input_shape (a shape tuple, does not include the batch dimension).
 
     # Arguments
-    padding: Int tuple of length 3. How many zeros to add at the beginning and at the end of the 3 padding dimensions.
+    padding: Int tuple of length 3. How many zeros to add at the beginning and at the end of the 3
+             padding dimensions.
              Symmetric padding will be applied to each dimension. Default is (1, 1, 1).
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last).
+                  Default is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> zeropadding3d = ZeroPadding3D(padding=(2, 1, 2), input_shape=(2, 8, 8, 10))
     creating: createKerasZeroPadding3D
@@ -1290,7 +1357,8 @@ class MaxPooling1D(KerasLayer):
              Default is None, and in this case it will be equal to pool_length..
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> maxpooling1d = MaxPooling1D(3, input_shape=(3, 24))
     creating: createKerasMaxPooling1D
@@ -1321,11 +1389,14 @@ class MaxPooling2D(KerasLayer):
     # Arguments
     pool_size: Int tuple of length 2 corresponding to the downscale vertically and horizontally.
                Default is (2, 2), which will halve the image in each dimension.
-    strides: Int tuple of length 2. Stride values. Default is None, and in this case it will be equal to pool_size.
+    strides: Int tuple of length 2. Stride values. Default is None, and in this case it will be
+             equal to pool_size.
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default
+                  is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> maxpooling2d = MaxPooling2D((2, 2), input_shape=(3, 32, 32), name="maxpooling2d_1")
     creating: createKerasMaxPooling2D
@@ -1358,11 +1429,13 @@ class MaxPooling3D(KerasLayer):
     # Arguments
     pool_size: Int tuple of length 3. Factors by which to downscale (dim1, dim2, dim3).
                Default is (2, 2, 2), which will halve the image in each dimension.
-    strides: Int tuple of length 3. Stride values. Default is None, and in this case it will be equal to pool_size.
+    strides: Int tuple of length 3. Stride values. Default is None, and in this case it will be
+             equal to pool_size.
     border_mode: Only 'valid' is supported for now.
     dim_ordering: Format of input data. Only 'th' (Channel First) is supported for now.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> maxpooling3d = MaxPooling3D((2, 1, 3), input_shape=(3, 32, 32, 32))
     creating: createKerasMaxPooling3D
@@ -1396,7 +1469,8 @@ class AveragePooling1D(KerasLayer):
              Default is None, and in this case it will be equal to pool_length..
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> averagepooling1d = AveragePooling1D(input_shape=(3, 24))
     creating: createKerasAveragePooling1D
@@ -1427,11 +1501,14 @@ class AveragePooling2D(KerasLayer):
     # Arguments
     pool_size: Int tuple of length 2 corresponding to the downscale vertically and horizontally.
                Default is (2, 2), which will halve the image in each dimension.
-    strides: Int tuple of length 2. Stride values. Default is None, and in this case it will be equal to pool_size.
+    strides: Int tuple of length 2. Stride values. Default is None, and in this case it will be
+             equal to pool_size.
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last).
+                  Default is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> averagepooling2d = AveragePooling2D((1, 2), input_shape=(2, 28, 32))
     creating: createKerasAveragePooling2D
@@ -1463,11 +1540,13 @@ class AveragePooling3D(KerasLayer):
     # Arguments
     pool_size: Int tuple of length 3. Factors by which to downscale (dim1, dim2, dim3).
                Default is (2, 2, 2), which will halve the image in each dimension.
-    strides: Int tuple of length 3. Stride values. Default is None, and in this case it will be equal to pool_size.
+    strides: Int tuple of length 3. Stride values. Default is None, and in this case it will be
+             equal to pool_size.
     border_mode: Only 'valid' is supported for now.
     dim_ordering: Format of input data. Only 'th' (Channel First) is supported for now.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> averagepooling3d = AveragePooling3D((1, 1, 2), input_shape=(3, 28, 32, 36))
     creating: createKerasAveragePooling3D
@@ -1497,7 +1576,8 @@ class GlobalMaxPooling1D(KerasLayer):
 
     # Arguments
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> globalmaxpooling1d = GlobalMaxPooling1D(input_shape=(4, 8))
     creating: createKerasGlobalMaxPooling1D
@@ -1521,7 +1601,8 @@ class GlobalAveragePooling1D(KerasLayer):
 
     # Arguments
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> globalaveragepooling1d = GlobalAveragePooling1D(input_shape=(12, 12))
     creating: createKerasGlobalAveragePooling1D
@@ -1544,9 +1625,11 @@ class GlobalMaxPooling2D(KerasLayer):
     input_shape (a shape tuple, does not include the batch dimension).
 
     # Arguments
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default
+                  is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> globalmaxpooling2d = GlobalMaxPooling2D(input_shape=(4, 32, 32))
     creating: createKerasGlobalMaxPooling2D
@@ -1570,9 +1653,11 @@ class GlobalAveragePooling2D(KerasLayer):
     input_shape (a shape tuple, does not include the batch dimension).
 
     # Arguments
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default
+                  is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> globalaveragepooling2d = GlobalAveragePooling2D(input_shape=(4, 32, 32))
     creating: createKerasGlobalAveragePooling2D
@@ -1600,7 +1685,8 @@ class GlobalMaxPooling3D(KerasLayer):
     # Arguments
     dim_ordering: Format of input data. Only 'th' (Channel First) is supported for now.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> globalmaxpooling3d = GlobalMaxPooling3D(input_shape=(4, 32, 32, 32))
     creating: createKerasGlobalMaxPooling3D
@@ -1628,7 +1714,8 @@ class GlobalAveragePooling3D(KerasLayer):
     # Arguments
     dim_ordering: Format of input data. Only 'th' (Channel First) is supported for now.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> globalaveragepooling3d = GlobalAveragePooling3D(input_shape=(4, 16, 16, 20))
     creating: createKerasGlobalAveragePooling3D
@@ -1653,17 +1740,21 @@ class SimpleRNN(KerasLayer):
 
     # Arguments
     output_dim: Hidden unit size. Dimension of internal projections and final output.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is 'tanh'.
-    return_sequences: Whether to return the full sequence or only return the last output in the output sequence.
+    return_sequences: Whether to return the full sequence or only return the last output in the
+                      output sequence.
                       Default is False.
     go_backwards: Whether the input sequence will be processed backwards. Default is False.
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
-    U_regularizer: An instance of [[Regularizer]], applied the recurrent weights matrices. Default is None.
+    U_regularizer: An instance of [[Regularizer]], applied the recurrent weights matrices.
+                   Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> simplernn = SimpleRNN(16, input_shape=(3, 32))
     creating: createKerasSimpleRNN
@@ -1696,18 +1787,23 @@ class LSTM(KerasLayer):
 
     # Arguments
     output_dim: Hidden unit size. Dimension of internal projections and final output.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is 'tanh'.
-    inner_activation: String representation of the activation function for inner cells. Default is 'hard_sigmoid'.
-    return_sequences: Whether to return the full sequence or only return the last output in the output sequence.
+    inner_activation: String representation of the activation function for inner cells. Default is
+                      'hard_sigmoid'.
+    return_sequences: Whether to return the full sequence or only return the last output in the
+                      output sequence.
                       Default is False.
     go_backwards: Whether the input sequence will be processed backwards. Default is False.
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
-    U_regularizer: An instance of [[Regularizer]], applied the recurrent weights matrices. Default is None.
+    U_regularizer: An instance of [[Regularizer]], applied the recurrent weights matrices.
+                   Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be
+          a generated string.
 
     >>> lstm = LSTM(32, input_shape=(8, 16), name="lstm1")
     creating: createKerasLSTM
@@ -1741,18 +1837,23 @@ class GRU(KerasLayer):
 
     # Arguments
     output_dim: Hidden unit size. Dimension of internal projections and final output.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is 'tanh'.
-    inner_activation: String representation of the activation function for inner cells. Default is 'hard_sigmoid'.
-    return_sequences: Whether to return the full sequence or only return the last output in the output sequence.
+    inner_activation: String representation of the activation function for inner cells. Default is
+                     'hard_sigmoid'.
+    return_sequences: Whether to return the full sequence or only return the last output in the
+                      output sequence.
                       Default is False.
     go_backwards: Whether the input sequence will be processed backwards. Default is False.
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
-    U_regularizer: An instance of [[Regularizer]], applied the recurrent weights matrices. Default is None.
+    U_regularizer: An instance of [[Regularizer]], applied the recurrent weights matrices. Default
+                   is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> gru = GRU(24, input_shape=(32, 32))
     creating: createKerasGRU
@@ -1790,23 +1891,30 @@ class ConvLSTM2D(KerasLayer):
     # Arguments
     nb_filter: Number of convolution filters to use.
     nb_row: Number of rows in the convolution kernel.
-    nb_col: Number of cols in the convolution kernel. Should be equal to nb_row as for a square kernel.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    nb_col: Number of cols in the convolution kernel. Should be equal to nb_row as for a square
+            kernel.
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is 'tanh'.
-    inner_activation: String representation of the activation function for inner cells. Default is 'hard_sigmoid'.
+    inner_activation: String representation of the activation function for inner cells. Default is
+                      'hard_sigmoid'.
     dim_ordering: Format of input data. Only 'th' (Channel First) is supported for now.
     border_mode: Only 'same' is supported for now.
-    subsample: Tuple of length 2. Factor by which to subsample output. Also called strides elsewhere.
+    subsample: Tuple of length 2. Factor by which to subsample output. Also called strides
+               elsewhere.
                Only support subsample[0] equal to subsample[1] for now. Default is (1, 1).
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
-    U_regularizer: An instance of [[Regularizer]], applied the recurrent weights matrices. Default is None.
+    U_regularizer: An instance of [[Regularizer]], applied the recurrent weights matrices.
+                   Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
-    return_sequences: Whether to return the full sequence or only return the last output in the output sequence.
+    return_sequences: Whether to return the full sequence or only return the last output in the
+                      output sequence.
                       Default is False.
     go_backwards: Whether the input sequence will be processed backwards. Default is False.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> convlstm2d = ConvLSTM2D(24, 3, 3, input_shape=(4, 32, 32, 32))
     creating: createKerasConvLSTM2D
@@ -1842,8 +1950,10 @@ def __init__(self, nb_filter, nb_row, nb_col, activation="tanh",
 
 class LocallyConnected1D(KerasLayer):
     """
-    Locally-connected layer for 1D inputs which works similarly to the TemporalConvolution layer, except that
-    weights are unshared, that is, a different set of filters is applied at each different patch of the input.
+    Locally-connected layer for 1D inputs which works similarly to the TemporalConvolution layer,
+    except that
+    weights are unshared, that is, a different set of filters is applied at each different patch of
+    the input.
     Border mode currently supported for this layer is 'valid'.
     The input of this layer should be 3D.
 
@@ -1853,7 +1963,8 @@ class LocallyConnected1D(KerasLayer):
     # Arguments
     nb_filter: Dimensionality of the output.
     filter_length: The extension (spatial or temporal) of each filter.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Only 'valid' is supported for now.
     subsample_length: Factor by which to subsample output. Int. Default is 1.
@@ -1863,7 +1974,8 @@ class LocallyConnected1D(KerasLayer):
     bias: Whether to include a bias (i.e. make the layer affine rather than linear).
           Default is True.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> locallyconnected1d = LocallyConnected1D(6, 3, input_shape=(8, 12))
     creating: createKerasLocallyConnected1D
@@ -1875,7 +1987,8 @@ def __init__(self, nb_filter, filter_length, activation=None, border_mode="valid
                  subsample_length=1, W_regularizer=None, b_regularizer=None,
                  bias=True, input_shape=None, **kwargs):
         if border_mode != "valid":
-            raise ValueError("For LocallyConnected1D, only border_mode='valid' is supported for now")
+            raise ValueError("For LocallyConnected1D, only border_mode='valid' is"
+                             " supported for now")
         super(LocallyConnected1D, self).__init__(None,
                                                  nb_filter,
                                                  filter_length,
@@ -1890,8 +2003,9 @@ def __init__(self, nb_filter, filter_length, activation=None, border_mode="valid
 
 class LocallyConnected2D(KerasLayer):
     """
-    Locally-connected layer for 2D inputs that works similarly to the SpatialConvolution layer, except that
-    weights are unshared, that is, a different set of filters is applied at each different patch of the input.
+    Locally-connected layer for 2D inputs that works similarly to the SpatialConvolution layer,
+    except that weights are unshared, that is, a different set of filters is applied at each
+    different patch of the input.
     The input of this layer should be 4D.
 
     When you use this layer as the first layer of a model, you need to provide the argument
@@ -1901,19 +2015,22 @@ class LocallyConnected2D(KerasLayer):
     nb_filter: Number of convolution filters to use.
     nb_row: Number of rows in the convolution kernel.
     nb_col: Number of cols in the convolution kernel.
-    activation: String representation of the activation function to use (such as 'relu' or 'sigmoid').
+    activation: String representation of the activation function to use (such as 'relu' or
+                'sigmoid').
                 Default is None.
     border_mode: Either 'valid' or 'same'. Default is 'valid'.
     subsample: Int tuple of length 2 corresponding to the step of the convolution in the
                height and width dimension. Also called strides elsewhere. Default is (1, 1).
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last).
+                  Default is 'th'.
     W_regularizer: An instance of [[Regularizer]], (eg. L1 or L2 regularization),
                    applied to the input weights matrices. Default is None.
     b_regularizer: An instance of [[Regularizer]], applied to the bias. Default is None.
     bias: Whether to include a bias (i.e. make the layer affine rather than linear).
           Default is True.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> locallyconnected2d = LocallyConnected2D(12, 3, 4, input_shape=(3, 128, 128))
     creating: createKerasLocallyConnected2D
@@ -1945,9 +2062,10 @@ class SpatialDropout1D(KerasLayer):
     Spatial 1D version of Dropout.
     This version performs the same function as Dropout, however it drops entire 1D feature maps
     instead of individual elements. If adjacent frames within feature maps are strongly correlated
-    (as is normally the case in early convolution layers) then regular dropout will not regularize the
-    activations and will otherwise just result in an effective learning rate decrease.
-    In this case, SpatialDropout1D will help promote independence between feature maps and should be used instead.
+    (as is normally the case in early convolution layers) then regular dropout will not regularize
+     the activations and will otherwise just result in an effective learning rate decrease.
+    In this case, SpatialDropout1D will help promote independence between feature maps and should be
+     used instead.
     The input of this layer should be 3D.
 
     When you use this layer as the first layer of a model, you need to provide the argument
@@ -1956,7 +2074,8 @@ class SpatialDropout1D(KerasLayer):
     # Arguments
     p: Fraction of the input units to drop. Float between 0 and 1.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> spatialdropout1d = SpatialDropout1D(0.4, input_shape=(10, 12))
     creating: createKerasSpatialDropout1D
@@ -1976,9 +2095,10 @@ class SpatialDropout2D(KerasLayer):
     Spatial 2D version of Dropout.
     This version performs the same function as Dropout, however it drops entire 2D feature maps
     instead of individual elements. If adjacent pixels within feature maps are strongly correlated
-    (as is normally the case in early convolution layers) then regular dropout will not regularize the
-    activations and will otherwise just result in an effective learning rate decrease.
-    In this case, SpatialDropout2D will help promote independence between feature maps and should be used instead.
+    (as is normally the case in early convolution layers) then regular dropout will not regularize
+    the activations and will otherwise just result in an effective learning rate decrease.
+    In this case, SpatialDropout2D will help promote independence between feature maps and should be
+    used instead.
     The input of this layer should be 4D.
 
     When you use this layer as the first layer of a model, you need to provide the argument
@@ -1986,9 +2106,11 @@ class SpatialDropout2D(KerasLayer):
 
     # Arguments
     p: Fraction of the input units to drop. Float between 0 and 1.
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default
+    is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+    generated string.
 
     >>> spatialdropout2d = SpatialDropout2D(0.25, input_shape=(5, 12, 12))
     creating: createKerasSpatialDropout2D
@@ -2009,9 +2131,10 @@ class SpatialDropout3D(KerasLayer):
     Spatial 3D version of Dropout.
     This version performs the same function as Dropout, however it drops entire 3D feature maps
     instead of individual elements. If adjacent voxels within feature maps are strongly correlated
-    (as is normally the case in early convolution layers) then regular dropout will not regularize the
-    activations and will otherwise just result in an effective learning rate decrease.
-    In this case, SpatialDropout3D will help promote independence between feature maps and should be used instead.
+    (as is normally the case in early convolution layers) then regular dropout will not regularize
+    the activations and will otherwise just result in an effective learning rate decrease.
+    In this case, SpatialDropout3D will help promote independence between feature maps and should be
+    used instead.
     The input of this layer should be 5D.
 
     When you use this layer as the first layer of a model, you need to provide the argument
@@ -2019,9 +2142,11 @@ class SpatialDropout3D(KerasLayer):
 
     # Arguments
     p: Fraction of the input units to drop. Float between 0 and 1.
-    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default is 'th'.
+    dim_ordering: Format of input data. Either 'th' (Channel First) or 'tf' (Channel Last). Default
+                  is 'th'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> spatialdropout3d = SpatialDropout3D(0.6, input_shape=(4, 12, 12, 16))
     creating: createKerasSpatialDropout3D
@@ -2049,7 +2174,8 @@ class GaussianDropout(KerasLayer):
     p: Drop probability. Float between 0 and 1.
        The multiplicative noise will have standard deviation 'sqrt(p/(1-p))'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> gaussiandropout = GaussianDropout(0.45, input_shape=(4, 8))
     creating: createKerasGaussianDropout
@@ -2077,7 +2203,8 @@ class GaussianNoise(KerasLayer):
     # Arguments
     sigma: Float, standard deviation of the noise distribution.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> gaussiannoise = GaussianNoise(0.45, input_shape=(3, 4, 5), name="gaussiannoise1")
     creating: createKerasGaussianNoise
@@ -2105,7 +2232,8 @@ class Masking(KerasLayer):
                 if all values in the input at that timestep are equal to 'mask_value',
                 then the timestep will masked (skipped) in all downstream layers.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> masking = Masking(0.3, input_shape=(6, 8))
     creating: createKerasMasking
@@ -2140,13 +2268,15 @@ class SReLU(KerasLayer):
                   Default is 'glorot_uniform'.
     a_right_init: String representation of the initialization method for the right part slope.
                   Default is 'one'.
-    shared_axes: Int tuple. The axes along which to share learnable parameters for the activation function.
+    shared_axes: Int tuple. The axes along which to share learnable parameters for the activation
+                 function.
                  Default is None.
-                 For example, if the incoming feature maps are from a 2D convolution with output shape
-                 (batch, height, width, channels), and you wish to share parameters across space so that
-                 each filter only has one set of parameters, set 'shared_axes=(1,2)'.
+                 For example, if the incoming feature maps are from a 2D convolution with output
+                 shape (batch, height, width, channels), and you wish to share parameters across
+                 space so that each filter only has one set of parameters, set 'shared_axes=(1,2)'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> srelu = SReLU(input_shape=(4, 5))
     creating: createKerasSReLU
@@ -2180,7 +2310,8 @@ class ELU(KerasLayer):
     # Arguments
     alpha: Float, scale for the negative factor. Default is 1.0.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> elu = ELU(1.2, input_shape=(4, 5))
     creating: createKerasELU
@@ -2208,7 +2339,8 @@ class LeakyReLU(KerasLayer):
     # Arguments
     alpha: Float >= 0. Negative slope coefficient. Default is 0.3.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> leakyrelu = LeakyReLU(0.02, input_shape=(4, 5))
     creating: createKerasLeakyReLU
@@ -2236,7 +2368,8 @@ class ThresholdedReLU(KerasLayer):
     # Arguments
     theta: Float >= 0. Threshold location of activation. Default is 1.0.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the layer. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the layer. If not specified, its name will by default to be a
+          generated string.
 
     >>> thresholdedrelu = ThresholdedReLU(input_shape=(10, 12))
     creating: createKerasThresholdedReLU
@@ -2255,7 +2388,8 @@ class TimeDistributed(KerasLayer):
     """
     TimeDistributed wrapper.
     Apply a layer to every temporal slice of an input.
-    The input should be at least 3D, and the dimension of index one will be considered to be the temporal dimension.
+    The input should be at least 3D, and the dimension of index one will be considered to be the
+    temporal dimension.
 
     When you use this layer as the first layer of a model, you need to provide the argument
     input_shape (a shape tuple, does not include the batch dimension).
@@ -2264,9 +2398,11 @@ class TimeDistributed(KerasLayer):
     # Arguments
     layer: A layer instance.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the wrapper. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the wrapper. If not specified, its name will by default to be a
+          generated string.
 
-    >>> timedistributed = TimeDistributed(Dense(8), input_shape=(10, 12), name="timedistributeddense")
+    >>> timedistributed = TimeDistributed(Dense(8), input_shape=(10, 12),
+    ... name="timedistributeddense")
     creating: createKerasDense
     creating: createKerasTimeDistributed
 
@@ -2296,9 +2432,11 @@ class Bidirectional(KerasLayer):
     merge_mode: Mode by which outputs of the forward and backward RNNs will be combined.
                 Must be one of: 'sum', 'mul', 'concat', 'ave'. Default is 'concat'.
     input_shape: A shape tuple, not including batch.
-    name: String to set the name of the wrapper. If not specified, its name will by default to be a generated string.
+    name: String to set the name of the wrapper. If not specified, its name will by default to be a
+          generated string.
 
-    >>> bidiretional = Bidirectional(LSTM(10, return_sequences=True), input_shape=(12, 16), name="bidirectionallstm")
+    >>> bidiretional = Bidirectional(LSTM(10, return_sequences=True), input_shape=(12, 16),
+    ... name="bidirectionallstm")
     creating: createKerasLSTM
     creating: createKerasBidirectional
 
diff --git a/python/dllib/src/bigdl/dllib/nn/keras/layers/topology.py b/python/dllib/src/bigdl/dllib/nn/keras/layers/topology.py
index 52b80f2ed44..a1530d58445 100644
--- a/python/dllib/src/bigdl/dllib/nn/keras/layers/topology.py
+++ b/python/dllib/src/bigdl/dllib/nn/keras/layers/topology.py
@@ -28,6 +28,7 @@ class KerasModel(KerasLayer, Container, SharedStaticUtils):
     .. note:: `bigdl.dllib.keras` is deprecated in 0.11.
     This will be removed in future releases.
     """
+
     def __convert_optim_method(self, optimizer):
         optimizer = optimizer.lower()
         if optimizer == "adagrad":
@@ -93,7 +94,8 @@ def compile(self, optimizer, loss, metrics=None):
                    string representation, such as 'sgd'.
         loss: Criterion to be used. One can alternatively pass in the corresponding string
               representation, such as 'mse'.
-        metrics: List of validation methods to be used. Default is None. One can alternatively use ['accuracy'].
+        metrics: List of validation methods to be used. Default is None. One can alternatively use
+        ['accuracy'].
         """
         if isinstance(optimizer, six.string_types):
             optimizer = self.__convert_optim_method(optimizer)
@@ -209,6 +211,7 @@ class Sequential(KerasModel):
     .. note:: `bigdl.dllib.keras` is deprecated in 0.11.
     This will be removed in future releases.
     """
+
     def __init__(self, jvalue=None, **kwargs):
         warnings.warn("bigdl.dllib.keras is deprecated in 0.11. "
                       "Recommend to use Analytics Zoo's Keras API.")
@@ -242,13 +245,15 @@ class Model(KerasModel):
     .. note:: `bigdl.dllib.keras` is deprecated in 0.11.
     This will be removed in future releases.
     """
-    def __init__(self, input, output, jvalue=None,  **kwargs):
+
+    def __init__(self, input, output, jvalue=None, **kwargs):
         warnings.warn("bigdl.dllib.keras is deprecated in BigDL 0.11."
                       "Recommend to use Analytics Zoo's Keras API.")
         super(Model, self).__init__(jvalue,
                                     to_list(input),
                                     to_list(output),
                                     **kwargs)
+
     @staticmethod
     def from_jvalue(jvalue, bigdl_type="float"):
         """
diff --git a/python/dllib/src/bigdl/dllib/nn/keras/optimization.py b/python/dllib/src/bigdl/dllib/nn/keras/optimization.py
index 89add68c51b..4024f382ad0 100644
--- a/python/dllib/src/bigdl/dllib/nn/keras/optimization.py
+++ b/python/dllib/src/bigdl/dllib/nn/keras/optimization.py
@@ -103,7 +103,8 @@ def to_bigdl_optim_method(koptim_method):
                                       decayrate=rho,
                                       epsilon=koptim_method.epsilon)
         elif isinstance(koptim_method, koptimizers.Adadelta):
-            warnings.warn("For Adadelta, we don't support learning rate and learning rate decay for now")
+            warnings.warn("For Adadelta, we don't support learning rate and learning rate decay for"
+                          " now")
             return boptimizer.Adadelta(decayrate=koptim_method.rho,
                                        epsilon=koptim_method.epsilon)
         elif isinstance(koptim_method, koptimizers.Adamax):
diff --git a/python/dllib/src/bigdl/dllib/nn/layer.py b/python/dllib/src/bigdl/dllib/nn/layer.py
index 6e5d200d43f..322c940b2e9 100644
--- a/python/dllib/src/bigdl/dllib/nn/layer.py
+++ b/python/dllib/src/bigdl/dllib/nn/layer.py
@@ -39,10 +39,12 @@
     long = int
     unicode = str
 
+
 class Node(JavaValue):
     """
     Represent a node in a graph. The connections between nodes are directed.
     """
+
     def __init__(self, jvalue, bigdl_type, *args):
         self.value = jvalue if jvalue else callBigDlFunc(
             bigdl_type, self.jvm_class_constructor(), *args)
@@ -75,7 +77,6 @@ def load(path, bigdl_type="float"):
         jmodel = callBigDlFunc(bigdl_type, "loadBigDL", path)
         return Layer.of(jmodel)
 
-
     @staticmethod
     def of(jvalue, bigdl_type="float"):
         """
@@ -83,6 +84,7 @@ def of(jvalue, bigdl_type="float"):
         :param jvalue: Java object create by Py4j
         :return: A Python Layer
         """
+
         def get_py_name(jclass_name):
             if jclass_name == "StaticGraph" or jclass_name == "DynamicGraph":
                 return "Model"
@@ -92,21 +94,21 @@ def get_py_name(jclass_name):
                 return jclass_name
 
         jname = callBigDlFunc(bigdl_type,
-                                      "getRealClassNameOfJValue",
-                                      jvalue)
+                              "getRealClassNameOfJValue",
+                              jvalue)
 
         jpackage_name = ".".join(jname.split(".")[:-1])
         pclass_name = get_py_name(jname.split(".")[-1])
 
         if "com.intel.analytics.bigdl.dllib.keras.Model" == jname or \
-                        "com.intel.analytics.bigdl.dllib.keras.Sequential" == jname:
+                "com.intel.analytics.bigdl.dllib.keras.Sequential" == jname:
             base_module = importlib.import_module('bigdl.dllib.keras.layers.topology')
         elif "com.intel.analytics.bigdl.dllib.keras" == jpackage_name:
             base_module = importlib.import_module('bigdl.dllib.keras.layers.layer')
         else:
             base_module = importlib.import_module('bigdl.dllib.nn.layer')
 
-        realClassName = "Layer" # The top base class
+        realClassName = "Layer"  # The top base class
         if pclass_name in dir(base_module):
             realClassName = pclass_name
         module = getattr(base_module, realClassName)
@@ -114,6 +116,7 @@ def get_py_name(jclass_name):
         model = jvalue_creator(jvalue, bigdl_type)
         return model
 
+
 class Layer(JavaValue, SharedStaticUtils):
     """
     Layer is the basic component of a neural network
@@ -123,7 +126,7 @@ class Layer(JavaValue, SharedStaticUtils):
 
     def __init__(self, jvalue, bigdl_type, *args):
         if (jvalue):
-            assert(type(jvalue) == JavaObject)
+            assert (type(jvalue) == JavaObject)
             self.value = jvalue
         else:
             self.value = callBigDlFunc(
@@ -219,6 +222,7 @@ def check_input(input):
         :param input: ndarray or list of ndarray or JTensor or list of JTensor.
         :return: (list of JTensor, isTable)
         """
+
         def to_jtensor(i):
             if isinstance(i, np.ndarray):
                 return JTensor.from_ndarray(i)
@@ -246,7 +250,7 @@ def check_list(input):
     def convert_output(output):
         if type(output) is JTensor:
             return output.to_ndarray()
-        elif(len(output) == 1):
+        elif (len(output) == 1):
             return output[0].to_ndarray()
         else:
             return [x.to_ndarray() for x in output]
@@ -272,9 +276,9 @@ def backward(self, input, grad_output):
         """
         NB: It's for debug only, please use optimizer.optimize() in production.
         Performs a back-propagation step through the module, with respect to the given input. In
-        general this method makes the assumption forward(input) has been called before, with the same
-        input. This is necessary for optimization reasons. If you do not respect this rule, backward()
-        will compute incorrect gradients.
+        general this method makes the assumption forward(input) has been called before, with the
+        same input. This is necessary for optimization reasons. If you do not respect this rule,
+        backward() will compute incorrect gradients.
 
         :param input: ndarray or list of ndarray or JTensor or list of JTensor.
         :param grad_output: ndarray or list of ndarray or JTensor or list of JTensor.
@@ -332,7 +336,7 @@ def to_ndarray(params):
                         params.items())
 
         return dict((layer_name, to_ndarray(params)) for layer_name, params in
-                name_to_params.items())
+                    name_to_params.items())
 
     def evaluate(self, *args):
         """
@@ -356,16 +360,18 @@ def evaluate(self, *args):
             dataset, batch_size, val_methods = args
             if (isinstance(dataset, ImageFrame)):
                 return callBigDlFunc(self.bigdl_type,
-                                    "modelEvaluateImageFrame",
-                                    self.value,
-                                    dataset, batch_size, val_methods)
+                                     "modelEvaluateImageFrame",
+                                     self.value,
+                                     dataset, batch_size, val_methods)
             else:
                 return callBigDlFunc(self.bigdl_type,
                                      "modelEvaluate",
                                      self.value,
                                      dataset, batch_size, val_methods)
         else:
-            raise Exception("Error when calling evaluate(): it takes no argument or exactly three arguments only")
+            raise Exception(
+                "Error when calling evaluate(): it takes no argument or exactly three arguments"
+                " only")
 
     def _to_jtensors(self, x):
         x = to_list(x)
@@ -376,8 +382,7 @@ def _to_jtensors(self, x):
         else:
             raise Exception("Not supported type: %s" % type(x[0]))
 
-
-    def predict_local(self, X, batch_size = -1):
+    def predict_local(self, X, batch_size=-1):
         """
         :param X: X can be a ndarray or list of ndarray if the model has multiple inputs.
                   The first dimension of X should be batch.
@@ -386,12 +391,12 @@ def predict_local(self, X, batch_size = -1):
         """
 
         jresults = callBigDlFunc(self.bigdl_type,
-                             "predictLocal",
-                               self.value,
-                               self._to_jtensors(X),
-                               batch_size)
+                                 "predictLocal",
+                                 self.value,
+                                 self._to_jtensors(X),
+                                 batch_size)
 
-        return np.stack([j.to_ndarray()for j in jresults])
+        return np.stack([j.to_ndarray() for j in jresults])
 
     def predict_class_local(self, X):
         """
@@ -401,12 +406,12 @@ def predict_class_local(self, X):
         :return: a ndarray as the prediction result.
         """
         result = callBigDlFunc(self.bigdl_type,
-                             "predictLocalClass",
+                               "predictLocalClass",
                                self.value,
                                self._to_jtensors(X))
         return np.stack(result)
 
-    def predict(self, features, batch_size = -1):
+    def predict(self, features, batch_size=-1):
         """
         Model inference base on the given data.
         :param features: it can be a ndarray or list of ndarray for locally inference
@@ -431,7 +436,7 @@ def predict_class(self, features):
         else:
             return self.predict_class_local(features)
 
-    def predict_distributed(self, data_rdd, batch_size = -1):
+    def predict_distributed(self, data_rdd, batch_size=-1):
         """
         Model inference base on the given data.
         You need to invoke collect() to trigger those action \
@@ -469,11 +474,11 @@ def predict_image(self, image_frame, output_layer=None, share_buffer=False,
         """
 
         image_frame = callBigDlFunc(self.bigdl_type, "modelPredictImage", self.value,
-                             image_frame,
-                             output_layer,
-                             share_buffer,
-                             batch_per_partition,
-                             predict_key)
+                                    image_frame,
+                                    output_layer,
+                                    share_buffer,
+                                    batch_per_partition,
+                                    predict_key)
         return ImageFrame(image_frame)
 
     def set_weights(self, weights):
@@ -507,9 +512,8 @@ def set_weights(self, weights):
         >>> try:
         ...     add.set_weights([np.array([7,8]), np.array([1,2])])
         ... except Py4JJavaError as err:
-        ...     print(err.java_exception)
+        ...     pass
         ...
-        java.lang.IllegalArgumentException: requirement failed: the number of input weight/bias is not consistant with number of weight/bias of this layer, number of input 1, number of output 2
         >>> cAdd = CAdd([4, 1])
         creating: createCAdd
         >>> cAdd.set_weights(np.ones([4, 1]))
@@ -526,7 +530,7 @@ def get_weights(self):
         :return: list of numpy arrays which represent weight and bias
         """
         tensorWeights = callBigDlFunc(self.bigdl_type,
-                              "getWeights", self.value)
+                                      "getWeights", self.value)
         if tensorWeights is not None:
             return [tensor.to_ndarray() for tensor in tensorWeights]
         else:
@@ -535,16 +539,17 @@ def get_weights(self):
 
     def is_with_weights(self):
         return callBigDlFunc(self.bigdl_type,
-                  "isWithWeights", self.value)
+                             "isWithWeights", self.value)
 
-    def save(self, path, over_write = False):
+    def save(self, path, over_write=False):
         callBigDlFunc(self.bigdl_type, "modelSave", self.value, path,
                       over_write)
-    def saveModel(self, modelPath, weightPath = None, over_write = False):
+
+    def saveModel(self, modelPath, weightPath=None, over_write=False):
         callBigDlFunc(self.bigdl_type, "saveBigDLModule", self.value, modelPath,
                       weightPath, over_write)
 
-    def save_caffe(self, prototxt_path, model_path, use_v2 = True, overwrite = False):
+    def save_caffe(self, prototxt_path, model_path, use_v2=True, overwrite=False):
         callBigDlFunc(self.bigdl_type, "saveCaffe", self.value, prototxt_path,
                       model_path, use_v2, overwrite)
 
@@ -564,7 +569,6 @@ def save_tensorflow(self, inputs, path, byte_order="little_endian", data_format=
         """
         callBigDlFunc(self.bigdl_type, "saveTF", self.value, inputs, path, byte_order, data_format)
 
-
     def setWRegularizer(self, wRegularizer):
         '''
         set weight regularizer
@@ -648,11 +652,13 @@ def quantize(self):
         >>> conv.set_weights([np.ones((2, 1, 3, 3)), np.zeros((2,))])
         >>> input = np.ones((2, 1, 4, 4))
         >>> output = conv.forward(input)
-        >>> expected_output = np.array([[[[9., 9.], [9., 9.]], [[9., 9.], [9., 9.]]], [[[9., 9.], [9., 9.]], [[9., 9.], [9., 9.]]]])
+        >>> expected_output = np.array([[[[9., 9.], [9., 9.]], [[9., 9.], [9., 9.]]], [[[9., 9.],
+        ... [9., 9.]], [[9., 9.], [9., 9.]]]])
         >>> np.testing.assert_allclose(output, expected_output)
         >>> quantized_conv = conv.quantize()
         >>> quantized_output = quantized_conv.forward(input)
-        >>> expected_quantized_output = np.array([[[[9., 9.], [9., 9.]], [[9., 9.], [9., 9.]]], [[[9., 9.], [9., 9.]], [[9., 9.], [9., 9.]]]])
+        >>> expected_quantized_output = np.array([[[[9., 9.], [9., 9.]], [[9., 9.], [9., 9.]]],
+        ... [[[9., 9.], [9., 9.]], [[9., 9.], [9., 9.]]]])
         >>> np.testing.assert_allclose(quantized_output, expected_quantized_output)
         >>> assert("quantized.SpatialConvolution" in quantized_conv.__str__())
         >>> seq = Sequential()
@@ -663,11 +669,13 @@ def quantize(self):
         >>> seq = seq.add(fc)
         >>> input = np.ones([1, 1, 6, 6])
         >>> output = seq.forward(input)
-        >>> expected_output = np.array([[37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.]])
+        >>> expected_output = np.array([[37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.],
+        ... [37., 37.], [37., 37.], [37., 37.]])
         >>> np.testing.assert_allclose(output, expected_output)
         >>> quantized_seq = seq.quantize()
         >>> quantized_output = quantized_seq.forward(input)
-        >>> expected_quantized_output = np.array([[37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.], [37., 37.]])
+        >>> expected_quantized_output = np.array([[37., 37.], [37., 37.], [37., 37.], [37., 37.],
+        ... [37., 37.], [37., 37.], [37., 37.], [37., 37.]])
         >>> np.testing.assert_allclose(quantized_output, expected_quantized_output)
         >>> assert("quantized.Linear" in quantized_seq.__str__())
         >>> assert("quantized.SpatialConvolution" in quantized_seq.__str__())
@@ -721,9 +729,10 @@ class Model(Container):
     All inputs should be able to connect to outputs through some paths in the graph.
     It is allowed that some successors of the inputs node are not connect to outputs.
     If so, these nodes will be excluded in the computation.
-    
+
     We also support initializing a Graph directly from a tensorflow module. In this case, you should
-    pass your tensorflow nodes as inputs and outputs and also specify the byte_order parameter ("little_endian"
+    pass your tensorflow nodes as inputs and outputs and also specify the byte_order parameter
+    ("little_endian"
      or "big_endian") and node_type parameter ("bigdl" or "tensorflow")
     node_type parameter.
     """
@@ -749,7 +758,6 @@ def __init__(self,
             model = convert(to_list(inputs), to_list(outputs), byte_order, bigdl_type)
             super(Model, self).__init__(model.value, bigdl_type)
 
-
     @staticmethod
     def from_jvalue(jvalue, bigdl_type="float"):
         """
@@ -764,9 +772,8 @@ def from_jvalue(jvalue, bigdl_type="float"):
     def __str__(self):
         return "->".join(self.layers())
 
-
     @staticmethod
-    def loadModel(modelPath, weightPath =None, bigdl_type="float"):
+    def loadModel(modelPath, weightPath=None, bigdl_type="float"):
         """
         Load a pre-trained Bigdl model.
 
@@ -793,7 +800,8 @@ def load_keras(json_path=None, hdf5_path=None, by_name=False):
         Load a pre-trained Keras model.
 
         :param json_path: The json path containing the keras model definition.
-        :param hdf5_path: The HDF5 path containing the pre-trained keras model weights with or without the model architecture.
+        :param hdf5_path: The HDF5 path containing the pre-trained keras model weights with or
+         without the model architecture.
         :return: A bigdl model.
         """
         import os
@@ -845,15 +853,16 @@ def load_caffe_model(defPath, modelPath, bigdl_type="float"):
         return Layer.of(jmodel)
 
     @staticmethod
-    def load_tensorflow(path, inputs, outputs, byte_order = "little_endian",
-                        bin_file = None, generated_backward = True, bigdl_type = "float"):
+    def load_tensorflow(path, inputs, outputs, byte_order="little_endian",
+                        bin_file=None, generated_backward=True, bigdl_type="float"):
         """
         Load a pre-trained Tensorflow model.
         :param path: The path containing the pre-trained model.
         :param inputs: The input node of this graph
         :param outputs: The output node of this graph
         :param byte_order: byte_order of the file, `little_endian` or `big_endian`
-        :param bin_file: the optional bin file produced by bigdl dump_model util function to store the weights
+        :param bin_file: the optional bin file produced by bigdl dump_model util function to store
+                         the weights
         :param generated_backward: if generate backward graph
         :return: A pre-trained model.
         """
@@ -862,7 +871,8 @@ def load_tensorflow(path, inputs, outputs, byte_order = "little_endian",
         return Model.of(jmodel)
 
     @staticmethod
-    def train(output, data, label, opt_method, criterion, batch_size, end_when, session=None, bigdl_type="float"):
+    def train(output, data, label, opt_method, criterion, batch_size, end_when, session=None,
+              bigdl_type="float"):
         from bigdl.dllib.utils.tf_utils import get_path
         from bigdl.dllib.utils.common import Sample
         output_name = output.name.split(":")[0]
@@ -871,8 +881,10 @@ def train(output, data, label, opt_method, criterion, batch_size, end_when, sess
         rdd_train_images = sc.parallelize(data)
         rdd_train_labels = sc.parallelize(label)
         rdd_train_sample = rdd_train_images.zip(rdd_train_labels).map(lambda input:
-                                                                      Sample.from_ndarray(input[0], input[1]))
-        jmodel = callBigDlFunc(bigdl_type, "trainTF", path, output_name, rdd_train_sample, opt_method, criterion, batch_size, end_when)
+                                                                      Sample.from_ndarray(input[0],
+                                                                                          input[1]))
+        jmodel = callBigDlFunc(bigdl_type, "trainTF", path, output_name, rdd_train_sample,
+                               opt_method, criterion, batch_size, end_when)
         return Model.of(jmodel)
 
     def stop_gradient(self, stop_layers, bigdl_type="float"):
@@ -893,8 +905,8 @@ def node(self, name, bigdl_type="float"):
         Return the corresponding node has the given name. If the given name doesn't match any node,
         an exception will be thrown
         :param name: node name
-        :param bigdl_type: 
-        :return: 
+        :param bigdl_type:
+        :return:
         """
         jnode = callBigDlFunc(bigdl_type, "findGraphNode", self.value, name)
         return Node.of(jnode)
@@ -920,7 +932,7 @@ def set_input_formats(self, input_formats, bigdl_type="float"):
         jname = callBigDlFunc(bigdl_type,
                               "getRealClassNameOfJValue",
                               self.value)
-        if jname.split(".")[-1] == "StaticGraph" :
+        if jname.split(".")[-1] == "StaticGraph":
             callBigDlFunc(bigdl_type, "setInputFormats", self.value, input_formats)
         return self
 
@@ -938,8 +950,8 @@ def set_output_formats(self, output_formats, bigdl_type="float"):
             callBigDlFunc(bigdl_type, "setOutputFormats", self.value, output_formats)
         return self
 
-class Attention(Layer):
 
+class Attention(Layer):
     '''
     Implementation of multiheaded attention and self-attention layers.
 
@@ -951,8 +963,8 @@ def __init__(self, hidden_size, num_heads, attention_dropout, bigdl_type="float"
         super(Attention, self).__init__(None, bigdl_type,
                                         hidden_size, num_heads, attention_dropout)
 
-class FeedForwardNetwork(Layer):
 
+class FeedForwardNetwork(Layer):
     '''
     Implementation FeedForwardNetwork constructed with fully connected network.
     Input with shape (batch_size, length, hidden_size)
@@ -964,7 +976,9 @@ class FeedForwardNetwork(Layer):
 
     def __init__(self, hidden_size, filter_size, relu_dropout, bigdl_type="float"):
         super(FeedForwardNetwork, self).__init__(None, bigdl_type,
-                                        hidden_size, filter_size, relu_dropout)
+                                                 hidden_size, filter_size, relu_dropout)
+
+
 class LayerNormalization(Layer):
     '''
     Applies layer normalization.
@@ -976,6 +990,7 @@ class LayerNormalization(Layer):
     def __init__(self, hidden_size, bigdl_type="float"):
         super(LayerNormalization, self).__init__(None, bigdl_type, hidden_size)
 
+
 class TableOperation(Layer):
     '''
     When two tensors have different size, firstly expand small size tensor to large size tensor,
@@ -989,6 +1004,7 @@ class TableOperation(Layer):
     def __init__(self, operation_layer, bigdl_type="float"):
         super(TableOperation, self).__init__(None, bigdl_type, operation_layer)
 
+
 class ExpandSize(Layer):
     '''
     Expand tensor to configured size
@@ -1000,22 +1016,24 @@ class ExpandSize(Layer):
     def __init__(self, sizes, bigdl_type="float"):
         super(ExpandSize, self).__init__(None, bigdl_type, sizes)
 
-class Transformer(Layer):
 
+class Transformer(Layer):
     '''
     Implementation for Transformer
     >>> layer = Transformer(20, 4, 2, 3, 1, 0.1, 0.1, 0.1)
     creating: createTransformer
     '''
+
     def __init__(self, vocab_size, hidden_size, num_heads, filter_size, num_hidden_layers,
                  postprocess_dropout, attention_dropout,
                  relu_dropout, bigdl_type="float"):
         super(Transformer, self).__init__(None, bigdl_type, vocab_size,
-                                               hidden_size, num_heads, filter_size,
-                                               num_hidden_layers, postprocess_dropout,
-                                               attention_dropout, relu_dropout)
-class Linear(Layer):
+                                          hidden_size, num_heads, filter_size,
+                                          num_hidden_layers, postprocess_dropout,
+                                          attention_dropout, relu_dropout)
 
+
+class Linear(Layer):
     '''
     The [[Linear]] module applies a linear transformation to the input data,
     i.e. `y = Wx + b`. The input given in `forward(input)` must be either
@@ -1026,7 +1044,8 @@ class Linear(Layer):
 
     :param input_size the size the each input sample
     :param output_size the size of the module output of each sample
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
     :param init_weight: the optional initial value for the weight
     :param init_bias: the optional initial value for the bias
@@ -1043,14 +1062,17 @@ class Linear(Layer):
     >>> init_bias = np.random.randn(10)
     >>> init_grad_weight = np.zeros([10, 100])
     >>> init_grad_bias = np.zeros([10])
-    >>> linear = Linear(100, 10, True, L1Regularizer(0.5), L1Regularizer(0.5), init_weight, init_bias, init_grad_weight, init_grad_bias)
+    >>> linear = Linear(100, 10, True, L1Regularizer(0.5), L1Regularizer(0.5), init_weight,
+    ... init_bias, init_grad_weight, init_grad_bias)
     creating: createL1Regularizer
     creating: createL1Regularizer
     creating: createLinear
     '''
 
-    def __init__(self, input_size, output_size, with_bias=True, wRegularizer=None, bRegularizer=None,
-                 init_weight=None, init_bias=None, init_grad_weight=None, init_grad_bias=None, bigdl_type="float"):
+    def __init__(self, input_size, output_size, with_bias=True, wRegularizer=None,
+                 bRegularizer=None,
+                 init_weight=None, init_bias=None, init_grad_weight=None, init_grad_bias=None,
+                 bigdl_type="float"):
         super(Linear, self).__init__(None, bigdl_type, input_size, output_size,
                                      with_bias, wRegularizer, bRegularizer,
                                      JTensor.from_ndarray(init_weight),
@@ -1058,13 +1080,13 @@ def __init__(self, input_size, output_size, with_bias=True, wRegularizer=None, b
                                      JTensor.from_ndarray(init_grad_weight),
                                      JTensor.from_ndarray(init_grad_bias))
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
-                                   weight_init_method, bias_init_method)
+                      weight_init_method, bias_init_method)
         return self
 
-class SparseLinear(Layer):
 
+class SparseLinear(Layer):
     '''
     SparseLinear is the sparse version of module Linear. SparseLinear has two different from Linear:
     firstly, SparseLinear's input Tensor is a SparseTensor. Secondly, SparseLinear doesn't backward
@@ -1079,7 +1101,8 @@ class SparseLinear(Layer):
     :param backwardStart backwardStart index, counting from 1
     :param backwardLength backward length
     :param withBias if has bias
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+     input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
     :param init_weight: the optional initial value for the weight
     :param init_bias: the optional initial value for the bias
@@ -1087,7 +1110,8 @@ class SparseLinear(Layer):
     :param init_grad_bias: the optional initial value for the grad_bias
 
 
-    >>> sparselinear = SparseLinear(100, 10, True, wRegularizer=L1Regularizer(0.5), bRegularizer=L1Regularizer(0.5))
+    >>> sparselinear = SparseLinear(100, 10, True, wRegularizer=L1Regularizer(0.5),
+    ... bRegularizer=L1Regularizer(0.5))
     creating: createL1Regularizer
     creating: createL1Regularizer
     creating: createSparseLinear
@@ -1096,7 +1120,8 @@ class SparseLinear(Layer):
     >>> init_bias = np.random.randn(10)
     >>> init_grad_weight = np.zeros([10, 100])
     >>> init_grad_bias = np.zeros([10])
-    >>> sparselinear = SparseLinear(100, 10, True, 1, 5, L1Regularizer(0.5), L1Regularizer(0.5), init_weight, init_bias, init_grad_weight, init_grad_bias)
+    >>> sparselinear = SparseLinear(100, 10, True, 1, 5, L1Regularizer(0.5), L1Regularizer(0.5),
+    ... init_weight, init_bias, init_grad_weight, init_grad_bias)
     creating: createL1Regularizer
     creating: createL1Regularizer
     creating: createSparseLinear
@@ -1105,9 +1130,11 @@ class SparseLinear(Layer):
     >>> init_bias = np.random.randn(5)
     >>> sparselinear = SparseLinear(1000, 5, init_weight=init_weight, init_bias=init_bias)
     creating: createSparseLinear
-    >>> input = JTensor.sparse(np.array([1, 3, 5, 2, 4, 6]), np.array([0, 0, 0, 1, 1, 1, 1, 5, 300, 2, 100, 500]), np.array([2, 1000]))
+    >>> input = JTensor.sparse(np.array([1, 3, 5, 2, 4, 6]), np.array([0, 0, 0, 1, 1, 1, 1, 5, 300,
+    ... 2, 100, 500]), np.array([2, 1000]))
     >>> output = sparselinear.forward(input)
-    >>> expected_output = np.array([[10.09569263, -10.94844246, -4.1086688, 1.02527523, 11.80737209], [7.9651413, 9.7131443, -10.22719955, 0.02345783, -3.74368906]])
+    >>> expected_output = np.array([[10.09569263, -10.94844246, -4.1086688, 1.02527523, 11.80737209]
+    ... , [7.9651413, 9.7131443, -10.22719955, 0.02345783, -3.74368906]])
     >>> np.testing.assert_allclose(output, expected_output, rtol=1e-6, atol=1e-6)
     '''
 
@@ -1115,20 +1142,20 @@ def __init__(self, input_size, output_size, with_bias=True, backwardStart=-1, ba
                  wRegularizer=None, bRegularizer=None, init_weight=None, init_bias=None,
                  init_grad_weight=None, init_grad_bias=None, bigdl_type="float"):
         super(SparseLinear, self).__init__(None, bigdl_type, input_size, output_size,
-                                     with_bias, backwardStart, backwardLength,
-                                     wRegularizer, bRegularizer,
-                                     JTensor.from_ndarray(init_weight),
-                                     JTensor.from_ndarray(init_bias),
-                                     JTensor.from_ndarray(init_grad_weight),
-                                     JTensor.from_ndarray(init_grad_bias))
+                                           with_bias, backwardStart, backwardLength,
+                                           wRegularizer, bRegularizer,
+                                           JTensor.from_ndarray(init_weight),
+                                           JTensor.from_ndarray(init_bias),
+                                           JTensor.from_ndarray(init_grad_weight),
+                                           JTensor.from_ndarray(init_grad_bias))
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
-class DenseToSparse(Layer):
 
+class DenseToSparse(Layer):
     '''
     Convert DenseTensor to SparseTensor.
 
@@ -1141,8 +1168,8 @@ def __init__(self,
                  bigdl_type="float"):
         super(DenseToSparse, self).__init__(None, bigdl_type)
 
-class ReLU(Layer):
 
+class ReLU(Layer):
     '''
     Applies the rectified linear unit (ReLU) function element-wise to the input Tensor,
      thus outputting a Tensor of the same dimension.
@@ -1161,7 +1188,6 @@ def __init__(self, ip=False, bigdl_type="float"):
 
 
 class Tanh(Layer):
-
     '''
     Applies the Tanh function element-wise to the input Tensor, thus outputting a Tensor of the same
     dimension. Tanh is defined as f(x) = (exp(x)-exp(-x))/(exp(x)+exp(-x)).
@@ -1176,7 +1202,6 @@ def __init__(self, bigdl_type="float"):
 
 
 class Sigmoid(Layer):
-
     '''
     Applies the Sigmoid function element-wise to the input Tensor,
     thus outputting a Tensor of the same dimension.
@@ -1191,7 +1216,6 @@ def __init__(self,
 
 
 class Echo(Layer):
-
     '''
     This module is for debug purpose, which can print activation and gradient in your model
     topology
@@ -1206,7 +1230,6 @@ def __init__(self, bigdl_type="float"):
 
 
 class LogSoftMax(Layer):
-
     '''
     Applies the LogSoftMax function to an n-dimensional input Tensor.
     LogSoftmax is defined as: f_i(x) = log(1 / a exp(x_i))
@@ -1222,7 +1245,6 @@ def __init__(self, bigdl_type="float"):
 
 
 class Sequential(Container):
-
     '''
     Sequential provides a means to plug layers together
     in a feed-forward fully connected manner.
@@ -1263,9 +1285,7 @@ def to_graph(self):
         return model
 
 
-
 class TemporalConvolution(Layer):
-
     '''
     Applies a 1D convolution over an input sequence composed of nInputFrame frames..
     The input tensor in `forward(input)` is expected to be a 2D tensor
@@ -1308,22 +1328,24 @@ def __init__(self,
                  init_grad_bias=None,
                  bigdl_type="float"):
         super(TemporalConvolution, self).__init__(None, bigdl_type,
-                                                 input_frame_size,
-                                                 output_frame_size,
-                                                 kernel_w,
-                                                 stride_w,
-                                                 propagate_back,
-                                                 weight_regularizer,
-                                                 bias_regularizer,
-                                                 JTensor.from_ndarray(init_weight),
-                                                 JTensor.from_ndarray(init_bias),
-                                                 JTensor.from_ndarray(init_grad_weight),
-                                                 JTensor.from_ndarray(init_grad_bias))
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+                                                  input_frame_size,
+                                                  output_frame_size,
+                                                  kernel_w,
+                                                  stride_w,
+                                                  propagate_back,
+                                                  weight_regularizer,
+                                                  bias_regularizer,
+                                                  JTensor.from_ndarray(init_weight),
+                                                  JTensor.from_ndarray(init_bias),
+                                                  JTensor.from_ndarray(init_grad_weight),
+                                                  JTensor.from_ndarray(init_grad_bias))
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
+
 class LocallyConnected1D(Layer):
     '''
     The `LocallyConnected1D` layer works similarly to
@@ -1388,6 +1410,7 @@ def set_init_method(self, weight_init_method=None, bias_init_method=None):
                       weight_init_method, bias_init_method)
         return self
 
+
 class BinaryTreeLSTM(Layer):
     '''
     This class is an implementation of Binary TreeLSTM (Constituency Tree LSTM).
@@ -1412,8 +1435,8 @@ def __init__(self,
                                              gate_output,
                                              with_graph)
 
-class LocallyConnected2D(Layer):
 
+class LocallyConnected2D(Layer):
     '''
     The LocallyConnected2D layer works similarly to the [[SpatialConvolution]] layer,
     except that weights are unshared, that is, a different set of filters
@@ -1430,16 +1453,18 @@ class LocallyConnected2D(Layer):
     :param pad_w The additional zeros added per width to the input planes.
     :param pad_h The additional zeros added per height to the input planes.
     :param propagate_back Propagate gradient back
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
     :param init_weight: the optional initial value for the weight
     :param init_bias: the optional initial value for the bias
     :param init_grad_weight: the optional initial value for the grad_weight
     :param init_grad_bias: the optional initial value for the grad_bias
     :param with_bias: the optional initial value for if need bias
-    :param data_format: a string value of "NHWC" or "NCHW" to specify the input data format of this layer. In "NHWC" format
-                       data is stored in the order of [batch_size, height, width, channels], in "NCHW" format data is stored
-                       in the order of [batch_size, channels, height, width].
+    :param data_format: a string value of "NHWC" or "NCHW" to specify the input data format of this
+                        layer. In "NHWC" format data is stored in the order of
+                        [batch_size, height, width, channels], in "NCHW" format data is stored
+                        in the order of [batch_size, channels, height, width].
 
     >>> locallyConnected2D = LocallyConnected2D(6, 2, 4, 12, 5, 5)
     creating: createLocallyConnected2D
@@ -1490,13 +1515,14 @@ def __init__(self,
                                                  JTensor.from_ndarray(init_grad_bias),
                                                  with_bias,
                                                  data_format)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
-class SpatialConvolution(Layer):
 
+class SpatialConvolution(Layer):
     '''
     Applies a 2D convolution over an input image composed of several input planes.
     The input tensor in forward(input) is expected to be
@@ -1512,15 +1538,18 @@ class SpatialConvolution(Layer):
     :param pad_h The additional zeros added per height to the input planes.
     :param n_group Kernel group number
     :param propagate_back Propagate gradient back
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
     :param init_weight: the optional initial value for the weight
     :param init_bias: the optional initial value for the bias
     :param init_grad_weight: the optional initial value for the grad_weight
     :param init_grad_bias: the optional initial value for the grad_bias
     :param with_bias: the optional initial value for if need bias
-    :param data_format: a string value of "NHWC" or "NCHW" to specify the input data format of this layer. In "NHWC" format
-                       data is stored in the order of [batch_size, height, width, channels], in "NCHW" format data is stored
+    :param data_format: a string value of "NHWC" or "NCHW" to specify the input data format of this
+                       layer. In "NHWC" format
+                       data is stored in the order of [batch_size, height, width, channels],
+                       in "NCHW" format data is stored
                        in the order of [batch_size, channels, height, width].
 
     >>> spatialConvolution = SpatialConvolution(6, 12, 5, 5)
@@ -1534,7 +1563,9 @@ class SpatialConvolution(Layer):
     >>> init_bias = np.random.randn(12)
     >>> init_grad_weight = np.zeros([1, 12, 6, 5, 5])
     >>> init_grad_bias = np.zeros([12])
-    >>> spatialConvolution = SpatialConvolution(6, 12, 5, 5, 1, 1, 0, 0, 1, True, L1Regularizer(0.5), L1Regularizer(0.5), init_weight, init_bias, init_grad_weight, init_grad_bias, True, "NCHW")
+    >>> spatialConvolution = SpatialConvolution(6, 12, 5, 5, 1, 1, 0, 0, 1, True,
+    ... L1Regularizer(0.5), L1Regularizer(0.5), init_weight, init_bias, init_grad_weight,
+    ... init_grad_bias, True, "NCHW")
     creating: createL1Regularizer
     creating: createL1Regularizer
     creating: createSpatialConvolution
@@ -1579,14 +1610,14 @@ def __init__(self,
                                                  JTensor.from_ndarray(init_grad_bias),
                                                  with_bias,
                                                  data_format)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
-                  weight_init_method, bias_init_method)
+                      weight_init_method, bias_init_method)
         return self
 
 
 class TemporalMaxPooling(Layer):
-
     '''
     Applies 1D max-pooling operation in kW regions by step size dW steps.
     Input sequence composed of nInputFrame frames.
@@ -1610,9 +1641,10 @@ def __init__(self,
                  d_w=-1,
                  bigdl_type="float"):
         super(TemporalMaxPooling, self).__init__(None, bigdl_type, k_w,
-                                                d_w)
-class SpatialMaxPooling(Layer):
+                                                 d_w)
+
 
+class SpatialMaxPooling(Layer):
     '''
     Applies 2D max-pooling operation in kWxkH regions by step size dWxdH steps.
     The number of output features is equal to the number of input planes.
@@ -1622,16 +1654,15 @@ class SpatialMaxPooling(Layer):
     oheight = op((height + 2*padH - kH) / dH + 1)
     op is a rounding operator. By default, it is floor.
     It can be changed by calling :ceil() or :floor() methods.
-    
+
     When padW and padH are both -1, we use a padding algorithm similar to the "SAME"
     padding of tensorflow. That is
- 
+
      outHeight = Math.ceil(inHeight.toFloat/strideH.toFloat)
      outWidth = Math.ceil(inWidth.toFloat/strideW.toFloat)
- 
+
      padAlongHeight = Math.max(0, (outHeight - 1) * strideH + kernelH - inHeight)
      padAlongWidth = Math.max(0, (outWidth - 1) * strideW + kernelW - inWidth)
- 
      padTop = padAlongHeight / 2
      padLeft = padAlongWidth / 2
 
@@ -1648,6 +1679,7 @@ class SpatialMaxPooling(Layer):
     >>> spatialMaxPooling = SpatialMaxPooling(2, 2, 2, 2, -1, -1, True, "NHWC")
     creating: createSpatialMaxPooling
     '''
+
     # to_ceil: call floor() when False; call ceil() when True
 
     def __init__(self, kw,
@@ -1670,7 +1702,6 @@ def __init__(self, kw,
 
 
 class Select(Layer):
-
     '''
     A Simple layer selecting an index of the input tensor in the given dimension
 
@@ -1686,6 +1717,7 @@ class Select(Layer):
     def __init__(self, dim, index, bigdl_type="float"):
         super(Select, self).__init__(None, bigdl_type, dim, index)
 
+
 class Recurrent(Container):
     '''
     Recurrent module is a container of rnn cells
@@ -1702,12 +1734,13 @@ def __init__(self, bigdl_type="float"):
     def get_hidden_state(self):
         """
         get hidden state and cell at last time step.
-        
+
         :return: list of hidden state and cell
         """
         states = callBigDlFunc(self.bigdl_type, "getHiddenState", self.value)
         return states
 
+
 class RecurrentDecoder(Recurrent):
     '''
     RecurrentDecoder module is a container of rnn cells which used to make
@@ -1726,6 +1759,7 @@ class RecurrentDecoder(Recurrent):
     def __init__(self, output_length, bigdl_type="float"):
         super(Recurrent, self).__init__(None, bigdl_type, output_length)
 
+
 class LSTM(Layer):
     '''
 |   Long Short Term Memory architecture.
@@ -1739,17 +1773,24 @@ class LSTM(Layer):
 
     :param inputSize: the size of each input vector
     :param hiddenSize: Hidden unit size in the LSTM
-    :param p: is used for [[Dropout]] probability. For more details aboutRNN dropouts, please refer to[RnnDrop: A Novel Dropout for RNNs in ASR](http://www.stat.berkeley.edu/~tsmoon/files/Conference/asru2015.pdf)[A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](https://arxiv.org/pdf/1512.05287.pdf)
+    :param p: is used for [[Dropout]] probability. For more details aboutRNN dropouts, please refer
+              to[RnnDrop: A Novel Dropout for RNNs in ASR](http://www.stat.berkeley.edu/~tsmoon/
+              files/Conference/asru2015.pdf)[A Theoretically Grounded Application of Dropout in
+              Recurrent Neural Networks](https://arxiv.org/pdf/1512.05287.pdf)
     :param activation: activation function, by default to be Tanh if not specified.
                         It can also be the name of an existing activation as a string.
-    :param inner_activation: activation function for the inner cells, by default to be Sigmoid if not specified.
+    :param inner_activation: activation function for the inner cells, by default to be Sigmoid if
+                             not specified.
                             It can also be the name of an existing activation as a string.
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
-    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the recurrent weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
+    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         recurrent weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
 
 
-    >>> lstm = LSTM(4, 3, 0.5, 'tanh', Sigmoid(), L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
+    >>> lstm = LSTM(4, 3, 0.5, 'tanh', Sigmoid(), L1Regularizer(0.5), L1Regularizer(0.5),
+    ... L1Regularizer(0.5))
     creating: createSigmoid
     creating: createL1Regularizer
     creating: createL1Regularizer
@@ -1769,7 +1810,8 @@ def __init__(self, input_size, hidden_size, p=0.0, activation=None, inner_activa
         if isinstance(inner_activation, six.string_types):
             inner_activation = get_activation_by_name(inner_activation)
         super(LSTM, self).__init__(None, bigdl_type, input_size, hidden_size, p,
-                                   activation, inner_activation, wRegularizer, uRegularizer, bRegularizer)
+                                   activation, inner_activation, wRegularizer, uRegularizer,
+                                   bRegularizer)
 
 
 class LSTMPeephole(Layer):
@@ -1784,9 +1826,14 @@ class LSTMPeephole(Layer):
 
     :param input_size: the size of each input vector
     :param hidden_size: Hidden unit size in the LSTM
-    :param  p: is used for [[Dropout]] probability. For more details aboutRNN dropouts, please refer to[RnnDrop: A Novel Dropout for RNNs in ASR](http://www.stat.berkeley.edu/~tsmoon/files/Conference/asru2015.pdf)[A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](https://arxiv.org/pdf/1512.05287.pdf)
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
-    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the recurrent weights matrices.
+    :param  p: is used for [[Dropout]] probability. For more details aboutRNN dropouts, please refer
+               to[RnnDrop: A Novel Dropout for RNNs in ASR](http://www.stat.berkeley.edu/~tsmoon/
+               files/Conference/asru2015.pdf)[A Theoretically Grounded Application of Dropout in
+               Recurrent Neural Networks](https://arxiv.org/pdf/1512.05287.pdf)
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
+    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         recurrent weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
 
     >>> lstm = LSTMPeephole(4, 3, 0.5, L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
@@ -1796,8 +1843,10 @@ class LSTMPeephole(Layer):
     creating: createLSTMPeephole
     '''
 
-    def __init__(self, input_size=4, hidden_size=3, p=0.0, wRegularizer=None, uRegularizer=None, bRegularizer=None, bigdl_type="float"):
-        super(LSTMPeephole, self).__init__(None, bigdl_type, input_size, hidden_size, p, wRegularizer, uRegularizer, bRegularizer)
+    def __init__(self, input_size=4, hidden_size=3, p=0.0, wRegularizer=None, uRegularizer=None,
+                 bRegularizer=None, bigdl_type="float"):
+        super(LSTMPeephole, self).__init__(None, bigdl_type, input_size, hidden_size, p,
+                                           wRegularizer, uRegularizer, bRegularizer)
 
 
 class Gemm(Layer):
@@ -1812,24 +1861,32 @@ class GRU(Layer):
 
 
 |   Ref.
-|   http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-rnn-with-python-and-theano/
+|   http://www.wildml.com/2015/10/recurrent-neural-network-tutorial-part-4-implementing-a-grulstm-
+|   rnn-with-python-and-theano/
 |   https://github.com/Element-Research/rnn/blob/master/GRU.lua
 
 
     :param input_size: the size of each input vector
     :param hidden_size: Hidden unit size in GRU
-    :param p: is used for [[Dropout]] probability. For more details aboutRNN dropouts, please refer to[RnnDrop: A Novel Dropout for RNNs in ASR](http://www.stat.berkeley.edu/~tsmoon/files/Conference/asru2015.pdf)[A Theoretically Grounded Application of Dropout in Recurrent Neural Networks](https://arxiv.org/pdf/1512.05287.pdf)
+    :param p: is used for [[Dropout]] probability. For more details aboutRNN dropouts, please refer
+              to[RnnDrop: A Novel Dropout for RNNs in ASR](http://www.stat.berkeley.edu/~tsmoon/
+              files/Conference/asru2015.pdf)[A Theoretically Grounded Application of Dropout in
+              Recurrent Neural Networks](https://arxiv.org/pdf/1512.05287.pdf)
     :param activation: activation function, by default to be Tanh if not specified.
                         It can also be the name of an existing activation as a string.
-    :param inner_activation: activation function for the inner cells, by default to be Sigmoid if not specified.
-                            It can also be the name of an existing activation as a string.
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
-    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the recurrent weights matrices.
+    :param inner_activation: activation function for the inner cells, by default to be Sigmoid if
+                             not specified.
+                             It can also be the name of an existing activation as a string.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
+    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         recurrent weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
 
 
 
-    >>> gru = GRU(4, 3, 0.5, Tanh(), Sigmoid(), L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
+    >>> gru = GRU(4, 3, 0.5, Tanh(), Sigmoid(), L1Regularizer(0.5), L1Regularizer(0.5),
+    ... L1Regularizer(0.5))
     creating: createTanh
     creating: createSigmoid
     creating: createL1Regularizer
@@ -1838,7 +1895,7 @@ class GRU(Layer):
     creating: createGRU
     '''
 
-    def __init__(self,  input_size, hidden_size, p=0.0, activation=None, inner_activation=None,
+    def __init__(self, input_size, hidden_size, p=0.0, activation=None, inner_activation=None,
                  wRegularizer=None, uRegularizer=None, bRegularizer=None, bigdl_type="float"):
         if not activation:
             activation = Tanh()
@@ -1848,7 +1905,8 @@ def __init__(self,  input_size, hidden_size, p=0.0, activation=None, inner_activ
             activation = get_activation_by_name(activation)
         if isinstance(inner_activation, six.string_types):
             inner_activation = get_activation_by_name(inner_activation)
-        super(GRU, self).__init__(None, bigdl_type, input_size, hidden_size, p, activation, inner_activation,
+        super(GRU, self).__init__(None, bigdl_type, input_size, hidden_size, p, activation,
+                                  inner_activation,
                                   wRegularizer, uRegularizer, bRegularizer)
 
 
@@ -1859,15 +1917,19 @@ class RnnCell(Layer):
 
     :param input_size: the size of each input vector
     :param hidden_size: Hidden unit size in simple RNN
-    :param activation: activation function. It can also be the name of an existing activation as a string.
+    :param activation: activation function. It can also be the name of an existing activation as a
+                       string.
     :param isInputWithBias: boolean
     :param isHiddenWithBias: boolean
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
-    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the recurrent weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
+    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         recurrent weights matrices.
     :param bRegularizer: instance of [[Regularizer]](../regularizers.md),applied to the bias.
 
 
-    >>> rnn = RnnCell(4, 3, Tanh(), True, True, L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
+    >>> rnn = RnnCell(4, 3, Tanh(), True, True, L1Regularizer(0.5), L1Regularizer(0.5),
+    ... L1Regularizer(0.5))
     creating: createTanh
     creating: createL1Regularizer
     creating: createL1Regularizer
@@ -1887,7 +1949,9 @@ def __init__(self,
                  bigdl_type="float"):
         if isinstance(activation, six.string_types):
             activation = get_activation_by_name(activation)
-        super(RnnCell, self).__init__(None, bigdl_type, input_size, hidden_size, activation, isInputWithBias, isHiddenWithBias, wRegularizer, uRegularizer, bRegularizer)
+        super(RnnCell, self).__init__(None, bigdl_type, input_size, hidden_size, activation,
+                                      isInputWithBias, isHiddenWithBias, wRegularizer, uRegularizer,
+                                      bRegularizer)
 
 
 class TimeDistributed(Layer):
@@ -1898,7 +1962,7 @@ class TimeDistributed(Layer):
 
     For instance, The TimeDistributed Layer can feed each time slice of input tensor
     to the Linear layer.
-    
+
     The input data format is [Batch, Time, Other dims]. For the contained layer, it must not change
     the Other dims length.
 
@@ -1913,7 +1977,6 @@ def __init__(self, model, bigdl_type="float"):
 
 
 class Concat(Container):
-
     '''
     Concat concatenates the output of one layer of "parallel"
     modules along the provided {@code dimension}: they take the
@@ -1943,20 +2006,19 @@ def __init__(self,
 
 
 class SpatialAveragePooling(Layer):
-
     '''
     Applies 2D average-pooling operation in kWxkH regions by step size dWxdH steps.
     The number of output features is equal to the number of input planes.
-    
+
     When padW and padH are both -1, we use a padding algorithm similar to the "SAME"
     padding of tensorflow. That is
- 
+
      outHeight = Math.ceil(inHeight.toFloat/strideH.toFloat)
      outWidth = Math.ceil(inWidth.toFloat/strideW.toFloat)
- 
+
      padAlongHeight = Math.max(0, (outHeight - 1) * strideH + kernelH - inHeight)
      padAlongWidth = Math.max(0, (outWidth - 1) * strideW + kernelW - inWidth)
- 
+
      padTop = padAlongHeight / 2
      padLeft = padAlongWidth / 2
 
@@ -1969,7 +2031,8 @@ class SpatialAveragePooling(Layer):
     :param global_pooling: If globalPooling then it will pool over the size of the input by doing
                          kH = input->height and kW = input->width
     :param ceilMode: whether the output size is to be ceiled or floored
-    :param countIncludePad: whether to include padding when dividing thenumber of elements in pooling region
+    :param countIncludePad: whether to include padding when dividing thenumber of elements in
+                            pooling region
     :param divide: whether to do the averaging
     :param format:          "NCHW" or "NHWC", indicating the input data format
 
@@ -2011,7 +2074,6 @@ def set_weights(self, weights):
 
 
 class SpatialBatchNormalization(Layer):
-
     '''
     This file implements Batch Normalization as described in the paper:
     "Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift"
@@ -2028,13 +2090,15 @@ class SpatialBatchNormalization(Layer):
 
     where gamma and beta are learnable parameters.
     The learning of gamma and beta is optional.
-    
+
     :param n_output: output feature map number
     :param eps: avoid divide zero
     :param momentum: momentum for weight update
     :param affine: affine operation on output or not
-    :param data_format a string value (or DataFormat Object in Scala) of "NHWC" or "NCHW" to specify the input data format of this layer. In "NHWC" format
-                        data is stored in the order of [batch_size, height, width, channels], in "NCHW" format data is stored
+    :param data_format a string value (or DataFormat Object in Scala) of "NHWC" or "NCHW" to specify
+                        the input data format of this layer. In "NHWC" format
+                        data is stored in the order of [batch_size, height, width, channels],
+                        in "NCHW" format data is stored
                         in the order of [batch_size, channels, height, width].
 
 
@@ -2045,9 +2109,11 @@ class SpatialBatchNormalization(Layer):
     >>> init_grad_weight = np.array([0.0])
     >>> init_bias = np.array([0.0])
     >>> init_grad_bias = np.array([0.0])
-    >>> spatialBatchNormalization = SpatialBatchNormalization(1, 1e-5, 0.1, True, init_weight, init_bias, init_grad_weight, init_grad_bias)
+    >>> spatialBatchNormalization = SpatialBatchNormalization(1, 1e-5, 0.1, True, init_weight,
+    ... init_bias, init_grad_weight, init_grad_bias)
     creating: createSpatialBatchNormalization
-    >>> spatialBatchNormalization = SpatialBatchNormalization(1, 1e-5, 0.1, True, init_weight, init_bias, init_grad_weight, init_grad_bias, "NHWC")
+    >>> spatialBatchNormalization = SpatialBatchNormalization(1, 1e-5, 0.1, True, init_weight,
+    ... init_bias, init_grad_weight, init_grad_bias, "NHWC")
     creating: createSpatialBatchNormalization
     '''
 
@@ -2073,14 +2139,13 @@ def __init__(self,
                                                         JTensor.from_ndarray(init_grad_bias),
                                                         data_format)
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class SpatialCrossMapLRN(Layer):
-
     '''
     Applies Spatial Local Response Normalization between different feature maps.
     The operation implemented is:
@@ -2098,8 +2163,10 @@ class SpatialCrossMapLRN(Layer):
     :param alpha:  the scaling parameter
     :param beta:   the exponent
     :param k: a constant
-    :param data_format a string value (or DataFormat Object in Scala) of "NHWC" or "NCHW" to specify the input data format of this layer. In "NHWC" format
-                        data is stored in the order of [batch_size, height, width, channels], in "NCHW" format data is stored
+    :param data_format a string value (or DataFormat Object in Scala) of "NHWC" or "NCHW" to specify
+                        the input data format of this layer. In "NHWC" format
+                        data is stored in the order of [batch_size, height, width, channels], in
+                        "NCHW" format data is stored
                         in the order of [batch_size, channels, height, width]
 
 
@@ -2121,6 +2188,8 @@ def __init__(self,
                                                  alpha,
                                                  beta,
                                                  k, data_format)
+
+
 class SpatialDropout3D(Layer):
     '''
     This version performs the same function as Dropout, however it drops
@@ -2139,6 +2208,7 @@ class SpatialDropout3D(Layer):
     >>> dropout = SpatialDropout3D(0.5, "NHWC")
     creating: createSpatialDropout3D
     '''
+
     def __init__(self,
                  init_p=0.5,
                  data_format="NCHW",
@@ -2146,6 +2216,7 @@ def __init__(self,
         super(SpatialDropout3D, self).__init__(None, bigdl_type,
                                                init_p, data_format)
 
+
 class SpatialDropout2D(Layer):
     '''
     This version performs the same function as Dropout, however it drops
@@ -2164,6 +2235,7 @@ class SpatialDropout2D(Layer):
     >>> dropout = SpatialDropout2D(0.4, "NHWC")
     creating: createSpatialDropout2D
     '''
+
     def __init__(self,
                  init_p=0.5,
                  data_format="NCHW",
@@ -2171,6 +2243,7 @@ def __init__(self,
         super(SpatialDropout2D, self).__init__(None, bigdl_type,
                                                init_p, data_format)
 
+
 class SpatialDropout1D(Layer):
     '''
     This version performs the same function as Dropout, however it drops
@@ -2186,14 +2259,15 @@ class SpatialDropout1D(Layer):
     >>> dropout = SpatialDropout1D(0.4)
     creating: createSpatialDropout1D
     '''
+
     def __init__(self,
                  init_p=0.5,
                  bigdl_type="float"):
         super(SpatialDropout1D, self).__init__(None, bigdl_type,
                                                init_p)
 
-class Dropout(Layer):
 
+class Dropout(Layer):
     '''
     Dropout masks(set to zero) parts of input using a bernoulli distribution.
     Each input element has a probability initP of being dropped. If scale is
@@ -2222,7 +2296,6 @@ def __init__(self,
 
 
 class GaussianDropout(Layer):
-
     '''
     Apply multiplicative 1-centered Gaussian noise.
     The multiplicative noise will have standard deviation `sqrt(rate / (1 - rate)).
@@ -2244,7 +2317,6 @@ def __init__(self,
 
 
 class GaussianNoise(Layer):
-
     '''
     Apply additive zero-centered Gaussian noise.
     This is useful to mitigate overfitting
@@ -2258,14 +2330,15 @@ class GaussianNoise(Layer):
     >>> GaussianNoise = GaussianNoise(0.5)
     creating: createGaussianNoise
     '''
+
     def __init__(self,
                  stddev,
                  bigdl_type="float"):
         super(GaussianNoise, self).__init__(None, bigdl_type,
                                             stddev)
 
-class View(Layer):
 
+class View(Layer):
     '''
     This module creates a new view of the input tensor using the sizes passed to the constructor.
     The method setNumInputDims() allows to specify the expected number of dimensions of the
@@ -2290,7 +2363,6 @@ def __init__(self,
 
 
 class Abs(Layer):
-
     '''
     an element-wise abs operation
 
@@ -2305,7 +2377,6 @@ def __init__(self,
 
 
 class Add(Layer):
-
     '''
     adds a bias term to input data ;
 
@@ -2320,14 +2391,14 @@ def __init__(self,
                  bigdl_type="float"):
         super(Add, self).__init__(None, bigdl_type,
                                   input_size)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class AddConstant(Layer):
-
     '''
     adding a constant
 
@@ -2348,8 +2419,8 @@ def __init__(self,
                                           constant_scalar,
                                           inplace)
 
-class BatchNormalization(Layer):
 
+class BatchNormalization(Layer):
     '''
     This layer implements Batch Normalization as described in the paper:
              "Batch Normalization: Accelerating Deep Network Training by Reducing Internal
@@ -2383,9 +2454,11 @@ class BatchNormalization(Layer):
     >>> init_grad_weight = np.zeros([2])
     >>> init_bias = np.zeros([2])
     >>> init_grad_bias = np.zeros([2])
-    >>> batchNormalization = BatchNormalization(2, 1e-5, 1e-5, True, init_weight, init_bias, init_grad_weight, init_grad_bias)
+    >>> batchNormalization = BatchNormalization(2, 1e-5, 1e-5, True, init_weight, init_bias,
+    ... init_grad_weight, init_grad_bias)
     creating: createBatchNormalization
     '''
+
     def __init__(self,
                  n_output,
                  eps=1e-5,
@@ -2406,7 +2479,7 @@ def __init__(self,
                                                  JTensor.from_ndarray(init_grad_weight),
                                                  JTensor.from_ndarray(init_grad_bias))
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
@@ -2431,11 +2504,10 @@ def __init__(self,
                  dimension,
                  bigdl_type="float"):
         super(BifurcateSplitTable, self).__init__(None, bigdl_type,
-                                       dimension)
+                                                  dimension)
 
 
 class Bilinear(Layer):
-
     '''
     a bilinear transformation with sparse inputs,
     The input tensor given in forward(input) is a table containing both inputs x_1 and x_2,
@@ -2445,7 +2517,8 @@ class Bilinear(Layer):
     :param input_size2 input dimension of x_2
     :param output_size output dimension
     :param bias_res whether use bias
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
 
     >>> bilinear = Bilinear(1, 1, 1, True, L1Regularizer(0.5))
@@ -2468,14 +2541,14 @@ def __init__(self,
                                        bias_res,
                                        wRegularizer,
                                        bRegularizer)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class Bottle(Container):
-
     '''
     Bottle allows varying dimensionality input to be forwarded through any module
     that accepts input of nInputDim dimensions, and generates output of nOutputDim dimensions.
@@ -2502,7 +2575,6 @@ def __init__(self,
 
 
 class CAdd(Layer):
-
     '''
     This layer has a bias tensor with given size. The bias will be added element wise to the input
     tensor. If the element number of the bias tensor match the input tensor, a simply element wise
@@ -2526,14 +2598,13 @@ def __init__(self,
         super(CAdd, self).__init__(None, bigdl_type,
                                    size, bRegularizer)
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class CAddTable(Layer):
-
     '''
     Merge the input tensors in the input table by element wise adding them together. The input
     table is actually an array of tensor with same size.
@@ -2552,8 +2623,8 @@ def __init__(self,
         super(CAddTable, self).__init__(None, bigdl_type,
                                         inplace)
 
-class CAveTable(Layer):
 
+class CAveTable(Layer):
     '''
     Merge the input tensors in the input table by element wise taking the average. The input
     table is actually an array of tensor with same size.
@@ -2574,7 +2645,6 @@ def __init__(self,
 
 
 class CDivTable(Layer):
-
     '''
     Takes a table with two Tensor and returns the component-wise division between them.
 
@@ -2589,7 +2659,6 @@ def __init__(self,
 
 
 class CMaxTable(Layer):
-
     '''
     Takes a table of Tensors and outputs the max of all of them.
 
@@ -2604,7 +2673,6 @@ def __init__(self,
 
 
 class CMinTable(Layer):
-
     '''
     Takes a table of Tensors and outputs the min of all of them.
 
@@ -2618,7 +2686,6 @@ def __init__(self,
 
 
 class CMul(Layer):
-
     '''
     Applies a component-wise multiplication to the incoming data
 
@@ -2637,14 +2704,13 @@ def __init__(self,
         super(CMul, self).__init__(None, bigdl_type,
                                    size, wRegularizer)
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class CMulTable(Layer):
-
     '''
     Takes a table of Tensors and outputs the multiplication of all of them.
 
@@ -2659,7 +2725,6 @@ def __init__(self,
 
 
 class CSubTable(Layer):
-
     '''
     Takes a table with two Tensor and returns the component-wise subtraction between them.
 
@@ -2674,7 +2739,6 @@ def __init__(self,
 
 
 class Clamp(Layer):
-
     '''
     Clamps all elements into the range [min_value, max_value].
     Output is identical to input in the range,
@@ -2700,7 +2764,6 @@ def __init__(self,
 
 
 class Contiguous(Layer):
-
     '''
     used to make input, grad_output both contiguous
 
@@ -2715,7 +2778,6 @@ def __init__(self,
 
 
 class Cosine(Layer):
-
     '''
     Cosine calculates the cosine similarity of the input to k mean centers. The input given in
     forward(input) must be either a vector (1D tensor) or matrix (2D tensor). If the input is a
@@ -2739,14 +2801,14 @@ def __init__(self,
         super(Cosine, self).__init__(None, bigdl_type,
                                      input_size,
                                      output_size)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class CosineDistance(Layer):
-
     '''
     Outputs the cosine distance between inputs
 
@@ -2761,7 +2823,6 @@ def __init__(self,
 
 
 class CrossProduct(Layer):
-
     """
     A layer which takes a table of multiple tensors(n >= 2) as input
     and calculate to dot product for `all combinations of pairs` among input tensors.
@@ -2802,6 +2863,7 @@ class UpSampling2D(Layer):
     >>> upsampled2d = UpSampling2D([2, 3])
     creating: createUpSampling2D
     """
+
     def __init__(self, size, data_format="nchw", bigdl_type="float"):
         super(UpSampling2D, self).__init__(None, bigdl_type, size, data_format)
 
@@ -2818,11 +2880,12 @@ class UpSampling1D(Layer):
     >>> upsampled1d = UpSampling1D(2)
     creating: createUpSampling1D
     """
+
     def __init__(self, length, bigdl_type="float"):
         super(UpSampling1D, self).__init__(None, bigdl_type, length)
 
-class Input(Node):
 
+class Input(Node):
     '''
     Input layer do nothing to the input tensors, just passing them through. It is used as input to
     the Graph container (add a link) when the first layer of the graph container accepts multiple
@@ -2844,7 +2907,6 @@ def __init__(self,
 
 
 class DotProduct(Layer):
-
     '''
     This is a simple table layer which takes a table of two tensors as input
     and calculate the dot product between them as outputs
@@ -2860,7 +2922,6 @@ def __init__(self,
 
 
 class ELU(Layer):
-
     '''
     D-A Clevert, Thomas Unterthiner, Sepp Hochreiter
     Fast and Accurate Deep Network Learning by Exponential Linear Units (ELUs)
@@ -2881,7 +2942,6 @@ def __init__(self,
 
 
 class Euclidean(Layer):
-
     '''
     Outputs the Euclidean distance of the input to outputSize centers
 
@@ -2894,8 +2954,6 @@ class Euclidean(Layer):
     creating: createEuclidean
     '''
 
-
-
     def __init__(self,
                  input_size,
                  output_size,
@@ -2906,14 +2964,13 @@ def __init__(self,
                                         output_size,
                                         fast_backward)
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class Exp(Layer):
-
     '''
     Applies element-wise exp to input tensor.
 
@@ -2927,7 +2984,6 @@ def __init__(self,
 
 
 class FlattenTable(Layer):
-
     '''
     This is a table layer which takes an arbitrarily deep table of Tensors
     (potentially nested) as input and a table of Tensors without any nested
@@ -2944,7 +3000,6 @@ def __init__(self,
 
 
 class GradientReversal(Layer):
-
     '''
     It is a simple module preserves the input, but takes the
     gradient from the subsequent layer, multiplies it by -lambda
@@ -2971,7 +3026,6 @@ def __init__(self,
 
 
 class HardShrink(Layer):
-
     '''
     This is a transfer layer which applies the hard shrinkage function
     element-wise to the input Tensor. The parameter lambda is set to 0.5
@@ -2997,7 +3051,6 @@ def __init__(self,
 
 
 class HardTanh(Layer):
-
     '''
     Applies HardTanh to each element of input, HardTanh is defined:
 ```
@@ -3028,7 +3081,6 @@ def __init__(self,
 
 
 class Index(Layer):
-
     '''
     Applies the Tensor index operation along the given dimension.
 
@@ -3048,7 +3100,6 @@ def __init__(self,
 
 
 class InferReshape(Layer):
-
     '''
     Reshape the input tensor with automatic size inference support.
     Positive numbers in the `size` argument are used to reshape the input to the
@@ -3087,7 +3138,6 @@ def __init__(self,
 
 
 class JoinTable(Layer):
-
     '''
     It is a table module which takes a table of Tensors as input and
     outputs a Tensor by joining them together along the dimension `dimension`.
@@ -3100,7 +3150,9 @@ class JoinTable(Layer):
 
 
     :param dimension: to be join in this dimension
-    :param nInputDims: specify the number of dimensions that this module will receiveIf it is more than the dimension of input tensors, the first dimensionwould be considered as batch size
+    :param nInputDims: specify the number of dimensions that this module will receiveIf it is more
+                       than the dimension of input tensors, the first dimensionwould be considered
+                       as batch size
 
 
     >>> joinTable = JoinTable(1, 1)
@@ -3115,8 +3167,8 @@ def __init__(self,
                                         dimension,
                                         n_input_dims)
 
-class SparseJoinTable(Layer):
 
+class SparseJoinTable(Layer):
     '''
     :: Experimental ::
 
@@ -3135,11 +3187,10 @@ def __init__(self,
                  dimension,
                  bigdl_type="float"):
         super(SparseJoinTable, self).__init__(None, bigdl_type,
-                                        dimension)
+                                              dimension)
 
 
 class L1Penalty(Layer):
-
     '''
     adds an L1 penalty to an input (for sparsity).
     L1Penalty is an inline module that in its forward propagation copies the input Tensor
@@ -3166,12 +3217,13 @@ def __init__(self,
                                         size_average,
                                         provide_output)
 
+
 class NegativeEntropyPenalty(Layer):
     '''
     Penalize the input multinomial distribution if it has low entropy.
     The input to this layer should be a batch of vector each representing a
     multinomial distribution. The input is typically the output of a softmax layer.
-    
+
     For forward, the output is the same as input and a NegativeEntropy loss of
     the latent state will be calculated each time. For backward,
     gradInput = gradOutput + gradLoss
@@ -3179,10 +3231,10 @@ class NegativeEntropyPenalty(Layer):
     This can be used in reinforcement learning to discourage the policy from
     collapsing to a single action for a given state, which improves exploration.
     See the A3C paper for more detail (https://arxiv.org/pdf/1602.01783.pdf).
-    
+
     >>> ne = NegativeEntropyPenalty(0.01)
     creating: createNegativeEntropyPenalty
-    
+
     :param beta penalty coefficient
     '''
 
@@ -3193,7 +3245,6 @@ def __init__(self, beta=0.01, bigdl_type="float"):
 
 
 class LeakyReLU(Layer):
-
     '''
     It is a transfer module that applies LeakyReLU, which parameter negval sets the slope of the
     negative part: LeakyReLU is defined as: f(x) = max(0, x) + negval * min(0, x)
@@ -3217,7 +3268,6 @@ def __init__(self,
 
 
 class Log(Layer):
-
     '''
     Applies the log function element-wise to the input Tensor,
      thus outputting a Tensor of the same dimension.
@@ -3233,7 +3283,6 @@ def __init__(self,
 
 
 class LogSigmoid(Layer):
-
     '''
     This class is a transform layer corresponding to the sigmoid function:
     f(x) = Log(1 / (1 + e ^^ (-x)))
@@ -3249,11 +3298,11 @@ def __init__(self,
 
 
 class LookupTable(Layer):
-
     '''
     a convolution of width 1, commonly used for word embeddings
 
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
 
     >>> lookupTable = LookupTable(1, 1, 1e-5, 1e-5, 1e-5, True, L1Regularizer(0.5))
     creating: createL1Regularizer
@@ -3277,14 +3326,14 @@ def __init__(self,
                                           norm_type,
                                           should_scale_grad_by_freq,
                                           wRegularizer)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class LookupTableSparse(Layer):
-
     '''
     LookupTable for multi-values.
     Also called embedding_lookup_sparse in TensorFlow.
@@ -3296,7 +3345,8 @@ class LookupTableSparse(Layer):
     like the SparseTensor input. And the second tensor is the corresponding
     weights of the integer ids.
 
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
 
     >>> lookupTableSparse = LookupTableSparse(20, 5, "mean", 2, L1Regularizer(0.5))
     creating: createL1Regularizer
@@ -3310,7 +3360,8 @@ class LookupTableSparse(Layer):
     creating: createLookupTableSparse
     >>> layer1.set_weights(np.arange(1, 41, 1).reshape(10, 4)) # set weight to 1 to 40
     >>> output = layer1.forward([input, weight])
-    >>> expected_output = np.array([[6.5999999 , 7.60000038, 8.60000038, 9.60000038],[ 1., 2., 3., 4.], [5., 6., 7., 8.]])
+    >>> expected_output = np.array([[6.5999999 , 7.60000038, 8.60000038, 9.60000038],
+    ... [ 1., 2., 3., 4.], [5., 6., 7., 8.]])
     >>> np.testing.assert_allclose(output, expected_output, rtol=1e-6, atol=1e-6)
     '''
 
@@ -3322,18 +3373,19 @@ def __init__(self,
                  wRegularizer=None,
                  bigdl_type="float"):
         super(LookupTableSparse, self).__init__(None, bigdl_type,
-                                          n_index,
-                                          n_output,
-                                          combiner,
-                                          max_norm + 0.0,
-                                          wRegularizer)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+                                                n_index,
+                                                n_output,
+                                                combiner,
+                                                max_norm + 0.0,
+                                                wRegularizer)
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
-class MM(Layer):
 
+class MM(Layer):
     '''
     Module to perform matrix multiplication on two mini-batch inputs, producing a mini-batch.
 
@@ -3345,6 +3397,7 @@ class MM(Layer):
     >>> mM = MM(True, True)
     creating: createMM
     '''
+
     def __init__(self,
                  trans_a=False,
                  trans_b=False,
@@ -3355,7 +3408,6 @@ def __init__(self,
 
 
 class MV(Layer):
-
     '''
     It is a module to perform matrix vector multiplication on two mini-batch inputs,
     producing a mini-batch.
@@ -3376,7 +3428,6 @@ def __init__(self,
 
 
 class MapTable(Container):
-
     '''
     This class is a container for a single module which will be applied
     to all input elements. The member module is cloned as necessary to
@@ -3394,8 +3445,8 @@ def __init__(self,
         super(MapTable, self).__init__(None, bigdl_type,
                                        module)
 
-class MaskedSelect(Layer):
 
+class MaskedSelect(Layer):
     '''
     Performs a torch.MaskedSelect on a Tensor. The mask is supplied as a tabular argument with
     the input on the forward and backward passes.
@@ -3410,7 +3461,6 @@ def __init__(self,
 
 
 class Max(Layer):
-
     '''
     Applies a max operation over dimension `dim`
 
@@ -3433,7 +3483,6 @@ def __init__(self,
 
 
 class Mean(Layer):
-
     '''
     It is a simple layer which applies a mean operation over the given dimension. When nInputDims
     is provided, the input will be considered as batches. Then the mean operation will be applied
@@ -3444,8 +3493,10 @@ class Mean(Layer):
 
 
     :param dimension: the dimension to be applied mean operation
-    :param n_input_dims: specify the number of dimensions that this module will receiveIf it is more than the dimension of input tensors, the first dimension would be consideredas batch size
-    :param squeeze: default is true, which will squeeze the sum dimension; set it to false to keep the sum dimension 
+    :param n_input_dims: specify the number of dimensions that this module will receiveIf it is more
+    than the dimension of input tensors, the first dimension would be consideredas batch size
+    :param squeeze: default is true, which will squeeze the sum dimension; set it to false to keep
+    the sum dimension
 
     >>> mean = Mean(1, 1, True)
     creating: createMean
@@ -3463,7 +3514,6 @@ def __init__(self,
 
 
 class Min(Layer):
-
     '''
     Applies a min operation over dimension `dim`.
 
@@ -3486,7 +3536,6 @@ def __init__(self,
 
 
 class MixtureTable(Layer):
-
     '''
     Creates a module that takes a table {gater, experts} as input and outputs the mixture of experts
     (a Tensor or table of Tensors) using a gater Tensor. When dim is provided, it specifies the
@@ -3508,7 +3557,6 @@ def __init__(self,
 
 
 class Mul(Layer):
-
     '''
     Multiply a single scalar factor to the incoming data
 
@@ -3521,14 +3569,13 @@ def __init__(self,
                  bigdl_type="float"):
         super(Mul, self).__init__(None, bigdl_type)
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class MulConstant(Layer):
-
     '''
     Multiplies input Tensor by a (non-learnable) scalar constant.
     This module is sometimes useful for debugging purposes.
@@ -3552,7 +3599,6 @@ def __init__(self,
 
 
 class Narrow(Layer):
-
     '''
     Narrow is application of narrow operation in a module.
     The module further supports a negative length in order to handle inputs with an unknown size.
@@ -3573,7 +3619,6 @@ def __init__(self,
 
 
 class NarrowTable(Layer):
-
     '''
     Creates a module that takes a table as input and outputs the subtable starting at index
     offset having length elements (defaults to 1 element). The elements can be either
@@ -3599,7 +3644,6 @@ def __init__(self,
 
 
 class Normalize(Layer):
-
     '''
     Normalizes the input Tensor to have unit L_p norm. The smoothing parameter eps prevents
     division by zero when the input contains all zero elements (default = 1e-10).
@@ -3620,7 +3664,6 @@ def __init__(self,
 
 
 class PReLU(Layer):
-
     '''
     Applies parametric ReLU, which parameter varies the slope of the negative part.
 
@@ -3648,14 +3691,13 @@ def __init__(self,
         super(PReLU, self).__init__(None, bigdl_type,
                                     n_output_plane)
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class Padding(Layer):
-
     '''
     This module adds pad units of padding to dimension dim of the input. If pad is negative,
     padding is added to the left, otherwise, it is added to the right of the dimension.
@@ -3669,7 +3711,8 @@ class Padding(Layer):
 
     :param dim: the dimension to be applied padding operation
     :param pad: num of the pad units
-    :param n_input_dim: specify the number of dimensions that this module will receiveIf it is more than the dimension of input tensors, the first dimensionwould be considered as batch size
+    :param n_input_dim: specify the number of dimensions that this module will receiveIf it is more
+     than the dimension of input tensors, the first dimensionwould be considered as batch size
     :param value: padding value
 
 
@@ -3693,7 +3736,6 @@ def __init__(self,
 
 
 class PairwiseDistance(Layer):
-
     '''
     It is a module that takes a table of two vectors as input and outputs
     the distance between them using the p-norm.
@@ -3718,7 +3760,6 @@ def __init__(self,
 
 
 class ParallelTable(Container):
-
     '''
     It is a container module that applies the i-th member module to the i-th
     input, and outputs an output in the form of Table
@@ -3734,7 +3775,6 @@ def __init__(self,
 
 
 class Power(Layer):
-
     '''
     Apply an element-wise power operation with scale and shift.
     f(x) = (shift + scale * x)^power^
@@ -3760,7 +3800,6 @@ def __init__(self,
 
 
 class RReLU(Layer):
-
     '''
     Applies the randomized leaky rectified linear unit (RReLU) element-wise to the input Tensor,
     thus outputting a Tensor of the same dimension. Informally the RReLU is also known as
@@ -3801,8 +3840,8 @@ class RReLU(Layer):
     '''
 
     def __init__(self,
-                 lower=1.0/8,
-                 upper=1.0/3,
+                 lower=1.0 / 8,
+                 upper=1.0 / 3,
                  inplace=False,
                  bigdl_type="float"):
         super(RReLU, self).__init__(None, bigdl_type,
@@ -3810,8 +3849,8 @@ def __init__(self,
                                     upper,
                                     inplace)
 
-class SpatialSeparableConvolution(Layer):
 
+class SpatialSeparableConvolution(Layer):
     '''
     Separable convolutions consist in first performing a depthwise spatial convolution (which acts
     on each input channel separately) followed by a pointwise convolution which mixes together the
@@ -3828,10 +3867,13 @@ class SpatialSeparableConvolution(Layer):
     :param pad_w The additional zeros added per width to the input planes.
     :param pad_h The additional zeros added per height to the input planes.
     :param with_bias: the optional initial value for if need bias
-    :param data_format: a string value of "NHWC" or "NCHW" to specify the input data format of this layer. In "NHWC" format
-                       data is stored in the order of [batch_size, height, width, channels], in "NCHW" format data is stored
+    :param data_format: a string value of "NHWC" or "NCHW" to specify the input data format of this
+                       layer. In "NHWC" format
+                       data is stored in the order of [batch_size, height, width, channels], in
+                       "NCHW" format data is stored
                        in the order of [batch_size, channels, height, width].
-    :param w_regularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the depth weights matrices.
+    :param w_regularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                          depth weights matrices.
     :param b_regularizer: instance of [[Regularizer]]applied to the pointwise bias.
     :param p_regularizer: instance of [[Regularizer]]applied to the pointwise weights.
 
@@ -3841,7 +3883,8 @@ class SpatialSeparableConvolution(Layer):
     creating: createL1Regularizer
     >>> conv.setBRegularizer(L1Regularizer(0.5))
     creating: createL1Regularizer
-    >>> conv = SpatialSeparableConvolution(6, 12, 1, 5, 5, 1, 1, 0, 0, True, "NCHW", L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
+    >>> conv = SpatialSeparableConvolution(6, 12, 1, 5, 5, 1, 1, 0, 0, True, "NCHW",
+    ... L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
     creating: createL1Regularizer
     creating: createL1Regularizer
     creating: createL1Regularizer
@@ -3881,8 +3924,8 @@ def __init__(self,
                                                           p_regularizer,
                                                           )
 
-class ReLU6(Layer):
 
+class ReLU6(Layer):
     '''
     Same as ReLU except that the rectifying function f(x) saturates at x = 6
 
@@ -3900,8 +3943,8 @@ def __init__(self,
         super(ReLU6, self).__init__(None, bigdl_type,
                                     inplace)
 
-class SReLU(Layer):
 
+class SReLU(Layer):
     '''S-shaped Rectified Linear Unit.
 
     It follows:
@@ -3910,7 +3953,8 @@ class SReLU(Layer):
     `f(x) = t^l + a^l(x - t^l) for x <= t^l`.
 
     # References
-        - [Deep Learning with S-shaped Rectified Linear Activation Units](http://arxiv.org/abs/1512.07030)
+        - [Deep Learning with S-shaped Rectified Linear Activation Units](http://arxiv.org/abs/
+          1512.07030)
 
     :param input_shape: shape for tleft, aleft, tright, aright.
             E.g. for a 4-D input, the shape is the last 3-D
@@ -3930,7 +3974,8 @@ class SReLU(Layer):
     >>> from bigdl.dllib.nn.initialization_method import Xavier
     >>> init = Xavier()
     creating: createXavier
-    >>> srelu = srelu.set_init_method(tLeftInit=init, aLeftInit=init, tRightInit=init, aRightInit=init)
+    >>> srelu = srelu.set_init_method(tLeftInit=init, aLeftInit=init, tRightInit=init,
+    ... aRightInit=init)
     '''
 
     def __init__(self,
@@ -3946,8 +3991,8 @@ def set_init_method(self, tLeftInit=None, aLeftInit=None,
                       [tLeftInit, aLeftInit, tRightInit, aRightInit])
         return self
 
-class ActivityRegularization(Layer):
 
+class ActivityRegularization(Layer):
     '''
     Layer that applies an update to the cost function based input activity.
 
@@ -3965,8 +4010,8 @@ def __init__(self,
                  bigdl_type="float"):
         super(ActivityRegularization, self).__init__(None, bigdl_type, l1, l2)
 
-class Replicate(Layer):
 
+class Replicate(Layer):
     '''
     Replicate repeats input `nFeatures` times along its `dim` dimension.
     Notice: No memory copy, it set the stride along the `dim`-th dimension to zero.
@@ -3980,6 +4025,7 @@ class Replicate(Layer):
     >>> replicate = Replicate(2)
     creating: createReplicate
     '''
+
     def __init__(self,
                  n_features,
                  dim=1,
@@ -3992,7 +4038,6 @@ def __init__(self,
 
 
 class RoiPooling(Layer):
-
     '''
     Region of interest pooling
     The RoIPooling uses max pooling to convert the features inside any valid region of interest
@@ -4013,7 +4058,8 @@ class RoiPooling(Layer):
 
     >>> import numpy as np
     >>> input_data = np.random.rand(2,2,6,8)
-    >>> input_rois = np.array([0, 0, 0, 7, 5, 1, 6, 2, 7, 5, 1, 3, 1, 6, 4, 0, 3, 3, 3, 3],dtype='float64').reshape(4,5)
+    >>> input_rois = np.array([0, 0, 0, 7, 5, 1, 6, 2, 7, 5, 1, 3, 1, 6, 4, 0, 3, 3, 3, 3],
+    ... dtype='float64').reshape(4,5)
     >>> m = RoiPooling(3,2,1.0)
     creating: createRoiPooling
     >>> out = m.forward([input_data,input_rois])
@@ -4031,7 +4077,6 @@ def __init__(self,
 
 
 class Scale(Layer):
-
     '''
     Scale is the combination of CMul and CAdd
     Computes the elementwise product of input and weight, with the shape of the weight "expand" to
@@ -4054,7 +4099,6 @@ def __init__(self,
 
 
 class SelectTable(Layer):
-
     '''
     Creates a module that takes a table as input and outputs the element at index `index`
     (positive or negative). This can be either a table or a Tensor.
@@ -4078,7 +4122,6 @@ def __init__(self,
 
 
 class SequenceBeamSearch(Layer):
-
     '''
     Find the translated sequence with the highest probability.
 
@@ -4098,15 +4141,15 @@ class SequenceBeamSearch(Layer):
     '''
 
     def __init__(self,
-                vocab_size,
-                beam_size,
-                alpha,
-                decode_length,
-                eos_id,
-                padding_value,
-                num_hidden_layers,
-                hidden_size,
-                bigdl_type="float"):
+                 vocab_size,
+                 beam_size,
+                 alpha,
+                 decode_length,
+                 eos_id,
+                 padding_value,
+                 num_hidden_layers,
+                 hidden_size,
+                 bigdl_type="float"):
         super(SequenceBeamSearch, self).__init__(None, bigdl_type,
                                                  vocab_size,
                                                  beam_size,
@@ -4119,7 +4162,6 @@ def __init__(self,
 
 
 class SoftMax(Layer):
-
     '''
     Applies the SoftMax function to an n-dimensional input Tensor, rescaling them so that the
     elements of the n-dimensional output Tensor lie in the range (0, 1) and sum to 1.
@@ -4138,7 +4180,6 @@ def __init__(self,
 
 
 class SoftMin(Layer):
-
     '''
     Applies the SoftMin function to an n-dimensional input Tensor, rescaling them so that the
     elements of the n-dimensional output Tensor lie in the range (0,1) and sum to 1.
@@ -4156,7 +4197,6 @@ def __init__(self,
 
 
 class SoftPlus(Layer):
-
     '''
     Apply the SoftPlus function to an n-dimensional input tensor.
     SoftPlus function: f_i(x) = 1/beta * log(1 + exp(beta * x_i))
@@ -4177,7 +4217,6 @@ def __init__(self,
 
 
 class SoftShrink(Layer):
-
     '''
     Apply the soft shrinkage function element-wise to the input Tensor
 
@@ -4204,7 +4243,6 @@ def __init__(self,
 
 
 class SoftSign(Layer):
-
     '''
     Apply SoftSign function to an n-dimensional input Tensor.
 
@@ -4222,7 +4260,6 @@ def __init__(self,
 
 
 class SpatialDilatedConvolution(Layer):
-
     '''
     Apply a 2D dilated convolution over an input image.
 
@@ -4250,7 +4287,8 @@ class SpatialDilatedConvolution(Layer):
     :param dilation_w: The number of pixels to skip. Default is 1.
     :param dilation_h: The number of pixels to skip. Default is 1.
     :param init_method: Init method, Default, Xavier.
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
 
 
@@ -4285,8 +4323,8 @@ def __init__(self,
                                                         dilation_h,
                                                         wRegularizer,
                                                         bRegularizer)
-                                                        
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
@@ -4308,8 +4346,8 @@ class SpatialFullConvolution(Layer):
     oheight = (height - 1) * dH - 2*padH + kH + adjH
 
 
-    Other frameworks call this operation "In-network Upsampling", "Fractionally-strided convolution",
-    "Backwards Convolution," "Deconvolution", or "Upconvolution."
+    Other frameworks call this operation "In-network Upsampling", "Fractionally-strided
+     convolution", "Backwards Convolution," "Deconvolution", or "Upconvolution."
 
 
     Reference Paper: Long J, Shelhamer E, Darrell T. Fully convolutional networks for semantic
@@ -4329,7 +4367,8 @@ class SpatialFullConvolution(Layer):
     :param nGroup Kernel group number.
     :param noBias If bias is needed.
     :param initMethod Init method, Default, Xavier, Bilinear.
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
 
 
@@ -4368,11 +4407,13 @@ def __init__(self,
                                                      no_bias,
                                                      wRegularizer,
                                                      bRegularizer)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
+
 class VolumetricFullConvolution(Layer):
     '''
     Apply a 3D full convolution over an 3D input image, a sequence of images, or a video etc.
@@ -4390,8 +4431,8 @@ class VolumetricFullConvolution(Layer):
     oheight = (height - 1) * dH - 2*padH + kH + adjH
 
 
-    Other frameworks call this operation "In-network Upsampling", "Fractionally-strided convolution",
-    "Backwards Convolution," "Deconvolution", or "Upconvolution."
+    Other frameworks call this operation "In-network Upsampling", "Fractionally-strided
+    convolution", "Backwards Convolution," "Deconvolution", or "Upconvolution."
 
 
     Reference Paper: Long J, Shelhamer E, Darrell T. Fully convolutional networks for semantic
@@ -4414,7 +4455,8 @@ class VolumetricFullConvolution(Layer):
     :param adjH Extra height to add to the output image. Default is 0.
     :param nGroup Kernel group number.
     :param noBias If bias is needed.
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
 
 
@@ -4443,31 +4485,32 @@ def __init__(self,
                  bRegularizer=None,
                  bigdl_type="float"):
         super(VolumetricFullConvolution, self).__init__(None, bigdl_type,
-                                                     n_input_plane,
-                                                     n_output_plane,
-                                                     kt,
-                                                     kw,
-                                                     kh,
-                                                     dt,
-                                                     dw,
-                                                     dh,
-                                                     pad_t,
-                                                     pad_w,
-                                                     pad_h,
-                                                     adj_t,
-                                                     adj_w,
-                                                     adj_h,
-                                                     n_group,
-                                                     no_bias,
-                                                     wRegularizer,
-                                                     bRegularizer)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+                                                        n_input_plane,
+                                                        n_output_plane,
+                                                        kt,
+                                                        kw,
+                                                        kh,
+                                                        dt,
+                                                        dw,
+                                                        dh,
+                                                        pad_t,
+                                                        pad_w,
+                                                        pad_h,
+                                                        adj_t,
+                                                        adj_w,
+                                                        adj_h,
+                                                        n_group,
+                                                        no_bias,
+                                                        wRegularizer,
+                                                        bRegularizer)
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
-class SpatialShareConvolution(Layer):
 
+class SpatialShareConvolution(Layer):
     '''
 
     >>> spatialShareConvolution = SpatialShareConvolution(1, 1, 1, 1)
@@ -4477,7 +4520,8 @@ class SpatialShareConvolution(Layer):
     >>> init_bias = np.random.randn(12)
     >>> init_grad_weight = np.zeros([1, 12, 6, 5, 5])
     >>> init_grad_bias = np.zeros([12])
-    >>> conv = SpatialShareConvolution(6, 12, 5, 5, 1, 1, 0, 0, 1, True, L1Regularizer(0.5), L1Regularizer(0.5), init_weight, init_bias, init_grad_weight, init_grad_bias)
+    >>> conv = SpatialShareConvolution(6, 12, 5, 5, 1, 1, 0, 0, 1, True, L1Regularizer(0.5),
+    ... L1Regularizer(0.5), init_weight, init_bias, init_grad_weight, init_grad_bias)
     creating: createL1Regularizer
     creating: createL1Regularizer
     creating: createSpatialShareConvolution
@@ -4520,14 +4564,14 @@ def __init__(self,
                                                       JTensor.from_ndarray(init_grad_weight),
                                                       JTensor.from_ndarray(init_grad_bias),
                                                       with_bias)
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class VolumetricConvolution(Layer):
-
     '''
     Applies a 3D convolution over an input image composed of several input planes. The input tensor
     in forward(input) is expected to be a 4D tensor (nInputPlane x time x height x width).
@@ -4540,11 +4584,13 @@ class VolumetricConvolution(Layer):
     :param d_t: The step of the convolution in the time dimension. Default is 1
     :param d_w: The step of the convolution in the width dimension. Default is 1
     :param d_h: The step of the convolution in the height dimension. Default is 1
-    :param pad_t: Additional zeros added to the input plane data on both sides of time axis.Default is 0. (kT-1)/2 is often used here.
+    :param pad_t: Additional zeros added to the input plane data on both sides of time axis.Default
+                 is 0. (kT-1)/2 is often used here.
     :param pad_w: The additional zeros added per width to the input planes.
     :param pad_h: The additional zeros added per height to the input planes.
     :param with_bias: whether with bias
-    :param wRegularizer: instance of [[Regularizer]] (eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]] (eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]] applied to the bias.
 
 
@@ -4584,14 +4630,13 @@ def __init__(self,
                                                     wRegularizer,
                                                     bRegularizer)
 
-    def set_init_method(self, weight_init_method = None, bias_init_method = None):
+    def set_init_method(self, weight_init_method=None, bias_init_method=None):
         callBigDlFunc(self.bigdl_type, "setInitMethod", self.value,
                       weight_init_method, bias_init_method)
         return self
 
 
 class VolumetricMaxPooling(Layer):
-
     '''
     Applies 3D max-pooling operation in kTxkWxkH regions by step size dTxdWxdH.
     The number of output features is equal to the number of input planes / dT.
@@ -4625,19 +4670,18 @@ def __init__(self,
                  pad_h=0,
                  bigdl_type="float"):
         super(VolumetricMaxPooling, self).__init__(None, bigdl_type,
-                                                    k_t,
-                                                    k_w,
-                                                    k_h,
-                                                    d_t,
-                                                    d_w,
-                                                    d_h,
-                                                    pad_t,
-                                                    pad_w,
-                                                    pad_h)
+                                                   k_t,
+                                                   k_w,
+                                                   k_h,
+                                                   d_t,
+                                                   d_w,
+                                                   d_h,
+                                                   pad_t,
+                                                   pad_w,
+                                                   pad_h)
 
 
 class VolumetricAveragePooling(Layer):
-
     '''
     Applies 3D average-pooling operation in kTxkWxkH regions by step size dTxdWxdH.
     The number of output features is equal to the number of input planes / dT.
@@ -4653,7 +4697,8 @@ class VolumetricAveragePooling(Layer):
     :param pad_t: The padding in the time dimension
     :param pad_w: The padding in the width dimension
     :param pad_h: The padding in the height dimension
-    :param count_include_pad: whether to include padding when dividing the number of elements in pooling region
+    :param count_include_pad: whether to include padding when dividing the number of elements in
+                              pooling region
     :param ceil_mode: whether the output size is to be ceiled or floored
 
 
@@ -4675,20 +4720,20 @@ def __init__(self,
                  ceil_mode=False,
                  bigdl_type="float"):
         super(VolumetricAveragePooling, self).__init__(None, bigdl_type,
-                                                        k_t,
-                                                        k_w,
-                                                        k_h,
-                                                        d_t,
-                                                        d_w,
-                                                        d_h,
-                                                        pad_t,
-                                                        pad_w,
-                                                        pad_h,
-                                                        count_include_pad,
-                                                        ceil_mode)
+                                                       k_t,
+                                                       k_w,
+                                                       k_h,
+                                                       d_t,
+                                                       d_w,
+                                                       d_h,
+                                                       pad_t,
+                                                       pad_w,
+                                                       pad_h,
+                                                       count_include_pad,
+                                                       ceil_mode)
 
-class SpatialZeroPadding(Layer):
 
+class SpatialZeroPadding(Layer):
     '''
     Each feature map of a given input is padded with specified number of zeros.
     If padding values are negative, then input is cropped.
@@ -4717,7 +4762,6 @@ def __init__(self,
 
 
 class SplitTable(Layer):
-
     '''
     Creates a module that takes a Tensor as input and
     outputs several tables, splitting the Tensor along
@@ -4731,7 +4775,8 @@ class SplitTable(Layer):
 
 
     :param dimension: to be split along this dimension
-    :param n_input_dims: specify the number of dimensions that this module will receiveIf it is more than the dimension of input tensors, the first dimensionwould be considered as batch size
+    :param n_input_dims: specify the number of dimensions that this module will receiveIf it is more
+    than the dimension of input tensors, the first dimensionwould be considered as batch size
 
 
     >>> splitTable = SplitTable(1, 1)
@@ -4748,7 +4793,6 @@ def __init__(self,
 
 
 class Sqrt(Layer):
-
     '''
     Apply an element-wise sqrt operation.
 
@@ -4763,7 +4807,6 @@ def __init__(self,
 
 
 class Square(Layer):
-
     '''
     Apply an element-wise square operation.
 
@@ -4777,7 +4820,6 @@ def __init__(self,
 
 
 class Squeeze(Layer):
-
     '''
     Delete singleton all dimensions or a specific dim.
 
@@ -4802,7 +4844,6 @@ def __init__(self,
 
 
 class Sum(Layer):
-
     '''
     It is a simple layer which applies a sum operation over the given dimension.
     When nInputDims is provided, the input will be considered as a batches.
@@ -4814,9 +4855,11 @@ class Sum(Layer):
 
 
     :param dimension: the dimension to be applied sum operation
-    :param n_input_dims: specify the number of dimensions that this module will receiveIf it is more than the dimension of input tensors, the first dimensionwould be considered as batch size
+    :param n_input_dims: specify the number of dimensions that this module will receiveIf it is more
+     than the dimension of input tensors, the first dimensionwould be considered as batch size
     :param size_average: default is false, if it is true, it will return the mean instead
-    :param squeeze: default is true, which will squeeze the sum dimension; set it to false to keep the sum dimension
+    :param squeeze: default is true, which will squeeze the sum dimension; set it to false to keep
+     the sum dimension
 
 
     >>> sum = Sum(1, 1, True, True)
@@ -4837,7 +4880,6 @@ def __init__(self,
 
 
 class TanhShrink(Layer):
-
     '''
     A simple layer for each element of the input tensor, do the following operation
     during the forward process:
@@ -4854,7 +4896,6 @@ def __init__(self,
 
 
 class Threshold(Layer):
-
     '''
     Threshold input Tensor.
     If values in the Tensor smaller than th, then replace it with v
@@ -4879,8 +4920,8 @@ def __init__(self,
                                         v,
                                         ip)
 
-class Negative(Layer):
 
+class Negative(Layer):
     '''
     Create an Negative layer.  Computing negative value of each element of input tensor
 
@@ -4892,13 +4933,12 @@ class Negative(Layer):
     '''
 
     def __init__(self,
-                 inplace = False,
+                 inplace=False,
                  bigdl_type="float"):
         super(Negative, self).__init__(None, bigdl_type, inplace)
 
 
 class Unsqueeze(Layer):
-
     '''
     Create an Unsqueeze layer.  Insert singleton dim (i.e., dimension 1) at position pos.
     For an input with dim = input.dim(),
@@ -4915,13 +4955,14 @@ class Unsqueeze(Layer):
 
     def __init__(self, pos, num_input_dims=INTMIN, bigdl_type="float"):
         if isinstance(pos, int):
-            posList=[pos]
+            posList = [pos]
             super(Unsqueeze, self).__init__(None, bigdl_type, to_list(posList), num_input_dims)
         elif isinstance(pos, list):
             super(Unsqueeze, self).__init__(None, bigdl_type, to_list(pos), num_input_dims)
         else:
             raise Exception("Error invalid input")
 
+
 class Reshape(Layer):
     '''
     The forward(input) reshape the input tensor into a size(0) * size(1) * ... tensor, taking the
@@ -5075,20 +5116,21 @@ class SpatialConvolutionMap(Layer):
     This class is a generalization of SpatialConvolution.
     It uses a generic connection table between input and output features.
     The SpatialConvolution is equivalent to using a full connection table.
-    
+
     When padW and padH are both -1, we use a padding algorithm similar to the "SAME"
     padding of tensorflow. That is
- 
+
      outHeight = Math.ceil(inHeight.toFloat/strideH.toFloat)
      outWidth = Math.ceil(inWidth.toFloat/strideW.toFloat)
- 
+
      padAlongHeight = Math.max(0, (outHeight - 1) * strideH + kernelH - inHeight)
      padAlongWidth = Math.max(0, (outWidth - 1) * strideW + kernelW - inWidth)
- 
+
      padTop = padAlongHeight / 2
      padLeft = padAlongWidth / 2
 
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+     input weights matrices.
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
 
     >>> ct = np.ones([9,9]).astype("float32")
@@ -5228,10 +5270,11 @@ def __init__(self,
                                                       alpha,
                                                       beta)
 
+
 class Pack(Layer):
     '''
     Stacks a list of n-dimensional tensors into one (n+1)-dimensional tensor.
-    
+
     >>> layer = Pack(1)
     creating: createPack
     '''
@@ -5239,9 +5282,10 @@ class Pack(Layer):
     def __init__(self, dimension, bigdl_type="float"):
         super(Pack, self).__init__(None, bigdl_type, dimension)
 
+
 class ConvLSTMPeephole(Layer):
     '''
-    
+
 |   Convolution Long Short Term Memory architecture with peephole.
 |   Ref. A.: https://arxiv.org/abs/1506.04214 (blueprint for this module)
 |   B. https://github.com/viorik/ConvLSTM
@@ -5254,15 +5298,19 @@ class ConvLSTMPeephole(Layer):
     :param padding: The additional zeros added, default is -1
     :param activation: activation function, by default to be Tanh if not specified.
                         It can also be the name of an existing activation as a string.
-    :param inner_activation: activation function for the inner cells, by default to be Sigmoid if not specified.
-                            It can also be the name of an existing activation as a string.
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices
-    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the recurrent weights matrices
+    :param inner_activation: activation function for the inner cells, by default to be Sigmoid if
+                             not specified.
+                             It can also be the name of an existing activation as a string.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices
+    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         recurrent weights matrices
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
     :param cRegularizer: instance of [[Regularizer]]applied to peephole.
     :param with_peephole: whether use last cell status control a gate.
 
-    >>> convlstm = ConvLSTMPeephole(4, 3, 3, 3, 1, -1, Tanh(), HardSigmoid(), L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
+    >>> convlstm = ConvLSTMPeephole(4, 3, 3, 3, 1, -1, Tanh(), HardSigmoid(), L1Regularizer(0.5),
+    ... L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
     creating: createTanh
     creating: createHardSigmoid
     creating: createL1Regularizer
@@ -5284,9 +5332,12 @@ def __init__(self, input_size, output_size, kernel_i, kernel_c, stride=1, paddin
             activation = get_activation_by_name(activation)
         if isinstance(inner_activation, six.string_types):
             inner_activation = get_activation_by_name(inner_activation)
-        super(ConvLSTMPeephole, self).__init__(None, bigdl_type, input_size, output_size, kernel_i, kernel_c,
+        super(ConvLSTMPeephole, self).__init__(None, bigdl_type, input_size, output_size, kernel_i,
+                                               kernel_c,
                                                stride, padding, activation, inner_activation,
-                                               wRegularizer, uRegularizer, bRegularizer, cRegularizer, with_peephole)
+                                               wRegularizer, uRegularizer, bRegularizer,
+                                               cRegularizer, with_peephole)
+
 
 class Tile(Layer):
     '''
@@ -5295,18 +5346,22 @@ class Tile(Layer):
     >>> layer = Tile(1, 2)
     creating: createTile
     '''
-    def __init__(self, dim = 1, copies = 2, bigdl_type="float"):
+
+    def __init__(self, dim=1, copies=2, bigdl_type="float"):
         super(Tile, self).__init__(None, bigdl_type, dim, copies)
 
+
 class BinaryThreshold(Layer):
     '''
     Binary threshold, 1 if value > th, 0 otherwise
     >>> layer = BinaryThreshold(0.1, False)
     creating: createBinaryThreshold
     '''
-    def __init__(self, th=1e-6, ip = False, bigdl_type="float"):
+
+    def __init__(self, th=1e-6, ip=False, bigdl_type="float"):
         super(BinaryThreshold, self).__init__(None, bigdl_type, th, ip)
 
+
 class ConvLSTMPeephole3D(Layer):
     '''
 
@@ -5316,13 +5371,16 @@ class ConvLSTMPeephole3D(Layer):
     :param kernel_c Convolutional filter size to convolve cell
     :param stride The step of the convolution
     :param padding The additional zeros added
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices
-    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the recurrent weights matrices
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices
+    :param uRegularizer: instance [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         recurrent weights matrices
     :param bRegularizer: instance of [[Regularizer]]applied to the bias.
     :param cRegularizer: instance of [[Regularizer]]applied to peephole.
     :param with_peephole: whether use last cell status control a gate.
 
-    >>> convlstm = ConvLSTMPeephole3D(4, 3, 3, 3, 1, -1, L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5), L1Regularizer(0.5))
+    >>> convlstm = ConvLSTMPeephole3D(4, 3, 3, 3, 1, -1, L1Regularizer(0.5), L1Regularizer(0.5),
+    ... L1Regularizer(0.5), L1Regularizer(0.5))
     creating: createL1Regularizer
     creating: createL1Regularizer
     creating: createL1Regularizer
@@ -5330,10 +5388,13 @@ class ConvLSTMPeephole3D(Layer):
     creating: createConvLSTMPeephole3D
     '''
 
-    def __init__(self, input_size, output_size, kernel_i, kernel_c, stride=1, padding=-1, wRegularizer=None, uRegularizer=None,
+    def __init__(self, input_size, output_size, kernel_i, kernel_c, stride=1, padding=-1,
+                 wRegularizer=None, uRegularizer=None,
                  bRegularizer=None, cRegularizer=None, with_peephole=True, bigdl_type="float"):
-        super(ConvLSTMPeephole3D, self).__init__(None, bigdl_type, input_size, output_size, kernel_i, kernel_c, stride,
-                                                 padding, wRegularizer, uRegularizer, bRegularizer, cRegularizer, with_peephole)
+        super(ConvLSTMPeephole3D, self).__init__(None, bigdl_type, input_size, output_size,
+                                                 kernel_i, kernel_c, stride,
+                                                 padding, wRegularizer, uRegularizer, bRegularizer,
+                                                 cRegularizer, with_peephole)
 
 
 class MultiRNNCell(Layer):
@@ -5352,6 +5413,7 @@ class MultiRNNCell(Layer):
     def __init__(self, cells, bigdl_type="float"):
         super(MultiRNNCell, self).__init__(None, bigdl_type, cells)
 
+
 class ResizeBilinear(Layer):
     """
     Resize the input image with bilinear interpolation. The input image must be a float tensor with
@@ -5365,21 +5427,25 @@ class ResizeBilinear(Layer):
     >>> resizeBilinear = ResizeBilinear(10, 20, False, "NCHW")
     creating: createResizeBilinear
     """
-    def __init__(self, output_height, output_width, align_corner=False, data_format="NCHW", bigdl_type="float"):
+
+    def __init__(self, output_height, output_width, align_corner=False, data_format="NCHW",
+                 bigdl_type="float"):
         super(ResizeBilinear, self).__init__(None, bigdl_type, output_height,
                                              output_width, align_corner, data_format)
 
+
 class GaussianSampler(Layer):
     """
     Takes {mean, log_variance} as input and samples from the Gaussian distribution
     >>> sampler = GaussianSampler()
     creating: createGaussianSampler
     """
+
     def __init__(self, bigdl_type="float"):
         super(GaussianSampler, self).__init__(None, bigdl_type)
 
-class Masking(Layer):
 
+class Masking(Layer):
     '''
     Use a mask value to skip timesteps for a sequence
     ```
@@ -5393,14 +5459,14 @@ def __init__(self,
                  mask_value,
                  bigdl_type="float"):
         super(Masking, self).__init__(None, bigdl_type,
-                                         mask_value)
+                                      mask_value)
+
 
 class Maxout(Layer):
-    
     '''
     A linear maxout layer Maxout layer select the element-wise maximum value of
     maxoutNumber Linear(inputSize, outputSize) layers
-    ```    
+    ```
     :param input_size: the size the each input sample
     :param output_size: the size of the module output of each sample
     :param maxout_number: number of Linear layers to use
@@ -5411,10 +5477,11 @@ class Maxout(Layer):
            applied to the bias.
     :param init_weight: initial weight
     :param init_bias: initial bias
-    
+
     >>> maxout = Maxout(2, 5, 3)
     creating: createMaxout
-    '''    
+    '''
+
     def __init__(self,
                  input_size,
                  output_size,
@@ -5426,13 +5493,14 @@ def __init__(self,
                  init_bias=None,
                  bigdl_type="float"):
         super(Maxout, self).__init__(None, bigdl_type,
-                                      input_size, output_size, maxout_number, with_bias,
-                                      w_regularizer, b_regularizer, init_weight, init_bias)
+                                     input_size, output_size, maxout_number, with_bias,
+                                     w_regularizer, b_regularizer, init_weight, init_bias)
+
 
 class HardSigmoid(Layer):
     """
     Apply Hard-sigmoid function
-```   
+```
                |  0, if x < -2.5
         f(x) = |  1, if x > 2.5
                |  0.2 * x + 0.5, otherwise
@@ -5440,9 +5508,11 @@ class HardSigmoid(Layer):
     >>> hardSigmoid = HardSigmoid()
     creating: createHardSigmoid
     """
+
     def __init__(self, bigdl_type="float"):
         super(HardSigmoid, self).__init__(None, bigdl_type)
 
+
 class Highway(Layer):
     """
     Densely connected highway network.
@@ -5450,17 +5520,23 @@ class Highway(Layer):
 
     :param size input size
     :param with_bias whether to include a bias
-    :param activation activation function. It can also be the name of an existing activation as a string.
-    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the input weights matrices.
+    :param activation activation function. It can also be the name of an existing activation as a
+    string.
+    :param wRegularizer: instance of [[Regularizer]](eg. L1 or L2 regularization), applied to the
+                         input weights matrices.
     :param bRegularizer: instance of [[Regularizer]], applied to the bias.
 
     >>> highway = Highway(2)
     creating: createHighway
     """
-    def __init__(self, size, with_bias=True, activation=None, wRegularizer=None, bRegularizer=None, bigdl_type="float"):
+
+    def __init__(self, size, with_bias=True, activation=None, wRegularizer=None, bRegularizer=None,
+                 bigdl_type="float"):
         if isinstance(activation, six.string_types):
             activation = get_activation_by_name(activation)
-        super(Highway, self).__init__(None, bigdl_type, size, with_bias, activation, wRegularizer, bRegularizer)
+        super(Highway, self).__init__(None, bigdl_type, size, with_bias, activation, wRegularizer,
+                                      bRegularizer)
+
 
 class UpSampling3D(Layer):
     """
@@ -5473,9 +5549,11 @@ class UpSampling3D(Layer):
     >>> upsample3d = UpSampling3D([1, 2, 3])
     creating: createUpSampling3D
     """
+
     def __init__(self, size, bigdl_type="float"):
         super(UpSampling3D, self).__init__(None, bigdl_type, size)
 
+
 class PriorBox(Layer):
     """
     Generate the prior boxes of designated sizes and aspect ratios across
@@ -5489,13 +5567,14 @@ class PriorBox(Layer):
     >>> layer = PriorBox([0.1])
     creating: createPriorBox
     """
+
     def __init__(self, min_sizes,
                  max_sizes=None,
                  aspect_ratios=None,
                  is_flip=True,
                  is_clip=False,
                  variances=None,
-                 offset = 0.5,
+                 offset=0.5,
                  img_h=0,
                  img_w=0,
                  img_size=0,
@@ -5518,6 +5597,7 @@ def __init__(self, min_sizes,
                                        step_w,
                                        step)
 
+
 class NormalizeScale(Layer):
     """
     NormalizeScale is conposed of normalize and scale, this is equal to caffe Normalize layer
@@ -5529,10 +5609,12 @@ class NormalizeScale(Layer):
     >>> layer = NormalizeScale(2.0, scale = 20.0, size = [1, 5, 1, 1])
     creating: createNormalizeScale
     """
+
     def __init__(self, p, scale, size, w_regularizer=None, eps=1e-10,
                  bigdl_type="float"):
         super(NormalizeScale, self).__init__(None, bigdl_type, p, eps, scale, size, w_regularizer)
 
+
 class Proposal(Layer):
     """
     Outputs object detection proposals by applying estimated bounding-box
@@ -5543,6 +5625,7 @@ class Proposal(Layer):
     >>> layer = Proposal(1000, 200, [0.1, 0.2], [2.0, 3.0])
     creating: createProposal
     """
+
     def __init__(self, pre_nms_topn, post_nms_topn, ratios, scales,
                  rpn_pre_nms_topn_train=12000, rpn_post_nms_topn_train=2000,
                  bigdl_type="float"):
@@ -5554,6 +5637,7 @@ def __init__(self, pre_nms_topn, post_nms_topn, ratios, scales,
                                        rpn_pre_nms_topn_train,
                                        rpn_post_nms_topn_train)
 
+
 class DetectionOutputSSD(Layer):
     """
     Layer to Post-process SSD output
@@ -5595,6 +5679,7 @@ def __init__(self, n_classes=21,
                                                  variance_encoded_in_target,
                                                  conf_post_process)
 
+
 class DetectionOutputFrcnn(Layer):
     """
     Post process Faster-RCNN models
@@ -5607,7 +5692,7 @@ class DetectionOutputFrcnn(Layer):
     creating: createDetectionOutputFrcnn
     """
 
-    def __init__(self, n_classes, bbox_vote, nms_thresh = 0.3,
+    def __init__(self, n_classes, bbox_vote, nms_thresh=0.3,
                  max_per_image=100, thresh=0.05,
                  bigdl_type="float"):
         super(DetectionOutputFrcnn, self).__init__(None, bigdl_type, nms_thresh,
@@ -5616,6 +5701,7 @@ def __init__(self, n_classes, bbox_vote, nms_thresh = 0.3,
                                                    max_per_image,
                                                    thresh)
 
+
 class Cropping2D(Layer):
     """
     Cropping layer for 2D input (e.g. picture).
@@ -5633,15 +5719,19 @@ class Cropping2D(Layer):
                       and end of the height dimension.
     :param widthCrop Array of length 2. How many units should be trimmed off at the beginning
                       and end of the width dimension.
-    :param data_format a string value (or DataFormat Object in Scala) of "NHWC" or "NCHW" to specify the input data format of this layer. In "NHWC" format
-                        data is stored in the order of [batch_size, height, width, channels], in "NCHW" format data is stored
+    :param data_format a string value (or DataFormat Object in Scala) of "NHWC" or "NCHW" to specify
+                        the input data format of this layer. In "NHWC" format
+                        data is stored in the order of [batch_size, height, width, channels], in
+                        "NCHW" format data is stored
                         in the order of [batch_size, channels, height, width].
     >>> cropping2D = Cropping2D([1, 1], [2, 2])
     creating: createCropping2D
     """
+
     def __init__(self, heightCrop, widthCrop, data_format="NCHW", bigdl_type="float"):
         super(Cropping2D, self).__init__(None, bigdl_type, heightCrop, widthCrop, data_format)
 
+
 class Cropping3D(Layer):
     """
     Cropping layer for 3D data (e.g. spatial or spatio-temporal).
@@ -5664,10 +5754,13 @@ class Cropping3D(Layer):
     >>> cropping3D = Cropping3D([1, 1], [2, 2], [1, 1])
     creating: createCropping3D
     """
-    def __init__(self, dim1Crop, dim2Crop, dim3Crop, data_format="channel_first", bigdl_type="float"):
-        super(Cropping3D, self).__init__(None, bigdl_type, dim1Crop, dim2Crop, dim3Crop, data_format)
 
-        
+    def __init__(self, dim1Crop, dim2Crop, dim3Crop, data_format="channel_first",
+                 bigdl_type="float"):
+        super(Cropping3D, self).__init__(None, bigdl_type, dim1Crop, dim2Crop, dim3Crop,
+                                         data_format)
+
+
 class RoiAlign(Layer):
     """
     Region of interest aligning (RoIAlign) for Mask-RCNN
@@ -5694,7 +5787,8 @@ class RoiAlign(Layer):
 
     >>> import numpy as np
     >>> input_data = np.random.rand(1,2,6,8)
-    >>> input_rois = np.array([0, 0, 7, 5, 6, 2, 7, 5, 3, 1, 6, 4, 3, 3, 3, 3],dtype='float').reshape(4,4)
+    >>> input_rois = np.array([0, 0, 7, 5, 6, 2, 7, 5, 3, 1, 6, 4, 3, 3, 3, 3],
+    ... dtype='float').reshape(4,4)
     >>> m = RoiAlign(1.0,3,2,2)
     creating: createRoiAlign
     >>> out = m.forward([input_data,input_rois])
@@ -5707,10 +5801,11 @@ def __init__(self,
                  pooled_w,
                  bigdl_type="float"):
         super(RoiAlign, self).__init__(None, bigdl_type,
-                                         spatial_scale,
-                                         sampling_ratio,
-                                         pooled_h,
-                                         pooled_w)
+                                       spatial_scale,
+                                       sampling_ratio,
+                                       pooled_h,
+                                       pooled_w)
+
 
 class Pooler(Layer):
     """
@@ -5725,7 +5820,8 @@ class Pooler(Layer):
     >>> feature1 = np.random.rand(1,2,4,4)
     >>> feature2 = np.random.rand(1,2,8,8)
     >>> features = [feature0, feature1, feature2]
-    >>> input_rois = np.array([0, 0, 3, 3, 2, 2, 50, 50, 50, 50, 500, 500],dtype='float').reshape(3,4)
+    >>> input_rois = np.array([0, 0, 3, 3, 2, 2, 50, 50, 50, 50, 500, 500],
+    ... dtype='float').reshape(3,4)
     >>> m = Pooler(2,[1.0, 0.5, 0.25],2)
     creating: createPooler
     >>> out = m.forward([features,input_rois])
@@ -5741,6 +5837,7 @@ def __init__(self,
                                      scales,
                                      sampling_ratio)
 
+
 class FPN(Layer):
     """
     Feature Pyramid Network (FPN) for Mask-RCNN
@@ -5774,11 +5871,12 @@ def __init__(self,
                  out_channels_of_p6p7=0,
                  bigdl_type="float"):
         super(FPN, self).__init__(None, bigdl_type,
-                                        in_channels_list,
-                                        out_channels,
-                                        top_blocks,
-                                        in_channels_of_p6p7,
-                                        out_channels_of_p6p7)
+                                  in_channels_list,
+                                  out_channels,
+                                  top_blocks,
+                                  in_channels_of_p6p7,
+                                  out_channels_of_p6p7)
+
 
 def _test():
     import doctest
@@ -5797,5 +5895,6 @@ def _test():
     if failure_count:
         exit(-1)
 
+
 if __name__ == "__main__":
     _test()
diff --git a/python/dllib/src/bigdl/dllib/nn/onnx/layer.py b/python/dllib/src/bigdl/dllib/nn/onnx/layer.py
index a907f5501c9..9007a6550a6 100644
--- a/python/dllib/src/bigdl/dllib/nn/onnx/layer.py
+++ b/python/dllib/src/bigdl/dllib/nn/onnx/layer.py
@@ -45,7 +45,8 @@ def __init__(self, bigdl_type="float"):
 
 class Gemm(Layer):
     """
-    General Matrix multiplication: https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3
+    General Matrix multiplication: https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#
+    Level_3
 
     A' = transpose(A) if transA else A
     B' = transpose(B) if transB else B
@@ -68,7 +69,8 @@ def __init__(self, matrix_b, matrix_c, alpha=float(1.0), beta=float(1.0), trans_
 
 class Reshape(Layer):
     """
-    A layer which takes a tensor as input and outputs an 1D tensor containing the shape of the input.
+    A layer which takes a tensor as input and outputs an 1D tensor containing the shape of the
+     input.
     >>> shape = (2, 2)
     >>> reshape = Reshape(shape)
     creating: createReshape
@@ -79,11 +81,11 @@ def __init__(self, shape=None, bigdl_type="float"):
 
 class Shape(Layer):
     """
-    A layer which takes a tensor as input and outputs an 1D tensor containing the shape of the input.
+    A layer which takes a tensor as input and outputs an 1D tensor containing the shape of the
+     input.
 
     >>> shape = Shape()
     creating: createShape
     """
     def __init__(self, bigdl_type="float"):
         super(Shape, self).__init__(None, bigdl_type)
-
diff --git a/python/dllib/src/bigdl/dllib/nncontext.py b/python/dllib/src/bigdl/dllib/nncontext.py
index 1d279c155c5..fb331eca60b 100644
--- a/python/dllib/src/bigdl/dllib/nncontext.py
+++ b/python/dllib/src/bigdl/dllib/nncontext.py
@@ -127,22 +127,22 @@ def init_spark_on_yarn(hadoop_conf,
 
 
 def init_spark_on_yarn_cluster(hadoop_conf,
-                       conda_name,
-                       num_executors,
-                       executor_cores,
-                       executor_memory="2g",
-                       driver_cores=4,
-                       driver_memory="2g",
-                       extra_executor_memory_for_ray=None,
-                       extra_python_lib=None,
-                       penv_archive=None,
-                       additional_archive=None,
-                       hadoop_user_name="root",
-                       spark_yarn_archive=None,
-                       spark_log_level="WARN",
-                       redirect_spark_log=True,
-                       jars=None,
-                       conf=None):
+                               conda_name,
+                               num_executors,
+                               executor_cores,
+                               executor_memory="2g",
+                               driver_cores=4,
+                               driver_memory="2g",
+                               extra_executor_memory_for_ray=None,
+                               extra_python_lib=None,
+                               penv_archive=None,
+                               additional_archive=None,
+                               hadoop_user_name="root",
+                               spark_yarn_archive=None,
+                               spark_log_level="WARN",
+                               redirect_spark_log=True,
+                               jars=None,
+                               conf=None):
     """
     Create a SparkContext with Analytics Zoo configurations on Yarn cluster for yarn-cluster mode.
     You only need to create a conda environment and install the python dependencies in that
@@ -185,25 +185,24 @@ def init_spark_on_yarn_cluster(hadoop_conf,
         runner = SparkRunner(spark_log_level=spark_log_level,
                              redirect_spark_log=redirect_spark_log)
         return_value = runner.init_spark_on_yarn_cluster(
-          hadoop_conf=hadoop_conf,
-          conda_name=conda_name,
-          num_executors=num_executors,
-          executor_cores=executor_cores,
-          executor_memory=executor_memory,
-          driver_cores=driver_cores,
-          driver_memory=driver_memory,
-          extra_executor_memory_for_ray=extra_executor_memory_for_ray,
-          extra_python_lib=extra_python_lib,
-          penv_archive=penv_archive,
-          additional_archive=additional_archive,
-          hadoop_user_name=hadoop_user_name,
-          spark_yarn_archive=spark_yarn_archive,
-          jars=jars,
-          conf=conf)
+            hadoop_conf=hadoop_conf,
+            conda_name=conda_name,
+            num_executors=num_executors,
+            executor_cores=executor_cores,
+            executor_memory=executor_memory,
+            driver_cores=driver_cores,
+            driver_memory=driver_memory,
+            extra_executor_memory_for_ray=extra_executor_memory_for_ray,
+            extra_python_lib=extra_python_lib,
+            penv_archive=penv_archive,
+            additional_archive=additional_archive,
+            hadoop_user_name=hadoop_user_name,
+            spark_yarn_archive=spark_yarn_archive,
+            jars=jars,
+            conf=conf)
     sys.exit(return_value)
 
 
-
 def init_spark_standalone(num_executors,
                           executor_cores,
                           executor_memory="2g",
@@ -345,7 +344,6 @@ def stop_spark_standalone():
 
 
 class ZooContextMeta(type):
-
     _log_output = False
     _barrier_mode = True
 
@@ -410,7 +408,8 @@ def _read_stream(fd, fn):
             fn(buff.decode('utf-8'))
 
 
-def init_nncontext(conf=None, cluster_mode="spark-submit", spark_log_level="WARN", redirect_spark_log=True, **kwargs):
+def init_nncontext(conf=None, cluster_mode="spark-submit", spark_log_level="WARN",
+                   redirect_spark_log=True, **kwargs):
     """
     Creates or gets a SparkContext with optimized configurations for BigDL performance.
     This method will also initialize the BigDL engine.
@@ -522,7 +521,8 @@ def init_nncontext(conf=None, cluster_mode="spark-submit", spark_log_level="WARN
         sc = init_spark_standalone(num_executors=num_nodes, executor_cores=cores,
                                    executor_memory=memory, **spark_args)
     else:
-        raise ValueError("cluster_mode can only be local, yarn-client, yarn-cluster, standalone or spark-submit, "
+        raise ValueError("cluster_mode can only be local, yarn-client, yarn-cluster, standalone or"
+                         " spark-submit, "
                          "but got: %s".format(cluster_mode))
     return sc
 
diff --git a/python/dllib/src/bigdl/dllib/optim/optimizer.py b/python/dllib/src/bigdl/dllib/optim/optimizer.py
index af0ddfa3bb6..e1a9e757125 100644
--- a/python/dllib/src/bigdl/dllib/optim/optimizer.py
+++ b/python/dllib/src/bigdl/dllib/optim/optimizer.py
@@ -46,9 +46,11 @@ class Top1Accuracy(JavaValue):
     >>> top1 = Top1Accuracy()
     creating: createTop1Accuracy
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
 
+
 class TreeNNAccuracy(JavaValue):
     """
     Caculate the percentage that output's max probability index equals target.
@@ -56,9 +58,11 @@ class TreeNNAccuracy(JavaValue):
     >>> top1 = TreeNNAccuracy()
     creating: createTreeNNAccuracy
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
 
+
 class Top5Accuracy(JavaValue):
     """
     Caculate the percentage that output's max probability index equals target.
@@ -66,9 +70,11 @@ class Top5Accuracy(JavaValue):
     >>> top5 = Top5Accuracy()
     creating: createTop5Accuracy
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
 
+
 class MeanAveragePrecision(JavaValue):
     """
     Calculate the Mean Average Precision for top-k confident predictions.
@@ -77,9 +83,11 @@ class MeanAveragePrecision(JavaValue):
     >>> MAP = MeanAveragePrecision(10, 20)
     creating: createMeanAveragePrecision
     """
+
     def __init__(self, k, classes, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, k, classes)
 
+
 class MeanAveragePrecisionObjectDetection(JavaValue):
     """
     Calculate the Mean Average Precision for Object Detection.
@@ -87,6 +95,7 @@ class MeanAveragePrecisionObjectDetection(JavaValue):
     >>> MAPObj = MeanAveragePrecisionObjectDetection(20)
     creating: createMeanAveragePrecisionObjectDetection
     """
+
     def __init__(self, classes, iou=0.5, use_voc2007=False, skip_class=-1, bigdl_type="float"):
         """
         :param classes: the number of classes
@@ -96,8 +105,8 @@ def __init__(self, classes, iou=0.5, use_voc2007=False, skip_class=-1, bigdl_typ
         """
         JavaValue.__init__(self, None, bigdl_type, classes, iou, use_voc2007, skip_class)
 
-class Loss(JavaValue):
 
+class Loss(JavaValue):
     """
     This evaluation method is calculate loss of output with respect to target
     >>> from bigdl.dllib.nn.criterion import ClassNLLCriterion
@@ -109,12 +118,14 @@ class Loss(JavaValue):
     creating: createClassNLLCriterion
     creating: createLoss
     """
+
     def __init__(self, cri=None, bigdl_type="float"):
         from bigdl.dllib.nn.criterion import ClassNLLCriterion
         if cri is None:
             cri = ClassNLLCriterion()
         JavaValue.__init__(self, None, bigdl_type, cri)
 
+
 class HitRatio(JavaValue):
     """
     Hit Ratio(HR) used in recommandation application.
@@ -123,7 +134,8 @@ class HitRatio(JavaValue):
     >>> hr10 = HitRatio(k = 10)
     creating: createHitRatio
     """
-    def __init__(self, k = 10, neg_num = 100, bigdl_type="float"):
+
+    def __init__(self, k=10, neg_num=100, bigdl_type="float"):
         """
         Create hit ratio validation method.
 
@@ -132,6 +144,7 @@ def __init__(self, k = 10, neg_num = 100, bigdl_type="float"):
         """
         JavaValue.__init__(self, None, bigdl_type, k, neg_num)
 
+
 class NDCG(JavaValue):
     """
     Normalized Discounted Cumulative Gain(NDCG).
@@ -140,7 +153,8 @@ class NDCG(JavaValue):
     >>> ndcg = NDCG(k = 10)
     creating: createNDCG
     """
-    def __init__(self, k = 10, neg_num = 100, bigdl_type="float"):
+
+    def __init__(self, k=10, neg_num=100, bigdl_type="float"):
         """
         Create NDCG validation method.
 
@@ -149,6 +163,7 @@ def __init__(self, k = 10, neg_num = 100, bigdl_type="float"):
         """
         JavaValue.__init__(self, None, bigdl_type, k, neg_num)
 
+
 class MAE(JavaValue):
     """
     This evaluation method calculates the mean absolute error of output with respect to target.
@@ -156,9 +171,11 @@ class MAE(JavaValue):
     >>> mae = MAE()
     creating: createMAE
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
 
+
 class MaxIteration(JavaValue):
     """
     A trigger specifies a timespot or several timespots during training,
@@ -171,6 +188,7 @@ class MaxIteration(JavaValue):
     >>> maxIteration = MaxIteration(20)
     creating: createMaxIteration
     """
+
     def __init__(self, max, bigdl_type="float"):
         """
         Create a MaxIteration trigger.
@@ -193,6 +211,7 @@ class MaxEpoch(JavaValue):
     >>> maxEpoch = MaxEpoch(2)
     creating: createMaxEpoch
     """
+
     def __init__(self, max_epoch, bigdl_type="float"):
         """
         Create a MaxEpoch trigger.
@@ -215,6 +234,7 @@ class EveryEpoch(JavaValue):
     >>> everyEpoch = EveryEpoch()
     creating: createEveryEpoch
     """
+
     def __init__(self, bigdl_type="float"):
         """
         Create a EveryEpoch trigger.
@@ -235,6 +255,7 @@ class SeveralIteration(JavaValue):
     >>> serveralIteration = SeveralIteration(2)
     creating: createSeveralIteration
     """
+
     def __init__(self, interval, bigdl_type="float"):
         """
         Create a SeveralIteration trigger.
@@ -253,6 +274,7 @@ class MaxScore(JavaValue):
     >>> maxScore = MaxScore(0.4)
     creating: createMaxScore
     """
+
     def __init__(self, max, bigdl_type="float"):
         """
         Create a MaxScore trigger.
@@ -271,6 +293,7 @@ class MinLoss(JavaValue):
     >>> minLoss = MinLoss(0.1)
     creating: createMinLoss
     """
+
     def __init__(self, min, bigdl_type="float"):
         """
         Create a MinLoss trigger.
@@ -280,6 +303,7 @@ def __init__(self, min, bigdl_type="float"):
         """
         JavaValue.__init__(self, None, bigdl_type, min)
 
+
 class TriggerAnd(JavaValue):
     """
     A trigger contains other triggers and triggers when all of them trigger (logical AND)
@@ -290,6 +314,7 @@ class TriggerAnd(JavaValue):
     creating: createMaxEpoch
     creating: createTriggerAnd
     """
+
     def __init__(self, first, *other):
         """
         Create a And trigger.
@@ -300,6 +325,7 @@ def __init__(self, first, *other):
         """
         JavaValue.__init__(self, None, "float", first, list(other))
 
+
 class TriggerOr(JavaValue):
     """
     A trigger contains other triggers and triggers when any of them trigger (logical OR)
@@ -310,6 +336,7 @@ class TriggerOr(JavaValue):
     creating: createMaxEpoch
     creating: createTriggerOr
     """
+
     def __init__(self, first, *other):
         """
         Create a Or trigger.
@@ -320,6 +347,7 @@ def __init__(self, first, *other):
         """
         JavaValue.__init__(self, None, "float", first, list(other))
 
+
 class Poly(JavaValue):
     """
     A learning rate decay policy, where the effective learning rate
@@ -333,8 +361,9 @@ class Poly(JavaValue):
     >>> poly = Poly(0.5, 2)
     creating: createPoly
     """
+
     def __init__(self, power, max_iteration, bigdl_type="float"):
-            JavaValue.__init__(self, None, bigdl_type, power, max_iteration)
+        JavaValue.__init__(self, None, bigdl_type, power, max_iteration)
 
 
 class Exponential(JavaValue):
@@ -349,6 +378,7 @@ class Exponential(JavaValue):
     >>> exponential = Exponential(100, 0.1)
     creating: createExponential
     """
+
     def __init__(self, decay_step, decay_rate, stair_case=False, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, decay_step, decay_rate, stair_case)
 
@@ -366,8 +396,10 @@ class Step(JavaValue):
     >>> step = Step(2, 0.3)
     creating: createStep
     """
+
     def __init__(self, step_size, gamma, bigdl_type="float"):
-            JavaValue.__init__(self, None, bigdl_type, step_size, gamma)
+        JavaValue.__init__(self, None, bigdl_type, step_size, gamma)
+
 
 class Default(JavaValue):
     """
@@ -380,6 +412,7 @@ class Default(JavaValue):
     >>> step = Default()
     creating: createDefault
     """
+
     def __init__(self, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type)
 
@@ -405,6 +438,7 @@ class Plateau(JavaValue):
     >>> plateau = Plateau("score")
     creating: createPlateau
     """
+
     def __init__(self,
                  monitor,
                  factor=0.1,
@@ -417,6 +451,7 @@ def __init__(self,
         JavaValue.__init__(self, None, bigdl_type, monitor, factor, patience, mode, epsilon,
                            cooldown, min_lr)
 
+
 class Warmup(JavaValue):
     """
     A learning rate gradual increase policy, where the effective learning rate
@@ -428,9 +463,11 @@ class Warmup(JavaValue):
     >>> warmup = Warmup(0.05)
     creating: createWarmup
     """
+
     def __init__(self, delta, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, delta)
 
+
 class SequentialSchedule(JavaValue):
     """
     Stack several learning rate schedulers.
@@ -446,6 +483,7 @@ class SequentialSchedule(JavaValue):
 
 
     """
+
     def __init__(self, iteration_per_epoch, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, iteration_per_epoch)
 
@@ -458,11 +496,12 @@ def add(self, scheduler, max_iteration, bigdl_type="float"):
         """
         return callBigDlFunc(bigdl_type, "addScheduler", self.value, scheduler, max_iteration)
 
+
 class OptimMethod(JavaValue):
 
     def __init__(self, jvalue, bigdl_type, *args):
         if (jvalue):
-            assert(type(jvalue) == JavaObject)
+            assert (type(jvalue) == JavaObject)
             self.value = jvalue
         else:
             self.value = callBigDlFunc(
@@ -483,9 +522,10 @@ def save(self, path, overWrite):
         :param path      path
         :param overWrite whether to overwrite
         """
-        method=self.value
+        method = self.value
         return callBigDlFunc(self.bigdl_type, "saveOptimMethod", method, path, overWrite)
 
+
 class SGD(OptimMethod):
     """
     A plain implementation of SGD
@@ -502,6 +542,7 @@ class SGD(OptimMethod):
     creating: createDefault
     creating: createSGD
     """
+
     def __init__(self,
                  learningrate=1e-3,
                  learningrate_decay=0.0,
@@ -514,9 +555,11 @@ def __init__(self,
                  weightdecays=None,
                  bigdl_type="float"):
         super(SGD, self).__init__(None, bigdl_type, learningrate, learningrate_decay, weightdecay,
-                           momentum, dampening, nesterov,
-                           leaningrate_schedule if (leaningrate_schedule) else Default(),
-                           JTensor.from_ndarray(learningrates), JTensor.from_ndarray(weightdecays))
+                                  momentum, dampening, nesterov,
+                                  leaningrate_schedule if (leaningrate_schedule) else Default(),
+                                  JTensor.from_ndarray(learningrates),
+                                  JTensor.from_ndarray(weightdecays))
+
 
 class Adagrad(OptimMethod):
     """
@@ -529,12 +572,15 @@ class Adagrad(OptimMethod):
     >>> adagrad = Adagrad()
     creating: createAdagrad
     """
+
     def __init__(self,
                  learningrate=1e-3,
                  learningrate_decay=0.0,
                  weightdecay=0.0,
                  bigdl_type="float"):
-        super(Adagrad, self).__init__(None, bigdl_type, learningrate, learningrate_decay, weightdecay)
+        super(Adagrad, self).__init__(None, bigdl_type, learningrate, learningrate_decay,
+                                      weightdecay)
+
 
 class LBFGS(OptimMethod):
     """
@@ -560,6 +606,7 @@ class LBFGS(OptimMethod):
     >>> lbfgs = LBFGS()
     creating: createLBFGS
     """
+
     def __init__(self,
                  max_iter=20,
                  max_eval=DOUBLEMAX,
@@ -574,7 +621,9 @@ def __init__(self,
         if linesearch or linesearch_options:
             raise ValueError('linesearch and linesearch_options must be None in LBFGS')
         super(LBFGS, self).__init__(None, bigdl_type, max_iter, max_eval, tolfun, tolx,
-                       ncorrection, learningrate, verbose, linesearch, linesearch_options)
+                                    ncorrection, learningrate, verbose, linesearch,
+                                    linesearch_options)
+
 
 class Adadelta(OptimMethod):
     """
@@ -585,12 +634,14 @@ class Adadelta(OptimMethod):
     >>> adagrad = Adadelta()
     creating: createAdadelta
     """
+
     def __init__(self,
-                 decayrate = 0.9,
-                 epsilon = 1e-10,
+                 decayrate=0.9,
+                 epsilon=1e-10,
                  bigdl_type="float"):
         super(Adadelta, self).__init__(None, bigdl_type, decayrate, epsilon)
 
+
 class Adam(OptimMethod):
     """
     An implementation of Adam http://arxiv.org/pdf/1412.6980.pdf
@@ -602,15 +653,17 @@ class Adam(OptimMethod):
     >>> adam = Adam()
     creating: createAdam
     """
+
     def __init__(self,
-                 learningrate = 1e-3,
-                 learningrate_decay = 0.0,
-                 beta1 = 0.9,
-                 beta2 = 0.999,
-                 epsilon = 1e-8,
+                 learningrate=1e-3,
+                 learningrate_decay=0.0,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
                  bigdl_type="float"):
         super(Adam, self).__init__(None, bigdl_type, learningrate, learningrate_decay,
-                           beta1, beta2, epsilon)
+                                   beta1, beta2, epsilon)
+
 
 class ParallelAdam(OptimMethod):
     """
@@ -624,18 +677,20 @@ class ParallelAdam(OptimMethod):
     >>> pAdam = ParallelAdam()
     creating: createParallelAdam
     """
+
     def __init__(self,
-                 learningrate = 1e-3,
-                 learningrate_decay = 0.0,
-                 beta1 = 0.9,
-                 beta2 = 0.999,
-                 epsilon = 1e-8,
-                 parallel_num = -1,
+                 learningrate=1e-3,
+                 learningrate_decay=0.0,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8,
+                 parallel_num=-1,
                  bigdl_type="float"):
         if parallel_num == -1:
             parallel_num = get_node_and_core_number()[1]
         super(ParallelAdam, self).__init__(None, bigdl_type, learningrate, learningrate_decay,
-                                   beta1, beta2, epsilon, parallel_num)
+                                           beta1, beta2, epsilon, parallel_num)
+
 
 class Ftrl(OptimMethod):
     """
@@ -656,13 +711,14 @@ class Ftrl(OptimMethod):
     >>> ftrl2 = Ftrl(1e-2, -0.1, 0.2, 0.3, 0.4, 0.5)
     creating: createFtrl
     """
+
     def __init__(self,
-                 learningrate = 1e-3,
-                 learningrate_power = -0.5,
-                 initial_accumulator_value = 0.1,
-                 l1_regularization_strength = 0.0,
-                 l2_regularization_strength = 0.0,
-                 l2_shrinkage_regularization_strength = 0.0,
+                 learningrate=1e-3,
+                 learningrate_power=-0.5,
+                 initial_accumulator_value=0.1,
+                 l1_regularization_strength=0.0,
+                 l2_regularization_strength=0.0,
+                 l2_shrinkage_regularization_strength=0.0,
                  bigdl_type="float"):
         super(Ftrl, self).__init__(None, bigdl_type, learningrate, learningrate_power,
                                    initial_accumulator_value,
@@ -670,6 +726,7 @@ def __init__(self,
                                    l2_regularization_strength,
                                    l2_shrinkage_regularization_strength)
 
+
 class Adamax(OptimMethod):
     """
     An implementation of Adamax http://arxiv.org/pdf/1412.6980.pdf
@@ -680,14 +737,16 @@ class Adamax(OptimMethod):
     >>> adagrad = Adamax()
     creating: createAdamax
     """
+
     def __init__(self,
-                 learningrate = 0.002,
-                 beta1 = 0.9,
-                 beta2 = 0.999,
-                 epsilon = 1e-38,
+                 learningrate=0.002,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-38,
                  bigdl_type="float"):
         super(Adamax, self).__init__(None, bigdl_type, learningrate, beta1, beta2, epsilon)
 
+
 class RMSprop(OptimMethod):
     """
     An implementation of RMSprop
@@ -698,13 +757,16 @@ class RMSprop(OptimMethod):
     >>> adagrad = RMSprop()
     creating: createRMSprop
     """
+
     def __init__(self,
-                 learningrate = 1e-2,
-                 learningrate_decay = 0.0,
-                 decayrate = 0.99,
-                 epsilon = 1e-8,
+                 learningrate=1e-2,
+                 learningrate_decay=0.0,
+                 decayrate=0.99,
+                 epsilon=1e-8,
                  bigdl_type="float"):
-        super(RMSprop, self).__init__(None, bigdl_type, learningrate, learningrate_decay, decayrate, epsilon)
+        super(RMSprop, self).__init__(None, bigdl_type, learningrate, learningrate_decay, decayrate,
+                                      epsilon)
+
 
 class MultiStep(JavaValue):
     """
@@ -718,6 +780,7 @@ class MultiStep(JavaValue):
     >>> step = MultiStep([2, 5], 0.3)
     creating: createMultiStep
     """
+
     def __init__(self, step_sizes, gamma, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, step_sizes, gamma)
 
@@ -894,7 +957,8 @@ def create(model,
            e.g. SGD, Adagrad, etc. If optim_method is None, the default algorithm is SGD.
         :param end_trigger: when to end the optimization. default value is MapEpoch(1)
         :param batch_size: training batch size
-        :param cores: This is for local optimizer only and use total physical cores as the default value
+        :param cores: This is for local optimizer only and use total physical cores as the default
+        value
         """
         if not end_trigger:
             end_trigger = MaxEpoch(1)
@@ -949,8 +1013,7 @@ def set_traindata(self, training_rdd, batch_size):
         :return:
         """
         callBigDlFunc(self.bigdl_type, "setTrainData", self.value,
-                     training_rdd, batch_size)
-
+                      training_rdd, batch_size)
 
 
 class DistriOptimizer(Optimizer):
@@ -1008,6 +1071,7 @@ class LocalOptimizer(BaseOptimizer):
     :param batch_size: training batch size
     :param cores: by default is the total physical cores.
     """
+
     def __init__(self,
                  X,
                  Y,
@@ -1067,6 +1131,7 @@ class TrainSummary(JavaValue, ):
 
     Use optimizer.setTrainSummary to enable train logger.
     """
+
     def __init__(self, log_dir, app_name, bigdl_type="float"):
         """
         Create a TrainSummary. Logs will be saved to log_dir/app_name/train.
@@ -1093,7 +1158,13 @@ def set_summary_trigger(self, name, trigger):
         Set the interval of recording for each indicator.
 
 
-        :param tag: tag name. Supported tag names are "LearningRate", "Loss","Throughput", "Parameters". "Parameters" is an umbrella tag thatincludes weight, bias, gradWeight, gradBias, and some running status(eg. runningMean and runningVar in BatchNormalization). If youdidn't set any triggers, we will by default record Loss and Throughputin each iteration, while *NOT* recording LearningRate and Parameters,as recording parameters may introduce substantial overhead when themodel is very big, LearningRate is not a public attribute for allOptimMethod.
+        :param tag: tag name. Supported tag names are "LearningRate", "Loss","Throughput",
+        "Parameters". "Parameters" is an umbrella tag thatincludes weight, bias, gradWeight,
+         gradBias, and some running status(eg. runningMean and runningVar in BatchNormalization).
+         If youdidn't set any triggers, we will by default record Loss and Throughputin each
+         iteration, while *NOT* recording LearningRate and Parameters,as recording parameters may
+         introduce substantial overhead when themodel is very big, LearningRate is not a public
+         attribute for allOptimMethod.
         :param trigger: trigger
         """
         return callBigDlFunc(self.bigdl_type, "summarySetTrigger", self.value,
@@ -1116,6 +1187,7 @@ class ValidationSummary(JavaValue):
 
      Use optimizer.setValidationSummary to enable validation logger.
     """
+
     def __init__(self, log_dir, app_name, bigdl_type="float"):
         """
         Create a ValidationSummary. Logs will be saved to
@@ -1136,7 +1208,8 @@ def read_scalar(self, tag):
         by default.
 
 
-        :param tag: the type of the logs. The tag should match the name ofthe ValidationMethod set into the optimizer. e.g."Top1AccuracyLoss","Top1Accuracy" or "Top5Accuracy".
+        :param tag: the type of the logs. The tag should match the name ofthe ValidationMethod set
+         into the optimizer. e.g."Top1AccuracyLoss","Top1Accuracy" or "Top5Accuracy".
         """
         return callBigDlFunc(self.bigdl_type, "summaryReadScalar", self.value,
                              tag)
@@ -1150,9 +1223,11 @@ class L1L2Regularizer(JavaValue):
     :param l2 l2 regularization rate
 
     """
+
     def __init__(self, l1, l2, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, l1, l2)
 
+
 class ActivityRegularization(JavaValue):
     """
     Apply both L1 and L2 regularization
@@ -1161,9 +1236,11 @@ class ActivityRegularization(JavaValue):
     :param l2 l2 regularization rate
 
     """
+
     def __init__(self, l1, l2, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, l1, l2)
 
+
 class L1Regularizer(JavaValue):
     """
     Apply L1 regularization
@@ -1171,6 +1248,7 @@ class L1Regularizer(JavaValue):
     :param l1 l1 regularization rate
 
     """
+
     def __init__(self, l1, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, l1)
 
@@ -1182,6 +1260,7 @@ class L2Regularizer(JavaValue):
     :param l2 l2 regularization rate
 
     """
+
     def __init__(self, l2, bigdl_type="float"):
         JavaValue.__init__(self, None, bigdl_type, l2)
 
@@ -1202,5 +1281,6 @@ def _test():
     if failure_count:
         exit(-1)
 
+
 if __name__ == "__main__":
     _test()
diff --git a/python/dllib/src/bigdl/dllib/utils/bigdl_export.py b/python/dllib/src/bigdl/dllib/utils/bigdl_export.py
index bbef00754f0..55932c72681 100644
--- a/python/dllib/src/bigdl/dllib/utils/bigdl_export.py
+++ b/python/dllib/src/bigdl/dllib/utils/bigdl_export.py
@@ -23,7 +23,6 @@
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 
-
 Keras_API_NAME = 'keras'
 
 _Attributes = collections.namedtuple(
@@ -37,8 +36,8 @@
 
 
 class api_export(object):  # pylint: disable=invalid-name
-  def __init__(self, *args, **kwargs):  # pylint: disable=g-doc-args
-    """Export under the names *args (first one is considered canonical).
+    def __init__(self, *args, **kwargs):  # pylint: disable=g-doc-args
+        """Export under the names *args (first one is considered canonical).
 
     Args:
       *args: API names in dot delimited format.
@@ -46,23 +45,22 @@ def __init__(self, *args, **kwargs):  # pylint: disable=g-doc-args
         api_name: Name of the API you want to generate (e.g. `tensorflow` or
           `estimator`). Default is `keras`.
     """
-    self._names = args
-    self._api_name = kwargs.get('api_name', Keras_API_NAME)
-
+        self._names = args
+        self._api_name = kwargs.get('api_name', Keras_API_NAME)
 
-  def __call__(self, func):
-    api_names_attr = API_ATTRS[self._api_name].names
+    def __call__(self, func):
+        api_names_attr = API_ATTRS[self._api_name].names
 
-    _, undecorated_func = tf_decorator.unwrap(func)
-    self.set_attr(undecorated_func, api_names_attr, self._names)
+        _, undecorated_func = tf_decorator.unwrap(func)
+        self.set_attr(undecorated_func, api_names_attr, self._names)
 
-    for name in self._names:
-      _NAME_TO_SYMBOL_MAPPING[name] = func
-      sys.modules[name] = func
-    return func
+        for name in self._names:
+            _NAME_TO_SYMBOL_MAPPING[name] = func
+            sys.modules[name] = func
+        return func
 
-  def set_attr(self, func, api_names_attr, names):
-    setattr(func, api_names_attr, names)
+    def set_attr(self, func, api_names_attr, names):
+        setattr(func, api_names_attr, names)
 
 
 keras_export = functools.partial(api_export, api_name=Keras_API_NAME)
diff --git a/python/dllib/src/bigdl/dllib/utils/common.py b/python/dllib/src/bigdl/dllib/utils/common.py
index 837b0571a9e..0e0e25567ac 100644
--- a/python/dllib/src/bigdl/dllib/utils/common.py
+++ b/python/dllib/src/bigdl/dllib/utils/common.py
@@ -64,7 +64,7 @@ def __init__(self, bigdl_type, port=25333):
 
 
 class JavaCreator(SingletonMixin):
-    __creator_class=[
+    __creator_class = [
         "com.intel.analytics.bigdl.dllib.utils.python.api.PythonBigDLKeras",
         "com.intel.analytics.bigdl.dllib.utils.python.api.PythonBigDLOnnx",
         "com.intel.analytics.bigdl.dllib.common.PythonZoo",
@@ -128,6 +128,7 @@ class EvaluatedResult():
     """
     A testing result used to benchmark the model quality.
     """
+
     def __init__(self, result, total_num, method):
         """
 
@@ -167,6 +168,7 @@ class JTensor(object):
     >>> np.random.seed(123)
     >>>
     """
+
     def __init__(self, storage, shape, bigdl_type="float", indices=None):
         """
 
@@ -188,7 +190,7 @@ def __init__(self, storage, shape, bigdl_type="float", indices=None):
             self.indices = np.frombuffer(indices, dtype=np.int32)
         else:
             assert isinstance(indices, np.ndarray), \
-            "indices should be a np.ndarray, not %s, %s" % (type(a_ndarray), str(indices))
+                "indices should be a np.ndarray, not %s, %s" % (type(a_ndarray), str(indices))
             self.indices = np.array(indices, dtype=np.int32)
         self.bigdl_type = bigdl_type
 
@@ -203,7 +205,8 @@ def from_ndarray(cls, a_ndarray, bigdl_type="float"):
         >>> np.random.seed(123)
         >>> data = np.random.uniform(0, 1, (2, 3)).astype("float32")
         >>> result = JTensor.from_ndarray(data)
-        >>> expected_storage = np.array([[0.69646919, 0.28613934, 0.22685145], [0.55131477, 0.71946895, 0.42310646]])
+        >>> expected_storage = np.array([[0.69646919, 0.28613934, 0.22685145], [0.55131477,
+        ... 0.71946895, 0.42310646]])
         >>> expected_shape = np.array([2, 3])
         >>> np.testing.assert_allclose(result.storage, expected_storage, rtol=1e-6, atol=1e-6)
         >>> np.testing.assert_allclose(result.shape, expected_shape)
@@ -280,7 +283,8 @@ def sparse(cls, a_ndarray, i_ndarray, shape, bigdl_type="float"):
     def to_ndarray(self):
         """
         Transfer JTensor to ndarray.
-        As SparseTensor may generate an very big ndarray, so we don't support this function for SparseTensor.
+        As SparseTensor may generate an very big ndarray, so we don't support this function for
+        SparseTensor.
         :return: a ndarray
         """
         assert self.indices is None, "sparseTensor to ndarray is not supported"
@@ -290,14 +294,16 @@ def __reduce__(self):
         if self.indices is None:
             return JTensor, (self.storage.tostring(), self.shape.tostring(), self.bigdl_type)
         else:
-            return JTensor, (self.storage.tostring(), self.shape.tostring(), self.bigdl_type, self.indices.tostring())
+            return JTensor, (self.storage.tostring(), self.shape.tostring(), self.bigdl_type,
+                             self.indices.tostring())
 
     def __str__(self):
         return self.__repr__()
 
     def __repr__(self):
         indices = "" if self.indices is None else " ,indices %s" % str(self.indices)
-        return "JTensor: storage: %s, shape: %s%s, %s" % (str(self.storage), str(self.shape), indices, self.bigdl_type)
+        return "JTensor: storage: %s, shape: %s%s, %s" % (
+            str(self.storage), str(self.shape), indices, self.bigdl_type)
 
 
 class Sample(object):
@@ -326,12 +332,15 @@ def from_ndarray(cls, features, labels, bigdl_type="float"):
         >>> from bigdl.dllib.utils.common import callBigDlFunc
         >>> from numpy.testing import assert_allclose
         >>> np.random.seed(123)
-        >>> sample = Sample.from_ndarray(np.random.random((2,3)), np.random.random((2,3)))        
-        >>> expected_feature_storage = np.array(([[0.69646919, 0.28613934, 0.22685145], [0.55131477, 0.71946895, 0.42310646]]))
+        >>> sample = Sample.from_ndarray(np.random.random((2,3)), np.random.random((2,3)))
+        >>> expected_feature_storage = np.array(([[0.69646919, 0.28613934, 0.22685145], [0.55131477,
+        ... 0.71946895, 0.42310646]]))
         >>> expected_feature_shape = np.array([2, 3])
-        >>> expected_label_storage = np.array(([[0.98076421, 0.68482971, 0.48093191], [0.39211753, 0.343178, 0.72904968]]))
+        >>> expected_label_storage = np.array(([[0.98076421, 0.68482971, 0.48093191], [0.39211753,
+        ... 0.343178, 0.72904968]]))
         >>> expected_label_shape = np.array([2, 3])
-        >>> assert_allclose(sample.features[0].storage, expected_feature_storage, rtol=1e-6, atol=1e-6)
+        >>> assert_allclose(sample.features[0].storage, expected_feature_storage, rtol=1e-6,
+        ... atol=1e-6)
         >>> assert_allclose(sample.features[0].shape, expected_feature_shape)
         >>> assert_allclose(sample.labels[0].storage, expected_label_storage, rtol=1e-6, atol=1e-6)
         >>> assert_allclose(sample.labels[0].shape, expected_label_shape)
@@ -395,10 +404,12 @@ def __str__(self):
     def __repr__(self):
         return "Sample: features: %s, labels: %s" % (self.features, self.labels)
 
+
 class RNG():
     """
     generate tensor data with seed
     """
+
     def __init__(self, bigdl_type="float"):
         self.bigdl_type = bigdl_type
 
@@ -406,7 +417,7 @@ def set_seed(self, seed):
         callBigDlFunc(self.bigdl_type, "setModelSeed", seed)
 
     def uniform(self, a, b, size):
-        return callBigDlFunc(self.bigdl_type, "uniform", a, b, size).to_ndarray() # noqa
+        return callBigDlFunc(self.bigdl_type, "uniform", a, b, size).to_ndarray()  # noqa
 
 
 _picklable_classes = [
@@ -428,15 +439,19 @@ def init_engine(bigdl_type="float"):
     # Spark context is supposed to have been created when init_engine is called
     get_spark_context()._jvm.org.apache.spark.bigdl.api.python.BigDLSerDe.initialize()
 
+
 def get_bigdl_engine_type(bigdl_type="float"):
     return callBigDlFunc(bigdl_type, "getEngineType")
 
+
 def set_optimizer_version(optimizerVersion, bigdl_type="float"):
     return callBigDlFunc(bigdl_type, "setOptimizerVersion", optimizerVersion)
 
+
 def get_optimizer_version(bigdl_type="float"):
     return callBigDlFunc(bigdl_type, "getOptimizerVersion")
 
+
 def init_executor_gateway(sc, bigdl_type="float"):
     callBigDlFunc(bigdl_type, "initExecutorGateway", sc, sc._gateway._gateway_client.port)
 
@@ -446,11 +461,12 @@ def get_node_and_core_number(bigdl_type="float"):
     return result[0], result[1]
 
 
-def redire_spark_logs(bigdl_type="float", log_path=os.getcwd()+"/bigdl.log"):
+def redire_spark_logs(bigdl_type="float", log_path=os.getcwd() + "/bigdl.log"):
     """
     Redirect spark logs to the specified path.
     :param bigdl_type: "double" or "float"
-    :param log_path: the file path to be redirected to; the default file is under the current workspace named `bigdl.log`.
+    :param log_path: the file path to be redirected to; the default file is under the current
+     workspace named `bigdl.log`.
     """
     callBigDlFunc(bigdl_type, "redirectSparkLogs", log_path)
 
@@ -473,12 +489,14 @@ def load_conf(conf_str):
 
     for p in sys.path:
         if bigdl_conf_file in p and os.path.isfile(p):
-            with open(p) if sys.version_info < (3,) else open(p, encoding='latin-1') as conf_file: # noqa
+            with open(p) if sys.version_info < (3,) else open(p,
+                                                              encoding='latin-1') as conf_file:
+                # noqa
                 return load_conf(conf_file.read())
         if bigdl_python_wrapper in p and os.path.isfile(p):
             import zipfile
             with zipfile.ZipFile(p, 'r') as zip_conf:
-                if bigdl_conf_file  in zip_conf.namelist():
+                if bigdl_conf_file in zip_conf.namelist():
                     content = zip_conf.read(bigdl_conf_file)
                     if sys.version_info >= (3,):
                         content = str(content, 'latin-1')
@@ -527,7 +545,8 @@ def create_spark_conf():
     if python_lib:
         existing_py_files = sparkConf.get("spark.submit.pyFiles")
         if existing_py_files:
-            sparkConf.set(key="spark.submit.pyFiles", value="%s,%s" % (python_lib, existing_py_files))
+            sparkConf.set(key="spark.submit.pyFiles",
+                          value="%s,%s" % (python_lib, existing_py_files))
         else:
             sparkConf.set(key="spark.submit.pyFiles", value=python_lib)
 
diff --git a/python/dllib/src/bigdl/dllib/utils/engine.py b/python/dllib/src/bigdl/dllib/utils/engine.py
index 4107f29e517..a8f745b2dfb 100644
--- a/python/dllib/src/bigdl/dllib/utils/engine.py
+++ b/python/dllib/src/bigdl/dllib/utils/engine.py
@@ -37,7 +37,8 @@ def check_spark_source_conflict(spark_home, pyspark_path):
     # trigger a warning if two spark sources don't match
     if spark_home and not pyspark_path.startswith(spark_home):
         warning_msg = "Find both SPARK_HOME and pyspark. You may need to check whether they " + \
-                      "match with each other. SPARK_HOME environment variable is set to: " + spark_home + \
+                      "match with each other. SPARK_HOME environment variable is set to: " +\
+                      spark_home + \
                       ", and pyspark is found in: " + pyspark_path + ". If they are unmatched, " + \
                       "please use one source only to avoid conflict. " + \
                       "For example, you can unset SPARK_HOME and use pyspark only."
@@ -50,7 +51,6 @@ def __sys_path_insert(file_path):
         sys.path.insert(0, file_path)
 
 
-
 def __prepare_spark_env():
     spark_home = os.environ.get('SPARK_HOME', None)
     if exist_pyspark():
@@ -115,6 +115,7 @@ def get_bigdl_classpath():
     jar_paths = get_bigdl_jars()
     return ":".join(jar_paths)
 
+
 def get_bigdl_jars():
     """
     Get and return the jar path for bigdl if exists.
@@ -123,22 +124,24 @@ def get_bigdl_jars():
     jar_paths = glob.glob(os.path.join(jar_dir, "share/*/lib/*.jar"))
     return jar_paths
 
+
 def get_bigdl_conf():
     jar_dir = os.path.abspath(__file__ + "/../../../")
     conf_paths = glob.glob(os.path.join(jar_dir, "share/*/conf/*.conf"))
     return conf_paths[0]
 
+
 def is_spark_below_2_2():
     """
     Check if spark version is below 2.2
     """
     import pyspark
-    if(hasattr(pyspark,"version")):
+    if (hasattr(pyspark, "version")):
         full_version = pyspark.version.__version__
         # We only need the general spark version (eg, 1.6, 2.2).
         parts = full_version.split(".")
         spark_version = parts[0] + "." + parts[1]
-        if(compare_version(spark_version, "2.2")>=0):
+        if (compare_version(spark_version, "2.2") >= 0):
             return False
     return True
 
@@ -148,7 +151,8 @@ def compare_version(version1, version2):
     Compare version strings.
     :param version1;
     :param version2;
-    :return: 1 if version1 is after version2; -1 if version1 is before version2; 0 if two versions are the same.
+    :return: 1 if version1 is after version2; -1 if version1 is before version2; 0 if two versions
+     are the same.
     """
     v1Arr = version1.split(".")
     v2Arr = version2.split(".")
diff --git a/python/dllib/src/bigdl/dllib/utils/spark.py b/python/dllib/src/bigdl/dllib/utils/spark.py
index 7298d45d919..690172b52eb 100644
--- a/python/dllib/src/bigdl/dllib/utils/spark.py
+++ b/python/dllib/src/bigdl/dllib/utils/spark.py
@@ -22,7 +22,8 @@
 from pyspark import SparkContext
 from bigdl.dllib.nncontext import init_internal_nncontext, init_spark_conf
 from bigdl.dllib.utils.utils import detect_python_location, pack_penv, get_node_ip
-from bigdl.dllib.utils.utils import get_executor_conda_zoo_classpath, get_zoo_bigdl_classpath_on_driver
+from bigdl.dllib.utils.utils import get_executor_conda_zoo_classpath
+from bigdl.dllib.utils.utils import get_zoo_bigdl_classpath_on_driver
 from bigdl.dllib.utils.engine import get_bigdl_jars
 
 
@@ -46,7 +47,7 @@ def create_sc(self, submit_args, conf):
         os.environ["PYSPARK_SUBMIT_ARGS"] = submit_args
         spark_conf = init_spark_conf(conf)
         sc = init_internal_nncontext(conf=spark_conf, spark_log_level=self.spark_log_level,
-                            redirect_spark_log=self.redirect_spark_log)
+                                     redirect_spark_log=self.redirect_spark_log)
         return sc
 
     def init_spark_on_local(self, cores, conf=None, python_location=None):
@@ -57,7 +58,7 @@ def init_spark_on_local(self, cores, conf=None, python_location=None):
         master = "local[{}]".format(cores)
         zoo_conf = init_spark_conf(conf).setMaster(master)
         sc = init_internal_nncontext(conf=zoo_conf, spark_log_level=self.spark_log_level,
-                            redirect_spark_log=self.redirect_spark_log)
+                                     redirect_spark_log=self.redirect_spark_log)
         print("Successfully got a SparkContext")
         return sc
 
@@ -131,21 +132,21 @@ def init_spark_on_yarn(self,
         return sc
 
     def init_spark_on_yarn_cluster(self,
-                           hadoop_conf,
-                           conda_name,
-                           num_executors,
-                           executor_cores,
-                           executor_memory="2g",
-                           driver_cores=4,
-                           driver_memory="1g",
-                           extra_executor_memory_for_ray=None,
-                           extra_python_lib=None,
-                           penv_archive=None,
-                           additional_archive=None,
-                           hadoop_user_name="root",
-                           spark_yarn_archive=None,
-                           conf=None,
-                           jars=None):
+                                   hadoop_conf,
+                                   conda_name,
+                                   num_executors,
+                                   executor_cores,
+                                   executor_memory="2g",
+                                   driver_cores=4,
+                                   driver_memory="1g",
+                                   extra_executor_memory_for_ray=None,
+                                   extra_python_lib=None,
+                                   penv_archive=None,
+                                   additional_archive=None,
+                                   hadoop_user_name="root",
+                                   spark_yarn_archive=None,
+                                   conf=None,
+                                   jars=None):
         print("Initializing job for yarn-cluster mode")
         executor_python_env = "python_env"
         os.environ["HADOOP_CONF_DIR"] = hadoop_conf
@@ -173,7 +174,8 @@ def init_spark_on_yarn_cluster(self,
             conf = enrich_conf_for_spark(conf, driver_cores, driver_memory, num_executors,
                                          executor_cores, executor_memory,
                                          extra_executor_memory_for_ray)
-            conf["spark.yarn.appMasterEnv.PYSPARK_PYTHON"] = "{}/bin/python".format(executor_python_env)
+            conf["spark.yarn.appMasterEnv.PYSPARK_PYTHON"] = "{}/bin/python".format(
+                executor_python_env)
             conf["spark.yarn.appMasterEnv.OnAppMaster"] = "True"
             conf["spark.yarn.appMasterEnv.PYTHONHOME"] = executor_python_env
             conf["spark.executorEnv.PYSPARK_PYTHON"] = "{}/bin/python".format(executor_python_env)
@@ -336,8 +338,7 @@ def init_spark_on_k8s(self,
                                      executor_cores, executor_memory, extra_executor_memory_for_ray)
         py_version = ".".join(platform.python_version().split(".")[0:2])
         preload_so = python_env + "/lib/libpython" + py_version + "m.so"
-        ld_path = python_env + "/lib:" + python_env + "/lib/python" +\
-            py_version + "/lib-dynload"
+        ld_path = python_env + "/lib:" + python_env + "/lib/python" + py_version + "/lib-dynload"
         if "spark.executor.extraLibraryPath" in conf:
             ld_path = "{}:{}".format(ld_path, conf["spark.executor.extraLibraryPath"])
         conf.update({"spark.cores.max": num_executors * executor_cores,
diff --git a/python/dllib/src/bigdl/dllib/utils/tf.py b/python/dllib/src/bigdl/dllib/utils/tf.py
index e838829c36d..2ad827abb1d 100644
--- a/python/dllib/src/bigdl/dllib/utils/tf.py
+++ b/python/dllib/src/bigdl/dllib/utils/tf.py
@@ -21,8 +21,8 @@
 import tempfile
 import re
 import shutil
-from bigdl.dllib.utils.file_utils import put_local_file_to_remote, get_remote_file_to_local, get_file_list,\
-    is_local_path
+from bigdl.dllib.utils.file_utils import put_local_file_to_remote, get_remote_file_to_local,\
+    get_file_list, is_local_path
 
 
 def process_grad(grad):
diff --git a/python/dllib/src/bigdl/dllib/utils/tf_utils.py b/python/dllib/src/bigdl/dllib/utils/tf_utils.py
index 72b348c2338..ae41d9fb795 100644
--- a/python/dllib/src/bigdl/dllib/utils/tf_utils.py
+++ b/python/dllib/src/bigdl/dllib/utils/tf_utils.py
@@ -31,8 +31,8 @@
 from bigdl.dllib.utils.common import callBigDlFunc
 import os
 
-def get_path(output_name, sess=None):
 
+def get_path(output_name, sess=None):
     if sess is None:
         sess = tf.Session()
         init = tf.global_variables_initializer()
@@ -51,7 +51,6 @@ def get_path(output_name, sess=None):
     return temp + '/model.pb'
 
 
-
 def convert(input_ops, output_ops, byte_order, bigdl_type):
     """
     Convert tensorflow model to bigdl model
@@ -90,7 +89,7 @@ def export_checkpoint(checkpoint_path):
     reader = tf.train.NewCheckpointReader(checkpoint_path)
 
     # Get tensor name list
-    tensor_names = filter(lambda n: n!='global_step',
+    tensor_names = filter(lambda n: n != 'global_step',
                           reader.get_variable_to_shape_map().keys())
     # Prepare key-value dictionary
     tensors = {}
@@ -117,15 +116,15 @@ def save_variable_bigdl(tensors, target_path, bigdl_type="float"):
         else:
             value = tensors[tn]
         jtensors[tn] = JTensor.from_ndarray(value)
-        
+
     callBigDlFunc(bigdl_type, "saveTensorDictionary", jtensors, target_path)
 
 
 def dump_model(path, graph=None, sess=None, ckpt_file=None, bigdl_type="float"):
     """
-    Dump a tensorflow model to files. The graph will be dumped to path/model.pb, and the checkpoint will
-    be dumped to path/model.bin
-    
+    Dump a tensorflow model to files. The graph will be dumped to path/model.pb, and the checkpoint
+    will be dumped to path/model.bin
+
     :param path: dump folder path
     :param sess: if user pass in session, we assume that the variable of the graph in the session
     has been inited
diff --git a/python/dllib/src/bigdl/dllib/utils/utils.py b/python/dllib/src/bigdl/dllib/utils/utils.py
index 61953e289dc..013f6600dc7 100644
--- a/python/dllib/src/bigdl/dllib/utils/utils.py
+++ b/python/dllib/src/bigdl/dllib/utils/utils.py
@@ -25,6 +25,7 @@
     long = int
     unicode = str
 
+
 def to_sample_rdd(x, y, sc, num_slices=None):
     """
     Convert x and y into RDD[Sample]
@@ -109,7 +110,7 @@ def pack_conda_main(conda_name, tmp_path):
     pack_env = os.environ.copy()
     if "PYTHONHOME" in pack_env:
         pack_env.pop("PYTHONHOME")
-    pack_cmd = "conda pack --format tar.gz --n-threads 8 -f -n {} -o {}"\
+    pack_cmd = "conda pack --format tar.gz --n-threads 8 -f -n {} -o {}" \
         .format(conda_name, tmp_path)
     pro = subprocess.Popen(pack_cmd, shell=True, env=pack_env)
     if pro.wait() != 0:
@@ -138,14 +139,15 @@ def get_executor_conda_zoo_classpath(conda_path):
     from bigdl.dllib.utils.engine import get_bigdl_jars
     bigdl_jars = get_bigdl_jars()
     python_interpreter_name = get_conda_python_path().split("/")[-1]  # Python version
-    prefix = "{}/lib/{}/site-packages/"\
+    prefix = "{}/lib/{}/site-packages/" \
         .format(conda_path, python_interpreter_name)
-    executor_classpath=[]
+    executor_classpath = []
     for jar_path in list(bigdl_jars):
         postfix = "/".join(jar_path.split("/")[-5:])
         executor_classpath.append("{}/{}".format(prefix, postfix))
     return executor_classpath
 
+
 def get_zoo_bigdl_classpath_on_driver():
     from bigdl.dllib.utils.engine import get_bigdl_classpath
     bigdl_classpath = get_bigdl_classpath()
diff --git a/python/dllib/src/bigdl/dllib/utils/zoo_engine.py b/python/dllib/src/bigdl/dllib/utils/zoo_engine.py
index 9823b65dfd9..715d30968d8 100644
--- a/python/dllib/src/bigdl/dllib/utils/zoo_engine.py
+++ b/python/dllib/src/bigdl/dllib/utils/zoo_engine.py
@@ -114,7 +114,7 @@ def get_analytics_zoo_classpath():
             # check jar path or jars dir path that is ended with "jars/*"
             if not os.path.exists(path) and not os.path.exists(path.split("*")[0]):
                 raise ValueError("Path {} specified BIGDL_CLASSPATH does not exist."
-                                     .format(path))
+                                 .format(path))
         return os.environ["BIGDL_CLASSPATH"]
     jar_dir = os.path.abspath(__file__ + "/../../../")
     jar_paths = glob.glob(os.path.join(jar_dir, "share/orca/lib/*.jar"))
diff --git a/python/dllib/src/setup.py b/python/dllib/src/setup.py
index 94134a7213b..42673b4d1f1 100755
--- a/python/dllib/src/setup.py
+++ b/python/dllib/src/setup.py
@@ -38,6 +38,7 @@
       python setup.py sdist
       pip install dist/*.tar.gz"""
 
+
 def build_from_source():
     code_path = bigdl_home + "/python/dllib/src/bigdl/dllib/utils/common.py"
     print("Checking: %s to see if build from source" % code_path)
@@ -57,10 +58,12 @@ def init_env():
         if os.path.exists(TEMP_PATH):
             rmtree(TEMP_PATH)
         copytree(dist_source, TEMP_PATH)
-        copyfile(bigdl_home + "/python/dllib/src/bigdl/dllib/nn/__init__.py", TEMP_PATH + "/__init__.py")
+        copyfile(bigdl_home + "/python/dllib/src/bigdl/dllib/nn/__init__.py",
+                 TEMP_PATH + "/__init__.py")
     else:
         print("Do nothing for release installation")
 
+
 def get_bigdl_packages():
     bigdl_python_home = os.path.abspath(__file__)[:-8]
     bigdl_packages = ['bigdl.share.dllib']
@@ -73,11 +76,12 @@ def get_bigdl_packages():
     print("================================================================")
     return bigdl_packages
 
+
 def setup_package():
     SCRIPTS_TARGET = os.path.join("../../../", "scripts/")
     script_names = ["pyspark-with-dllib", "spark-submit-with-dllib"]
     scripts = list(map(lambda script: os.path.join(
-                    SCRIPTS_TARGET, script), script_names))
+        SCRIPTS_TARGET, script), script_names))
     metadata = dict(
         name='bigdl-dllib',
         version=VERSION,
@@ -112,5 +116,4 @@ def setup_package():
         raise e
     finally:
         if build_from_source() and os.path.exists(TEMP_PATH):
-             rmtree(TEMP_PATH)
-
+            rmtree(TEMP_PATH)