dmlc
diff --git a/‎.github/workflows/unittests.yml
+1-1 b/‎.github/workflows/unittests.yml
+1-1
diff --git a/‎README.md
+10-13 b/‎README.md
+10-13
diff --git a/‎docs/_static/custom.css
+7-5 b/‎docs/_static/custom.css
+7-5
diff --git a/‎scripts/benchmarks/README.md
+45 b/‎scripts/benchmarks/README.md
+45
diff --git a/‎scripts/benchmarks/benchmark_gluonnlp.py
+130 b/‎scripts/benchmarks/benchmark_gluonnlp.py
+130
diff --git a/‎scripts/benchmarks/benchmark_gluonnlp.sh
+14 b/‎scripts/benchmarks/benchmark_gluonnlp.sh
+14
@@ -35,7 +35,7 @@ jobs:
           python -m pip install --user --upgrade pip
           python -m pip install --user setuptools pytest pytest-cov contextvars
           python -m pip install --upgrade cython
-          python -m pip install --pre --user "mxnet>=2.0.0b20200716" -f https://dist.mxnet.io/python
+          python -m pip install --pre --user "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
           python -m pip install --user -e .[extras]
       - name: Test project
         run: |
 
@@ -20,35 +20,32 @@ First of all, install the latest MXNet. You may use the following commands:
 
 ```bash
 # Install the version with CUDA 10.0
-pip install -U --pre "mxnet-cu100>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200802" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.1
-pip install -U --pre "mxnet-cu101>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200802" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.2
-pip install -U --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200802" -f https://dist.mxnet.io/python
 
 # Install the cpu-only version
-pip install -U --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet>=2.0.0b20200802" -f https://dist.mxnet.io/python
 ```
 
 
-To install, use
+To install GluonNLP, use
 
 ```bash
-pip install -U -e .
+python3 -m pip install -U -e .
 
 # Also, you may install all the extra requirements via
-pip install -U -e .[extras]
-
-# In case you are using zsh, try to use the following command for installing
-pip install -U -e ."[extras]"
+python3 -m pip install -U -e ."[extras]"
 ```
 
 If you find that you do not have the permission, you can also install to the user folder:
 
 ```bash
-pip install -U -e . --user
+python3 -m pip install -U -e . --user
 ```
 
 For Windows users, we recommend to use the [Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/about).
@@ -68,8 +65,8 @@ nlp_data help
 nlp_preprocess help
 
 # Also, you can use `python -m` to access the toolkits
-python -m gluonnlp.cli.data help
-python -m gluonnlp.cli.preprocess help
+python3 -m gluonnlp.cli.data help
+python3 -m gluonnlp.cli.preprocess help
 
 ```
 
 
@@ -20,9 +20,11 @@
 }
 
 @media (max-width: 650px) {
-.install .option, .install .title {
-    width: 90%;
-}
-.install .title {
-    margin-top: 1em;
+    .install .option, .install .title {
+        width: 90%;
+    }
+
+    .install .title {
+        margin-top: 1em;
+    }
 }
@@ -0,0 +1,45 @@
+# Benchmarking the Performance of NLP Backbones
+
+We benchmark the latency and peak memory usage of a single training (forward + backward) and inference (forward-only) step 
+of the NLP backbones.
+For comparison, we also provide the numbers of the models in huggingface.
+
+## Backbones in HuggingFace
+
+We use the [huggingface benchmark](https://github.com/huggingface/transformers/tree/master/examples/benchmarking) 
+to benchmark the training + inference speed of common workloads in NLP. 
+
+```bash
+python3 -m pip install -U -r requirements.txt --user
+python3 benchmark_hf.py
+```
+
+It will generate a list of csv files:
+
+```
+├── pytorch_train_fp32.csv
+├── pytorch_train_fp16.csv
+├── pytorch_infer_fp32.csv
+├── pytorch_infer_fp16.csv
+├── pytorch_infer_fp32_ts.csv
+```
+
+## GluonNLP Backbones based on MXNet-2.0
+
+We profile three options: `NT` layout, `NT` layout with `TN` layout as the compute layout,
+and `TN` layout.
+
+```bash
+python3 -m pip install -U -r requirements.txt --user
+bash benchmark_gluonnlp.sh
+```
+
+It will generate csv files with `gluonnlp_` as the prefix
+```
+├── gluonnlp_train_fp32_NT_NT.csv
+├── gluonnlp_train_fp32_NT_TN.csv
+├── gluonnlp_train_fp32_TN_TN.csv
+├── gluonnlp_infer_fp32_NT_NT.csv
+├── gluonnlp_infer_fp32_NT_TN.csv
+├── gluonnlp_infer_fp32_TN_TN.csv
+```
@@ -0,0 +1,130 @@
+import mxnet as mx
+import argparse
+import os
+import pandas as pd
+from benchmark_utils import GluonNLPBackboneBenchmark
+import multiprocessing as mp
+from multiprocessing import Process
+mx.npx.set_np()
+
+
+MODELS = [
+    'google_en_uncased_bert_base',
+    'google_en_uncased_bert_large',
+    'google_albert_base_v2',
+    'google_albert_large_v2',
+    'google_albert_xlarge_v2',
+    'google_albert_xxlarge_v2',
+    'google_electra_small',
+    'google_electra_base',
+    'google_electra_large',
+    'google_uncased_mobilebert',
+    'fairseq_bart_base',
+    'fairseq_bart_large'
+]
+
+# (batch_size, seq_length)
+train_workloads =\
+    [(4, 128),
+     (8, 128),
+     (16, 128),
+     (32, 128),
+     (1, 512),
+     (2, 512),
+     (4, 512),
+     (8, 512)]
+
+
+inference_workloads = [
+    (1, 128),
+    (1, 384),
+    (1, 512),
+    (8, 32),
+    (8, 128),
+    (8, 512),
+    (32, 512),
+    (256, 128),
+    (400, 100),
+]
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description='Process some integers.')
+    parser.add_argument('--layout', type=str, default='NT',
+                        help='The layout of the computation')
+    parser.add_argument('--compute_layout', type=str, default=None,
+                        help='The compute layout of the computation')
+    parser.add_argument('--mode', type=str, default='train',
+                        choices=['train', 'inference'])
+    return parser
+
+
+def run_benchmark(workload, model_name, out_file_name, is_train):
+    if is_train:
+        benchmark = GluonNLPBackboneBenchmark(
+            workloads=workload,
+            model_names=model_name,
+            profile_inference=False,
+            profile_train=True,
+            to_csv=True,
+            train_out_csv_file=out_file_name)
+        benchmark.run()
+    else:
+        benchmark = GluonNLPBackboneBenchmark(
+            workloads=workload,
+            model_names=model_name,
+            profile_inference=True,
+            profile_train=False,
+            to_csv=True,
+            inference_out_csv_file=out_file_name)
+        benchmark.run()
+    return
+
+
+if __name__ == '__main__':
+    mp.set_start_method('spawn')
+    parser = get_parser()
+    args = parser.parse_args()
+    if args.compute_layout is None:
+        args.compute_layout = args.layout
+    for layout, compute_layout in [(args.layout, args.compute_layout)]:
+        if compute_layout != layout:
+            profile_models = [ele for ele in MODELS if 'bart' not in ele]
+        else:
+            profile_models = [ele for ele in MODELS]
+        if args.mode == 'inference':
+            out_dir = 'infer_fp32_{}_{}'.format(layout, compute_layout)
+            df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+                                       'latency', 'memory'])
+            os.makedirs(out_dir, exist_ok=True)
+            for model_name in profile_models:
+                for workload in inference_workloads:
+                    out_path = os.path.join(out_dir, '{}_{}_{}.csv'.format(model_name, workload[0],
+                                                                           workload[1]))
+                    process = Process(
+                        target=run_benchmark,
+                        args=(workload, model_name, out_path, False))
+                    process.start()
+                    process.join()
+                    new_df = pd.read_csv(out_path)
+                    df = df.append(new_df, ignore_index=True)
+                    df.to_csv('gluonnlp_infer_fp32_{}_{}.csv'.format(layout, compute_layout))
+        elif args.mode == 'train':
+            out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout)
+            df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
+                                       'latency', 'memory'])
+            os.makedirs(out_dir, exist_ok=True)
+            for model_name in profile_models:
+                for workload in train_workloads:
+                    out_path = os.path.join(out_dir, '{}_{}_{}.csv'.format(model_name, workload[0],
+                                                                           workload[1]))
+                    process = Process(
+                        target=run_benchmark,
+                        args=(workload, model_name, out_path, True))
+                    process.start()
+                    process.join()
+                    new_df = pd.read_csv(out_path)
+                    df = df.append(new_df, ignore_index=True)
+                    df.to_csv('gluonnlp_train_fp32_{}_{}.csv'.format(layout, compute_layout))
+        else:
+            raise NotImplementedError
@@ -0,0 +1,14 @@
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode
+done