diff --git a/benchmarking.ipynb b/benchmarking.ipynb
new file mode 100644
index 0000000000..ecea9eab7d
--- /dev/null
+++ b/benchmarking.ipynb
@@ -0,0 +1,532 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "6e128361",
+   "metadata": {},
+   "source": [
+    "In this notebook, we pick up a model trained using `train_and_save_models_for_benchmarking.ipynb` and stored on google drive to perform inference and benchmark performance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ce1d7155",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: wandb in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 1)) (0.13.7)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 2)) (1.3.5)\n",
+      "Requirement already satisfied: nvidia-pyindex in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 3)) (1.0.9)\n",
+      "Requirement already satisfied: dllogger from git+https://github.com/NVIDIA/dllogger#egg=dllogger in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 4)) (1.0.0)\n",
+      "Requirement already satisfied: pathtools in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (0.1.2)\n",
+      "Requirement already satisfied: promise<3,>=2.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (2.3)\n",
+      "Requirement already satisfied: docker-pycreds>=0.4.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (0.4.0)\n",
+      "Requirement already satisfied: shortuuid>=0.5.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.0.11)\n",
+      "Requirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (5.9.4)\n",
+      "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0; python_version < \"3.9\" and sys_platform == \"linux\" in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (3.20.3)\n",
+      "Requirement already satisfied: PyYAML in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (6.0)\n",
+      "Requirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (2.28.1)\n",
+      "Requirement already satisfied: setproctitle in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.3.2)\n",
+      "Requirement already satisfied: GitPython>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (3.1.30)\n",
+      "Requirement already satisfied: sentry-sdk>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.12.1)\n",
+      "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from wandb->-r requirements.txt (line 1)) (45.2.0)\n",
+      "Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (8.1.3)\n",
+      "Requirement already satisfied: numpy>=1.17.3; platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\" in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (1.22.4)\n",
+      "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (2022.7)\n",
+      "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (2.8.2)\n",
+      "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from promise<3,>=2.0->wandb->-r requirements.txt (line 1)) (1.14.0)\n",
+      "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2.1.1)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (1.26.13)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2.8)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2019.11.28)\n",
+      "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.8/dist-packages (from GitPython>=1.0.0->wandb->-r requirements.txt (line 1)) (4.0.10)\n",
+      "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.8/dist-packages (from gitdb<5,>=4.0.1->GitPython>=1.0.0->wandb->-r requirements.txt (line 1)) (5.0.0)\n",
+      "Requirement already satisfied: gdown in /usr/local/lib/python3.8/dist-packages (4.6.3)\n",
+      "Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from gdown) (4.64.1)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from gdown) (3.9.0)\n",
+      "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from gdown) (4.11.1)\n",
+      "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from gdown) (1.14.0)\n",
+      "Requirement already satisfied: requests[socks] in /usr/local/lib/python3.8/dist-packages (from gdown) (2.28.1)\n",
+      "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.8/dist-packages (from beautifulsoup4->gdown) (2.3.2.post1)\n",
+      "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (2.1.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests[socks]->gdown) (2.8)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests[socks]->gdown) (2019.11.28)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (1.26.13)\n",
+      "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6; extra == \"socks\" in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (1.7.1)\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading...\n",
+      "From: https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV\n",
+      "To: /workspace/examples/t4rec_paper_experiments/t4r_paper_repro/rees46_ecom_dataset_small_for_ci.zip\n",
+      "100%|██████████| 43.4M/43.4M [00:06<00:00, 6.42MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease\n",
+      "Hit:2 http://archive.ubuntu.com/ubuntu focal InRelease\n",
+      "Hit:3 http://security.ubuntu.com/ubuntu focal-security InRelease\n",
+      "Hit:4 http://archive.ubuntu.com/ubuntu focal-updates InRelease\n",
+      "Hit:5 http://archive.ubuntu.com/ubuntu focal-backports InRelease\n",
+      "Reading package lists...\n",
+      "Reading package lists...\n",
+      "Building dependency tree...\n",
+      "Reading state information...\n",
+      "unzip is already the newest version (6.0-25ubuntu1.1).\n",
+      "0 upgraded, 0 newly installed, 0 to remove and 74 not upgraded.\n",
+      "Archive:  rees46_ecom_dataset_small_for_ci.zip\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error:  invalid response [# gdown h]\n",
+      "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error:  invalid response [ttps://dr]\n",
+      "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error:  invalid response [ive.googl]\n",
+      "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error:  invalid response [e.com/uc?]\n",
+      "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error:  invalid response [id=18Ella]\n",
+      "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error:  invalid response [Kaodqaesr]\n",
+      "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: "
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "set -e\n",
+    "\n",
+    "#### Install requirements\n",
+    "cd examples/t4rec_paper_experiments\n",
+    "pip install -r requirements.txt\n",
+    "\n",
+    "### Get data\n",
+    "cd t4r_paper_repro\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=../datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "pip install gdown\n",
+    "gdown https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV\n",
+    "apt-get update -y\n",
+    "apt-get install unzip -y\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "unzip -d $DATA_PATH \"rees46_ecom_dataset_small_for_ci.zip\"\n",
+    "# gdown https://drive.google.com/uc?id=18EllaKaodqaesrNJ3YGEmv0YUD3NX0vK\n",
+    "# mkdir -p /transformers4rec/TF4Rec/models/\n",
+    "# MODEL_PATH=/transformers4rec/TF4Rec/models/\n",
+    "# unzip -d $MODEL_PATH \"model.zip\"\n",
+    "exit 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "513f52fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import glob\n",
+    "import logging\n",
+    "import os\n",
+    "from functools import partial\n",
+    "import pandas as pd\n",
+    "import cudf\n",
+    "import numpy as np\n",
+    "import nvtabular.inference.triton as nvt_triton\n",
+    "import tritonclient.grpc as grpcclient\n",
+    "import subprocess\n",
+    "import time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0abc674e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p /workspace/examples/t4rec_paper_experiments/t4r_paper_repro"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "163eef53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.chdir('/workspace/examples/t4rec_paper_experiments/t4r_paper_repro')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "f4071799",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_path = os.path.join(\n",
+    "    '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/',\n",
+    "    str(2,).zfill(4),\n",
+    "    \"valid.parquet\",\n",
+    ")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f2775430",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<subprocess.Popen at 0x7f7b8b8666a0>"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0220 02:43:34.847979 18298 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7f03d6000000' with size 268435456\n",
+      "I0220 02:43:34.848302 18298 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864\n",
+      "I0220 02:43:34.850169 18298 model_lifecycle.cc:459] loading: t4r_pytorch_pt:1\n",
+      "I0220 02:43:38.522804 18298 python_be.cc:1856] TRITONBACKEND_ModelInstanceInitialize: t4r_pytorch_pt (GPU device 0)\n"
+     ]
+    }
+   ],
+   "source": [
+    "# load model trained locally using train_and_save_models_for_benchmarking.ipynb\n",
+    "\n",
+    "my_env = os.environ.copy()\n",
+    "\n",
+    "# # run on the CPU\n",
+    "# my_env[\"CUDA_VISIBLE_DEVICES\"] = ''\n",
+    "# my_env[\"HAS_GPU\"] = '0'\n",
+    "\n",
+    "# run on the GPU\n",
+    "my_env[\"HAS_GPU\"] = '1'\n",
+    "\n",
+    "subprocess.Popen(['tritonserver', '--model-repository=/workspace/models_for_benchmarking/'], env=my_env)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "39dfa60b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # load model downloaded from google drive\n",
+    "# subprocess.Popen(['tritonserver',  '--model-repository=/transformers4rec/TF4Rec/models/'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "6b8f3a54",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "I0220 02:43:42.878213 18298 model_lifecycle.cc:694] successfully loaded 't4r_pytorch_pt' version 1\n",
+      "I0220 02:43:42.878340 18298 server.cc:563] \n",
+      "+------------------+------+\n",
+      "| Repository Agent | Path |\n",
+      "+------------------+------+\n",
+      "+------------------+------+\n",
+      "\n",
+      "I0220 02:43:42.878405 18298 server.cc:590] \n",
+      "+---------+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "| Backend | Path                                                  | Config                                                                                                                                                        |\n",
+      "+---------+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "| python  | /opt/tritonserver/backends/python/libtriton_python.so | {\"cmdline\":{\"auto-complete-config\":\"true\",\"min-compute-capability\":\"6.000000\",\"backend-directory\":\"/opt/tritonserver/backends\",\"default-max-batch-size\":\"4\"}} |\n",
+      "+---------+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "\n",
+      "I0220 02:43:42.878442 18298 server.cc:633] \n",
+      "+----------------+---------+--------+\n",
+      "| Model          | Version | Status |\n",
+      "+----------------+---------+--------+\n",
+      "| t4r_pytorch_pt | 1       | READY  |\n",
+      "+----------------+---------+--------+\n",
+      "\n",
+      "I0220 02:43:42.903695 18298 metrics.cc:864] Collecting metrics for GPU 0: Quadro RTX 8000\n",
+      "I0220 02:43:42.903932 18298 metrics.cc:757] Collecting CPU metrics\n",
+      "I0220 02:43:42.904063 18298 tritonserver.cc:2264] \n",
+      "+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "| Option                           | Value                                                                                                                                                                                                |\n",
+      "+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "| server_id                        | triton                                                                                                                                                                                               |\n",
+      "| server_version                   | 2.28.0                                                                                                                                                                                               |\n",
+      "| server_extensions                | classification sequence model_repository model_repository(unload_dependents) schedule_policy model_configuration system_shared_memory cuda_shared_memory binary_tensor_data statistics trace logging |\n",
+      "| model_repository_path[0]         | /workspace/models_for_benchmarking/                                                                                                                                                                  |\n",
+      "| model_control_mode               | MODE_NONE                                                                                                                                                                                            |\n",
+      "| strict_model_config              | 0                                                                                                                                                                                                    |\n",
+      "| rate_limit                       | OFF                                                                                                                                                                                                  |\n",
+      "| pinned_memory_pool_byte_size     | 268435456                                                                                                                                                                                            |\n",
+      "| cuda_memory_pool_byte_size{0}    | 67108864                                                                                                                                                                                             |\n",
+      "| response_cache_byte_size         | 0                                                                                                                                                                                                    |\n",
+      "| min_supported_compute_capability | 6.0                                                                                                                                                                                                  |\n",
+      "| strict_readiness                 | 1                                                                                                                                                                                                    |\n",
+      "| exit_timeout                     | 30                                                                                                                                                                                                   |\n",
+      "+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+      "\n",
+      "I0220 02:43:42.904966 18298 grpc_server.cc:4819] Started GRPCInferenceService at 0.0.0.0:8001\n",
+      "I0220 02:43:42.905121 18298 http_server.cc:3477] Started HTTPService at 0.0.0.0:8000\n",
+      "I0220 02:43:42.945837 18298 http_server.cc:184] Started Metrics Service at 0.0.0.0:8002\n"
+     ]
+    }
+   ],
+   "source": [
+    "time.sleep(15)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "f2413171",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "client created.\n",
+      "GET /v2/health/live, headers None\n",
+      "<HTTPSocketPoolResponse status=200 headers={'content-length': '0', 'content-type': 'text/plain'}>\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.8/dist-packages/tritonhttpclient/__init__.py:31: DeprecationWarning: The package `tritonhttpclient` is deprecated and will be removed in a future version. Please use instead `tritonclient.http`\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "True"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import tritonhttpclient\n",
+    "try:\n",
+    "    triton_client = tritonhttpclient.InferenceServerClient(url=\"localhost:8000\", verbose=True)\n",
+    "    print(\"client created.\")\n",
+    "except Exception as e:\n",
+    "    print(\"channel creation failed: \" + str(e))\n",
+    "triton_client.is_server_live()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "1726cce3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prediction_data = cudf.read_parquet(eval_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "7a21bdff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "col_names = ['sess_pid_seq']\n",
+    "inputs = nvt_triton.convert_df_to_triton_input(col_names, prediction_data.loc[6, col_names], grpcclient.InferInput)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "7cd5d6f9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.058879852294921875"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import time\n",
+    "MODEL_NAME_PT = \"t4r_pytorch_pt\"\n",
+    "\n",
+    "N_TRIALS = 1000\n",
+    "\n",
+    "# WarmUp\n",
+    "for _ in range(N_TRIALS):\n",
+    "    payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n",
+    "    with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n",
+    "        col_names = ['sess_pid_seq']\n",
+    "        inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n",
+    "        response = client.infer(MODEL_NAME_PT, inputs)\n",
+    "    end_time = time.time()\n",
+    "\n",
+    "\n",
+    "# Collecting\n",
+    "\n",
+    "out = []\n",
+    "for _ in range(N_TRIALS):\n",
+    "    payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n",
+    "    start_time = time.time()\n",
+    "    with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n",
+    "        col_names = ['sess_pid_seq']\n",
+    "        inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n",
+    "        response = client.infer(MODEL_NAME_PT, inputs)\n",
+    "    end_time = time.time()\n",
+    "    out.append(end_time-start_time)\n",
+    "    \n",
+    "# P95\n",
+    "np.sort(out)[int(0.95 * N_TRIALS)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "876aaf7a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.008340835571289062"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import time\n",
+    "MODEL_NAME_PT = \"t4r_pytorch_pt\"\n",
+    "\n",
+    "N_TRIALS = 1000\n",
+    "\n",
+    "# WarmUp\n",
+    "for _ in range(N_TRIALS):\n",
+    "    payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n",
+    "    with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n",
+    "        col_names = ['sess_pid_seq']\n",
+    "        inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n",
+    "        response = client.infer(MODEL_NAME_PT, inputs)\n",
+    "    end_time = time.time()\n",
+    "\n",
+    "\n",
+    "# Collecting\n",
+    "\n",
+    "out = []\n",
+    "for _ in range(N_TRIALS):\n",
+    "    payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n",
+    "    start_time = time.time()\n",
+    "    with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n",
+    "        col_names = ['sess_pid_seq']\n",
+    "        inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n",
+    "        response = client.infer(MODEL_NAME_PT, inputs)\n",
+    "    end_time = time.time()\n",
+    "    out.append(end_time-start_time)\n",
+    "    \n",
+    "# P95\n",
+    "np.sort(out)[int(0.95 * N_TRIALS)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8b537ca",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "212fd93a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %%timeit\n",
+    "\n",
+    "# output_names = [\"output\"]\n",
+    "\n",
+    "# outputs = []\n",
+    "# for col in output_names:\n",
+    "#     outputs.append(grpcclient.InferRequestedOutput(col))\n",
+    "    \n",
+    "# MODEL_NAME_PT = \"t4r_pytorch_pt\"\n",
+    "# payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n",
+    "\n",
+    "# with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n",
+    "#     col_names = ['sess_pid_seq']\n",
+    "#     inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n",
+    "#     response = client.infer(MODEL_NAME_PT, inputs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc821e3b",
+   "metadata": {},
+   "source": [
+    "Some additional information on how the benchmarking was run:\n",
+    "\n",
+    "I train and save the model using (modified) script for the T4Rec paper, all this is documented in the following notebook: https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/add_benchmarking_scripts/train_and_save_models_for_benchmarking.ipynb\n",
+    "I generate data for benchmarking here: https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/add_benchmarking_scripts/generate_randomized_input_for_benchmarking.ipynb\n",
+    "And I then start triton and load the model using code from this notebook: https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/add_benchmarking_scripts/benchmarking.ipynb\n",
+    "When inferring on the CPU, you need to make sure you output the correct config.pbtxt, the modified model.py should handle the rest.\n",
+    "Also, from the model repository, I delete all the other models that are being output, I am only loading the T4Rec model.\n",
+    "I run perf_analyzer using this command: perf_analyzer -m t4r_pytorch_pt --shape sess_pid_seq__nnzs:2,1 --shape sess_pid_seq__values:20,1 --input-data input.json --concurrency-range 1:4:1"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/generate_randomized_input_for_benchmarking.ipynb b/generate_randomized_input_for_benchmarking.ipynb
new file mode 100644
index 0000000000..163e58f991
--- /dev/null
+++ b/generate_randomized_input_for_benchmarking.ipynb
@@ -0,0 +1,164 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "486c0d66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "j = \"\"\"\n",
+    " {  \n",
+    "  \"data\" :\n",
+    "     [\n",
+    "        {\n",
+    "          \"sess_pid_seq__values\" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19],\n",
+    "          \"sess_pid_seq__nnzs\" : [0,20]\n",
+    "        }\n",
+    "      ]\n",
+    "  }\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "8d629daf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e95ca248",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "d = json.loads(j)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "1f7836da",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'data': [{'sess_pid_seq__values': [0,\n",
+       "    1,\n",
+       "    2,\n",
+       "    3,\n",
+       "    4,\n",
+       "    5,\n",
+       "    6,\n",
+       "    7,\n",
+       "    8,\n",
+       "    9,\n",
+       "    10,\n",
+       "    11,\n",
+       "    12,\n",
+       "    13,\n",
+       "    15,\n",
+       "    16,\n",
+       "    17,\n",
+       "    18,\n",
+       "    19],\n",
+       "   'sess_pid_seq__nnzs': [0, 20]}]}"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "d"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f4cbfe0d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'{\"data\": [{\"sess_pid_seq__values\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19], \"sess_pid_seq__nnzs\": [0, 20]}]}'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "json.dumps(d)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "18cfd44c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "353b82cb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data_as_dict = {'data': []}\n",
+    "\n",
+    "for i in range(10_000):\n",
+    "    payload = {}\n",
+    "    payload['sess_pid_seq__values'] = np.random.randint(0, 390001, 20).tolist()\n",
+    "    payload['sess_pid_seq__nnzs'] = [0, 20]\n",
+    "    \n",
+    "    data_as_dict['data'].append(payload)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "ef9c5f38",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('input.json', 'w') as f:\n",
+    "    json.dump(data_as_dict, f)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/train_and_save_models_for_benchmarking.ipynb b/train_and_save_models_for_benchmarking.ipynb
new file mode 100644
index 0000000000..73244ccba4
--- /dev/null
+++ b/train_and_save_models_for_benchmarking.ipynb
@@ -0,0 +1,1476 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1571e258",
+   "metadata": {},
+   "source": [
+    "This notebook trains and exports a model for benchmarking using scripts provided by Gabriel (the ones that has been used for the T4Rec paper)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "9eccd5e8",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: wandb in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 1)) (0.13.7)\n",
+      "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 2)) (1.3.5)\n",
+      "Requirement already satisfied: nvidia-pyindex in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 3)) (1.0.9)\n",
+      "Collecting dllogger\n",
+      "  Cloning https://github.com/NVIDIA/dllogger to /tmp/pip-install-qkhp_f4u/dllogger\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  Running command git clone -q https://github.com/NVIDIA/dllogger /tmp/pip-install-qkhp_f4u/dllogger\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: shortuuid>=0.5.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.0.11)\n",
+      "Requirement already satisfied: PyYAML in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (6.0)\n",
+      "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from wandb->-r requirements.txt (line 1)) (45.2.0)\n",
+      "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0; python_version < \"3.9\" and sys_platform == \"linux\" in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (3.20.3)\n",
+      "Requirement already satisfied: docker-pycreds>=0.4.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (0.4.0)\n",
+      "Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (8.1.3)\n",
+      "Requirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (2.28.1)\n",
+      "Requirement already satisfied: pathtools in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (0.1.2)\n",
+      "Requirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (5.9.4)\n",
+      "Requirement already satisfied: setproctitle in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.3.2)\n",
+      "Requirement already satisfied: GitPython>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (3.1.30)\n",
+      "Requirement already satisfied: promise<3,>=2.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (2.3)\n",
+      "Requirement already satisfied: sentry-sdk>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.12.1)\n",
+      "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (2.8.2)\n",
+      "Requirement already satisfied: numpy>=1.17.3; platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\" in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (1.22.4)\n",
+      "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (2022.7)\n",
+      "Requirement already satisfied: six>=1.4.0 in /usr/lib/python3/dist-packages (from docker-pycreds>=0.4.0->wandb->-r requirements.txt (line 1)) (1.14.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2.8)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2019.11.28)\n",
+      "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2.1.1)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (1.26.13)\n",
+      "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.8/dist-packages (from GitPython>=1.0.0->wandb->-r requirements.txt (line 1)) (4.0.10)\n",
+      "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.8/dist-packages (from gitdb<5,>=4.0.1->GitPython>=1.0.0->wandb->-r requirements.txt (line 1)) (5.0.0)\n",
+      "Building wheels for collected packages: dllogger\n",
+      "  Building wheel for dllogger (setup.py): started\n",
+      "  Building wheel for dllogger (setup.py): finished with status 'done'\n",
+      "  Created wheel for dllogger: filename=DLLogger-1.0.0-py3-none-any.whl size=5656 sha256=571a7a3df2e72b3d0c50aa54bda34ce6ded9c2d77053a1ad5336ff9a7dd3dfea\n",
+      "  Stored in directory: /tmp/pip-ephem-wheel-cache-0f1jgvf3/wheels/ad/94/cf/8f3396cb8d62d532695ec557e193fada55cd366e14fd9a02be\n",
+      "Successfully built dllogger\n",
+      "Installing collected packages: dllogger\n",
+      "Successfully installed dllogger-1.0.0\n",
+      "Collecting gdown\n",
+      "  Downloading gdown-4.6.3-py3-none-any.whl (14 kB)\n",
+      "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from gdown) (4.11.1)\n",
+      "Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from gdown) (3.9.0)\n",
+      "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from gdown) (1.14.0)\n",
+      "Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from gdown) (4.64.1)\n",
+      "Requirement already satisfied: requests[socks] in /usr/local/lib/python3.8/dist-packages (from gdown) (2.28.1)\n",
+      "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.8/dist-packages (from beautifulsoup4->gdown) (2.3.2.post1)\n",
+      "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (2.1.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests[socks]->gdown) (2.8)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (1.26.13)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests[socks]->gdown) (2019.11.28)\n",
+      "Collecting PySocks!=1.5.7,>=1.5.6; extra == \"socks\"\n",
+      "  Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)\n",
+      "Installing collected packages: gdown, PySocks\n",
+      "Successfully installed PySocks-1.7.1 gdown-4.6.3\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading...\n",
+      "From: https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV\n",
+      "To: /workspace/examples/t4rec_paper_experiments/t4r_paper_repro/rees46_ecom_dataset_small_for_ci.zip\n",
+      "100%|██████████| 43.4M/43.4M [00:07<00:00, 6.18MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease [1581 B]\n",
+      "Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  Packages [871 kB]\n",
+      "Get:3 http://archive.ubuntu.com/ubuntu focal InRelease [265 kB]\n",
+      "Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]\n",
+      "Get:5 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2496 kB]\n",
+      "Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]\n",
+      "Get:7 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]\n",
+      "Get:8 http://archive.ubuntu.com/ubuntu focal/restricted amd64 Packages [33.4 kB]\n",
+      "Get:9 http://archive.ubuntu.com/ubuntu focal/main amd64 Packages [1275 kB]\n",
+      "Get:10 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [995 kB]\n",
+      "Get:11 http://security.ubuntu.com/ubuntu focal-security/multiverse amd64 Packages [28.5 kB]\n",
+      "Get:12 http://security.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [1937 kB]\n",
+      "Get:13 http://archive.ubuntu.com/ubuntu focal/multiverse amd64 Packages [177 kB]\n",
+      "Get:14 http://archive.ubuntu.com/ubuntu focal/universe amd64 Packages [11.3 MB]\n",
+      "Get:15 http://archive.ubuntu.com/ubuntu focal-updates/restricted amd64 Packages [2066 kB]\n",
+      "Get:16 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages [2970 kB]\n",
+      "Get:17 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 Packages [1296 kB]\n",
+      "Get:18 http://archive.ubuntu.com/ubuntu focal-updates/multiverse amd64 Packages [31.2 kB]\n",
+      "Get:19 http://archive.ubuntu.com/ubuntu focal-backports/universe amd64 Packages [28.6 kB]\n",
+      "Get:20 http://archive.ubuntu.com/ubuntu focal-backports/main amd64 Packages [55.2 kB]\n",
+      "Fetched 26.2 MB in 10s (2536 kB/s)\n",
+      "Reading package lists...\n",
+      "Reading package lists...\n",
+      "Building dependency tree...\n",
+      "Reading state information...\n",
+      "unzip is already the newest version (6.0-25ubuntu1.1).\n",
+      "0 upgraded, 0 newly installed, 0 to remove and 74 not upgraded.\n",
+      "Archive:  rees46_ecom_dataset_small_for_ci.zip\n",
+      "   creating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/\n",
+      "  inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet  \n",
+      " extracting: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/.zip  \n",
+      "  inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/train.parquet  \n",
+      "  inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/test.parquet  \n",
+      "   creating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0002/\n",
+      "  inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0002/valid.parquet  \n",
+      "  inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0002/train.parquet  \n",
+      "  inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0002/test.parquet  \n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "set -e\n",
+    "\n",
+    "#### Install requirements\n",
+    "cd examples/t4rec_paper_experiments\n",
+    "pip install -r requirements.txt\n",
+    "\n",
+    "### Get data\n",
+    "cd t4r_paper_repro\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=../datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "pip install gdown\n",
+    "gdown https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV\n",
+    "apt-get update -y\n",
+    "apt-get install unzip -y\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "unzip -d $DATA_PATH \"rees46_ecom_dataset_small_for_ci.zip\"\n",
+    "exit 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "f114837f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_pid_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: INT\n",
+    "  int_domain {\n",
+    "    name: \"sess_pid_seq\"\n",
+    "    min: 1\n",
+    "    max: 390000\n",
+    "    is_categorical: true\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"item_id\"\n",
+    "    tag: \"list\"\n",
+    "    tag: \"categorical\"\n",
+    "    tag: \"item\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_ccid_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: INT\n",
+    "  int_domain {\n",
+    "    name: \"sess_ccid_seq\"\n",
+    "    min: 1\n",
+    "    max: 150\n",
+    "    is_categorical: true\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"list\"\n",
+    "    tag: \"categorical\"\n",
+    "    tag: \"item\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_csid_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: INT\n",
+    "  int_domain {\n",
+    "    name: \"sess_csid_seq\"\n",
+    "    min: 1\n",
+    "    max: 1400\n",
+    "    is_categorical: true\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"list\"\n",
+    "    tag: \"categorical\"\n",
+    "    tag: \"item\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_bid_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: INT\n",
+    "  int_domain {\n",
+    "    name: \"sess_bid_seq\"\n",
+    "    min: 1\n",
+    "    max: 7000\n",
+    "    is_categorical: true\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"list\"\n",
+    "    tag: \"categorical\"\n",
+    "    tag: \"item\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_price_log_norm_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: FLOAT\n",
+    "  float_domain {\n",
+    "    name: \"sess_price_log_norm_seq\"\n",
+    "    min: 0.0\n",
+    "    max: 10000.0\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"item\"\n",
+    "    tag: \"list\"\n",
+    "    tag: \"continuous\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_relative_price_to_avg_category_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: FLOAT\n",
+    "  float_domain {\n",
+    "    name: \"sess_relative_price_to_avg_category_seq\"\n",
+    "    min: -10000.0\n",
+    "    max: 10000.0\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"item\"\n",
+    "    tag: \"list\"\n",
+    "    tag: \"continuous\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_prod_recency_days_log_norm_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: FLOAT\n",
+    "  float_domain {\n",
+    "    name: \"sess_prod_recency_days_log_norm_seq\"\n",
+    "    min: -10000.0\n",
+    "    max: 10000.0\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"item\"\n",
+    "    tag: \"list\"\n",
+    "    tag: \"continuous\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_et_hour_sin_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: FLOAT\n",
+    "  float_domain {\n",
+    "    name: \"sess_et_hour_sin_seq\"\n",
+    "    min: -1.0\n",
+    "    max: 1.0\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"list\"\n",
+    "    tag: \"continuous\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_et_hour_cos_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: FLOAT\n",
+    "  float_domain {\n",
+    "    name: \"sess_et_hour_cos_seq\"\n",
+    "    min: -1.0\n",
+    "    max: 1.0\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"list\"\n",
+    "    tag: \"continuous\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_et_dayofweek_sin_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: FLOAT\n",
+    "  float_domain {\n",
+    "    name: \"sess_et_dayofweek_sin_seq\"\n",
+    "    min: -1.0\n",
+    "    max: 1.0\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"list\"\n",
+    "    tag: \"continuous\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_et_dayofweek_cos_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: FLOAT\n",
+    "  float_domain {\n",
+    "    name: \"sess_et_dayofweek_cos_seq\"\n",
+    "    min: -1.0\n",
+    "    max: 1.0\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"list\"\n",
+    "    tag: \"continuous\"\n",
+    "  }\n",
+    "}\n",
+    "\n",
+    "feature {\n",
+    "  name: \"sess_etime_seq\"\n",
+    "  value_count {\n",
+    "    min: 2\n",
+    "    max: 20\n",
+    "  }\n",
+    "  type: FLOAT\n",
+    "  float_domain {\n",
+    "    name: \"sess_etime_seq\"\n",
+    "    min: 0\n",
+    "    max: 0\n",
+    "  }\n",
+    "  annotation {\n",
+    "    tag: \"time\"\n",
+    "    tag: \"list\"\n",
+    "  }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "6089f14c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "04/05/2023 04:08:01 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "04/05/2023 04:08:03 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-04-05 04:08:03,174 >> Using amp fp16 backend\n",
+      "04/05/2023 04:08:03 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Apr05_04-08-00_da24a92e0a20', 'logging_first_step': False, 'logging_steps': 20, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 100, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': 10}\n",
+      "[INFO|trainer.py:1196] 2023-04-05 04:08:03,669 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-04-05 04:08:03,669 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-04-05 04:08:03,669 >>   Num Epochs = 1\n",
+      "[INFO|trainer.py:1199] 2023-04-05 04:08:03,669 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-04-05 04:08:03,669 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-04-05 04:08:03,669 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-04-05 04:08:03,669 >>   Total optimization steps = 676\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-04-05 04:08:03.174988 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : True  mlm_probability : 0.30000000000000004  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 1.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Apr05_04-08-00_da24a92e0a20  logging_first_step : False  logging_steps : 20  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 100  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : 10 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\r",
+      "  0%|          | 0/676 [00:00<?, ?it/s]\r",
+      "  0%|          | 1/676 [00:00<07:34,  1.48it/s]\r",
+      "  0%|          | 3/676 [00:00<02:37,  4.27it/s]\r",
+      "  1%|          | 5/676 [00:00<01:43,  6.46it/s]\r",
+      "  1%|          | 7/676 [00:01<01:21,  8.20it/s]\r",
+      "  1%|▏         | 9/676 [00:01<01:10,  9.40it/s]\r",
+      "  2%|▏         | 11/676 [00:01<01:04, 10.37it/s]\r",
+      "  2%|▏         | 13/676 [00:01<01:00, 11.00it/s]\r",
+      "  2%|▏         | 15/676 [00:01<00:57, 11.49it/s]\r",
+      "  3%|▎         | 17/676 [00:01<00:55, 11.89it/s]\r",
+      "  3%|▎         | 19/676 [00:02<00:54, 12.13it/s]\r",
+      "                                                \r",
+      "\r",
+      "  3%|▎         | 20/676 [00:02<00:54, 12.13it/s]\r",
+      "  3%|▎         | 21/676 [00:02<00:53, 12.34it/s]\r",
+      "  3%|▎         | 23/676 [00:02<00:52, 12.49it/s]\r",
+      "  4%|▎         | 25/676 [00:02<00:51, 12.63it/s]\r",
+      "  4%|▍         | 27/676 [00:02<00:51, 12.69it/s]\r",
+      "  4%|▍         | 29/676 [00:02<00:50, 12.76it/s]\r",
+      "  5%|▍         | 31/676 [00:03<00:50, 12.74it/s]\r",
+      "  5%|▍         | 33/676 [00:03<00:50, 12.75it/s]\r",
+      "  5%|▌         | 35/676 [00:03<00:50, 12.68it/s]\r",
+      "  5%|▌         | 37/676 [00:03<00:50, 12.72it/s]\r",
+      "  6%|▌         | 39/676 [00:03<00:49, 12.80it/s]\r",
+      "                                                \r",
+      "\r",
+      "  6%|▌         | 40/676 [00:03<00:49, 12.80it/s]\r",
+      "  6%|▌         | 41/676 [00:03<00:49, 12.74it/s]\r",
+      "  6%|▋         | 43/676 [00:03<00:49, 12.73it/s]\r",
+      "  7%|▋         | 45/676 [00:04<00:49, 12.74it/s]\r",
+      "  7%|▋         | 47/676 [00:04<00:49, 12.75it/s]\r",
+      "  7%|▋         | 49/676 [00:04<00:49, 12.77it/s]\r",
+      "  8%|▊         | 51/676 [00:04<00:49, 12.73it/s]\r",
+      "  8%|▊         | 53/676 [00:04<00:48, 12.75it/s]\r",
+      "  8%|▊         | 55/676 [00:04<00:48, 12.77it/s]\r",
+      "  8%|▊         | 57/676 [00:05<00:48, 12.89it/s]\r",
+      "  9%|▊         | 59/676 [00:05<00:48, 12.80it/s]\r",
+      "                                                \r",
+      "\r",
+      "  9%|▉         | 60/676 [00:05<00:48, 12.80it/s]\r",
+      "  9%|▉         | 61/676 [00:05<00:48, 12.72it/s]\r",
+      "  9%|▉         | 63/676 [00:05<00:48, 12.71it/s]\r",
+      " 10%|▉         | 65/676 [00:05<00:48, 12.67it/s]\r",
+      " 10%|▉         | 67/676 [00:05<00:48, 12.65it/s]\r",
+      " 10%|█         | 69/676 [00:06<00:48, 12.62it/s]\r",
+      " 11%|█         | 71/676 [00:06<00:47, 12.67it/s]\r",
+      " 11%|█         | 73/676 [00:06<00:47, 12.71it/s]\r",
+      " 11%|█         | 75/676 [00:06<00:47, 12.70it/s]\r",
+      " 11%|█▏        | 77/676 [00:06<00:47, 12.65it/s]\r",
+      " 12%|█▏        | 79/676 [00:06<00:47, 12.64it/s]\r",
+      "                                                \r",
+      "\r",
+      " 12%|█▏        | 80/676 [00:06<00:47, 12.64it/s]\r",
+      " 12%|█▏        | 81/676 [00:06<00:47, 12.63it/s]\r",
+      " 12%|█▏        | 83/676 [00:07<00:46, 12.67it/s]\r",
+      " 13%|█▎        | 85/676 [00:07<00:46, 12.68it/s]\r",
+      " 13%|█▎        | 87/676 [00:07<00:46, 12.71it/s]\r",
+      " 13%|█▎        | 89/676 [00:07<00:46, 12.72it/s]\r",
+      " 13%|█▎        | 91/676 [00:07<00:45, 12.81it/s]\r",
+      " 14%|█▍        | 93/676 [00:07<00:45, 12.71it/s]\r",
+      " 14%|█▍        | 95/676 [00:08<00:45, 12.73it/s]\r",
+      " 14%|█▍        | 97/676 [00:08<00:45, 12.66it/s]\r",
+      " 15%|█▍        | 99/676 [00:08<00:45, 12.68it/s]\r",
+      "                                                \r",
+      "\r",
+      " 15%|█▍        | 100/676 [00:08<00:45, 12.68it/s]\r",
+      " 15%|█▍        | 101/676 [00:08<00:45, 12.68it/s]\r",
+      " 15%|█▌        | 103/676 [00:08<00:45, 12.71it/s]\r",
+      " 16%|█▌        | 105/676 [00:08<00:44, 12.73it/s]\r",
+      " 16%|█▌        | 107/676 [00:09<00:44, 12.77it/s]\r",
+      " 16%|█▌        | 109/676 [00:09<00:44, 12.70it/s]\r",
+      " 16%|█▋        | 111/676 [00:09<00:44, 12.71it/s]\r",
+      " 17%|█▋        | 113/676 [00:09<00:44, 12.61it/s]\r",
+      " 17%|█▋        | 115/676 [00:09<00:44, 12.68it/s]\r",
+      " 17%|█▋        | 117/676 [00:09<00:43, 12.79it/s]\r",
+      " 18%|█▊        | 119/676 [00:09<00:43, 12.74it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 18%|█▊        | 120/676 [00:10<00:43, 12.74it/s]\r",
+      " 18%|█▊        | 121/676 [00:10<00:43, 12.72it/s]\r",
+      " 18%|█▊        | 123/676 [00:10<00:43, 12.77it/s]\r",
+      " 18%|█▊        | 125/676 [00:10<00:43, 12.79it/s]\r",
+      " 19%|█▉        | 127/676 [00:10<00:42, 12.78it/s]\r",
+      " 19%|█▉        | 129/676 [00:10<00:42, 12.75it/s]\r",
+      " 19%|█▉        | 131/676 [00:10<00:42, 12.68it/s]\r",
+      " 20%|█▉        | 133/676 [00:11<00:42, 12.75it/s]\r",
+      " 20%|█▉        | 135/676 [00:11<00:42, 12.71it/s]\r",
+      " 20%|██        | 137/676 [00:11<00:42, 12.64it/s]\r",
+      " 21%|██        | 139/676 [00:11<00:42, 12.70it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 21%|██        | 140/676 [00:11<00:42, 12.70it/s]\r",
+      " 21%|██        | 141/676 [00:11<00:42, 12.71it/s]\r",
+      " 21%|██        | 143/676 [00:11<00:41, 12.72it/s]\r",
+      " 21%|██▏       | 145/676 [00:11<00:41, 12.72it/s]\r",
+      " 22%|██▏       | 147/676 [00:12<00:41, 12.78it/s]\r",
+      " 22%|██▏       | 149/676 [00:12<00:41, 12.74it/s]\r",
+      " 22%|██▏       | 151/676 [00:12<00:41, 12.75it/s]\r",
+      " 23%|██▎       | 153/676 [00:12<00:40, 12.78it/s]\r",
+      " 23%|██▎       | 155/676 [00:12<00:40, 12.73it/s]\r",
+      " 23%|██▎       | 157/676 [00:12<00:40, 12.73it/s]\r",
+      " 24%|██▎       | 159/676 [00:13<00:40, 12.67it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 24%|██▎       | 160/676 [00:13<00:40, 12.67it/s]\r",
+      " 24%|██▍       | 161/676 [00:13<00:41, 12.55it/s]\r",
+      " 24%|██▍       | 163/676 [00:13<00:40, 12.73it/s]\r",
+      " 24%|██▍       | 165/676 [00:13<00:40, 12.71it/s]\r",
+      " 25%|██▍       | 167/676 [00:13<00:40, 12.71it/s]\r",
+      " 25%|██▌       | 169/676 [00:13<00:40, 12.67it/s]\r",
+      " 25%|██▌       | 171/676 [00:14<00:39, 12.72it/s]\r",
+      " 26%|██▌       | 173/676 [00:14<00:39, 12.68it/s]\r",
+      " 26%|██▌       | 175/676 [00:14<00:39, 12.73it/s]\r",
+      " 26%|██▌       | 177/676 [00:14<00:39, 12.65it/s]\r",
+      " 26%|██▋       | 179/676 [00:14<00:39, 12.70it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 27%|██▋       | 180/676 [00:14<00:39, 12.70it/s]\r",
+      " 27%|██▋       | 181/676 [00:14<00:39, 12.69it/s]\r",
+      " 27%|██▋       | 183/676 [00:14<00:38, 12.73it/s]\r",
+      " 27%|██▋       | 185/676 [00:15<00:38, 12.75it/s]\r",
+      " 28%|██▊       | 187/676 [00:15<00:38, 12.77it/s]\r",
+      " 28%|██▊       | 189/676 [00:15<00:38, 12.69it/s]\r",
+      " 28%|██▊       | 191/676 [00:15<00:38, 12.68it/s]\r",
+      " 29%|██▊       | 193/676 [00:15<00:37, 12.71it/s]\r",
+      " 29%|██▉       | 195/676 [00:15<00:37, 12.78it/s]\r",
+      " 29%|██▉       | 197/676 [00:16<00:37, 12.76it/s]\r",
+      " 29%|██▉       | 199/676 [00:16<00:37, 12.79it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 30%|██▉       | 200/676 [00:16<00:37, 12.79it/s]\r",
+      " 30%|██▉       | 201/676 [00:16<00:37, 12.80it/s]\r",
+      " 30%|███       | 203/676 [00:16<00:37, 12.71it/s]\r",
+      " 30%|███       | 205/676 [00:16<00:36, 12.73it/s]\r",
+      " 31%|███       | 207/676 [00:16<00:36, 12.73it/s]\r",
+      " 31%|███       | 209/676 [00:17<00:36, 12.74it/s]\r",
+      " 31%|███       | 211/676 [00:17<00:36, 12.80it/s]\r",
+      " 32%|███▏      | 213/676 [00:17<00:36, 12.81it/s]\r",
+      " 32%|███▏      | 215/676 [00:17<00:36, 12.71it/s]\r",
+      " 32%|███▏      | 217/676 [00:17<00:36, 12.74it/s]\r",
+      " 32%|███▏      | 219/676 [00:17<00:35, 12.70it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 33%|███▎      | 220/676 [00:17<00:35, 12.70it/s]\r",
+      " 33%|███▎      | 221/676 [00:17<00:35, 12.71it/s]\r",
+      " 33%|███▎      | 223/676 [00:18<00:35, 12.76it/s]\r",
+      " 33%|███▎      | 225/676 [00:18<00:35, 12.72it/s]\r",
+      " 34%|███▎      | 227/676 [00:18<00:35, 12.73it/s]\r",
+      " 34%|███▍      | 229/676 [00:18<00:34, 12.80it/s]\r",
+      " 34%|███▍      | 231/676 [00:18<00:34, 12.82it/s]\r",
+      " 34%|███▍      | 233/676 [00:18<00:34, 12.78it/s]\r",
+      " 35%|███▍      | 235/676 [00:19<00:34, 12.77it/s]\r",
+      " 35%|███▌      | 237/676 [00:19<00:34, 12.78it/s]\r",
+      " 35%|███▌      | 239/676 [00:19<00:34, 12.78it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 36%|███▌      | 240/676 [00:19<00:34, 12.78it/s]\r",
+      " 36%|███▌      | 241/676 [00:19<00:34, 12.75it/s]\r",
+      " 36%|███▌      | 243/676 [00:19<00:33, 12.78it/s]\r",
+      " 36%|███▌      | 245/676 [00:19<00:33, 12.78it/s]\r",
+      " 37%|███▋      | 247/676 [00:20<00:33, 12.80it/s]\r",
+      " 37%|███▋      | 249/676 [00:20<00:33, 12.82it/s]\r",
+      " 37%|███▋      | 251/676 [00:20<00:33, 12.80it/s]\r",
+      " 37%|███▋      | 253/676 [00:20<00:33, 12.76it/s]\r",
+      " 38%|███▊      | 255/676 [00:20<00:33, 12.75it/s]\r",
+      " 38%|███▊      | 257/676 [00:20<00:32, 12.76it/s]\r",
+      " 38%|███▊      | 259/676 [00:20<00:32, 12.77it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 38%|███▊      | 260/676 [00:21<00:32, 12.77it/s]\r",
+      " 39%|███▊      | 261/676 [00:21<00:32, 12.73it/s]\r",
+      " 39%|███▉      | 263/676 [00:21<00:32, 12.71it/s]\r",
+      " 39%|███▉      | 265/676 [00:21<00:32, 12.73it/s]\r",
+      " 39%|███▉      | 267/676 [00:21<00:32, 12.74it/s]\r",
+      " 40%|███▉      | 269/676 [00:21<00:32, 12.65it/s]\r",
+      " 40%|████      | 271/676 [00:21<00:32, 12.64it/s]\r",
+      " 40%|████      | 273/676 [00:22<00:31, 12.67it/s]\r",
+      " 41%|████      | 275/676 [00:22<00:31, 12.65it/s]\r",
+      " 41%|████      | 277/676 [00:22<00:31, 12.80it/s]\r",
+      " 41%|████▏     | 279/676 [00:22<00:31, 12.74it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 41%|████▏     | 280/676 [00:22<00:31, 12.74it/s]\r",
+      " 42%|████▏     | 281/676 [00:22<00:30, 12.75it/s]\r",
+      " 42%|████▏     | 283/676 [00:22<00:30, 12.78it/s]\r",
+      " 42%|████▏     | 285/676 [00:22<00:30, 12.77it/s]\r",
+      " 42%|████▏     | 287/676 [00:23<00:30, 12.73it/s]\r",
+      " 43%|████▎     | 289/676 [00:23<00:30, 12.67it/s]\r",
+      " 43%|████▎     | 291/676 [00:23<00:30, 12.67it/s]\r",
+      " 43%|████▎     | 293/676 [00:23<00:30, 12.71it/s]\r",
+      " 44%|████▎     | 295/676 [00:23<00:29, 12.74it/s]\r",
+      " 44%|████▍     | 297/676 [00:23<00:29, 12.76it/s]\r",
+      " 44%|████▍     | 299/676 [00:24<00:29, 12.73it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 44%|████▍     | 300/676 [00:24<00:29, 12.73it/s]\r",
+      " 45%|████▍     | 301/676 [00:24<00:29, 12.66it/s]\r",
+      " 45%|████▍     | 303/676 [00:24<00:29, 12.64it/s]\r",
+      " 45%|████▌     | 305/676 [00:24<00:29, 12.62it/s]\r",
+      " 45%|████▌     | 307/676 [00:24<00:29, 12.52it/s]\r",
+      " 46%|████▌     | 309/676 [00:24<00:29, 12.51it/s]\r",
+      " 46%|████▌     | 311/676 [00:25<00:28, 12.60it/s]\r",
+      " 46%|████▋     | 313/676 [00:25<00:28, 12.67it/s]\r",
+      " 47%|████▋     | 315/676 [00:25<00:28, 12.58it/s]\r",
+      " 47%|████▋     | 317/676 [00:25<00:28, 12.64it/s]\r",
+      " 47%|████▋     | 319/676 [00:25<00:28, 12.59it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 47%|████▋     | 320/676 [00:25<00:28, 12.59it/s]\r",
+      " 47%|████▋     | 321/676 [00:25<00:28, 12.61it/s]\r",
+      " 48%|████▊     | 323/676 [00:25<00:28, 12.59it/s]\r",
+      " 48%|████▊     | 325/676 [00:26<00:27, 12.63it/s]\r",
+      " 48%|████▊     | 327/676 [00:26<00:27, 12.64it/s]\r",
+      " 49%|████▊     | 329/676 [00:26<00:27, 12.69it/s]\r",
+      " 49%|████▉     | 331/676 [00:26<00:27, 12.77it/s]\r",
+      " 49%|████▉     | 333/676 [00:26<00:26, 12.73it/s]\r",
+      " 50%|████▉     | 335/676 [00:26<00:26, 12.65it/s]\r",
+      " 50%|████▉     | 337/676 [00:27<00:26, 12.70it/s]\r",
+      " 50%|█████     | 339/676 [00:27<00:26, 12.69it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 50%|█████     | 340/676 [00:27<00:26, 12.69it/s]\r",
+      " 50%|█████     | 341/676 [00:27<00:26, 12.69it/s]\r",
+      " 51%|█████     | 343/676 [00:27<00:26, 12.69it/s]\r",
+      " 51%|█████     | 345/676 [00:27<00:26, 12.63it/s]\r",
+      " 51%|█████▏    | 347/676 [00:27<00:26, 12.61it/s]\r",
+      " 52%|█████▏    | 349/676 [00:28<00:25, 12.61it/s]\r",
+      " 52%|█████▏    | 351/676 [00:28<00:25, 12.58it/s]\r",
+      " 52%|█████▏    | 353/676 [00:28<00:25, 12.56it/s]\r",
+      " 53%|█████▎    | 355/676 [00:28<00:25, 12.55it/s]\r",
+      " 53%|█████▎    | 357/676 [00:28<00:25, 12.58it/s]\r",
+      " 53%|█████▎    | 359/676 [00:28<00:25, 12.65it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 53%|█████▎    | 360/676 [00:28<00:24, 12.65it/s]\r",
+      " 53%|█████▎    | 361/676 [00:28<00:24, 12.65it/s]\r",
+      " 54%|█████▎    | 363/676 [00:29<00:24, 12.64it/s]\r",
+      " 54%|█████▍    | 365/676 [00:29<00:24, 12.67it/s]\r",
+      " 54%|█████▍    | 367/676 [00:29<00:24, 12.60it/s]\r",
+      " 55%|█████▍    | 369/676 [00:29<00:24, 12.69it/s]\r",
+      " 55%|█████▍    | 371/676 [00:29<00:24, 12.67it/s]\r",
+      " 55%|█████▌    | 373/676 [00:29<00:23, 12.77it/s]\r",
+      " 55%|█████▌    | 375/676 [00:30<00:23, 12.69it/s]\r",
+      " 56%|█████▌    | 377/676 [00:30<00:23, 12.69it/s]\r",
+      " 56%|█████▌    | 379/676 [00:30<00:23, 12.69it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 56%|█████▌    | 380/676 [00:30<00:23, 12.69it/s]\r",
+      " 56%|█████▋    | 381/676 [00:30<00:23, 12.72it/s]\r",
+      " 57%|█████▋    | 383/676 [00:30<00:23, 12.70it/s]\r",
+      " 57%|█████▋    | 385/676 [00:30<00:22, 12.71it/s]\r",
+      " 57%|█████▋    | 387/676 [00:31<00:22, 12.65it/s]\r",
+      " 58%|█████▊    | 389/676 [00:31<00:22, 12.61it/s]\r",
+      " 58%|█████▊    | 391/676 [00:31<00:22, 12.67it/s]\r",
+      " 58%|█████▊    | 393/676 [00:31<00:22, 12.71it/s]\r",
+      " 58%|█████▊    | 395/676 [00:31<00:22, 12.67it/s]\r",
+      " 59%|█████▊    | 397/676 [00:31<00:21, 12.69it/s]\r",
+      " 59%|█████▉    | 399/676 [00:31<00:21, 12.66it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 59%|█████▉    | 400/676 [00:32<00:21, 12.66it/s]\r",
+      " 59%|█████▉    | 401/676 [00:32<00:21, 12.75it/s]\r",
+      " 60%|█████▉    | 403/676 [00:32<00:21, 12.73it/s]\r",
+      " 60%|█████▉    | 405/676 [00:32<00:21, 12.79it/s]\r",
+      " 60%|██████    | 407/676 [00:32<00:21, 12.76it/s]\r",
+      " 61%|██████    | 409/676 [00:32<00:20, 12.75it/s]\r",
+      " 61%|██████    | 411/676 [00:32<00:20, 12.74it/s]\r",
+      " 61%|██████    | 413/676 [00:33<00:20, 12.76it/s]\r",
+      " 61%|██████▏   | 415/676 [00:33<00:20, 12.72it/s]\r",
+      " 62%|██████▏   | 417/676 [00:33<00:20, 12.68it/s]\r",
+      " 62%|██████▏   | 419/676 [00:33<00:20, 12.65it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 62%|██████▏   | 420/676 [00:33<00:20, 12.65it/s]\r",
+      " 62%|██████▏   | 421/676 [00:33<00:20, 12.65it/s]\r",
+      " 63%|██████▎   | 423/676 [00:33<00:19, 12.71it/s]\r",
+      " 63%|██████▎   | 425/676 [00:34<00:19, 12.83it/s]\r",
+      " 63%|██████▎   | 427/676 [00:34<00:19, 12.77it/s]\r",
+      " 63%|██████▎   | 429/676 [00:34<00:19, 12.74it/s]\r",
+      " 64%|██████▍   | 431/676 [00:34<00:19, 12.84it/s]\r",
+      " 64%|██████▍   | 433/676 [00:34<00:18, 12.81it/s]\r",
+      " 64%|██████▍   | 435/676 [00:34<00:18, 12.86it/s]\r",
+      " 65%|██████▍   | 437/676 [00:34<00:18, 12.76it/s]\r",
+      " 65%|██████▍   | 439/676 [00:35<00:18, 12.73it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 65%|██████▌   | 440/676 [00:35<00:18, 12.73it/s]\r",
+      " 65%|██████▌   | 441/676 [00:35<00:18, 12.75it/s]\r",
+      " 66%|██████▌   | 443/676 [00:35<00:18, 12.70it/s]\r",
+      " 66%|██████▌   | 445/676 [00:35<00:18, 12.72it/s]\r",
+      " 66%|██████▌   | 447/676 [00:35<00:17, 12.76it/s]\r",
+      " 66%|██████▋   | 449/676 [00:35<00:17, 12.67it/s]\r",
+      " 67%|██████▋   | 451/676 [00:36<00:17, 12.65it/s]\r",
+      " 67%|██████▋   | 453/676 [00:36<00:17, 12.76it/s]\r",
+      " 67%|██████▋   | 455/676 [00:36<00:17, 12.73it/s]\r",
+      " 68%|██████▊   | 457/676 [00:36<00:17, 12.70it/s]\r",
+      " 68%|██████▊   | 459/676 [00:36<00:17, 12.74it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 68%|██████▊   | 460/676 [00:36<00:16, 12.74it/s]\r",
+      " 68%|██████▊   | 461/676 [00:36<00:16, 12.74it/s]\r",
+      " 68%|██████▊   | 463/676 [00:37<00:16, 12.72it/s]\r",
+      " 69%|██████▉   | 465/676 [00:37<00:16, 12.71it/s]\r",
+      " 69%|██████▉   | 467/676 [00:37<00:16, 12.71it/s]\r",
+      " 69%|██████▉   | 469/676 [00:37<00:16, 12.74it/s]\r",
+      " 70%|██████▉   | 471/676 [00:37<00:16, 12.75it/s]\r",
+      " 70%|██████▉   | 473/676 [00:37<00:15, 12.75it/s]\r",
+      " 70%|███████   | 475/676 [00:37<00:15, 12.75it/s]\r",
+      " 71%|███████   | 477/676 [00:38<00:15, 12.68it/s]\r",
+      " 71%|███████   | 479/676 [00:38<00:15, 12.65it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 71%|███████   | 480/676 [00:38<00:15, 12.65it/s]\r",
+      " 71%|███████   | 481/676 [00:38<00:15, 12.68it/s]\r",
+      " 71%|███████▏  | 483/676 [00:38<00:15, 12.69it/s]\r",
+      " 72%|███████▏  | 485/676 [00:38<00:15, 12.66it/s]\r",
+      " 72%|███████▏  | 487/676 [00:38<00:14, 12.64it/s]\r",
+      " 72%|███████▏  | 489/676 [00:39<00:14, 12.66it/s]\r",
+      " 73%|███████▎  | 491/676 [00:39<00:14, 12.67it/s]\r",
+      " 73%|███████▎  | 493/676 [00:39<00:14, 12.69it/s]\r",
+      " 73%|███████▎  | 495/676 [00:39<00:14, 12.75it/s]\r",
+      " 74%|███████▎  | 497/676 [00:39<00:14, 12.74it/s]\r",
+      " 74%|███████▍  | 499/676 [00:39<00:13, 12.73it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 74%|███████▍  | 500/676 [00:39<00:13, 12.73it/s]\r",
+      " 74%|███████▍  | 501/676 [00:40<00:13, 12.68it/s]\r",
+      " 74%|███████▍  | 503/676 [00:40<00:13, 12.67it/s]\r",
+      " 75%|███████▍  | 505/676 [00:40<00:13, 12.62it/s]\r",
+      " 75%|███████▌  | 507/676 [00:40<00:13, 12.66it/s]\r",
+      " 75%|███████▌  | 509/676 [00:40<00:13, 12.70it/s]\r",
+      " 76%|███████▌  | 511/676 [00:40<00:12, 12.70it/s]\r",
+      " 76%|███████▌  | 513/676 [00:40<00:12, 12.72it/s]\r",
+      " 76%|███████▌  | 515/676 [00:41<00:12, 12.69it/s]\r",
+      " 76%|███████▋  | 517/676 [00:41<00:12, 12.64it/s]\r",
+      " 77%|███████▋  | 519/676 [00:41<00:12, 12.61it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 77%|███████▋  | 520/676 [00:41<00:12, 12.61it/s]\r",
+      " 77%|███████▋  | 521/676 [00:41<00:12, 12.65it/s]\r",
+      " 77%|███████▋  | 523/676 [00:41<00:12, 12.63it/s]\r",
+      " 78%|███████▊  | 525/676 [00:41<00:11, 12.71it/s]\r",
+      " 78%|███████▊  | 527/676 [00:42<00:11, 12.72it/s]\r",
+      " 78%|███████▊  | 529/676 [00:42<00:11, 12.66it/s]\r",
+      " 79%|███████▊  | 531/676 [00:42<00:11, 12.67it/s]\r",
+      " 79%|███████▉  | 533/676 [00:42<00:11, 12.63it/s]\r",
+      " 79%|███████▉  | 535/676 [00:42<00:11, 12.68it/s]\r",
+      " 79%|███████▉  | 537/676 [00:42<00:10, 12.67it/s]\r",
+      " 80%|███████▉  | 539/676 [00:43<00:10, 12.64it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 80%|███████▉  | 540/676 [00:43<00:10, 12.64it/s]\r",
+      " 80%|████████  | 541/676 [00:43<00:10, 12.64it/s]\r",
+      " 80%|████████  | 543/676 [00:43<00:10, 12.61it/s]\r",
+      " 81%|████████  | 545/676 [00:43<00:10, 12.58it/s]\r",
+      " 81%|████████  | 547/676 [00:43<00:10, 12.65it/s]\r",
+      " 81%|████████  | 549/676 [00:43<00:10, 12.64it/s]\r",
+      " 82%|████████▏ | 551/676 [00:43<00:09, 12.62it/s]\r",
+      " 82%|████████▏ | 553/676 [00:44<00:09, 12.58it/s]\r",
+      " 82%|████████▏ | 555/676 [00:44<00:09, 12.59it/s]\r",
+      " 82%|████████▏ | 557/676 [00:44<00:09, 12.58it/s]\r",
+      " 83%|████████▎ | 559/676 [00:44<00:09, 12.48it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 83%|████████▎ | 560/676 [00:44<00:09, 12.48it/s]\r",
+      " 83%|████████▎ | 561/676 [00:44<00:09, 12.49it/s]\r",
+      " 83%|████████▎ | 563/676 [00:44<00:09, 12.50it/s]\r",
+      " 84%|████████▎ | 565/676 [00:45<00:08, 12.59it/s]\r",
+      " 84%|████████▍ | 567/676 [00:45<00:08, 12.56it/s]\r",
+      " 84%|████████▍ | 569/676 [00:45<00:08, 12.56it/s]\r",
+      " 84%|████████▍ | 571/676 [00:45<00:08, 12.62it/s]\r",
+      " 85%|████████▍ | 573/676 [00:45<00:08, 12.64it/s]\r",
+      " 85%|████████▌ | 575/676 [00:45<00:07, 12.74it/s]\r",
+      " 85%|████████▌ | 577/676 [00:46<00:07, 12.72it/s]\r",
+      " 86%|████████▌ | 579/676 [00:46<00:07, 12.69it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 86%|████████▌ | 580/676 [00:46<00:07, 12.69it/s]\r",
+      " 86%|████████▌ | 581/676 [00:46<00:07, 12.65it/s]\r",
+      " 86%|████████▌ | 583/676 [00:46<00:07, 12.71it/s]\r",
+      " 87%|████████▋ | 585/676 [00:46<00:07, 12.69it/s]\r",
+      " 87%|████████▋ | 587/676 [00:46<00:06, 12.78it/s]\r",
+      " 87%|████████▋ | 589/676 [00:46<00:06, 12.74it/s]\r",
+      " 87%|████████▋ | 591/676 [00:47<00:06, 12.78it/s]\r",
+      " 88%|████████▊ | 593/676 [00:47<00:06, 12.75it/s]\r",
+      " 88%|████████▊ | 595/676 [00:47<00:06, 12.72it/s]\r",
+      " 88%|████████▊ | 597/676 [00:47<00:06, 12.69it/s]\r",
+      " 89%|████████▊ | 599/676 [00:47<00:06, 12.65it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 89%|████████▉ | 600/676 [00:47<00:06, 12.65it/s]\r",
+      " 89%|████████▉ | 601/676 [00:47<00:05, 12.67it/s]\r",
+      " 89%|████████▉ | 603/676 [00:48<00:05, 12.63it/s]\r",
+      " 89%|████████▉ | 605/676 [00:48<00:05, 12.67it/s]\r",
+      " 90%|████████▉ | 607/676 [00:48<00:05, 12.65it/s]\r",
+      " 90%|█████████ | 609/676 [00:48<00:05, 12.66it/s]\r",
+      " 90%|█████████ | 611/676 [00:48<00:05, 12.70it/s]\r",
+      " 91%|█████████ | 613/676 [00:48<00:04, 12.66it/s]\r",
+      " 91%|█████████ | 615/676 [00:49<00:04, 12.70it/s]\r",
+      " 91%|█████████▏| 617/676 [00:49<00:04, 12.67it/s]\r",
+      " 92%|█████████▏| 619/676 [00:49<00:04, 12.65it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 92%|█████████▏| 620/676 [00:49<00:04, 12.65it/s]\r",
+      " 92%|█████████▏| 621/676 [00:49<00:04, 12.70it/s]\r",
+      " 92%|█████████▏| 623/676 [00:49<00:04, 12.75it/s]\r",
+      " 92%|█████████▏| 625/676 [00:49<00:04, 12.72it/s]\r",
+      " 93%|█████████▎| 627/676 [00:49<00:03, 12.69it/s]\r",
+      " 93%|█████████▎| 629/676 [00:50<00:03, 12.73it/s]\r",
+      " 93%|█████████▎| 631/676 [00:50<00:03, 12.59it/s]\r",
+      " 94%|█████████▎| 633/676 [00:50<00:03, 12.66it/s]\r",
+      " 94%|█████████▍| 635/676 [00:50<00:03, 12.67it/s]\r",
+      " 94%|█████████▍| 637/676 [00:50<00:03, 12.69it/s]\r",
+      " 95%|█████████▍| 639/676 [00:50<00:02, 12.69it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 95%|█████████▍| 640/676 [00:50<00:02, 12.69it/s]\r",
+      " 95%|█████████▍| 641/676 [00:51<00:02, 12.76it/s]\r",
+      " 95%|█████████▌| 643/676 [00:51<00:02, 12.74it/s]\r",
+      " 95%|█████████▌| 645/676 [00:51<00:02, 12.64it/s]\r",
+      " 96%|█████████▌| 647/676 [00:51<00:02, 12.66it/s]\r",
+      " 96%|█████████▌| 649/676 [00:51<00:02, 12.63it/s]\r",
+      " 96%|█████████▋| 651/676 [00:51<00:01, 12.79it/s]\r",
+      " 97%|█████████▋| 653/676 [00:52<00:01, 12.73it/s]\r",
+      " 97%|█████████▋| 655/676 [00:52<00:01, 12.70it/s]\r",
+      " 97%|█████████▋| 657/676 [00:52<00:01, 12.67it/s]\r",
+      " 97%|█████████▋| 659/676 [00:52<00:01, 12.70it/s]\r",
+      "                                                 \r",
+      "\r",
+      " 98%|█████████▊| 660/676 [00:52<00:01, 12.70it/s]\r",
+      " 98%|█████████▊| 661/676 [00:52<00:01, 12.61it/s]\r",
+      " 98%|█████████▊| 663/676 [00:52<00:01, 12.64it/s]\r",
+      " 98%|█████████▊| 665/676 [00:52<00:00, 12.66it/s]\r",
+      " 99%|█████████▊| 667/676 [00:53<00:00, 12.65it/s]\r",
+      " 99%|█████████▉| 669/676 [00:53<00:00, 12.67it/s]\r",
+      " 99%|█████████▉| 671/676 [00:53<00:00, 12.65it/s]\r",
+      "100%|█████████▉| 673/676 [00:53<00:00, 12.63it/s]\r",
+      "100%|█████████▉| 675/676 [00:53<00:00, 12.71it/s][INFO|trainer.py:1409] 2023-04-05 04:08:57,499 >> \n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "\n",
+      "\n",
+      "100%|██████████| 676/676 [00:53<00:00, 12.56it/s]\n",
+      "04/05/2023 04:08:57 - INFO - transformers4rec.torch.trainer -   ***** Running Evaluation *****\n",
+      "04/05/2023 04:08:57 - INFO - transformers4rec.torch.trainer -     Batch size = 128\n",
+      "04/05/2023 04:08:57 - INFO - transformers4rec.torch.trainer -     Num sessions (examples) = 2560\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'loss': 12.9419, 'learning_rate': 0.0006479980438000916, 'epoch': 0.03}\n",
+      "{'loss': 12.5627, 'learning_rate': 0.0006282720759522958, 'epoch': 0.06}\n",
+      "{'loss': 11.7988, 'learning_rate': 0.0006085461081045, 'epoch': 0.09}\n",
+      "{'loss': 10.9136, 'learning_rate': 0.0005888201402567042, 'epoch': 0.12}\n",
+      "{'loss': 10.5825, 'learning_rate': 0.0005690941724089084, 'epoch': 0.15}\n",
+      "{'loss': 10.4064, 'learning_rate': 0.0005493682045611127, 'epoch': 0.18}\n",
+      "{'loss': 10.1992, 'learning_rate': 0.0005296422367133169, 'epoch': 0.21}\n",
+      "{'loss': 10.2454, 'learning_rate': 0.000509916268865521, 'epoch': 0.24}\n",
+      "{'loss': 10.1139, 'learning_rate': 0.0004901903010177253, 'epoch': 0.27}\n",
+      "{'loss': 10.0165, 'learning_rate': 0.0004704643331699295, 'epoch': 0.3}\n",
+      "{'loss': 10.0056, 'learning_rate': 0.00045073836532213366, 'epoch': 0.33}\n",
+      "{'loss': 9.8803, 'learning_rate': 0.00043101239747433793, 'epoch': 0.36}\n",
+      "{'loss': 9.7873, 'learning_rate': 0.00041128642962654215, 'epoch': 0.38}\n",
+      "{'loss': 9.9475, 'learning_rate': 0.0003915604617787463, 'epoch': 0.41}\n",
+      "{'loss': 9.8843, 'learning_rate': 0.0003718344939309506, 'epoch': 0.44}\n",
+      "{'loss': 9.7393, 'learning_rate': 0.00035210852608315475, 'epoch': 0.47}\n",
+      "{'loss': 9.5825, 'learning_rate': 0.00033238255823535897, 'epoch': 0.5}\n",
+      "{'loss': 9.8305, 'learning_rate': 0.0003126565903875632, 'epoch': 0.53}\n",
+      "{'loss': 9.7408, 'learning_rate': 0.00029293062253976746, 'epoch': 0.56}\n",
+      "{'loss': 9.7161, 'learning_rate': 0.0002732046546919716, 'epoch': 0.59}\n",
+      "{'loss': 9.5964, 'learning_rate': 0.00025347868684417584, 'epoch': 0.62}\n",
+      "{'loss': 9.5593, 'learning_rate': 0.00023375271899638006, 'epoch': 0.65}\n",
+      "{'loss': 9.4851, 'learning_rate': 0.00021402675114858425, 'epoch': 0.68}\n",
+      "{'loss': 9.7007, 'learning_rate': 0.0001943007833007885, 'epoch': 0.71}\n",
+      "{'loss': 9.5252, 'learning_rate': 0.00017457481545299271, 'epoch': 0.74}\n",
+      "{'loss': 9.6155, 'learning_rate': 0.0001548488476051969, 'epoch': 0.77}\n",
+      "{'loss': 9.6275, 'learning_rate': 0.00013512287975740115, 'epoch': 0.8}\n",
+      "{'loss': 9.571, 'learning_rate': 0.00011539691190960534, 'epoch': 0.83}\n",
+      "{'loss': 9.4793, 'learning_rate': 9.567094406180957e-05, 'epoch': 0.86}\n",
+      "{'loss': 9.4686, 'learning_rate': 7.594497621401378e-05, 'epoch': 0.89}\n",
+      "{'loss': 9.524, 'learning_rate': 5.621900836621799e-05, 'epoch': 0.92}\n",
+      "{'loss': 9.5921, 'learning_rate': 3.6493040518422206e-05, 'epoch': 0.95}\n",
+      "{'loss': 9.5054, 'learning_rate': 1.6767072670626417e-05, 'epoch': 0.98}\n",
+      "{'train_runtime': 53.8312, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.558, 'train_loss': 10.050675363935662, 'epoch': 1.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "  3%|▎         | 20/676 [00:00<00:16, 40.02it/s]\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   ***** train results (time index): 2)*****\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/loss = 9.21147346496582\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/next-item/ndcg_at_10 = 0.03348138555884361\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/next-item/ndcg_at_20 = 0.04145380109548569\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/next-item/recall_at_10 = 0.064453125\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/next-item/recall_at_20 = 0.09609375149011612\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_runtime = 0.6467\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_samples_per_second = 3958.501\n",
+      "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_steps_per_second = 30.926\n",
+      "04/05/2023 04:08:58 - INFO - transformers4rec.torch.trainer -   ***** Running Evaluation *****\n",
+      "04/05/2023 04:08:58 - INFO - transformers4rec.torch.trainer -     Batch size = 128\n",
+      "04/05/2023 04:08:58 - INFO - transformers4rec.torch.trainer -     Num sessions (examples) = 10624\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.03348138555884361, 'train_/next-item/ndcg_at_20': 0.04145380109548569, 'train_/next-item/recall_at_10': 0.064453125, 'train_/next-item/recall_at_20': 0.09609375149011612, 'train_/loss': 9.21147346496582, 'train_runtime': 0.6467, 'train_samples_per_second': 3958.501, 'train_steps_per_second': 30.926}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 83/83 [00:02<00:00, 38.64it/s]\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   ***** eval results (time index): 2)*****\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/loss = 9.303844451904297\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/next-item/ndcg_at_10 = 0.03641688451170921\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/next-item/ndcg_at_20 = 0.04447970166802406\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/next-item/recall_at_10 = 0.07172439247369766\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/next-item/recall_at_20 = 0.10363327711820602\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_runtime = 2.2272\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_samples_per_second = 4770.055\n",
+      "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_steps_per_second = 37.266\n",
+      "04/05/2023 04:09:01 - INFO - __main__ -   Computing and logging AOT (Average Over Time) metrics\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   ***** Eval results (avg over time) *****\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/loss_AOT = 9.303844451904297\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/next-item/ndcg_at_10_AOT = 0.03641688451170921\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/next-item/ndcg_at_20_AOT = 0.04447970166802406\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/next-item/recall_at_10_AOT = 0.07172439247369766\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_/next-item/recall_at_20_AOT = 0.10363327711820602\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_runtime_AOT = 2.2272\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_samples_per_second_AOT = 4770.055\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     eval_steps_per_second_AOT = 37.266\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/loss_AOT = 9.21147346496582\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/next-item/ndcg_at_10_AOT = 0.03348138555884361\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/next-item/ndcg_at_20_AOT = 0.04145380109548569\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/next-item/recall_at_10_AOT = 0.064453125\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_/next-item/recall_at_20_AOT = 0.09609375149011612\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_runtime_AOT = 0.6467\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_samples_per_second_AOT = 3958.501\n",
+      "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -     train_steps_per_second_AOT = 30.926\n",
+      "04/05/2023 04:09:01 - INFO - transformers4rec.torch.trainer -   ***** Running Prediction *****\n",
+      "04/05/2023 04:09:01 - INFO - transformers4rec.torch.trainer -     Batch size = 128\n",
+      "04/05/2023 04:09:01 - INFO - transformers4rec.torch.trainer -     Num sessions (examples) = 10752\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.03641688451170921, 'eval_/next-item/ndcg_at_20': 0.04447970166802406, 'eval_/next-item/recall_at_10': 0.07172439247369766, 'eval_/next-item/recall_at_20': 0.10363327711820602, 'eval_/loss': 9.303844451904297, 'eval_runtime': 2.2272, 'eval_samples_per_second': 4770.055, 'eval_steps_per_second': 37.266}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      " 99%|█████████▉| 83/84 [00:00<00:00, 84.22it/s]04/05/2023 04:09:02 - INFO - __main__ -   Recall@10 of manually masked test data = 0.07175098739890916\n",
+      "100%|██████████| 84/84 [00:03<00:00, 22.48it/s]"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=1 # all the models in the papers were trained for 5 epochs\n",
+    "\n",
+    "# UNCOMMENT THE MODEL YOU'D LIKE TO TRAIN AND EXPORT\n",
+    "\n",
+    "### GPT-2 (CLM) - Item Id feature\n",
+    "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type gpt2 --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --per_device_train_batch_size 128 --learning_rate 0.0008781937894379981 --dropout 0.2 --input_dropout 0.4 --weight_decay 1.4901138106122045e-05 --d_model 128 --item_embedding_dim 448 --n_layer 1 --n_head 1 --label_smoothing 0.9 --stochastic_shared_embeddings_replacement_prob 0.0 --item_id_embeddings_init_std 0.03 --other_embeddings_init_std 0.034999999999999996 --eval_on_test_set --seed 100 --report_to none\n",
+    "\n",
+    "### Transformer-XL (CLM) - Item Id feature\n",
+    "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type transfoxl --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --per_device_train_batch_size 128 --learning_rate 0.001007765821083962 --dropout 0.1 --input_dropout 0.30000000000000004 --weight_decay 1.0673054163921092e-06 --d_model 448 --item_embedding_dim 320 --n_layer 1 --n_head 1 --label_smoothing 0.2 --stochastic_shared_embeddings_replacement_prob 0.02 --item_id_embeddings_init_std 0.15 --other_embeddings_init_std 0.01 --eval_on_test_set --seed 100 --report_to none\n",
+    "\n",
+    "### BERT (MLM) - Item Id feature\n",
+    "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type albert --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --mlm --num_hidden_groups -1 --inner_group_num 1 --per_device_train_batch_size 128 --learning_rate 0.0004904752786458524 --dropout 0.0 --input_dropout 0.1 --weight_decay 9.565968888623912e-05 --d_model 320 --item_embedding_dim 320 --n_layer 2 --n_head 8 --label_smoothing 0.2 --stochastic_shared_embeddings_replacement_prob 0.06 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.025 --mlm_probability 0.6000000000000001 --eval_on_test_set --seed 100 --report_to none\n",
+    "\n",
+    "### XLNet (PLM) - Item Id feature\n",
+    "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --plm --per_device_train_batch_size 128 --learning_rate 0.0003387925502203725 --dropout 0.0 --input_dropout 0.2 --weight_decay 2.1769664191492473e-05 --d_model 384 --item_embedding_dim 384 --n_layer 4 --n_head 16 --label_smoothing 0.7000000000000001 --stochastic_shared_embeddings_replacement_prob 0.02 --item_id_embeddings_init_std 0.13 --other_embeddings_init_std 0.005 --plm_probability 0.5 --plm_max_span_length 3 --eval_on_test_set --seed 100 --report_to none\n",
+    "\n",
+    "### XLNet (MLM) - Item Id feature\n",
+    "python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed 100 --report_to none\n",
+    "\n",
+    "### XLNET (MLM) - CONCAT + SOFT ONE-HOT ENCODING - All features\n",
+    "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --input_features_aggregation concat --per_device_train_batch_size 128 --learning_rate 0.00034029107417129616 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.168336235732841e-05 --d_model 448 --item_embedding_dim 384 --n_layer 2 --n_head 8 --label_smoothing 0.6000000000000001 --stochastic_shared_embeddings_replacement_prob 0.0 --item_id_embeddings_init_std 0.06999999999999999 --other_embeddings_init_std 0.085 --mlm_probability 0.30000000000000004 --embedding_dim_from_cardinality_multiplier 1.0 --numeric_features_project_to_embedding_dim 20 --numeric_features_soft_one_hot_encoding_num_embeddings 5 --eval_on_test_set --seed 100 --use_side_information_features --report_to none"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d9b7394",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f0af515e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rm -rf /workspace/models_for_benchmarking/t4r_pytorch /workspace/models_for_benchmarking/t4r_pytorch_nvt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c65f9a1f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting /workspace/models_for_benchmarking/t4r_pytorch_pt/config.pbtxt\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile /workspace/models_for_benchmarking/t4r_pytorch_pt/config.pbtxt\n",
+    "\n",
+    "name: \"t4r_pytorch_pt\"\n",
+    "input {\n",
+    "  name: \"sess_pid_seq__values\"\n",
+    "  data_type: TYPE_INT64\n",
+    "  dims: -1\n",
+    "  dims: 1\n",
+    "}\n",
+    "input {\n",
+    "  name: \"sess_pid_seq__nnzs\"\n",
+    "  data_type: TYPE_INT64\n",
+    "  dims: -1\n",
+    "  dims: 1\n",
+    "}\n",
+    "output {\n",
+    "  name: \"output\"\n",
+    "  data_type: TYPE_FP32\n",
+    "  dims: -1\n",
+    "  dims: 20\n",
+    "}\n",
+    "backend: \"python\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9964a2cc",
+   "metadata": {},
+   "source": [
+    "For running on the CPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d61fe61b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting /workspace/models_for_benchmarking/t4r_pytorch_pt/config.pbtxt\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile /workspace/models_for_benchmarking/t4r_pytorch_pt/config.pbtxt\n",
+    "\n",
+    "name: \"t4r_pytorch_pt\"\n",
+    "instance_group [\n",
+    "    {\n",
+    "      count: 1\n",
+    "      kind: KIND_CPU\n",
+    "    }\n",
+    "]\n",
+    "input {\n",
+    "  name: \"sess_pid_seq__values\"\n",
+    "  data_type: TYPE_INT64\n",
+    "  dims: -1\n",
+    "  dims: 1\n",
+    "}\n",
+    "input {\n",
+    "  name: \"sess_pid_seq__nnzs\"\n",
+    "  data_type: TYPE_INT64\n",
+    "  dims: -1\n",
+    "  dims: 1\n",
+    "}\n",
+    "output {\n",
+    "  name: \"output\"\n",
+    "  data_type: TYPE_FP32\n",
+    "  dims: -1\n",
+    "  dims: 20\n",
+    "}\n",
+    "backend: \"python\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "63ba822c",
+   "metadata": {},
+   "source": [
+    "You can control whether you would like to run on the GPU or the CPU by setting the environment variable `HAS_GPU` to either 0 or 1."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "b3a346b9",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Overwriting /workspace/models_for_benchmarking/t4r_pytorch_pt/1/model.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%writefile /workspace/models_for_benchmarking/t4r_pytorch_pt/1/model.py\n",
+    "\n",
+    "# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\n",
+    "#\n",
+    "# Redistribution and use in source and binary forms, with or without\n",
+    "# modification, are permitted provided that the following conditions\n",
+    "# are met:\n",
+    "#  * Redistributions of source code must retain the above copyright\n",
+    "#    notice, this list of conditions and the following disclaimer.\n",
+    "#  * Redistributions in binary form must reproduce the above copyright\n",
+    "#    notice, this list of conditions and the following disclaimer in the\n",
+    "#    documentation and/or other materials provided with the distribution.\n",
+    "#  * Neither the name of NVIDIA CORPORATION nor the names of its\n",
+    "#    contributors may be used to endorse or promote products derived\n",
+    "#    from this software without specific prior written permission.\n",
+    "#\n",
+    "# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n",
+    "# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n",
+    "# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n",
+    "# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n",
+    "# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n",
+    "# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n",
+    "# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n",
+    "# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n",
+    "# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n",
+    "# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n",
+    "# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n",
+    "\n",
+    "import json\n",
+    "import logging\n",
+    "import pathlib\n",
+    "\n",
+    "import cloudpickle\n",
+    "import pickle\n",
+    "import io\n",
+    "import os\n",
+    "import torch\n",
+    "import triton_python_backend_utils as pb_utils\n",
+    "\n",
+    "from nvtabular.inference.triton import _convert_string2pytorch_dtype, _convert_tensor\n",
+    "from merlin.core.dispatch import HAS_GPU\n",
+    "\n",
+    "LOG = logging.getLogger(\"nvtabular\")\n",
+    "\n",
+    "sparse_value_marker = \"__values\"\n",
+    "sparse_nnzs_marker = \"__nnzs\"\n",
+    "\n",
+    "HAS_GPU = os.environ['HAS_GPU'] == '1'\n",
+    "\n",
+    "\n",
+    "class CPU_Unpickler(pickle.Unpickler):\n",
+    "    def find_class(self, module, name):\n",
+    "        print(super().find_class(module, name))\n",
+    "        if module == 'torch.storage' and name == '_load_from_bytes':\n",
+    "            return lambda b: torch.load(io.BytesIO(b), map_location=torch.device('cpu'))\n",
+    "        else:\n",
+    "            return super().find_class(module, name)\n",
+    "\n",
+    "class TritonPythonModel:\n",
+    "    \"\"\"Generic TritonPythonModel for nvtabular workflows\"\"\"\n",
+    "\n",
+    "    def initialize(self, args):\n",
+    "        # Arg parsing\n",
+    "        repository_path = pathlib.Path(args[\"model_repository\"])\n",
+    "        model_version = str(args[\"model_version\"])\n",
+    "\n",
+    "        # Handle bug in Tritonserver 22.06\n",
+    "        # model_repository argument became path to model.py\n",
+    "        if str(repository_path).endswith(\".py\"):\n",
+    "            repository_path = repository_path.parent.parent\n",
+    "\n",
+    "        model_path = repository_path / model_version / \"model.pkl\"\n",
+    "\n",
+    "        # Load the pickled PyTorch model\n",
+    "        if HAS_GPU:\n",
+    "            self.model = cloudpickle.load(\n",
+    "                open(str(model_path), \"rb\")  # pylint: disable=consider-using-with\n",
+    "            )\n",
+    "            model_path = repository_path / model_version / \"model.pth\"\n",
+    "            self.model.load_state_dict(torch.load(str(model_path)))\n",
+    "        else:\n",
+    "            self.model = CPU_Unpickler(open(str(model_path), \"rb\")).load()\n",
+    "            model_path = repository_path / model_version / \"model.pth\"\n",
+    "            self.model.load_state_dict(torch.load(str(model_path), map_location='cpu'))\n",
+    "\n",
+    "        self.model.eval()\n",
+    "\n",
+    "        # Load model config file\n",
+    "        self.model_config = json.loads(args[\"model_config\"])\n",
+    "\n",
+    "        # Load extra info needed for the Transformer4Rec (if exists)\n",
+    "        model_info_path = repository_path / model_version / \"model_info.json\"\n",
+    "        self.model_info = None\n",
+    "        model_info_file = pathlib.Path(model_info_path)\n",
+    "        if model_info_file.exists():\n",
+    "            with open(str(model_info_path), encoding=\"utf-8\") as json_file:\n",
+    "                self.model_info = json.load(json_file)\n",
+    "\n",
+    "        # Get the name of the dense and sparse inputs, and the outputs\n",
+    "        self.inputs = {}\n",
+    "        self.sparse_inputs = {}\n",
+    "        self.outputs = {}\n",
+    "        len_svm = len(sparse_value_marker)\n",
+    "        len_snm = len(sparse_nnzs_marker)\n",
+    "\n",
+    "        for val in self.model_config[\"input\"]:\n",
+    "            name = val[\"name\"]\n",
+    "\n",
+    "            # NVTabular adds this specific marker \"__values\" into the name of the sparse inputs\n",
+    "            # The ones that has the marker \"__nnzs\" are for the sparse values\n",
+    "            # Hence, dense and sparse inputs are identified based on these markers\n",
+    "            if len(name) > len_svm:\n",
+    "                if name[-len_svm:] == sparse_value_marker:\n",
+    "                    self.sparse_inputs[\n",
+    "                        name[0 : (len(name) - len_svm)]\n",
+    "                    ] = _convert_string2pytorch_dtype(val[\"data_type\"])\n",
+    "                elif name[-len_snm:] != sparse_nnzs_marker:\n",
+    "                    self.inputs[name] = _convert_string2pytorch_dtype(val[\"data_type\"])\n",
+    "            else:\n",
+    "                if len(name) > len_snm:\n",
+    "                    if name[-len_snm:] != sparse_nnzs_marker:\n",
+    "                        self.inputs[name] = _convert_string2pytorch_dtype(val[\"data_type\"])\n",
+    "                else:\n",
+    "                    self.inputs[name] = _convert_string2pytorch_dtype(val[\"data_type\"])\n",
+    "\n",
+    "        for val in self.model_config[\"output\"]:\n",
+    "            self.outputs[val[\"name\"]] = _convert_string2pytorch_dtype(val[\"data_type\"])\n",
+    "\n",
+    "    def execute(self, requests):\n",
+    "        \"\"\"Predicts the input batches by running through a PyTorch predict function.\"\"\"\n",
+    "\n",
+    "        # To be able to execute the queries, the PyTorch model must accept a dict input\n",
+    "        # and generates a dict output that has the output in the the \"predictions\"\n",
+    "        # bucket. Otherwise, it'll throw an error.\n",
+    "\n",
+    "        with torch.no_grad():\n",
+    "            responses = []\n",
+    "            for request in requests:\n",
+    "                # Convert the input data to dict to pass it into the PyTorch model\n",
+    "                input_dict = {}\n",
+    "                for name, dtype in self.inputs.items():\n",
+    "                    # Convert to fixed dtypes if requested\n",
+    "                    if self.model_info[\"use_fix_dtypes\"]:\n",
+    "                        dtype = _convert_dtype(dtype)\n",
+    "                    input_dict[name] = torch.tensor(\n",
+    "                        _convert_tensor(pb_utils.get_input_tensor_by_name(request, name)),\n",
+    "                        dtype=dtype,\n",
+    "                    ).cuda()\n",
+    "\n",
+    "                # Sparse inputs have a special format\n",
+    "                for name, dtype in self.sparse_inputs.items():\n",
+    "\n",
+    "                    # Get __values and __nnzs\n",
+    "                    input_val = _convert_tensor(\n",
+    "                        pb_utils.get_input_tensor_by_name(request, name + sparse_value_marker)\n",
+    "                    )\n",
+    "                    input_nnzs = _convert_tensor(\n",
+    "                        pb_utils.get_input_tensor_by_name(request, name + sparse_nnzs_marker)\n",
+    "                    )\n",
+    "                    input_nnzs = torch.tensor(input_nnzs, dtype=torch.int64)\n",
+    "                    input_values = torch.tensor(input_val, dtype=dtype)\n",
+    "\n",
+    "                    # Get the PyTorch sparse_coo_tensor\n",
+    "                    sparse_to_dense = False\n",
+    "                    seq_limit = 0\n",
+    "                    if self.model_info is not None:\n",
+    "                        if self.model_info[\"sparse_max\"].get(name) is not None:\n",
+    "                            sparse_to_dense = True\n",
+    "                            seq_limit = self.model_info[\"sparse_max\"][name]\n",
+    "\n",
+    "                    if seq_limit == 0:\n",
+    "                        seq_limit = int(input_nnzs.max())\n",
+    "\n",
+    "                    input_dict[name] = _build_sparse_tensor(\n",
+    "                        input_values, input_nnzs, seq_limit, sparse_to_dense\n",
+    "                    )\n",
+    "\n",
+    "                # Call forward function to get the predictions\n",
+    "                # Forward function should return a dict with the \"predictions\" bucket\n",
+    "                pred = self.model(input_dict, training=False)\n",
+    "                if pred is None:\n",
+    "                    raise KeyError(\n",
+    "                        \"output of the forward function should have a bucket named as predictions\"\n",
+    "                    )\n",
+    "\n",
+    "\t\t\t\t#place holder for testing. \n",
+    "                pred_numpy = (torch.topk(pred.detach(),20).indices).cpu().numpy()\n",
+    "                # There is one output in the config file\n",
+    "                # since the PyTorch models generate a tensor as an output\n",
+    "                output_info = self.model_config[\"output\"][0]\n",
+    "                output_tensor = pb_utils.Tensor(output_info[\"name\"], pred_numpy)\n",
+    "                responses.append(pb_utils.InferenceResponse([output_tensor]))\n",
+    "                \n",
+    "                # pred_numpy = pred.cpu().detach().numpy()\n",
+    "\n",
+    "                # There is one output in the config file\n",
+    "                # since the PyTorch models generate a tensor as an output\n",
+    "                # output_info = self.model_config[\"output\"][0]\n",
+    "                # output_tensor = pb_utils.Tensor(output_info[\"name\"], pred_numpy)\n",
+    "                # responses.append(pb_utils.InferenceResponse([output_tensor]))\n",
+    "\n",
+    "        return responses\n",
+    "\n",
+    "\n",
+    "def _get_indices(nnzs, device=\"cuda\"):\n",
+    "    \"\"\"Calculate indices for the PyTorch sparse_coo_tensor\"\"\"\n",
+    "    nnzs = nnzs[:, 0]\n",
+    "    row_ids = torch.arange(len(nnzs)-1)\n",
+    "    offsets = nnzs[1:]\n",
+    "    offsets[1:] = offsets[1:] - offsets[:-1]\n",
+    "    row_ids_repeated = torch.repeat_interleave(row_ids, offsets)\n",
+    "    offsets_cols = nnzs[:-1]\n",
+    "    offsets_cols = torch.repeat_interleave(offsets_cols.cumsum(0), offsets)\n",
+    "    col_ids = torch.arange(len(row_ids_repeated)) - offsets_cols\n",
+    "    indices = torch.cat([row_ids_repeated.unsqueeze(-1), col_ids.unsqueeze(-1)], axis=1)\n",
+    "    return indices.T\n",
+    "\n",
+    "    offsets = torch.cat((torch.tensor([1]), nnzs), 0)\n",
+    "    offsets = offsets.cumsum(0)\n",
+    "    row_ids = torch.arange(len(offsets) - 1)\n",
+    "    row_ids_repeated = torch.repeat_interleave(row_ids, nnzs)\n",
+    "    row_offset_repeated = torch.repeat_interleave(offsets[:-1], nnzs)\n",
+    "    col_ids = torch.arange(len(row_offset_repeated)) - row_offset_repeated + 1\n",
+    "    indices = torch.cat([row_ids_repeated.unsqueeze(-1), col_ids.unsqueeze(-1)], axis=1)\n",
+    "    return indices.T\n",
+    "\n",
+    "\n",
+    "def _get_sparse_tensor(values, indices, num_rows, seq_limit, sparse_as_dense, device=\"cuda\"):\n",
+    "    \"\"\"Creates the PyTorch sparse_coo_tensor\"\"\"\n",
+    "    \n",
+    "    if HAS_GPU:\n",
+    "        device='cuda'\n",
+    "    else:\n",
+    "        device='cpu'\n",
+    "    \n",
+    "    sparse_tensor = torch.sparse_coo_tensor(\n",
+    "        indices, values.squeeze(), torch.Size([num_rows-1, seq_limit]), device=device\n",
+    "    )\n",
+    "    if sparse_as_dense:\n",
+    "        sparse_tensor = sparse_tensor.to_dense()\n",
+    "    return sparse_tensor\n",
+    "\n",
+    "\n",
+    "def _build_sparse_tensor(values, nnzs, seq_limit, sparse_as_dense, device=\"cuda\"):\n",
+    "    \"\"\"Builds PyTorch sparse_coo_tensor by converting the __values and __nnzs inputs\"\"\"\n",
+    "    indices = _get_indices(nnzs, device)\n",
+    "    num_rows = len(nnzs)\n",
+    "    return _get_sparse_tensor(values, indices, num_rows, seq_limit, sparse_as_dense, device)\n",
+    "\n",
+    "\n",
+    "def _convert_dtype(dtype):\n",
+    "    \"\"\"Transformer4Rec uses these fixed dtypes and this function converts the original dtype\n",
+    "    to this fixed dtypes\"\"\"\n",
+    "    if dtype in [torch.float64, torch.float32, torch.float16]:\n",
+    "        return torch.float32\n",
+    "    if dtype in [\n",
+    "        torch.int64,\n",
+    "        torch.int32,\n",
+    "        torch.int16,\n",
+    "        torch.int8,\n",
+    "        torch.uint8,\n",
+    "    ]:\n",
+    "        return torch.long\n",
+    "\n",
+    "    raise ValueError(f\"Can't convert dtype {dtype})\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/train_runs/xlnet_clm_item_id.ipynb b/train_runs/xlnet_clm_item_id.ipynb
new file mode 100644
index 0000000000..d3ae60679d
--- /dev/null
+++ b/train_runs/xlnet_clm_item_id.ipynb
@@ -0,0 +1,439 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2ce2e001",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 07:14:19 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 07:14:21 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 07:14:21,395 >> Using amp fp16 backend\n",
+      "03/10/2023 07:14:21 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-14-18_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 100, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 07:14:21,846 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 07:14:21,846 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 07:14:21,846 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 07:14:21,846 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 07:14:21,846 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 07:14:21,846 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 07:14:21,846 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 07:14:21.396153 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : False  mlm_probability : 0.15  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_07-14-18_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 100  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 6.6475, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 2.4218, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 1.9739, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 1.858, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 1.7681, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 1.7082, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 337.7129, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.009, 'train_loss': 2.6110303890070266, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.11942513287067413, 'train_/next-item/ndcg_at_20': 0.132638081908226, 'train_/next-item/recall_at_10': 0.19570313394069672, 'train_/next-item/recall_at_20': 0.24843750894069672, 'train_/loss': 7.65301513671875, 'train_runtime': 0.6484, 'train_samples_per_second': 3948.127, 'train_steps_per_second': 30.845}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08762501925230026, 'eval_/next-item/ndcg_at_20': 0.09899389743804932, 'eval_/next-item/recall_at_10': 0.14928463101387024, 'eval_/next-item/recall_at_20': 0.1945594847202301, 'eval_/loss': 8.972926139831543, 'eval_runtime': 2.286, 'eval_samples_per_second': 4647.323, 'eval_steps_per_second': 36.307}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-18' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed 100 --report_to none\n",
+    "exit 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "770b3d58",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 07:20:26 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 07:20:27 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 07:20:28,053 >> Using amp fp16 backend\n",
+      "03/10/2023 07:20:28 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-20-25_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 0, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 07:20:28,547 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 07:20:28,547 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 07:20:28,547 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 07:20:28,547 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 07:20:28,547 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 07:20:28,547 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 07:20:28,547 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 07:20:28.053983 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : False  mlm_probability : 0.15  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_07-20-25_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 0  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 6.6784, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 2.4454, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 1.9754, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 1.8708, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 1.7958, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 1.7419, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 337.8916, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.003, 'train_loss': 2.6348056228908563, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.12325046211481094, 'train_/next-item/ndcg_at_20': 0.137022003531456, 'train_/next-item/recall_at_10': 0.19960938394069672, 'train_/next-item/recall_at_20': 0.25390625, 'train_/loss': 7.624682426452637, 'train_runtime': 0.6451, 'train_samples_per_second': 3968.659, 'train_steps_per_second': 31.005}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08722994476556778, 'eval_/next-item/ndcg_at_20': 0.0991109311580658, 'eval_/next-item/recall_at_10': 0.145143061876297, 'eval_/next-item/recall_at_20': 0.1927710771560669, 'eval_/loss': 9.022594451904297, 'eval_runtime': 2.2602, 'eval_samples_per_second': 4700.388, 'eval_steps_per_second': 36.722}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-30' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "SEED=0\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed $SEED --report_to none\n",
+    "exit 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "32e29315",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 07:26:15 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 07:26:17 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 07:26:17,384 >> Using amp fp16 backend\n",
+      "03/10/2023 07:26:17 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-26-14_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 1, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 07:26:17,872 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 07:26:17,872 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 07:26:17,872 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 07:26:17,872 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 07:26:17,872 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 07:26:17,872 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 07:26:17,872 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 07:26:17.385530 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : False  mlm_probability : 0.15  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_07-26-14_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 1  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 6.6644, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 2.3778, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 1.9486, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 1.8619, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 1.7841, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 1.7368, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 337.8499, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.004, 'train_loss': 2.613156272391596, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.12271016836166382, 'train_/next-item/ndcg_at_20': 0.1380811482667923, 'train_/next-item/recall_at_10': 0.19843749701976776, 'train_/next-item/recall_at_20': 0.25859376788139343, 'train_/loss': 7.584864616394043, 'train_runtime': 0.6478, 'train_samples_per_second': 3951.826, 'train_steps_per_second': 30.874}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08986585587263107, 'eval_/next-item/ndcg_at_20': 0.10167953372001648, 'eval_/next-item/recall_at_10': 0.14956700801849365, 'eval_/next-item/recall_at_20': 0.19663026928901672, 'eval_/loss': 9.000876426696777, 'eval_runtime': 2.2765, 'eval_samples_per_second': 4666.864, 'eval_steps_per_second': 36.46}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-35' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "SEED=1\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed $SEED --report_to none\n",
+    "exit 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "4b87e4f5",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 07:32:05 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 07:32:06 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 07:32:06,807 >> Using amp fp16 backend\n",
+      "03/10/2023 07:32:06 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-32-04_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 2, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 07:32:07,286 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 07:32:07,286 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 07:32:07,286 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 07:32:07,286 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 07:32:07,286 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 07:32:07,286 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 07:32:07,286 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 07:32:06.807964 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : False  mlm_probability : 0.15  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_07-32-04_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 2  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 6.624, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 2.3857, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 1.962, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 1.8662, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 1.784, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 1.7199, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 337.7988, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.006, 'train_loss': 2.6095561732907266, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.12347330152988434, 'train_/next-item/ndcg_at_20': 0.137996107339859, 'train_/next-item/recall_at_10': 0.20078125596046448, 'train_/next-item/recall_at_20': 0.25859376788139343, 'train_/loss': 7.582167148590088, 'train_runtime': 0.6507, 'train_samples_per_second': 3934.492, 'train_steps_per_second': 30.738}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08896206319332123, 'eval_/next-item/ndcg_at_20': 0.10121969878673553, 'eval_/next-item/recall_at_10': 0.1499435156583786, 'eval_/next-item/recall_at_20': 0.1987951695919037, 'eval_/loss': 8.977458953857422, 'eval_runtime': 2.2602, 'eval_samples_per_second': 4700.428, 'eval_steps_per_second': 36.722}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-40' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "SEED=2\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed $SEED --report_to none\n",
+    "exit 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "8c0d111b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 07:37:54 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 07:37:56 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 07:37:56,124 >> Using amp fp16 backend\n",
+      "03/10/2023 07:37:56 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-37-53_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 3, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 07:37:56,606 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 07:37:56,606 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 07:37:56,606 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 07:37:56,606 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 07:37:56,606 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 07:37:56,606 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 07:37:56,606 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 07:37:56.124693 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : False  mlm_probability : 0.15  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_07-37-53_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 3  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 6.7871, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 2.4843, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 1.9926, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 1.872, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 1.7884, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 1.7512, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 337.9889, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.0, 'train_loss': 2.6576913867476426, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.12272482365369797, 'train_/next-item/ndcg_at_20': 0.13889344036579132, 'train_/next-item/recall_at_10': 0.1953125, 'train_/next-item/recall_at_20': 0.25859376788139343, 'train_/loss': 7.600445747375488, 'train_runtime': 0.6514, 'train_samples_per_second': 3930.07, 'train_steps_per_second': 30.704}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08895686268806458, 'eval_/next-item/ndcg_at_20': 0.09970034658908844, 'eval_/next-item/recall_at_10': 0.14909638464450836, 'eval_/next-item/recall_at_20': 0.19145330786705017, 'eval_/loss': 9.008366584777832, 'eval_runtime': 2.2737, 'eval_samples_per_second': 4672.54, 'eval_steps_per_second': 36.504}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-45' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "SEED=3\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed $SEED --report_to none\n",
+    "exit 0"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/train_runs/xlnet_mlm_item_id.ipynb b/train_runs/xlnet_mlm_item_id.ipynb
new file mode 100644
index 0000000000..8603aca12a
--- /dev/null
+++ b/train_runs/xlnet_mlm_item_id.ipynb
@@ -0,0 +1,515 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "d48f073c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 06:22:30 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 06:22:31 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 06:22:31,662 >> Using amp fp16 backend\n",
+      "03/10/2023 06:22:31 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-22-28_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 100, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 06:22:32,106 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 06:22:32,106 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 06:22:32,106 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 06:22:32,106 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 06:22:32,106 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 06:22:32,106 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 06:22:32,106 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 06:22:31.662852 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : True  mlm_probability : 0.30000000000000004  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_06-22-28_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 100  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 10.1481, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 9.1622, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 8.8482, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 8.648, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 8.4446, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 8.3279, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 267.3924, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.641, 'train_loss': 8.856075417783838, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.0946362242102623, 'train_/next-item/ndcg_at_20': 0.11049988120794296, 'train_/next-item/recall_at_10': 0.17226563394069672, 'train_/next-item/recall_at_20': 0.23593750596046448, 'train_/loss': 7.873326301574707, 'train_runtime': 0.6495, 'train_samples_per_second': 3941.348, 'train_steps_per_second': 30.792}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08858750760555267, 'eval_/next-item/ndcg_at_20': 0.10536891222000122, 'eval_/next-item/recall_at_10': 0.16274471580982208, 'eval_/next-item/recall_at_20': 0.2292921543121338, 'eval_/loss': 8.278496742248535, 'eval_runtime': 2.2227, 'eval_samples_per_second': 4779.666, 'eval_steps_per_second': 37.341}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-8' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed 100 --report_to none"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a2b2ec29",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 06:27:11 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 06:27:12 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 06:27:12,976 >> Using amp fp16 backend\n",
+      "03/10/2023 06:27:12 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-27-10_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 100, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 06:27:13,448 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 06:27:13,448 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 06:27:13,448 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 06:27:13,448 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 06:27:13,448 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 06:27:13,448 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 06:27:13,448 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 06:27:12.976882 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : True  mlm_probability : 0.30000000000000004  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_06-27-10_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 100  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 10.1707, 'learning_rate': 0.0005683051336949965, 'epoch': 0.74}\n",
+      "{'loss': 9.1904, 'learning_rate': 0.0004696752944560177, 'epoch': 1.48}\n",
+      "{'loss': 8.9014, 'learning_rate': 0.0003710454552170388, 'epoch': 2.22}\n",
+      "{'loss': 8.7112, 'learning_rate': 0.0002724156159780598, 'epoch': 2.96}\n",
+      "{'loss': 8.5372, 'learning_rate': 0.00017378577673908085, 'epoch': 3.7}\n",
+      "{'loss': 8.4297, 'learning_rate': 7.515593750010194e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 268.1719, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.604, 'train_loss': 8.918887816660503, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.08740683645009995, 'train_/next-item/ndcg_at_20': 0.10400686413049698, 'train_/next-item/recall_at_10': 0.15468750894069672, 'train_/next-item/recall_at_20': 0.220703125, 'train_/loss': 7.998610019683838, 'train_runtime': 0.6448, 'train_samples_per_second': 3970.378, 'train_steps_per_second': 31.019}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08547135442495346, 'eval_/next-item/ndcg_at_20': 0.10144450515508652, 'eval_/next-item/recall_at_10': 0.1554969847202301, 'eval_/next-item/recall_at_20': 0.21865586936473846, 'eval_/loss': 8.368797302246094, 'eval_runtime': 2.2149, 'eval_samples_per_second': 4796.548, 'eval_steps_per_second': 37.473}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-13' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed 100 --report_to none"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "9ba3e539",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 06:31:53 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 06:31:54 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 06:31:54,883 >> Using amp fp16 backend\n",
+      "03/10/2023 06:31:54 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-31-52_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 0, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 06:31:55,376 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 06:31:55,376 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 06:31:55,376 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 06:31:55,376 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 06:31:55,376 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 06:31:55,376 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 06:31:55,376 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 06:31:54.884501 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : True  mlm_probability : 0.30000000000000004  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_06-31-52_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 0  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 10.1421, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 9.1819, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 8.9005, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 8.6487, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 8.4605, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 8.2935, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 268.4247, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.592, 'train_loss': 8.861479521079882, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.09560234844684601, 'train_/next-item/ndcg_at_20': 0.1121138259768486, 'train_/next-item/recall_at_10': 0.17148438096046448, 'train_/next-item/recall_at_20': 0.23710937798023224, 'train_/loss': 7.855612754821777, 'train_runtime': 0.6468, 'train_samples_per_second': 3957.894, 'train_steps_per_second': 30.921}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.09119625389575958, 'eval_/next-item/ndcg_at_20': 0.10950089246034622, 'eval_/next-item/recall_at_10': 0.1640625, 'eval_/next-item/recall_at_20': 0.23644576966762543, 'eval_/loss': 8.2479248046875, 'eval_runtime': 2.2235, 'eval_samples_per_second': 4778.131, 'eval_steps_per_second': 37.329}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-18' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "SEED=0\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed $SEED --report_to none"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "39c50e18",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 06:36:35 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 06:36:37 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 06:36:37,141 >> Using amp fp16 backend\n",
+      "03/10/2023 06:36:37 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-36-34_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 1, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 06:36:37,612 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 06:36:37,612 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 06:36:37,612 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 06:36:37,612 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 06:36:37,612 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 06:36:37,612 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 06:36:37,612 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 06:36:37.142319 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : True  mlm_probability : 0.30000000000000004  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_06-36-34_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 1  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 10.1835, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 9.1997, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 8.8927, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 8.6939, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 8.4857, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 8.4036, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 268.0277, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.611, 'train_loss': 8.904246891179733, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.08619051426649094, 'train_/next-item/ndcg_at_20': 0.10135936737060547, 'train_/next-item/recall_at_10': 0.16093750298023224, 'train_/next-item/recall_at_20': 0.22109375894069672, 'train_/loss': 7.952397346496582, 'train_runtime': 0.6468, 'train_samples_per_second': 3958.17, 'train_steps_per_second': 30.923}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08869394659996033, 'eval_/next-item/ndcg_at_20': 0.10502538084983826, 'eval_/next-item/recall_at_10': 0.16217996180057526, 'eval_/next-item/recall_at_20': 0.22693899273872375, 'eval_/loss': 8.337870597839355, 'eval_runtime': 2.2357, 'eval_samples_per_second': 4751.905, 'eval_steps_per_second': 37.124}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-23' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "SEED=1\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed $SEED --report_to none"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7012acaf",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 06:41:17 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 06:41:18 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 06:41:18,925 >> Using amp fp16 backend\n",
+      "03/10/2023 06:41:18 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-41-16_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 2, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 06:41:19,383 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 06:41:19,383 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 06:41:19,383 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 06:41:19,384 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 06:41:19,384 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 06:41:19,384 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 06:41:19,384 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 06:41:18.926096 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : True  mlm_probability : 0.30000000000000004  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_06-41-16_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 2  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 10.1984, 'learning_rate': 0.0005683051336949965, 'epoch': 0.74}\n",
+      "{'loss': 9.1995, 'learning_rate': 0.0004696752944560177, 'epoch': 1.48}\n",
+      "{'loss': 8.9484, 'learning_rate': 0.0003710454552170388, 'epoch': 2.22}\n",
+      "{'loss': 8.7082, 'learning_rate': 0.0002724156159780598, 'epoch': 2.96}\n",
+      "{'loss': 8.5479, 'learning_rate': 0.00017378577673908085, 'epoch': 3.7}\n",
+      "{'loss': 8.4013, 'learning_rate': 7.515593750010194e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 267.6305, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.629, 'train_loss': 8.929472757373336, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.0865924283862114, 'train_/next-item/ndcg_at_20': 0.10158973932266235, 'train_/next-item/recall_at_10': 0.15976563096046448, 'train_/next-item/recall_at_20': 0.21914063394069672, 'train_/loss': 7.99446964263916, 'train_runtime': 0.6446, 'train_samples_per_second': 3971.675, 'train_steps_per_second': 31.029}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08677017688751221, 'eval_/next-item/ndcg_at_20': 0.10235893726348877, 'eval_/next-item/recall_at_10': 0.16114456951618195, 'eval_/next-item/recall_at_20': 0.223268061876297, 'eval_/loss': 8.346117973327637, 'eval_runtime': 2.2174, 'eval_samples_per_second': 4791.094, 'eval_steps_per_second': 37.43}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-28' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "SEED=2\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed $SEED --report_to none"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "c8abb213",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "03/10/2023 06:50:42 - WARNING - __main__ -   Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n",
+      "03/10/2023 06:50:44 - WARNING - transformers4rec -   Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n",
+      "[INFO|trainer.py:434] 2023-03-10 06:50:44,307 >> Using amp fp16 backend\n",
+      "03/10/2023 06:50:44 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs -   Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-50-41_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 3, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n",
+      "[INFO|trainer.py:1196] 2023-03-10 06:50:44,786 >> ***** Running training *****\n",
+      "[INFO|trainer.py:1197] 2023-03-10 06:50:44,786 >>   Num examples = 86528\n",
+      "[INFO|trainer.py:1198] 2023-03-10 06:50:44,786 >>   Num Epochs = 5\n",
+      "[INFO|trainer.py:1199] 2023-03-10 06:50:44,786 >>   Instantaneous batch size per device = 128\n",
+      "[INFO|trainer.py:1200] 2023-03-10 06:50:44,786 >>   Total train batch size (w. parallel, distributed & accumulation) = 128\n",
+      "[INFO|trainer.py:1201] 2023-03-10 06:50:44,786 >>   Gradient Accumulation steps = 1\n",
+      "[INFO|trainer.py:1202] 2023-03-10 06:50:44,786 >>   Total optimization steps = 3380\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "DLL 2023-03-10 06:50:44.307714 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/  features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt  start_time_window_index : 1  final_time_window_index : 2  time_window_folder_pad_digits : 4  no_incremental_training : False  training_time_window_size : 0  use_side_information_features : False  input_features_aggregation : concat  model_type : xlnet  tf_out_activation : tanh  mlm : True  mlm_probability : 0.30000000000000004  plm : False  plm_probability : 0.25  plm_max_span_length : 5  plm_mask_input : False  plm_permute_all : False  rtd : False  rtd_sample_from_batch : False  rtd_use_batch_interaction : False  rtd_discriminator_loss_weight : 50  rtd_generator_loss_weight : 1  rtd_tied_generator : False  d_model : 192  n_layer : 3  n_head : 16  layer_norm_eps : 1e-12  initializer_range : 0.02  hidden_act : gelu  dropout : 0.0  summary_type : last  num_hidden_groups : 1  inner_group_num : 1  eval_on_last_item_seq_only : True  train_on_last_item_seq_only : False  mf_constrained_embeddings : True  item_embedding_dim : 448  numeric_features_project_to_embedding_dim : 0  numeric_features_soft_one_hot_encoding_num_embeddings : 0  stochastic_shared_embeddings_replacement_prob : 0.1  softmax_temperature : 1.0  label_smoothing : 0.0  embedding_dim_from_cardinality_multiplier : 2.0  item_id_embeddings_init_std : 0.11  other_embeddings_init_std : 0.02  layer_norm_featurewise : True  attn_type : bi  input_dropout : 0.1  loss_type : cross_entropy  similarity_type : concat_mlp  inp_merge : mlp  learning_rate_warmup_steps : 0  avg_session_length : None  output_dir : ./tmp/  overwrite_output_dir : True  do_train : True  do_eval : True  do_predict : False  prediction_loss_only : False  per_device_train_batch_size : 128  per_device_eval_batch_size : 128  per_gpu_train_batch_size : None  per_gpu_eval_batch_size : None  gradient_accumulation_steps : 1  eval_accumulation_steps : None  learning_rate : 0.0006667377132554976  weight_decay : 3.910060265627374e-05  adam_beta1 : 0.9  adam_beta2 : 0.999  adam_epsilon : 1e-08  max_grad_norm : 1.0  num_train_epochs : 5.0  max_steps : -1  lr_scheduler_type : linear  warmup_ratio : 0.0  warmup_steps : 0  log_level : -1  log_level_replica : -1  log_on_each_node : True  logging_dir : ./tmp/runs/Mar10_06-50-41_7dfa224f788e  logging_first_step : False  logging_steps : 500  logging_nan_inf_filter : True  save_steps : 0  save_total_limit : None  save_on_each_node : False  no_cuda : False  seed : 3  fp16 : True  fp16_opt_level : O1  fp16_backend : auto  fp16_full_eval : False  local_rank : -1  xpu_backend : None  tpu_num_cores : None  tpu_metrics_debug : False  debug : []  dataloader_drop_last : True  eval_steps : None  dataloader_num_workers : 0  past_index : -1  run_name : None  disable_tqdm : False  remove_unused_columns : True  label_names : None  load_best_model_at_end : False  metric_for_best_model : None  greater_is_better : None  ignore_data_skip : False  sharded_ddp : []  deepspeed : None  label_smoothing_factor : 0.0  adafactor : False  group_by_length : False  length_column_name : length  report_to : []  ddp_find_unused_parameters : None  dataloader_pin_memory : True  skip_memory_metrics : True  use_legacy_prediction_loop : False  push_to_hub : False  resume_from_checkpoint : None  hub_model_id : None  hub_token : None  gradient_checkpointing : False  push_to_hub_model_id : None  push_to_hub_organization : None  push_to_hub_token : None  _n_gpu : 1  mp_parameters :   max_sequence_length : 20  shuffle_buffer_size : 0  data_loader_engine : merlin  eval_on_test_set : True  eval_steps_on_train_set : 20  predict_top_k : 0  learning_rate_num_cosine_cycles_by_epoch : 1.25  log_predictions : False  compute_metrics_each_n_steps : 1  experiments_group : default  session_seq_length_max : 20  learning_rate_schedule : linear_with_warmup  validate_every : -1 \n",
+      "\n",
+      "***** Launch training for day 1: *****\n",
+      "{'loss': 10.1517, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n",
+      "{'loss': 9.1378, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n",
+      "{'loss': 8.8854, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n",
+      "{'loss': 8.668, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n",
+      "{'loss': 8.4887, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n",
+      "{'loss': 8.3778, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n",
+      "{'train_runtime': 267.4244, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.639, 'train_loss': 8.881638241378512, 'epoch': 5.0}\n",
+      "\n",
+      "***** Evaluation results for day 2 (train set):*****\n",
+      "\n",
+      "{'train_/next-item/ndcg_at_10': 0.08869679272174835, 'train_/next-item/ndcg_at_20': 0.10546129196882248, 'train_/next-item/recall_at_10': 0.15507812798023224, 'train_/next-item/recall_at_20': 0.22148437798023224, 'train_/loss': 7.946734428405762, 'train_runtime': 0.6445, 'train_samples_per_second': 3972.268, 'train_steps_per_second': 31.033}\n",
+      "\n",
+      "***** Evaluation results for day 2 (eval set):*****\n",
+      "\n",
+      "{'eval_/next-item/ndcg_at_10': 0.08494117110967636, 'eval_/next-item/ndcg_at_20': 0.10106566548347473, 'eval_/next-item/recall_at_10': 0.15568523108959198, 'eval_/next-item/recall_at_20': 0.21940888464450836, 'eval_/loss': 8.320558547973633, 'eval_runtime': 2.2146, 'eval_samples_per_second': 4797.31, 'eval_steps_per_second': 37.479}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Task exception was never retrieved\n",
+      "future: <Task finished name='Task-33' coro=<ScriptMagics.shebang.<locals>._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n",
+      "    line = await self.readuntil(sep)\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n",
+      "    raise exceptions.LimitOverrunError(\n",
+      "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n",
+      "    line = (await stream.readline()).decode(\"utf8\")\n",
+      "  File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n",
+      "    raise ValueError(e.args[0])\n",
+      "ValueError: Separator is not found, and chunk exceed the limit\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%bash\n",
+    "\n",
+    "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n",
+    "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n",
+    "NUM_EPOCHS=5\n",
+    "SEED=3\n",
+    "\n",
+    "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed $SEED --report_to none"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/transf_exp_main_modified.py b/transf_exp_main_modified.py
new file mode 100644
index 0000000000..08b1859457
--- /dev/null
+++ b/transf_exp_main_modified.py
@@ -0,0 +1,480 @@
+#
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import glob
+import logging
+import os
+from functools import partial
+
+import numpy as np
+import pandas as pd
+import torch
+import transformers
+from examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs import (
+    config_dllogger,
+    creates_output_dir,
+    log_aot_metric_results,
+    log_metric_results,
+    log_parameters,
+)
+from merlin.io import Dataset
+from examples.t4rec_paper_experiments.t4r_paper_repro.transf_exp_args import DataArguments, ModelArguments, TrainingArguments
+from transformers import HfArgumentParser, set_seed
+from transformers.trainer_utils import is_main_process
+
+import transformers4rec.torch as t4r
+from merlin_standard_lib import Schema, Tag
+from transformers4rec.torch import Trainer
+from transformers4rec.torch.utils.data_utils import MerlinDataLoader
+from transformers4rec.torch.utils.examples_utils import wipe_memory
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    # Parsing command line arguments
+    (data_args, model_args, training_args) = parse_command_line_args()
+    pd.to_pickle(data_args, 'data_args_with_side_features.pkl')
+    pd.to_pickle(model_args, 'model_args_with_side_features.pkl')
+    pd.to_pickle(training_args, 'training_args_with_side_features.pkl')
+
+    # Ensuring to set W&B run name to null, so that a nice run name is generated
+    training_args.run_name = None
+
+    # Loading the schema of the dataset
+    schema = Schema().from_proto_text(data_args.features_schema_path)
+    if not data_args.use_side_information_features:
+        schema = schema.select_by_tag(Tag.ITEM_ID)
+
+    item_id_col = schema.select_by_tag(Tag.ITEM_ID).column_names[0]
+    col_names = schema.column_names
+    logger.info("Column names: {}".format(col_names))
+
+    creates_output_dir(training_args)
+    config_logging(training_args)
+    set_seed(training_args.seed)
+
+    # Getting masking config
+    masking_kwargs = get_masking_kwargs(model_args)
+
+    # Obtaining Stochastic Shared embeddings config
+    pre_transforms = []
+    if model_args.stochastic_shared_embeddings_replacement_prob > 0:
+        pre_transforms.append(
+            t4r.StochasticSwapNoise(
+                pad_token=0,
+                replacement_prob=model_args.stochastic_shared_embeddings_replacement_prob,
+                schema=schema,
+            )
+        )
+
+    post_transforms = []
+
+    # Adding input dropout config
+    if model_args.input_dropout > 0:
+        input_dropout = t4r.TabularDropout(dropout_rate=model_args.input_dropout)
+        post_transforms.append(input_dropout)
+
+    # Obtaining feature-wise layer norm config
+    if model_args.layer_norm_featurewise:
+        post_transforms.append("layer-norm")
+
+    # Configuring categorical features embedding sizes
+    embedding_dims = {item_id_col: model_args.item_embedding_dim}
+    embedding_dim_default = model_args.item_embedding_dim
+    infer_embedding_sizes = not model_args.input_features_aggregation.startswith("element-wise")
+
+    # Configuring embedding initializers
+    embeddings_initializers = {}
+    for col in col_names:
+        if col == item_id_col:
+            std = model_args.item_id_embeddings_init_std
+        else:
+            std = model_args.other_embeddings_init_std
+        embeddings_initializers[col] = partial(torch.nn.init.normal_, mean=0.0, std=std)
+
+    # Define input module to process tabular input-features and to prepare masked inputs
+    input_module = t4r.TabularSequenceFeatures.from_schema(
+        schema,
+        max_sequence_length=training_args.max_sequence_length,
+        aggregation=model_args.input_features_aggregation,
+        d_output=model_args.d_model,
+        pre=pre_transforms,
+        post=post_transforms,
+        # Embedding Features args
+        embedding_dims=embedding_dims,
+        embedding_dim_default=embedding_dim_default,
+        infer_embedding_sizes=infer_embedding_sizes,
+        infer_embedding_sizes_multiplier=model_args.embedding_dim_from_cardinality_multiplier,
+        embeddings_initializers=embeddings_initializers,
+        continuous_soft_embeddings=(
+            model_args.numeric_features_soft_one_hot_encoding_num_embeddings > 0
+        ),
+        soft_embedding_cardinality_default=(
+            model_args.numeric_features_soft_one_hot_encoding_num_embeddings
+        ),
+        soft_embedding_dim_default=model_args.numeric_features_project_to_embedding_dim,
+        **masking_kwargs,
+    )
+
+    # Loss function: Cross-entropy with label smoothing
+    label_smoothing_xe_loss = t4r.LabelSmoothCrossEntropyLoss(
+        reduction="mean", smoothing=model_args.label_smoothing
+    )
+
+    # Configuring metrics: NDCG@10, NDCG@20, Recall@10, Recall@20
+    metrics = [
+        t4r.ranking_metric.NDCGAt(top_ks=[10, 20], labels_onehot=True),
+        t4r.ranking_metric.RecallAt(top_ks=[10, 20], labels_onehot=True),
+    ]
+
+    # Configures the next-item prediction-task
+    prediction_task = t4r.NextItemPredictionTask(
+        weight_tying=model_args.mf_constrained_embeddings,
+        softmax_temperature=model_args.softmax_temperature,
+        metrics=metrics,
+        loss=label_smoothing_xe_loss,
+    )
+
+    model_config = get_model_config(training_args, model_args)
+
+    # Generates the final PyTorch model
+    model = model_config.to_torch_model(input_module, prediction_task)
+
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        schema=schema,
+        compute_metrics=True,
+        incremental_logging=True,
+    )
+
+    log_parameters(trainer, data_args, model_args, training_args)
+
+    results_over_time = incremental_train_eval(
+        trainer,
+        start_time_index=data_args.start_time_window_index,
+        end_time_index=data_args.final_time_window_index,
+        input_dir=data_args.data_path,
+        training_args=training_args,
+        data_args=data_args,
+    )
+
+    if training_args.do_eval:
+        logger.info("Computing and logging AOT (Average Over Time) metrics")
+        results_df = pd.DataFrame.from_dict(results_over_time, orient="index")
+        results_df.reset_index().to_csv(
+            os.path.join(training_args.output_dir, "eval_train_results.csv"),
+            index=False,
+        )
+
+        results_avg_time = dict(results_df.mean())
+        results_avg_time = {f"{k}_AOT": v for k, v in results_avg_time.items()}
+        # Logging to W&B / Tensorboard
+        trainer.log(results_avg_time)
+
+        log_aot_metric_results(training_args.output_dir, results_avg_time)
+
+    # Mimic the inference by manually computing recall@10 using the evaluation data
+    # of the last time-index.
+    eval_path = os.path.join(
+        data_args.data_path,
+        str(
+            data_args.final_time_window_index,
+        ).zfill(data_args.time_window_folder_pad_digits),
+        "test.parquet" if training_args.eval_on_test_set else "valid.parquet",
+    )
+    prediction_data = pd.read_parquet(eval_path)
+    # Extract label
+    labels = prediction_data["sess_pid_seq"].apply(lambda x: x[-1]).values
+
+    # Truncate input sequences up to last item - 1 to mimic the inference
+    def mask_last_interaction(x):
+        return list(x[:-1])
+
+    list_columns = schema.select_by_tag("list").column_names
+    for col in list_columns:
+        prediction_data[col] = prediction_data[col].apply(mask_last_interaction)
+    # Get top-10 predictions
+    test_loader = MerlinDataLoader.from_schema(
+        schema,
+        Dataset(prediction_data),
+        training_args.per_device_eval_batch_size,
+        max_sequence_length=training_args.max_sequence_length,
+        shuffle=False,
+    )
+    trainer.test_dataloader = test_loader
+    trainer.args.predict_top_k = 10
+    topk_preds = trainer.predict(test_loader).predictions[0]
+    # Compute recall@10
+    recall_10 = recall(topk_preds, labels)
+
+    logger.info(f"Recall@10 of manually masked test data = {str(recall_10)}")
+    output_file = os.path.join(training_args.output_dir, "eval_results_over_time.txt")
+    with open(output_file, "a") as writer:
+        writer.write(f"\n***** Recall@10 of simulated inference  = {recall_10} *****\n")
+    # Verify that the recall@10 from train.evaluate() matches the recall@10 calculated manually
+    if not isinstance(input_module.masking, t4r.masking.PermutationLanguageModeling):
+        # TODO fix inference discrepancy for permutation language modeling
+        assert np.isclose(recall_10, results_over_time[2]["eval_/next-item/recall_at_10"], rtol=0.1)
+
+    import nvtabular as nvt
+    from merlin.schema.tags import Tags
+    from nvtabular.inference.triton import export_pytorch_ensemble
+    from nvtabular.workflow import Workflow
+
+
+    os.makedirs('/workspace/models_for_benchmarking/', exist_ok=True)
+    ds = nvt.Dataset(prediction_data)
+    sess_pid_seq = ['sess_pid_seq'] >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL])
+    wf = nvt.Workflow(sess_pid_seq)
+    wf.fit(ds)
+    wf.output_dtypes
+
+    export_pytorch_ensemble(
+        model,
+        wf,
+        sparse_max=trainer.get_train_dataloader().dataset.sparse_max,
+        name= "t4r_pytorch",
+        model_path= "/workspace/models_for_benchmarking/",
+        label_columns =[],
+    )
+
+
+
+def recall(predicted_items: np.ndarray, real_items: np.ndarray) -> float:
+    bs, top_k = predicted_items.shape
+    valid_rows = real_items != 0
+
+    # reshape predictions and labels to compare
+    # the top-10 predicted item-ids with the label id.
+    real_items = real_items.reshape(bs, 1, -1)
+    predicted_items = predicted_items.reshape(bs, 1, top_k)
+
+    num_relevant = real_items.shape[-1]
+    predicted_correct_sum = (predicted_items == real_items).sum(-1)
+    predicted_correct_sum = predicted_correct_sum[valid_rows]
+    recall_per_row = predicted_correct_sum / num_relevant
+    return np.mean(recall_per_row)
+
+
+def incremental_train_eval(
+    trainer, start_time_index, end_time_index, input_dir, training_args, data_args
+):
+    """
+    Performs incremental training eand evaluation.
+    Iteratively train using data of a given window index and evaluate on the validation data
+    of the following index.
+    Parameters
+    ----------
+    start_time_index: int
+        The start index for training, it should match the partitions of the data directory
+    end_time_index: int
+        The end index for training, it should match the partitions of the  data directory
+    input_dir: str
+        The input directory where the parquet files were saved based on partition column
+    Returns
+    -------
+    results_over_time: dict
+        The average over time of ranking metrics.
+    """
+    results_over_time = {}
+    for time_index in range(start_time_index, end_time_index):
+        # 1. Set data
+        time_index_train = time_index
+        time_index_eval = time_index + 1
+        train_paths = glob.glob(
+            os.path.join(
+                input_dir,
+                str(time_index_train).zfill(data_args.time_window_folder_pad_digits),
+                "train.parquet",
+            )
+        )
+        eval_paths = glob.glob(
+            os.path.join(
+                input_dir,
+                str(time_index_eval).zfill(data_args.time_window_folder_pad_digits),
+                "test.parquet" if training_args.eval_on_test_set else "valid.parquet",
+            )
+        )
+
+        # 2. Train on train data of time_index
+        if training_args.do_train:
+            print("\n***** Launch training for day %s: *****" % time_index)
+            trainer.train_dataset_or_path = train_paths
+            trainer.reset_lr_scheduler()
+            trainer.train()
+
+        if training_args.do_eval:
+
+            # 3. Evaluate on train data of time_index
+            trainer.eval_dataset_or_path = train_paths
+            train_metrics = trainer.evaluate(metric_key_prefix="train")
+            print("\n***** Evaluation results for day %s (train set):*****\n" % time_index_eval)
+            print(train_metrics)
+
+            log_metric_results(
+                training_args.output_dir,
+                train_metrics,
+                prefix="train",
+                time_index=time_index_eval,
+            )
+
+            # free GPU for next day training
+            wipe_memory()
+
+            # 4. Evaluate on valid/test data of time_index+1
+            trainer.eval_dataset_or_path = eval_paths
+            eval_metrics = trainer.evaluate(metric_key_prefix="eval")
+            print("\n***** Evaluation results for day %s (eval set):*****\n" % time_index_eval)
+            print(eval_metrics)
+
+            log_metric_results(
+                training_args.output_dir,
+                eval_metrics,
+                prefix="eval",
+                time_index=time_index_eval,
+            )
+
+            # free GPU for next day training
+            wipe_memory()
+
+        results_over_time[time_index_eval] = {
+            **eval_metrics,
+            **train_metrics,
+        }
+
+    return results_over_time
+
+
+def get_masking_kwargs(model_args):
+    kwargs = {}
+    if model_args.plm:
+        kwargs = {
+            "masking": "plm",
+            "plm_probability": model_args.plm_probability,
+            "max_span_length": model_args.plm_max_span_length,
+            "permute_all": model_args.plm_permute_all,
+        }
+    elif model_args.rtd:
+        kwargs = {
+            "masking": "rtd",
+            "sample_from_batch": model_args.rtd_sample_from_batch,
+            # rtd_use_batch_interaction=?
+            # rtd_discriminator_loss_weight=?
+            # rtd_generator_loss_weight=?
+            # rtd_tied_generator=?
+        }
+    elif model_args.mlm:
+        kwargs = {"masking": "mlm", "mlm_probability": model_args.mlm_probability}
+    else:
+        kwargs = {"masking": "clm"}
+
+    return kwargs
+
+
+def get_model_config(training_args, model_args):
+    kwargs = {}
+
+    if model_args.model_type == "gpt2":
+        model_build_fn = t4r.GPT2Config.build
+    if model_args.model_type == "xlnet":
+        model_build_fn = t4r.XLNetConfig.build
+        kwargs = {
+            "summary_type": model_args.summary_type,
+            "attn_type": model_args.attn_type,
+        }
+    if model_args.model_type == "electra":
+        model_build_fn = t4r.ElectraConfig.build
+    if model_args.model_type == "albert":
+        model_build_fn = t4r.AlbertConfig.build
+        num_hidden_groups = model_args.num_hidden_groups
+        if model_args.num_hidden_groups == -1:
+            num_hidden_groups = model_args.n_layer
+        kwargs = {
+            "num_hidden_groups": num_hidden_groups,
+            "inner_group_num": model_args.inner_group_num,
+        }
+    if model_args.model_type == "transfoxl":
+        model_build_fn = t4r.TransfoXLConfig.build
+
+    model_config = model_build_fn(
+        total_seq_length=training_args.max_sequence_length,
+        d_model=model_args.d_model,
+        n_head=model_args.n_head,
+        n_layer=model_args.n_layer,
+        hidden_act=model_args.hidden_act,
+        initializer_range=model_args.initializer_range,
+        layer_norm_eps=model_args.layer_norm_eps,
+        dropout=model_args.dropout,
+        pad_token=0,
+        **kwargs,
+    )
+
+    return model_config
+
+
+def parse_command_line_args():
+    # Parsing command line argument
+    parser = HfArgumentParser((DataArguments, ModelArguments, TrainingArguments))
+    (
+        data_args,
+        model_args,
+        training_args,
+    ) = parser.parse_args_into_dataclasses()
+
+    # Adapting arguments used in the original paper reproducibility script to the new ones
+    if training_args.session_seq_length_max:
+        training_args.max_sequence_length = training_args.session_seq_length_max
+
+    if training_args.learning_rate_schedule:
+        training_args.lr_scheduler_type = training_args.learning_rate_schedule.replace(
+            "_with_warmup", ""
+        )
+
+    if model_args.input_features_aggregation == "elementwise_sum_multiply_item_embedding":
+        model_args.input_features_aggregation = "element-wise-sum-item-multi"
+
+    return data_args, model_args, training_args
+
+
+def config_logging(training_args):
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(training_args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+
+    config_dllogger(training_args.output_dir)
+
+
+if __name__ == "__main__":
+    main()