diff --git a/benchmarking.ipynb b/benchmarking.ipynb new file mode 100644 index 0000000000..ecea9eab7d --- /dev/null +++ b/benchmarking.ipynb @@ -0,0 +1,532 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "6e128361", + "metadata": {}, + "source": [ + "In this notebook, we pick up a model trained using `train_and_save_models_for_benchmarking.ipynb` and stored on google drive to perform inference and benchmark performance." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "ce1d7155", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: wandb in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 1)) (0.13.7)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 2)) (1.3.5)\n", + "Requirement already satisfied: nvidia-pyindex in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 3)) (1.0.9)\n", + "Requirement already satisfied: dllogger from git+https://github.com/NVIDIA/dllogger#egg=dllogger in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 4)) (1.0.0)\n", + "Requirement already satisfied: pathtools in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (0.1.2)\n", + "Requirement already satisfied: promise<3,>=2.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (2.3)\n", + "Requirement already satisfied: docker-pycreds>=0.4.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (0.4.0)\n", + "Requirement already satisfied: shortuuid>=0.5.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.0.11)\n", + "Requirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (5.9.4)\n", + "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0; python_version < \"3.9\" and sys_platform == \"linux\" in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (3.20.3)\n", + "Requirement already satisfied: PyYAML in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (6.0)\n", + "Requirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (2.28.1)\n", + "Requirement already satisfied: setproctitle in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.3.2)\n", + "Requirement already satisfied: GitPython>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (3.1.30)\n", + "Requirement already satisfied: sentry-sdk>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.12.1)\n", + "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from wandb->-r requirements.txt (line 1)) (45.2.0)\n", + "Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (8.1.3)\n", + "Requirement already satisfied: numpy>=1.17.3; platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\" in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (1.22.4)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (2022.7)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (2.8.2)\n", + "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from promise<3,>=2.0->wandb->-r requirements.txt (line 1)) (1.14.0)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2.1.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (1.26.13)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2019.11.28)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.8/dist-packages (from GitPython>=1.0.0->wandb->-r requirements.txt (line 1)) (4.0.10)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.8/dist-packages (from gitdb<5,>=4.0.1->GitPython>=1.0.0->wandb->-r requirements.txt (line 1)) (5.0.0)\n", + "Requirement already satisfied: gdown in /usr/local/lib/python3.8/dist-packages (4.6.3)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from gdown) (4.64.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from gdown) (3.9.0)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from gdown) (4.11.1)\n", + "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from gdown) (1.14.0)\n", + "Requirement already satisfied: requests[socks] in /usr/local/lib/python3.8/dist-packages (from gdown) (2.28.1)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.8/dist-packages (from beautifulsoup4->gdown) (2.3.2.post1)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests[socks]->gdown) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests[socks]->gdown) (2019.11.28)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (1.26.13)\n", + "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6; extra == \"socks\" in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (1.7.1)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV\n", + "To: /workspace/examples/t4rec_paper_experiments/t4r_paper_repro/rees46_ecom_dataset_small_for_ci.zip\n", + "100%|██████████| 43.4M/43.4M [00:06<00:00, 6.42MB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Hit:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 InRelease\n", + "Hit:2 http://archive.ubuntu.com/ubuntu focal InRelease\n", + "Hit:3 http://security.ubuntu.com/ubuntu focal-security InRelease\n", + "Hit:4 http://archive.ubuntu.com/ubuntu focal-updates InRelease\n", + "Hit:5 http://archive.ubuntu.com/ubuntu focal-backports InRelease\n", + "Reading package lists...\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "unzip is already the newest version (6.0-25ubuntu1.1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 74 not upgraded.\n", + "Archive: rees46_ecom_dataset_small_for_ci.zip\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error: invalid response [# gdown h]\n", + "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error: invalid response [ttps://dr]\n", + "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error: invalid response [ive.googl]\n", + "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error: invalid response [e.com/uc?]\n", + "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error: invalid response [id=18Ella]\n", + "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: error: invalid response [Kaodqaesr]\n", + "replace /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet? [y]es, [n]o, [A]ll, [N]one, [r]ename: " + ] + } + ], + "source": [ + "%%bash\n", + "set -e\n", + "\n", + "#### Install requirements\n", + "cd examples/t4rec_paper_experiments\n", + "pip install -r requirements.txt\n", + "\n", + "### Get data\n", + "cd t4r_paper_repro\n", + "\n", + "FEATURE_SCHEMA_PATH=../datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "pip install gdown\n", + "gdown https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV\n", + "apt-get update -y\n", + "apt-get install unzip -y\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "unzip -d $DATA_PATH \"rees46_ecom_dataset_small_for_ci.zip\"\n", + "# gdown https://drive.google.com/uc?id=18EllaKaodqaesrNJ3YGEmv0YUD3NX0vK\n", + "# mkdir -p /transformers4rec/TF4Rec/models/\n", + "# MODEL_PATH=/transformers4rec/TF4Rec/models/\n", + "# unzip -d $MODEL_PATH \"model.zip\"\n", + "exit 0" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "513f52fc", + "metadata": {}, + "outputs": [], + "source": [ + "import glob\n", + "import logging\n", + "import os\n", + "from functools import partial\n", + "import pandas as pd\n", + "import cudf\n", + "import numpy as np\n", + "import nvtabular.inference.triton as nvt_triton\n", + "import tritonclient.grpc as grpcclient\n", + "import subprocess\n", + "import time" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0abc674e", + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p /workspace/examples/t4rec_paper_experiments/t4r_paper_repro" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "163eef53", + "metadata": {}, + "outputs": [], + "source": [ + "os.chdir('/workspace/examples/t4rec_paper_experiments/t4r_paper_repro')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "f4071799", + "metadata": {}, + "outputs": [], + "source": [ + "eval_path = os.path.join(\n", + " '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/',\n", + " str(2,).zfill(4),\n", + " \"valid.parquet\",\n", + ")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f2775430", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0220 02:43:34.847979 18298 pinned_memory_manager.cc:240] Pinned memory pool is created at '0x7f03d6000000' with size 268435456\n", + "I0220 02:43:34.848302 18298 cuda_memory_manager.cc:105] CUDA memory pool is created on device 0 with size 67108864\n", + "I0220 02:43:34.850169 18298 model_lifecycle.cc:459] loading: t4r_pytorch_pt:1\n", + "I0220 02:43:38.522804 18298 python_be.cc:1856] TRITONBACKEND_ModelInstanceInitialize: t4r_pytorch_pt (GPU device 0)\n" + ] + } + ], + "source": [ + "# load model trained locally using train_and_save_models_for_benchmarking.ipynb\n", + "\n", + "my_env = os.environ.copy()\n", + "\n", + "# # run on the CPU\n", + "# my_env[\"CUDA_VISIBLE_DEVICES\"] = ''\n", + "# my_env[\"HAS_GPU\"] = '0'\n", + "\n", + "# run on the GPU\n", + "my_env[\"HAS_GPU\"] = '1'\n", + "\n", + "subprocess.Popen(['tritonserver', '--model-repository=/workspace/models_for_benchmarking/'], env=my_env)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "39dfa60b", + "metadata": {}, + "outputs": [], + "source": [ + "# # load model downloaded from google drive\n", + "# subprocess.Popen(['tritonserver', '--model-repository=/transformers4rec/TF4Rec/models/'])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "6b8f3a54", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "I0220 02:43:42.878213 18298 model_lifecycle.cc:694] successfully loaded 't4r_pytorch_pt' version 1\n", + "I0220 02:43:42.878340 18298 server.cc:563] \n", + "+------------------+------+\n", + "| Repository Agent | Path |\n", + "+------------------+------+\n", + "+------------------+------+\n", + "\n", + "I0220 02:43:42.878405 18298 server.cc:590] \n", + "+---------+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "| Backend | Path | Config |\n", + "+---------+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "| python | /opt/tritonserver/backends/python/libtriton_python.so | {\"cmdline\":{\"auto-complete-config\":\"true\",\"min-compute-capability\":\"6.000000\",\"backend-directory\":\"/opt/tritonserver/backends\",\"default-max-batch-size\":\"4\"}} |\n", + "+---------+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n", + "I0220 02:43:42.878442 18298 server.cc:633] \n", + "+----------------+---------+--------+\n", + "| Model | Version | Status |\n", + "+----------------+---------+--------+\n", + "| t4r_pytorch_pt | 1 | READY |\n", + "+----------------+---------+--------+\n", + "\n", + "I0220 02:43:42.903695 18298 metrics.cc:864] Collecting metrics for GPU 0: Quadro RTX 8000\n", + "I0220 02:43:42.903932 18298 metrics.cc:757] Collecting CPU metrics\n", + "I0220 02:43:42.904063 18298 tritonserver.cc:2264] \n", + "+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "| Option | Value |\n", + "+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "| server_id | triton |\n", + "| server_version | 2.28.0 |\n", + "| server_extensions | classification sequence model_repository model_repository(unload_dependents) schedule_policy model_configuration system_shared_memory cuda_shared_memory binary_tensor_data statistics trace logging |\n", + "| model_repository_path[0] | /workspace/models_for_benchmarking/ |\n", + "| model_control_mode | MODE_NONE |\n", + "| strict_model_config | 0 |\n", + "| rate_limit | OFF |\n", + "| pinned_memory_pool_byte_size | 268435456 |\n", + "| cuda_memory_pool_byte_size{0} | 67108864 |\n", + "| response_cache_byte_size | 0 |\n", + "| min_supported_compute_capability | 6.0 |\n", + "| strict_readiness | 1 |\n", + "| exit_timeout | 30 |\n", + "+----------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", + "\n", + "I0220 02:43:42.904966 18298 grpc_server.cc:4819] Started GRPCInferenceService at 0.0.0.0:8001\n", + "I0220 02:43:42.905121 18298 http_server.cc:3477] Started HTTPService at 0.0.0.0:8000\n", + "I0220 02:43:42.945837 18298 http_server.cc:184] Started Metrics Service at 0.0.0.0:8002\n" + ] + } + ], + "source": [ + "time.sleep(15)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "f2413171", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "client created.\n", + "GET /v2/health/live, headers None\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python3.8/dist-packages/tritonhttpclient/__init__.py:31: DeprecationWarning: The package `tritonhttpclient` is deprecated and will be removed in a future version. Please use instead `tritonclient.http`\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import tritonhttpclient\n", + "try:\n", + " triton_client = tritonhttpclient.InferenceServerClient(url=\"localhost:8000\", verbose=True)\n", + " print(\"client created.\")\n", + "except Exception as e:\n", + " print(\"channel creation failed: \" + str(e))\n", + "triton_client.is_server_live()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "1726cce3", + "metadata": {}, + "outputs": [], + "source": [ + "prediction_data = cudf.read_parquet(eval_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7a21bdff", + "metadata": {}, + "outputs": [], + "source": [ + "col_names = ['sess_pid_seq']\n", + "inputs = nvt_triton.convert_df_to_triton_input(col_names, prediction_data.loc[6, col_names], grpcclient.InferInput)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7cd5d6f9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.058879852294921875" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import time\n", + "MODEL_NAME_PT = \"t4r_pytorch_pt\"\n", + "\n", + "N_TRIALS = 1000\n", + "\n", + "# WarmUp\n", + "for _ in range(N_TRIALS):\n", + " payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n", + " with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n", + " col_names = ['sess_pid_seq']\n", + " inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n", + " response = client.infer(MODEL_NAME_PT, inputs)\n", + " end_time = time.time()\n", + "\n", + "\n", + "# Collecting\n", + "\n", + "out = []\n", + "for _ in range(N_TRIALS):\n", + " payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n", + " start_time = time.time()\n", + " with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n", + " col_names = ['sess_pid_seq']\n", + " inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n", + " response = client.infer(MODEL_NAME_PT, inputs)\n", + " end_time = time.time()\n", + " out.append(end_time-start_time)\n", + " \n", + "# P95\n", + "np.sort(out)[int(0.95 * N_TRIALS)]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "876aaf7a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.008340835571289062" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import time\n", + "MODEL_NAME_PT = \"t4r_pytorch_pt\"\n", + "\n", + "N_TRIALS = 1000\n", + "\n", + "# WarmUp\n", + "for _ in range(N_TRIALS):\n", + " payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n", + " with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n", + " col_names = ['sess_pid_seq']\n", + " inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n", + " response = client.infer(MODEL_NAME_PT, inputs)\n", + " end_time = time.time()\n", + "\n", + "\n", + "# Collecting\n", + "\n", + "out = []\n", + "for _ in range(N_TRIALS):\n", + " payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n", + " start_time = time.time()\n", + " with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n", + " col_names = ['sess_pid_seq']\n", + " inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n", + " response = client.infer(MODEL_NAME_PT, inputs)\n", + " end_time = time.time()\n", + " out.append(end_time-start_time)\n", + " \n", + "# P95\n", + "np.sort(out)[int(0.95 * N_TRIALS)]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8b537ca", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "212fd93a", + "metadata": {}, + "outputs": [], + "source": [ + "# %%timeit\n", + "\n", + "# output_names = [\"output\"]\n", + "\n", + "# outputs = []\n", + "# for col in output_names:\n", + "# outputs.append(grpcclient.InferRequestedOutput(col))\n", + " \n", + "# MODEL_NAME_PT = \"t4r_pytorch_pt\"\n", + "# payload = cudf.DataFrame(data={'sess_pid_seq': np.random.randint(0, 390001, 20), 'id': 0}).groupby('id').agg({'sess_pid_seq': list})\n", + "\n", + "# with grpcclient.InferenceServerClient(\"localhost:8001\") as client:\n", + "# col_names = ['sess_pid_seq']\n", + "# inputs = nvt_triton.convert_df_to_triton_input(col_names, payload, grpcclient.InferInput)\n", + "# response = client.infer(MODEL_NAME_PT, inputs)" + ] + }, + { + "cell_type": "markdown", + "id": "dc821e3b", + "metadata": {}, + "source": [ + "Some additional information on how the benchmarking was run:\n", + "\n", + "I train and save the model using (modified) script for the T4Rec paper, all this is documented in the following notebook: https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/add_benchmarking_scripts/train_and_save_models_for_benchmarking.ipynb\n", + "I generate data for benchmarking here: https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/add_benchmarking_scripts/generate_randomized_input_for_benchmarking.ipynb\n", + "And I then start triton and load the model using code from this notebook: https://github.com/NVIDIA-Merlin/Transformers4Rec/blob/add_benchmarking_scripts/benchmarking.ipynb\n", + "When inferring on the CPU, you need to make sure you output the correct config.pbtxt, the modified model.py should handle the rest.\n", + "Also, from the model repository, I delete all the other models that are being output, I am only loading the T4Rec model.\n", + "I run perf_analyzer using this command: perf_analyzer -m t4r_pytorch_pt --shape sess_pid_seq__nnzs:2,1 --shape sess_pid_seq__values:20,1 --input-data input.json --concurrency-range 1:4:1" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/generate_randomized_input_for_benchmarking.ipynb b/generate_randomized_input_for_benchmarking.ipynb new file mode 100644 index 0000000000..163e58f991 --- /dev/null +++ b/generate_randomized_input_for_benchmarking.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "486c0d66", + "metadata": {}, + "outputs": [], + "source": [ + "j = \"\"\"\n", + " { \n", + " \"data\" :\n", + " [\n", + " {\n", + " \"sess_pid_seq__values\" : [0,1,2,3,4,5,6,7,8,9,10,11,12,13,15,16,17,18,19],\n", + " \"sess_pid_seq__nnzs\" : [0,20]\n", + " }\n", + " ]\n", + " }\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "8d629daf", + "metadata": {}, + "outputs": [], + "source": [ + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e95ca248", + "metadata": {}, + "outputs": [], + "source": [ + "d = json.loads(j)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1f7836da", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'data': [{'sess_pid_seq__values': [0,\n", + " 1,\n", + " 2,\n", + " 3,\n", + " 4,\n", + " 5,\n", + " 6,\n", + " 7,\n", + " 8,\n", + " 9,\n", + " 10,\n", + " 11,\n", + " 12,\n", + " 13,\n", + " 15,\n", + " 16,\n", + " 17,\n", + " 18,\n", + " 19],\n", + " 'sess_pid_seq__nnzs': [0, 20]}]}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "d" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f4cbfe0d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'{\"data\": [{\"sess_pid_seq__values\": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19], \"sess_pid_seq__nnzs\": [0, 20]}]}'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "json.dumps(d)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "18cfd44c", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "353b82cb", + "metadata": {}, + "outputs": [], + "source": [ + "data_as_dict = {'data': []}\n", + "\n", + "for i in range(10_000):\n", + " payload = {}\n", + " payload['sess_pid_seq__values'] = np.random.randint(0, 390001, 20).tolist()\n", + " payload['sess_pid_seq__nnzs'] = [0, 20]\n", + " \n", + " data_as_dict['data'].append(payload)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "ef9c5f38", + "metadata": {}, + "outputs": [], + "source": [ + "with open('input.json', 'w') as f:\n", + " json.dump(data_as_dict, f)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/train_and_save_models_for_benchmarking.ipynb b/train_and_save_models_for_benchmarking.ipynb new file mode 100644 index 0000000000..73244ccba4 --- /dev/null +++ b/train_and_save_models_for_benchmarking.ipynb @@ -0,0 +1,1476 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1571e258", + "metadata": {}, + "source": [ + "This notebook trains and exports a model for benchmarking using scripts provided by Gabriel (the ones that has been used for the T4Rec paper)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9eccd5e8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: wandb in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 1)) (0.13.7)\n", + "Requirement already satisfied: pandas in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 2)) (1.3.5)\n", + "Requirement already satisfied: nvidia-pyindex in /usr/local/lib/python3.8/dist-packages (from -r requirements.txt (line 3)) (1.0.9)\n", + "Collecting dllogger\n", + " Cloning https://github.com/NVIDIA/dllogger to /tmp/pip-install-qkhp_f4u/dllogger\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " Running command git clone -q https://github.com/NVIDIA/dllogger /tmp/pip-install-qkhp_f4u/dllogger\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: shortuuid>=0.5.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.0.11)\n", + "Requirement already satisfied: PyYAML in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (6.0)\n", + "Requirement already satisfied: setuptools in /usr/lib/python3/dist-packages (from wandb->-r requirements.txt (line 1)) (45.2.0)\n", + "Requirement already satisfied: protobuf!=4.21.0,<5,>=3.12.0; python_version < \"3.9\" and sys_platform == \"linux\" in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (3.20.3)\n", + "Requirement already satisfied: docker-pycreds>=0.4.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (0.4.0)\n", + "Requirement already satisfied: Click!=8.0.0,>=7.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (8.1.3)\n", + "Requirement already satisfied: requests<3,>=2.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (2.28.1)\n", + "Requirement already satisfied: pathtools in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (0.1.2)\n", + "Requirement already satisfied: psutil>=5.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (5.9.4)\n", + "Requirement already satisfied: setproctitle in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.3.2)\n", + "Requirement already satisfied: GitPython>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (3.1.30)\n", + "Requirement already satisfied: promise<3,>=2.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (2.3)\n", + "Requirement already satisfied: sentry-sdk>=1.0.0 in /usr/local/lib/python3.8/dist-packages (from wandb->-r requirements.txt (line 1)) (1.12.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (2.8.2)\n", + "Requirement already satisfied: numpy>=1.17.3; platform_machine != \"aarch64\" and platform_machine != \"arm64\" and python_version < \"3.10\" in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (1.22.4)\n", + "Requirement already satisfied: pytz>=2017.3 in /usr/local/lib/python3.8/dist-packages (from pandas->-r requirements.txt (line 2)) (2022.7)\n", + "Requirement already satisfied: six>=1.4.0 in /usr/lib/python3/dist-packages (from docker-pycreds>=0.4.0->wandb->-r requirements.txt (line 1)) (1.14.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2.8)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2019.11.28)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (2.1.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests<3,>=2.0.0->wandb->-r requirements.txt (line 1)) (1.26.13)\n", + "Requirement already satisfied: gitdb<5,>=4.0.1 in /usr/local/lib/python3.8/dist-packages (from GitPython>=1.0.0->wandb->-r requirements.txt (line 1)) (4.0.10)\n", + "Requirement already satisfied: smmap<6,>=3.0.1 in /usr/local/lib/python3.8/dist-packages (from gitdb<5,>=4.0.1->GitPython>=1.0.0->wandb->-r requirements.txt (line 1)) (5.0.0)\n", + "Building wheels for collected packages: dllogger\n", + " Building wheel for dllogger (setup.py): started\n", + " Building wheel for dllogger (setup.py): finished with status 'done'\n", + " Created wheel for dllogger: filename=DLLogger-1.0.0-py3-none-any.whl size=5656 sha256=571a7a3df2e72b3d0c50aa54bda34ce6ded9c2d77053a1ad5336ff9a7dd3dfea\n", + " Stored in directory: /tmp/pip-ephem-wheel-cache-0f1jgvf3/wheels/ad/94/cf/8f3396cb8d62d532695ec557e193fada55cd366e14fd9a02be\n", + "Successfully built dllogger\n", + "Installing collected packages: dllogger\n", + "Successfully installed dllogger-1.0.0\n", + "Collecting gdown\n", + " Downloading gdown-4.6.3-py3-none-any.whl (14 kB)\n", + "Requirement already satisfied: beautifulsoup4 in /usr/local/lib/python3.8/dist-packages (from gdown) (4.11.1)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.8/dist-packages (from gdown) (3.9.0)\n", + "Requirement already satisfied: six in /usr/lib/python3/dist-packages (from gdown) (1.14.0)\n", + "Requirement already satisfied: tqdm in /usr/local/lib/python3.8/dist-packages (from gdown) (4.64.1)\n", + "Requirement already satisfied: requests[socks] in /usr/local/lib/python3.8/dist-packages (from gdown) (2.28.1)\n", + "Requirement already satisfied: soupsieve>1.2 in /usr/local/lib/python3.8/dist-packages (from beautifulsoup4->gdown) (2.3.2.post1)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/lib/python3/dist-packages (from requests[socks]->gdown) (2.8)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.8/dist-packages (from requests[socks]->gdown) (1.26.13)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/lib/python3/dist-packages (from requests[socks]->gdown) (2019.11.28)\n", + "Collecting PySocks!=1.5.7,>=1.5.6; extra == \"socks\"\n", + " Downloading PySocks-1.7.1-py3-none-any.whl (16 kB)\n", + "Installing collected packages: gdown, PySocks\n", + "Successfully installed PySocks-1.7.1 gdown-4.6.3\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading...\n", + "From: https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV\n", + "To: /workspace/examples/t4rec_paper_experiments/t4r_paper_repro/rees46_ecom_dataset_small_for_ci.zip\n", + "100%|██████████| 43.4M/43.4M [00:07<00:00, 6.18MB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 InRelease [1581 B]\n", + "Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64 Packages [871 kB]\n", + "Get:3 http://archive.ubuntu.com/ubuntu focal InRelease [265 kB]\n", + "Get:4 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]\n", + "Get:5 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2496 kB]\n", + "Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]\n", + "Get:7 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]\n", + "Get:8 http://archive.ubuntu.com/ubuntu focal/restricted amd64 Packages [33.4 kB]\n", + "Get:9 http://archive.ubuntu.com/ubuntu focal/main amd64 Packages [1275 kB]\n", + "Get:10 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [995 kB]\n", + "Get:11 http://security.ubuntu.com/ubuntu focal-security/multiverse amd64 Packages [28.5 kB]\n", + "Get:12 http://security.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [1937 kB]\n", + "Get:13 http://archive.ubuntu.com/ubuntu focal/multiverse amd64 Packages [177 kB]\n", + "Get:14 http://archive.ubuntu.com/ubuntu focal/universe amd64 Packages [11.3 MB]\n", + "Get:15 http://archive.ubuntu.com/ubuntu focal-updates/restricted amd64 Packages [2066 kB]\n", + "Get:16 http://archive.ubuntu.com/ubuntu focal-updates/main amd64 Packages [2970 kB]\n", + "Get:17 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 Packages [1296 kB]\n", + "Get:18 http://archive.ubuntu.com/ubuntu focal-updates/multiverse amd64 Packages [31.2 kB]\n", + "Get:19 http://archive.ubuntu.com/ubuntu focal-backports/universe amd64 Packages [28.6 kB]\n", + "Get:20 http://archive.ubuntu.com/ubuntu focal-backports/main amd64 Packages [55.2 kB]\n", + "Fetched 26.2 MB in 10s (2536 kB/s)\n", + "Reading package lists...\n", + "Reading package lists...\n", + "Building dependency tree...\n", + "Reading state information...\n", + "unzip is already the newest version (6.0-25ubuntu1.1).\n", + "0 upgraded, 0 newly installed, 0 to remove and 74 not upgraded.\n", + "Archive: rees46_ecom_dataset_small_for_ci.zip\n", + " creating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/\n", + " inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/valid.parquet \n", + " extracting: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/.zip \n", + " inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/train.parquet \n", + " inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0001/test.parquet \n", + " creating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0002/\n", + " inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0002/valid.parquet \n", + " inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0002/train.parquet \n", + " inflating: /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/0002/test.parquet \n" + ] + } + ], + "source": [ + "%%bash\n", + "set -e\n", + "\n", + "#### Install requirements\n", + "cd examples/t4rec_paper_experiments\n", + "pip install -r requirements.txt\n", + "\n", + "### Get data\n", + "cd t4r_paper_repro\n", + "\n", + "FEATURE_SCHEMA_PATH=../datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "pip install gdown\n", + "gdown https://drive.google.com/uc?id=1NCFZ5ya3zyxPsrmupEoc9UEm4sslAddV\n", + "apt-get update -y\n", + "apt-get install unzip -y\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "unzip -d $DATA_PATH \"rees46_ecom_dataset_small_for_ci.zip\"\n", + "exit 0" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f114837f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n" + ] + } + ], + "source": [ + "%%writefile /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "\n", + "feature {\n", + " name: \"sess_pid_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: INT\n", + " int_domain {\n", + " name: \"sess_pid_seq\"\n", + " min: 1\n", + " max: 390000\n", + " is_categorical: true\n", + " }\n", + " annotation {\n", + " tag: \"item_id\"\n", + " tag: \"list\"\n", + " tag: \"categorical\"\n", + " tag: \"item\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_ccid_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: INT\n", + " int_domain {\n", + " name: \"sess_ccid_seq\"\n", + " min: 1\n", + " max: 150\n", + " is_categorical: true\n", + " }\n", + " annotation {\n", + " tag: \"list\"\n", + " tag: \"categorical\"\n", + " tag: \"item\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_csid_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: INT\n", + " int_domain {\n", + " name: \"sess_csid_seq\"\n", + " min: 1\n", + " max: 1400\n", + " is_categorical: true\n", + " }\n", + " annotation {\n", + " tag: \"list\"\n", + " tag: \"categorical\"\n", + " tag: \"item\"\n", + " }\n", + "}\n", + "\n", + "\n", + "feature {\n", + " name: \"sess_bid_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: INT\n", + " int_domain {\n", + " name: \"sess_bid_seq\"\n", + " min: 1\n", + " max: 7000\n", + " is_categorical: true\n", + " }\n", + " annotation {\n", + " tag: \"list\"\n", + " tag: \"categorical\"\n", + " tag: \"item\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_price_log_norm_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: FLOAT\n", + " float_domain {\n", + " name: \"sess_price_log_norm_seq\"\n", + " min: 0.0\n", + " max: 10000.0\n", + " }\n", + " annotation {\n", + " tag: \"item\"\n", + " tag: \"list\"\n", + " tag: \"continuous\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_relative_price_to_avg_category_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: FLOAT\n", + " float_domain {\n", + " name: \"sess_relative_price_to_avg_category_seq\"\n", + " min: -10000.0\n", + " max: 10000.0\n", + " }\n", + " annotation {\n", + " tag: \"item\"\n", + " tag: \"list\"\n", + " tag: \"continuous\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_prod_recency_days_log_norm_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: FLOAT\n", + " float_domain {\n", + " name: \"sess_prod_recency_days_log_norm_seq\"\n", + " min: -10000.0\n", + " max: 10000.0\n", + " }\n", + " annotation {\n", + " tag: \"item\"\n", + " tag: \"list\"\n", + " tag: \"continuous\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_et_hour_sin_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: FLOAT\n", + " float_domain {\n", + " name: \"sess_et_hour_sin_seq\"\n", + " min: -1.0\n", + " max: 1.0\n", + " }\n", + " annotation {\n", + " tag: \"list\"\n", + " tag: \"continuous\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_et_hour_cos_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: FLOAT\n", + " float_domain {\n", + " name: \"sess_et_hour_cos_seq\"\n", + " min: -1.0\n", + " max: 1.0\n", + " }\n", + " annotation {\n", + " tag: \"list\"\n", + " tag: \"continuous\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_et_dayofweek_sin_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: FLOAT\n", + " float_domain {\n", + " name: \"sess_et_dayofweek_sin_seq\"\n", + " min: -1.0\n", + " max: 1.0\n", + " }\n", + " annotation {\n", + " tag: \"list\"\n", + " tag: \"continuous\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_et_dayofweek_cos_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: FLOAT\n", + " float_domain {\n", + " name: \"sess_et_dayofweek_cos_seq\"\n", + " min: -1.0\n", + " max: 1.0\n", + " }\n", + " annotation {\n", + " tag: \"list\"\n", + " tag: \"continuous\"\n", + " }\n", + "}\n", + "\n", + "feature {\n", + " name: \"sess_etime_seq\"\n", + " value_count {\n", + " min: 2\n", + " max: 20\n", + " }\n", + " type: FLOAT\n", + " float_domain {\n", + " name: \"sess_etime_seq\"\n", + " min: 0\n", + " max: 0\n", + " }\n", + " annotation {\n", + " tag: \"time\"\n", + " tag: \"list\"\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "6089f14c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "04/05/2023 04:08:01 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "04/05/2023 04:08:03 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-04-05 04:08:03,174 >> Using amp fp16 backend\n", + "04/05/2023 04:08:03 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 1.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Apr05_04-08-00_da24a92e0a20', 'logging_first_step': False, 'logging_steps': 20, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 100, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': 10}\n", + "[INFO|trainer.py:1196] 2023-04-05 04:08:03,669 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-04-05 04:08:03,669 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-04-05 04:08:03,669 >> Num Epochs = 1\n", + "[INFO|trainer.py:1199] 2023-04-05 04:08:03,669 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-04-05 04:08:03,669 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-04-05 04:08:03,669 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-04-05 04:08:03,669 >> Total optimization steps = 676\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-04-05 04:08:03.174988 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : True mlm_probability : 0.30000000000000004 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 1.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Apr05_04-08-00_da24a92e0a20 logging_first_step : False logging_steps : 20 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 100 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : 10 \n", + "\n", + "***** Launch training for day 1: *****\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\r", + " 0%| | 0/676 [00:00> \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Training completed. Do not forget to share your model on huggingface.co/models =)\n", + "\n", + "\n", + "100%|██████████| 676/676 [00:53<00:00, 12.56it/s]\n", + "04/05/2023 04:08:57 - INFO - transformers4rec.torch.trainer - ***** Running Evaluation *****\n", + "04/05/2023 04:08:57 - INFO - transformers4rec.torch.trainer - Batch size = 128\n", + "04/05/2023 04:08:57 - INFO - transformers4rec.torch.trainer - Num sessions (examples) = 2560\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'loss': 12.9419, 'learning_rate': 0.0006479980438000916, 'epoch': 0.03}\n", + "{'loss': 12.5627, 'learning_rate': 0.0006282720759522958, 'epoch': 0.06}\n", + "{'loss': 11.7988, 'learning_rate': 0.0006085461081045, 'epoch': 0.09}\n", + "{'loss': 10.9136, 'learning_rate': 0.0005888201402567042, 'epoch': 0.12}\n", + "{'loss': 10.5825, 'learning_rate': 0.0005690941724089084, 'epoch': 0.15}\n", + "{'loss': 10.4064, 'learning_rate': 0.0005493682045611127, 'epoch': 0.18}\n", + "{'loss': 10.1992, 'learning_rate': 0.0005296422367133169, 'epoch': 0.21}\n", + "{'loss': 10.2454, 'learning_rate': 0.000509916268865521, 'epoch': 0.24}\n", + "{'loss': 10.1139, 'learning_rate': 0.0004901903010177253, 'epoch': 0.27}\n", + "{'loss': 10.0165, 'learning_rate': 0.0004704643331699295, 'epoch': 0.3}\n", + "{'loss': 10.0056, 'learning_rate': 0.00045073836532213366, 'epoch': 0.33}\n", + "{'loss': 9.8803, 'learning_rate': 0.00043101239747433793, 'epoch': 0.36}\n", + "{'loss': 9.7873, 'learning_rate': 0.00041128642962654215, 'epoch': 0.38}\n", + "{'loss': 9.9475, 'learning_rate': 0.0003915604617787463, 'epoch': 0.41}\n", + "{'loss': 9.8843, 'learning_rate': 0.0003718344939309506, 'epoch': 0.44}\n", + "{'loss': 9.7393, 'learning_rate': 0.00035210852608315475, 'epoch': 0.47}\n", + "{'loss': 9.5825, 'learning_rate': 0.00033238255823535897, 'epoch': 0.5}\n", + "{'loss': 9.8305, 'learning_rate': 0.0003126565903875632, 'epoch': 0.53}\n", + "{'loss': 9.7408, 'learning_rate': 0.00029293062253976746, 'epoch': 0.56}\n", + "{'loss': 9.7161, 'learning_rate': 0.0002732046546919716, 'epoch': 0.59}\n", + "{'loss': 9.5964, 'learning_rate': 0.00025347868684417584, 'epoch': 0.62}\n", + "{'loss': 9.5593, 'learning_rate': 0.00023375271899638006, 'epoch': 0.65}\n", + "{'loss': 9.4851, 'learning_rate': 0.00021402675114858425, 'epoch': 0.68}\n", + "{'loss': 9.7007, 'learning_rate': 0.0001943007833007885, 'epoch': 0.71}\n", + "{'loss': 9.5252, 'learning_rate': 0.00017457481545299271, 'epoch': 0.74}\n", + "{'loss': 9.6155, 'learning_rate': 0.0001548488476051969, 'epoch': 0.77}\n", + "{'loss': 9.6275, 'learning_rate': 0.00013512287975740115, 'epoch': 0.8}\n", + "{'loss': 9.571, 'learning_rate': 0.00011539691190960534, 'epoch': 0.83}\n", + "{'loss': 9.4793, 'learning_rate': 9.567094406180957e-05, 'epoch': 0.86}\n", + "{'loss': 9.4686, 'learning_rate': 7.594497621401378e-05, 'epoch': 0.89}\n", + "{'loss': 9.524, 'learning_rate': 5.621900836621799e-05, 'epoch': 0.92}\n", + "{'loss': 9.5921, 'learning_rate': 3.6493040518422206e-05, 'epoch': 0.95}\n", + "{'loss': 9.5054, 'learning_rate': 1.6767072670626417e-05, 'epoch': 0.98}\n", + "{'train_runtime': 53.8312, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.558, 'train_loss': 10.050675363935662, 'epoch': 1.0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 3%|▎ | 20/676 [00:00<00:16, 40.02it/s]\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - ***** train results (time index): 2)*****\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/loss = 9.21147346496582\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/next-item/ndcg_at_10 = 0.03348138555884361\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/next-item/ndcg_at_20 = 0.04145380109548569\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/next-item/recall_at_10 = 0.064453125\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/next-item/recall_at_20 = 0.09609375149011612\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_runtime = 0.6467\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_samples_per_second = 3958.501\n", + "04/05/2023 04:08:58 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_steps_per_second = 30.926\n", + "04/05/2023 04:08:58 - INFO - transformers4rec.torch.trainer - ***** Running Evaluation *****\n", + "04/05/2023 04:08:58 - INFO - transformers4rec.torch.trainer - Batch size = 128\n", + "04/05/2023 04:08:58 - INFO - transformers4rec.torch.trainer - Num sessions (examples) = 10624\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.03348138555884361, 'train_/next-item/ndcg_at_20': 0.04145380109548569, 'train_/next-item/recall_at_10': 0.064453125, 'train_/next-item/recall_at_20': 0.09609375149011612, 'train_/loss': 9.21147346496582, 'train_runtime': 0.6467, 'train_samples_per_second': 3958.501, 'train_steps_per_second': 30.926}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 83/83 [00:02<00:00, 38.64it/s]\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - ***** eval results (time index): 2)*****\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/loss = 9.303844451904297\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/next-item/ndcg_at_10 = 0.03641688451170921\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/next-item/ndcg_at_20 = 0.04447970166802406\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/next-item/recall_at_10 = 0.07172439247369766\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/next-item/recall_at_20 = 0.10363327711820602\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_runtime = 2.2272\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_samples_per_second = 4770.055\n", + "04/05/2023 04:09:00 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_steps_per_second = 37.266\n", + "04/05/2023 04:09:01 - INFO - __main__ - Computing and logging AOT (Average Over Time) metrics\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - ***** Eval results (avg over time) *****\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/loss_AOT = 9.303844451904297\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/next-item/ndcg_at_10_AOT = 0.03641688451170921\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/next-item/ndcg_at_20_AOT = 0.04447970166802406\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/next-item/recall_at_10_AOT = 0.07172439247369766\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_/next-item/recall_at_20_AOT = 0.10363327711820602\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_runtime_AOT = 2.2272\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_samples_per_second_AOT = 4770.055\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - eval_steps_per_second_AOT = 37.266\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/loss_AOT = 9.21147346496582\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/next-item/ndcg_at_10_AOT = 0.03348138555884361\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/next-item/ndcg_at_20_AOT = 0.04145380109548569\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/next-item/recall_at_10_AOT = 0.064453125\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_/next-item/recall_at_20_AOT = 0.09609375149011612\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_runtime_AOT = 0.6467\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_samples_per_second_AOT = 3958.501\n", + "04/05/2023 04:09:01 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - train_steps_per_second_AOT = 30.926\n", + "04/05/2023 04:09:01 - INFO - transformers4rec.torch.trainer - ***** Running Prediction *****\n", + "04/05/2023 04:09:01 - INFO - transformers4rec.torch.trainer - Batch size = 128\n", + "04/05/2023 04:09:01 - INFO - transformers4rec.torch.trainer - Num sessions (examples) = 10752\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.03641688451170921, 'eval_/next-item/ndcg_at_20': 0.04447970166802406, 'eval_/next-item/recall_at_10': 0.07172439247369766, 'eval_/next-item/recall_at_20': 0.10363327711820602, 'eval_/loss': 9.303844451904297, 'eval_runtime': 2.2272, 'eval_samples_per_second': 4770.055, 'eval_steps_per_second': 37.266}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " 99%|█████████▉| 83/84 [00:00<00:00, 84.22it/s]04/05/2023 04:09:02 - INFO - __main__ - Recall@10 of manually masked test data = 0.07175098739890916\n", + "100%|██████████| 84/84 [00:03<00:00, 22.48it/s]" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=1 # all the models in the papers were trained for 5 epochs\n", + "\n", + "# UNCOMMENT THE MODEL YOU'D LIKE TO TRAIN AND EXPORT\n", + "\n", + "### GPT-2 (CLM) - Item Id feature\n", + "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type gpt2 --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --per_device_train_batch_size 128 --learning_rate 0.0008781937894379981 --dropout 0.2 --input_dropout 0.4 --weight_decay 1.4901138106122045e-05 --d_model 128 --item_embedding_dim 448 --n_layer 1 --n_head 1 --label_smoothing 0.9 --stochastic_shared_embeddings_replacement_prob 0.0 --item_id_embeddings_init_std 0.03 --other_embeddings_init_std 0.034999999999999996 --eval_on_test_set --seed 100 --report_to none\n", + "\n", + "### Transformer-XL (CLM) - Item Id feature\n", + "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type transfoxl --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --per_device_train_batch_size 128 --learning_rate 0.001007765821083962 --dropout 0.1 --input_dropout 0.30000000000000004 --weight_decay 1.0673054163921092e-06 --d_model 448 --item_embedding_dim 320 --n_layer 1 --n_head 1 --label_smoothing 0.2 --stochastic_shared_embeddings_replacement_prob 0.02 --item_id_embeddings_init_std 0.15 --other_embeddings_init_std 0.01 --eval_on_test_set --seed 100 --report_to none\n", + "\n", + "### BERT (MLM) - Item Id feature\n", + "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type albert --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --mlm --num_hidden_groups -1 --inner_group_num 1 --per_device_train_batch_size 128 --learning_rate 0.0004904752786458524 --dropout 0.0 --input_dropout 0.1 --weight_decay 9.565968888623912e-05 --d_model 320 --item_embedding_dim 320 --n_layer 2 --n_head 8 --label_smoothing 0.2 --stochastic_shared_embeddings_replacement_prob 0.06 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.025 --mlm_probability 0.6000000000000001 --eval_on_test_set --seed 100 --report_to none\n", + "\n", + "### XLNet (PLM) - Item Id feature\n", + "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --plm --per_device_train_batch_size 128 --learning_rate 0.0003387925502203725 --dropout 0.0 --input_dropout 0.2 --weight_decay 2.1769664191492473e-05 --d_model 384 --item_embedding_dim 384 --n_layer 4 --n_head 16 --label_smoothing 0.7000000000000001 --stochastic_shared_embeddings_replacement_prob 0.02 --item_id_embeddings_init_std 0.13 --other_embeddings_init_std 0.005 --plm_probability 0.5 --plm_max_span_length 3 --eval_on_test_set --seed 100 --report_to none\n", + "\n", + "### XLNet (MLM) - Item Id feature\n", + "python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed 100 --report_to none\n", + "\n", + "### XLNET (MLM) - CONCAT + SOFT ONE-HOT ENCODING - All features\n", + "# python3 transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --validate_every 10 --logging_steps 20 --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --input_features_aggregation concat --per_device_train_batch_size 128 --learning_rate 0.00034029107417129616 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.168336235732841e-05 --d_model 448 --item_embedding_dim 384 --n_layer 2 --n_head 8 --label_smoothing 0.6000000000000001 --stochastic_shared_embeddings_replacement_prob 0.0 --item_id_embeddings_init_std 0.06999999999999999 --other_embeddings_init_std 0.085 --mlm_probability 0.30000000000000004 --embedding_dim_from_cardinality_multiplier 1.0 --numeric_features_project_to_embedding_dim 20 --numeric_features_soft_one_hot_encoding_num_embeddings 5 --eval_on_test_set --seed 100 --use_side_information_features --report_to none" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4d9b7394", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f0af515e", + "metadata": {}, + "outputs": [], + "source": [ + "rm -rf /workspace/models_for_benchmarking/t4r_pytorch /workspace/models_for_benchmarking/t4r_pytorch_nvt" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c65f9a1f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting /workspace/models_for_benchmarking/t4r_pytorch_pt/config.pbtxt\n" + ] + } + ], + "source": [ + "%%writefile /workspace/models_for_benchmarking/t4r_pytorch_pt/config.pbtxt\n", + "\n", + "name: \"t4r_pytorch_pt\"\n", + "input {\n", + " name: \"sess_pid_seq__values\"\n", + " data_type: TYPE_INT64\n", + " dims: -1\n", + " dims: 1\n", + "}\n", + "input {\n", + " name: \"sess_pid_seq__nnzs\"\n", + " data_type: TYPE_INT64\n", + " dims: -1\n", + " dims: 1\n", + "}\n", + "output {\n", + " name: \"output\"\n", + " data_type: TYPE_FP32\n", + " dims: -1\n", + " dims: 20\n", + "}\n", + "backend: \"python\"" + ] + }, + { + "cell_type": "markdown", + "id": "9964a2cc", + "metadata": {}, + "source": [ + "For running on the CPU" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d61fe61b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting /workspace/models_for_benchmarking/t4r_pytorch_pt/config.pbtxt\n" + ] + } + ], + "source": [ + "%%writefile /workspace/models_for_benchmarking/t4r_pytorch_pt/config.pbtxt\n", + "\n", + "name: \"t4r_pytorch_pt\"\n", + "instance_group [\n", + " {\n", + " count: 1\n", + " kind: KIND_CPU\n", + " }\n", + "]\n", + "input {\n", + " name: \"sess_pid_seq__values\"\n", + " data_type: TYPE_INT64\n", + " dims: -1\n", + " dims: 1\n", + "}\n", + "input {\n", + " name: \"sess_pid_seq__nnzs\"\n", + " data_type: TYPE_INT64\n", + " dims: -1\n", + " dims: 1\n", + "}\n", + "output {\n", + " name: \"output\"\n", + " data_type: TYPE_FP32\n", + " dims: -1\n", + " dims: 20\n", + "}\n", + "backend: \"python\"" + ] + }, + { + "cell_type": "markdown", + "id": "63ba822c", + "metadata": {}, + "source": [ + "You can control whether you would like to run on the GPU or the CPU by setting the environment variable `HAS_GPU` to either 0 or 1." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "b3a346b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting /workspace/models_for_benchmarking/t4r_pytorch_pt/1/model.py\n" + ] + } + ], + "source": [ + "%%writefile /workspace/models_for_benchmarking/t4r_pytorch_pt/1/model.py\n", + "\n", + "# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.\n", + "#\n", + "# Redistribution and use in source and binary forms, with or without\n", + "# modification, are permitted provided that the following conditions\n", + "# are met:\n", + "# * Redistributions of source code must retain the above copyright\n", + "# notice, this list of conditions and the following disclaimer.\n", + "# * Redistributions in binary form must reproduce the above copyright\n", + "# notice, this list of conditions and the following disclaimer in the\n", + "# documentation and/or other materials provided with the distribution.\n", + "# * Neither the name of NVIDIA CORPORATION nor the names of its\n", + "# contributors may be used to endorse or promote products derived\n", + "# from this software without specific prior written permission.\n", + "#\n", + "# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY\n", + "# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\n", + "# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR\n", + "# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR\n", + "# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,\n", + "# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,\n", + "# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR\n", + "# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY\n", + "# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT\n", + "# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE\n", + "# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.\n", + "\n", + "import json\n", + "import logging\n", + "import pathlib\n", + "\n", + "import cloudpickle\n", + "import pickle\n", + "import io\n", + "import os\n", + "import torch\n", + "import triton_python_backend_utils as pb_utils\n", + "\n", + "from nvtabular.inference.triton import _convert_string2pytorch_dtype, _convert_tensor\n", + "from merlin.core.dispatch import HAS_GPU\n", + "\n", + "LOG = logging.getLogger(\"nvtabular\")\n", + "\n", + "sparse_value_marker = \"__values\"\n", + "sparse_nnzs_marker = \"__nnzs\"\n", + "\n", + "HAS_GPU = os.environ['HAS_GPU'] == '1'\n", + "\n", + "\n", + "class CPU_Unpickler(pickle.Unpickler):\n", + " def find_class(self, module, name):\n", + " print(super().find_class(module, name))\n", + " if module == 'torch.storage' and name == '_load_from_bytes':\n", + " return lambda b: torch.load(io.BytesIO(b), map_location=torch.device('cpu'))\n", + " else:\n", + " return super().find_class(module, name)\n", + "\n", + "class TritonPythonModel:\n", + " \"\"\"Generic TritonPythonModel for nvtabular workflows\"\"\"\n", + "\n", + " def initialize(self, args):\n", + " # Arg parsing\n", + " repository_path = pathlib.Path(args[\"model_repository\"])\n", + " model_version = str(args[\"model_version\"])\n", + "\n", + " # Handle bug in Tritonserver 22.06\n", + " # model_repository argument became path to model.py\n", + " if str(repository_path).endswith(\".py\"):\n", + " repository_path = repository_path.parent.parent\n", + "\n", + " model_path = repository_path / model_version / \"model.pkl\"\n", + "\n", + " # Load the pickled PyTorch model\n", + " if HAS_GPU:\n", + " self.model = cloudpickle.load(\n", + " open(str(model_path), \"rb\") # pylint: disable=consider-using-with\n", + " )\n", + " model_path = repository_path / model_version / \"model.pth\"\n", + " self.model.load_state_dict(torch.load(str(model_path)))\n", + " else:\n", + " self.model = CPU_Unpickler(open(str(model_path), \"rb\")).load()\n", + " model_path = repository_path / model_version / \"model.pth\"\n", + " self.model.load_state_dict(torch.load(str(model_path), map_location='cpu'))\n", + "\n", + " self.model.eval()\n", + "\n", + " # Load model config file\n", + " self.model_config = json.loads(args[\"model_config\"])\n", + "\n", + " # Load extra info needed for the Transformer4Rec (if exists)\n", + " model_info_path = repository_path / model_version / \"model_info.json\"\n", + " self.model_info = None\n", + " model_info_file = pathlib.Path(model_info_path)\n", + " if model_info_file.exists():\n", + " with open(str(model_info_path), encoding=\"utf-8\") as json_file:\n", + " self.model_info = json.load(json_file)\n", + "\n", + " # Get the name of the dense and sparse inputs, and the outputs\n", + " self.inputs = {}\n", + " self.sparse_inputs = {}\n", + " self.outputs = {}\n", + " len_svm = len(sparse_value_marker)\n", + " len_snm = len(sparse_nnzs_marker)\n", + "\n", + " for val in self.model_config[\"input\"]:\n", + " name = val[\"name\"]\n", + "\n", + " # NVTabular adds this specific marker \"__values\" into the name of the sparse inputs\n", + " # The ones that has the marker \"__nnzs\" are for the sparse values\n", + " # Hence, dense and sparse inputs are identified based on these markers\n", + " if len(name) > len_svm:\n", + " if name[-len_svm:] == sparse_value_marker:\n", + " self.sparse_inputs[\n", + " name[0 : (len(name) - len_svm)]\n", + " ] = _convert_string2pytorch_dtype(val[\"data_type\"])\n", + " elif name[-len_snm:] != sparse_nnzs_marker:\n", + " self.inputs[name] = _convert_string2pytorch_dtype(val[\"data_type\"])\n", + " else:\n", + " if len(name) > len_snm:\n", + " if name[-len_snm:] != sparse_nnzs_marker:\n", + " self.inputs[name] = _convert_string2pytorch_dtype(val[\"data_type\"])\n", + " else:\n", + " self.inputs[name] = _convert_string2pytorch_dtype(val[\"data_type\"])\n", + "\n", + " for val in self.model_config[\"output\"]:\n", + " self.outputs[val[\"name\"]] = _convert_string2pytorch_dtype(val[\"data_type\"])\n", + "\n", + " def execute(self, requests):\n", + " \"\"\"Predicts the input batches by running through a PyTorch predict function.\"\"\"\n", + "\n", + " # To be able to execute the queries, the PyTorch model must accept a dict input\n", + " # and generates a dict output that has the output in the the \"predictions\"\n", + " # bucket. Otherwise, it'll throw an error.\n", + "\n", + " with torch.no_grad():\n", + " responses = []\n", + " for request in requests:\n", + " # Convert the input data to dict to pass it into the PyTorch model\n", + " input_dict = {}\n", + " for name, dtype in self.inputs.items():\n", + " # Convert to fixed dtypes if requested\n", + " if self.model_info[\"use_fix_dtypes\"]:\n", + " dtype = _convert_dtype(dtype)\n", + " input_dict[name] = torch.tensor(\n", + " _convert_tensor(pb_utils.get_input_tensor_by_name(request, name)),\n", + " dtype=dtype,\n", + " ).cuda()\n", + "\n", + " # Sparse inputs have a special format\n", + " for name, dtype in self.sparse_inputs.items():\n", + "\n", + " # Get __values and __nnzs\n", + " input_val = _convert_tensor(\n", + " pb_utils.get_input_tensor_by_name(request, name + sparse_value_marker)\n", + " )\n", + " input_nnzs = _convert_tensor(\n", + " pb_utils.get_input_tensor_by_name(request, name + sparse_nnzs_marker)\n", + " )\n", + " input_nnzs = torch.tensor(input_nnzs, dtype=torch.int64)\n", + " input_values = torch.tensor(input_val, dtype=dtype)\n", + "\n", + " # Get the PyTorch sparse_coo_tensor\n", + " sparse_to_dense = False\n", + " seq_limit = 0\n", + " if self.model_info is not None:\n", + " if self.model_info[\"sparse_max\"].get(name) is not None:\n", + " sparse_to_dense = True\n", + " seq_limit = self.model_info[\"sparse_max\"][name]\n", + "\n", + " if seq_limit == 0:\n", + " seq_limit = int(input_nnzs.max())\n", + "\n", + " input_dict[name] = _build_sparse_tensor(\n", + " input_values, input_nnzs, seq_limit, sparse_to_dense\n", + " )\n", + "\n", + " # Call forward function to get the predictions\n", + " # Forward function should return a dict with the \"predictions\" bucket\n", + " pred = self.model(input_dict, training=False)\n", + " if pred is None:\n", + " raise KeyError(\n", + " \"output of the forward function should have a bucket named as predictions\"\n", + " )\n", + "\n", + "\t\t\t\t#place holder for testing. \n", + " pred_numpy = (torch.topk(pred.detach(),20).indices).cpu().numpy()\n", + " # There is one output in the config file\n", + " # since the PyTorch models generate a tensor as an output\n", + " output_info = self.model_config[\"output\"][0]\n", + " output_tensor = pb_utils.Tensor(output_info[\"name\"], pred_numpy)\n", + " responses.append(pb_utils.InferenceResponse([output_tensor]))\n", + " \n", + " # pred_numpy = pred.cpu().detach().numpy()\n", + "\n", + " # There is one output in the config file\n", + " # since the PyTorch models generate a tensor as an output\n", + " # output_info = self.model_config[\"output\"][0]\n", + " # output_tensor = pb_utils.Tensor(output_info[\"name\"], pred_numpy)\n", + " # responses.append(pb_utils.InferenceResponse([output_tensor]))\n", + "\n", + " return responses\n", + "\n", + "\n", + "def _get_indices(nnzs, device=\"cuda\"):\n", + " \"\"\"Calculate indices for the PyTorch sparse_coo_tensor\"\"\"\n", + " nnzs = nnzs[:, 0]\n", + " row_ids = torch.arange(len(nnzs)-1)\n", + " offsets = nnzs[1:]\n", + " offsets[1:] = offsets[1:] - offsets[:-1]\n", + " row_ids_repeated = torch.repeat_interleave(row_ids, offsets)\n", + " offsets_cols = nnzs[:-1]\n", + " offsets_cols = torch.repeat_interleave(offsets_cols.cumsum(0), offsets)\n", + " col_ids = torch.arange(len(row_ids_repeated)) - offsets_cols\n", + " indices = torch.cat([row_ids_repeated.unsqueeze(-1), col_ids.unsqueeze(-1)], axis=1)\n", + " return indices.T\n", + "\n", + " offsets = torch.cat((torch.tensor([1]), nnzs), 0)\n", + " offsets = offsets.cumsum(0)\n", + " row_ids = torch.arange(len(offsets) - 1)\n", + " row_ids_repeated = torch.repeat_interleave(row_ids, nnzs)\n", + " row_offset_repeated = torch.repeat_interleave(offsets[:-1], nnzs)\n", + " col_ids = torch.arange(len(row_offset_repeated)) - row_offset_repeated + 1\n", + " indices = torch.cat([row_ids_repeated.unsqueeze(-1), col_ids.unsqueeze(-1)], axis=1)\n", + " return indices.T\n", + "\n", + "\n", + "def _get_sparse_tensor(values, indices, num_rows, seq_limit, sparse_as_dense, device=\"cuda\"):\n", + " \"\"\"Creates the PyTorch sparse_coo_tensor\"\"\"\n", + " \n", + " if HAS_GPU:\n", + " device='cuda'\n", + " else:\n", + " device='cpu'\n", + " \n", + " sparse_tensor = torch.sparse_coo_tensor(\n", + " indices, values.squeeze(), torch.Size([num_rows-1, seq_limit]), device=device\n", + " )\n", + " if sparse_as_dense:\n", + " sparse_tensor = sparse_tensor.to_dense()\n", + " return sparse_tensor\n", + "\n", + "\n", + "def _build_sparse_tensor(values, nnzs, seq_limit, sparse_as_dense, device=\"cuda\"):\n", + " \"\"\"Builds PyTorch sparse_coo_tensor by converting the __values and __nnzs inputs\"\"\"\n", + " indices = _get_indices(nnzs, device)\n", + " num_rows = len(nnzs)\n", + " return _get_sparse_tensor(values, indices, num_rows, seq_limit, sparse_as_dense, device)\n", + "\n", + "\n", + "def _convert_dtype(dtype):\n", + " \"\"\"Transformer4Rec uses these fixed dtypes and this function converts the original dtype\n", + " to this fixed dtypes\"\"\"\n", + " if dtype in [torch.float64, torch.float32, torch.float16]:\n", + " return torch.float32\n", + " if dtype in [\n", + " torch.int64,\n", + " torch.int32,\n", + " torch.int16,\n", + " torch.int8,\n", + " torch.uint8,\n", + " ]:\n", + " return torch.long\n", + "\n", + " raise ValueError(f\"Can't convert dtype {dtype})\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/train_runs/xlnet_clm_item_id.ipynb b/train_runs/xlnet_clm_item_id.ipynb new file mode 100644 index 0000000000..d3ae60679d --- /dev/null +++ b/train_runs/xlnet_clm_item_id.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "2ce2e001", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 07:14:19 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 07:14:21 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 07:14:21,395 >> Using amp fp16 backend\n", + "03/10/2023 07:14:21 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-14-18_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 100, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 07:14:21,846 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 07:14:21,846 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 07:14:21,846 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 07:14:21,846 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 07:14:21,846 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 07:14:21,846 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 07:14:21,846 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 07:14:21.396153 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : False mlm_probability : 0.15 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_07-14-18_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 100 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 6.6475, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 2.4218, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 1.9739, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 1.858, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 1.7681, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 1.7082, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 337.7129, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.009, 'train_loss': 2.6110303890070266, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.11942513287067413, 'train_/next-item/ndcg_at_20': 0.132638081908226, 'train_/next-item/recall_at_10': 0.19570313394069672, 'train_/next-item/recall_at_20': 0.24843750894069672, 'train_/loss': 7.65301513671875, 'train_runtime': 0.6484, 'train_samples_per_second': 3948.127, 'train_steps_per_second': 30.845}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08762501925230026, 'eval_/next-item/ndcg_at_20': 0.09899389743804932, 'eval_/next-item/recall_at_10': 0.14928463101387024, 'eval_/next-item/recall_at_20': 0.1945594847202301, 'eval_/loss': 8.972926139831543, 'eval_runtime': 2.286, 'eval_samples_per_second': 4647.323, 'eval_steps_per_second': 36.307}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed 100 --report_to none\n", + "exit 0" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "770b3d58", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 07:20:26 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 07:20:27 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 07:20:28,053 >> Using amp fp16 backend\n", + "03/10/2023 07:20:28 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-20-25_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 0, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 07:20:28,547 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 07:20:28,547 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 07:20:28,547 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 07:20:28,547 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 07:20:28,547 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 07:20:28,547 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 07:20:28,547 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 07:20:28.053983 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : False mlm_probability : 0.15 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_07-20-25_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 0 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 6.6784, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 2.4454, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 1.9754, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 1.8708, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 1.7958, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 1.7419, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 337.8916, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.003, 'train_loss': 2.6348056228908563, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.12325046211481094, 'train_/next-item/ndcg_at_20': 0.137022003531456, 'train_/next-item/recall_at_10': 0.19960938394069672, 'train_/next-item/recall_at_20': 0.25390625, 'train_/loss': 7.624682426452637, 'train_runtime': 0.6451, 'train_samples_per_second': 3968.659, 'train_steps_per_second': 31.005}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08722994476556778, 'eval_/next-item/ndcg_at_20': 0.0991109311580658, 'eval_/next-item/recall_at_10': 0.145143061876297, 'eval_/next-item/recall_at_20': 0.1927710771560669, 'eval_/loss': 9.022594451904297, 'eval_runtime': 2.2602, 'eval_samples_per_second': 4700.388, 'eval_steps_per_second': 36.722}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "SEED=0\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed $SEED --report_to none\n", + "exit 0" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "32e29315", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 07:26:15 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 07:26:17 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 07:26:17,384 >> Using amp fp16 backend\n", + "03/10/2023 07:26:17 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-26-14_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 1, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 07:26:17,872 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 07:26:17,872 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 07:26:17,872 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 07:26:17,872 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 07:26:17,872 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 07:26:17,872 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 07:26:17,872 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 07:26:17.385530 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : False mlm_probability : 0.15 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_07-26-14_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 1 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 6.6644, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 2.3778, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 1.9486, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 1.8619, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 1.7841, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 1.7368, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 337.8499, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.004, 'train_loss': 2.613156272391596, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.12271016836166382, 'train_/next-item/ndcg_at_20': 0.1380811482667923, 'train_/next-item/recall_at_10': 0.19843749701976776, 'train_/next-item/recall_at_20': 0.25859376788139343, 'train_/loss': 7.584864616394043, 'train_runtime': 0.6478, 'train_samples_per_second': 3951.826, 'train_steps_per_second': 30.874}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08986585587263107, 'eval_/next-item/ndcg_at_20': 0.10167953372001648, 'eval_/next-item/recall_at_10': 0.14956700801849365, 'eval_/next-item/recall_at_20': 0.19663026928901672, 'eval_/loss': 9.000876426696777, 'eval_runtime': 2.2765, 'eval_samples_per_second': 4666.864, 'eval_steps_per_second': 36.46}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "SEED=1\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed $SEED --report_to none\n", + "exit 0" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "4b87e4f5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 07:32:05 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 07:32:06 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 07:32:06,807 >> Using amp fp16 backend\n", + "03/10/2023 07:32:06 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-32-04_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 2, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 07:32:07,286 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 07:32:07,286 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 07:32:07,286 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 07:32:07,286 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 07:32:07,286 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 07:32:07,286 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 07:32:07,286 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 07:32:06.807964 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : False mlm_probability : 0.15 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_07-32-04_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 2 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 6.624, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 2.3857, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 1.962, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 1.8662, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 1.784, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 1.7199, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 337.7988, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.006, 'train_loss': 2.6095561732907266, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.12347330152988434, 'train_/next-item/ndcg_at_20': 0.137996107339859, 'train_/next-item/recall_at_10': 0.20078125596046448, 'train_/next-item/recall_at_20': 0.25859376788139343, 'train_/loss': 7.582167148590088, 'train_runtime': 0.6507, 'train_samples_per_second': 3934.492, 'train_steps_per_second': 30.738}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08896206319332123, 'eval_/next-item/ndcg_at_20': 0.10121969878673553, 'eval_/next-item/recall_at_10': 0.1499435156583786, 'eval_/next-item/recall_at_20': 0.1987951695919037, 'eval_/loss': 8.977458953857422, 'eval_runtime': 2.2602, 'eval_samples_per_second': 4700.428, 'eval_steps_per_second': 36.722}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "SEED=2\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed $SEED --report_to none\n", + "exit 0" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8c0d111b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 07:37:54 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 07:37:56 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 07:37:56,124 >> Using amp fp16 backend\n", + "03/10/2023 07:37:56 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': False, 'mlm_probability': 0.15, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_07-37-53_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 3, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 07:37:56,606 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 07:37:56,606 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 07:37:56,606 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 07:37:56,606 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 07:37:56,606 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 07:37:56,606 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 07:37:56,606 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 07:37:56.124693 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : False mlm_probability : 0.15 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_07-37-53_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 3 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 6.7871, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 2.4843, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 1.9926, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 1.872, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 1.7884, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 1.7512, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 337.9889, 'train_samples_per_second': 0.015, 'train_steps_per_second': 10.0, 'train_loss': 2.6576913867476426, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.12272482365369797, 'train_/next-item/ndcg_at_20': 0.13889344036579132, 'train_/next-item/recall_at_10': 0.1953125, 'train_/next-item/recall_at_20': 0.25859376788139343, 'train_/loss': 7.600445747375488, 'train_runtime': 0.6514, 'train_samples_per_second': 3930.07, 'train_steps_per_second': 30.704}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08895686268806458, 'eval_/next-item/ndcg_at_20': 0.09970034658908844, 'eval_/next-item/recall_at_10': 0.14909638464450836, 'eval_/next-item/recall_at_20': 0.19145330786705017, 'eval_/loss': 9.008366584777832, 'eval_runtime': 2.2737, 'eval_samples_per_second': 4672.54, 'eval_steps_per_second': 36.504}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "SEED=3\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --eval_on_test_set --seed $SEED --report_to none\n", + "exit 0" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/train_runs/xlnet_mlm_item_id.ipynb b/train_runs/xlnet_mlm_item_id.ipynb new file mode 100644 index 0000000000..8603aca12a --- /dev/null +++ b/train_runs/xlnet_mlm_item_id.ipynb @@ -0,0 +1,515 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d48f073c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 06:22:30 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 06:22:31 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 06:22:31,662 >> Using amp fp16 backend\n", + "03/10/2023 06:22:31 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-22-28_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 100, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 06:22:32,106 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 06:22:32,106 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 06:22:32,106 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 06:22:32,106 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 06:22:32,106 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 06:22:32,106 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 06:22:32,106 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 06:22:31.662852 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : True mlm_probability : 0.30000000000000004 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_06-22-28_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 100 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 10.1481, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 9.1622, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 8.8482, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 8.648, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 8.4446, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 8.3279, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 267.3924, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.641, 'train_loss': 8.856075417783838, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.0946362242102623, 'train_/next-item/ndcg_at_20': 0.11049988120794296, 'train_/next-item/recall_at_10': 0.17226563394069672, 'train_/next-item/recall_at_20': 0.23593750596046448, 'train_/loss': 7.873326301574707, 'train_runtime': 0.6495, 'train_samples_per_second': 3941.348, 'train_steps_per_second': 30.792}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08858750760555267, 'eval_/next-item/ndcg_at_20': 0.10536891222000122, 'eval_/next-item/recall_at_10': 0.16274471580982208, 'eval_/next-item/recall_at_20': 0.2292921543121338, 'eval_/loss': 8.278496742248535, 'eval_runtime': 2.2227, 'eval_samples_per_second': 4779.666, 'eval_steps_per_second': 37.341}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed 100 --report_to none" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a2b2ec29", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 06:27:11 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 06:27:12 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 06:27:12,976 >> Using amp fp16 backend\n", + "03/10/2023 06:27:12 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-27-10_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 100, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 06:27:13,448 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 06:27:13,448 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 06:27:13,448 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 06:27:13,448 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 06:27:13,448 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 06:27:13,448 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 06:27:13,448 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 06:27:12.976882 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : True mlm_probability : 0.30000000000000004 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_06-27-10_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 100 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 10.1707, 'learning_rate': 0.0005683051336949965, 'epoch': 0.74}\n", + "{'loss': 9.1904, 'learning_rate': 0.0004696752944560177, 'epoch': 1.48}\n", + "{'loss': 8.9014, 'learning_rate': 0.0003710454552170388, 'epoch': 2.22}\n", + "{'loss': 8.7112, 'learning_rate': 0.0002724156159780598, 'epoch': 2.96}\n", + "{'loss': 8.5372, 'learning_rate': 0.00017378577673908085, 'epoch': 3.7}\n", + "{'loss': 8.4297, 'learning_rate': 7.515593750010194e-05, 'epoch': 4.44}\n", + "{'train_runtime': 268.1719, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.604, 'train_loss': 8.918887816660503, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.08740683645009995, 'train_/next-item/ndcg_at_20': 0.10400686413049698, 'train_/next-item/recall_at_10': 0.15468750894069672, 'train_/next-item/recall_at_20': 0.220703125, 'train_/loss': 7.998610019683838, 'train_runtime': 0.6448, 'train_samples_per_second': 3970.378, 'train_steps_per_second': 31.019}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08547135442495346, 'eval_/next-item/ndcg_at_20': 0.10144450515508652, 'eval_/next-item/recall_at_10': 0.1554969847202301, 'eval_/next-item/recall_at_20': 0.21865586936473846, 'eval_/loss': 8.368797302246094, 'eval_runtime': 2.2149, 'eval_samples_per_second': 4796.548, 'eval_steps_per_second': 37.473}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed 100 --report_to none" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9ba3e539", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 06:31:53 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 06:31:54 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 06:31:54,883 >> Using amp fp16 backend\n", + "03/10/2023 06:31:54 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-31-52_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 0, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 06:31:55,376 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 06:31:55,376 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 06:31:55,376 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 06:31:55,376 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 06:31:55,376 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 06:31:55,376 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 06:31:55,376 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 06:31:54.884501 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : True mlm_probability : 0.30000000000000004 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_06-31-52_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 0 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 10.1421, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 9.1819, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 8.9005, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 8.6487, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 8.4605, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 8.2935, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 268.4247, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.592, 'train_loss': 8.861479521079882, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.09560234844684601, 'train_/next-item/ndcg_at_20': 0.1121138259768486, 'train_/next-item/recall_at_10': 0.17148438096046448, 'train_/next-item/recall_at_20': 0.23710937798023224, 'train_/loss': 7.855612754821777, 'train_runtime': 0.6468, 'train_samples_per_second': 3957.894, 'train_steps_per_second': 30.921}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.09119625389575958, 'eval_/next-item/ndcg_at_20': 0.10950089246034622, 'eval_/next-item/recall_at_10': 0.1640625, 'eval_/next-item/recall_at_20': 0.23644576966762543, 'eval_/loss': 8.2479248046875, 'eval_runtime': 2.2235, 'eval_samples_per_second': 4778.131, 'eval_steps_per_second': 37.329}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "SEED=0\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed $SEED --report_to none" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "39c50e18", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 06:36:35 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 06:36:37 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 06:36:37,141 >> Using amp fp16 backend\n", + "03/10/2023 06:36:37 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-36-34_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 1, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 06:36:37,612 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 06:36:37,612 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 06:36:37,612 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 06:36:37,612 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 06:36:37,612 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 06:36:37,612 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 06:36:37,612 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 06:36:37.142319 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : True mlm_probability : 0.30000000000000004 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_06-36-34_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 1 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 10.1835, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 9.1997, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 8.8927, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 8.6939, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 8.4857, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 8.4036, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 268.0277, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.611, 'train_loss': 8.904246891179733, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.08619051426649094, 'train_/next-item/ndcg_at_20': 0.10135936737060547, 'train_/next-item/recall_at_10': 0.16093750298023224, 'train_/next-item/recall_at_20': 0.22109375894069672, 'train_/loss': 7.952397346496582, 'train_runtime': 0.6468, 'train_samples_per_second': 3958.17, 'train_steps_per_second': 30.923}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08869394659996033, 'eval_/next-item/ndcg_at_20': 0.10502538084983826, 'eval_/next-item/recall_at_10': 0.16217996180057526, 'eval_/next-item/recall_at_20': 0.22693899273872375, 'eval_/loss': 8.337870597839355, 'eval_runtime': 2.2357, 'eval_samples_per_second': 4751.905, 'eval_steps_per_second': 37.124}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "SEED=1\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed $SEED --report_to none" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7012acaf", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 06:41:17 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 06:41:18 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 06:41:18,925 >> Using amp fp16 backend\n", + "03/10/2023 06:41:18 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-41-16_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 2, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 06:41:19,383 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 06:41:19,383 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 06:41:19,383 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 06:41:19,384 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 06:41:19,384 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 06:41:19,384 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 06:41:19,384 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 06:41:18.926096 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : True mlm_probability : 0.30000000000000004 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_06-41-16_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 2 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 10.1984, 'learning_rate': 0.0005683051336949965, 'epoch': 0.74}\n", + "{'loss': 9.1995, 'learning_rate': 0.0004696752944560177, 'epoch': 1.48}\n", + "{'loss': 8.9484, 'learning_rate': 0.0003710454552170388, 'epoch': 2.22}\n", + "{'loss': 8.7082, 'learning_rate': 0.0002724156159780598, 'epoch': 2.96}\n", + "{'loss': 8.5479, 'learning_rate': 0.00017378577673908085, 'epoch': 3.7}\n", + "{'loss': 8.4013, 'learning_rate': 7.515593750010194e-05, 'epoch': 4.44}\n", + "{'train_runtime': 267.6305, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.629, 'train_loss': 8.929472757373336, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.0865924283862114, 'train_/next-item/ndcg_at_20': 0.10158973932266235, 'train_/next-item/recall_at_10': 0.15976563096046448, 'train_/next-item/recall_at_20': 0.21914063394069672, 'train_/loss': 7.99446964263916, 'train_runtime': 0.6446, 'train_samples_per_second': 3971.675, 'train_steps_per_second': 31.029}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08677017688751221, 'eval_/next-item/ndcg_at_20': 0.10235893726348877, 'eval_/next-item/recall_at_10': 0.16114456951618195, 'eval_/next-item/recall_at_20': 0.223268061876297, 'eval_/loss': 8.346117973327637, 'eval_runtime': 2.2174, 'eval_samples_per_second': 4791.094, 'eval_steps_per_second': 37.43}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "SEED=2\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed $SEED --report_to none" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c8abb213", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "03/10/2023 06:50:42 - WARNING - __main__ - Process rank: -1, device: cuda:0, n_gpu: 1, distributed training: False, 16-bits training: True\n", + "03/10/2023 06:50:44 - WARNING - transformers4rec - Projecting inputs of NextItemPredictionTask to'448' As weight tying requires the input dimension '192' to be equal to the item-id embedding dimension '448'\n", + "[INFO|trainer.py:434] 2023-03-10 06:50:44,307 >> Using amp fp16 backend\n", + "03/10/2023 06:50:44 - INFO - examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs - Training, Model and Data parameters {'data_path': '/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/', 'features_schema_path': '/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt', 'start_time_window_index': 1, 'final_time_window_index': 2, 'time_window_folder_pad_digits': 4, 'no_incremental_training': False, 'training_time_window_size': 0, 'use_side_information_features': False, 'input_features_aggregation': 'concat', 'model_type': 'xlnet', 'tf_out_activation': 'tanh', 'mlm': True, 'mlm_probability': 0.30000000000000004, 'plm': False, 'plm_probability': 0.25, 'plm_max_span_length': 5, 'plm_mask_input': False, 'plm_permute_all': False, 'rtd': False, 'rtd_sample_from_batch': False, 'rtd_use_batch_interaction': False, 'rtd_discriminator_loss_weight': 50, 'rtd_generator_loss_weight': 1, 'rtd_tied_generator': False, 'd_model': 192, 'n_layer': 3, 'n_head': 16, 'layer_norm_eps': 1e-12, 'initializer_range': 0.02, 'hidden_act': 'gelu', 'dropout': 0.0, 'summary_type': 'last', 'num_hidden_groups': 1, 'inner_group_num': 1, 'eval_on_last_item_seq_only': True, 'train_on_last_item_seq_only': False, 'mf_constrained_embeddings': True, 'item_embedding_dim': 448, 'numeric_features_project_to_embedding_dim': 0, 'numeric_features_soft_one_hot_encoding_num_embeddings': 0, 'stochastic_shared_embeddings_replacement_prob': 0.1, 'softmax_temperature': 1.0, 'label_smoothing': 0.0, 'embedding_dim_from_cardinality_multiplier': 2.0, 'item_id_embeddings_init_std': 0.11, 'other_embeddings_init_std': 0.02, 'layer_norm_featurewise': True, 'attn_type': 'bi', 'input_dropout': 0.1, 'loss_type': 'cross_entropy', 'similarity_type': 'concat_mlp', 'inp_merge': 'mlp', 'learning_rate_warmup_steps': 0, 'avg_session_length': None, 'output_dir': './tmp/', 'overwrite_output_dir': True, 'do_train': True, 'do_eval': True, 'do_predict': False, 'prediction_loss_only': False, 'per_device_train_batch_size': 128, 'per_device_eval_batch_size': 128, 'per_gpu_train_batch_size': None, 'per_gpu_eval_batch_size': None, 'gradient_accumulation_steps': 1, 'eval_accumulation_steps': None, 'learning_rate': 0.0006667377132554976, 'weight_decay': 3.910060265627374e-05, 'adam_beta1': 0.9, 'adam_beta2': 0.999, 'adam_epsilon': 1e-08, 'max_grad_norm': 1.0, 'num_train_epochs': 5.0, 'max_steps': -1, 'lr_scheduler_type': 'linear', 'warmup_ratio': 0.0, 'warmup_steps': 0, 'log_level': -1, 'log_level_replica': -1, 'log_on_each_node': True, 'logging_dir': './tmp/runs/Mar10_06-50-41_7dfa224f788e', 'logging_first_step': False, 'logging_steps': 500, 'logging_nan_inf_filter': True, 'save_steps': 0, 'save_total_limit': None, 'save_on_each_node': False, 'no_cuda': False, 'seed': 3, 'fp16': True, 'fp16_opt_level': 'O1', 'fp16_backend': 'auto', 'fp16_full_eval': False, 'local_rank': -1, 'xpu_backend': None, 'tpu_num_cores': None, 'tpu_metrics_debug': False, 'debug': [], 'dataloader_drop_last': True, 'eval_steps': None, 'dataloader_num_workers': 0, 'past_index': -1, 'run_name': None, 'disable_tqdm': False, 'remove_unused_columns': True, 'label_names': None, 'load_best_model_at_end': False, 'metric_for_best_model': None, 'greater_is_better': None, 'ignore_data_skip': False, 'sharded_ddp': [], 'deepspeed': None, 'label_smoothing_factor': 0.0, 'adafactor': False, 'group_by_length': False, 'length_column_name': 'length', 'report_to': [], 'ddp_find_unused_parameters': None, 'dataloader_pin_memory': True, 'skip_memory_metrics': True, 'use_legacy_prediction_loop': False, 'push_to_hub': False, 'resume_from_checkpoint': None, 'hub_model_id': None, 'hub_token': None, 'gradient_checkpointing': False, 'push_to_hub_model_id': None, 'push_to_hub_organization': None, 'push_to_hub_token': None, '_n_gpu': 1, 'mp_parameters': '', 'max_sequence_length': 20, 'shuffle_buffer_size': 0, 'data_loader_engine': 'merlin', 'eval_on_test_set': True, 'eval_steps_on_train_set': 20, 'predict_top_k': 0, 'learning_rate_num_cosine_cycles_by_epoch': 1.25, 'log_predictions': False, 'compute_metrics_each_n_steps': 1, 'experiments_group': 'default', 'session_seq_length_max': 20, 'learning_rate_schedule': 'linear_with_warmup', 'validate_every': -1}\n", + "[INFO|trainer.py:1196] 2023-03-10 06:50:44,786 >> ***** Running training *****\n", + "[INFO|trainer.py:1197] 2023-03-10 06:50:44,786 >> Num examples = 86528\n", + "[INFO|trainer.py:1198] 2023-03-10 06:50:44,786 >> Num Epochs = 5\n", + "[INFO|trainer.py:1199] 2023-03-10 06:50:44,786 >> Instantaneous batch size per device = 128\n", + "[INFO|trainer.py:1200] 2023-03-10 06:50:44,786 >> Total train batch size (w. parallel, distributed & accumulation) = 128\n", + "[INFO|trainer.py:1201] 2023-03-10 06:50:44,786 >> Gradient Accumulation steps = 1\n", + "[INFO|trainer.py:1202] 2023-03-10 06:50:44,786 >> Total optimization steps = 3380\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "DLL 2023-03-10 06:50:44.307714 - PARAMETER data_path : /transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/ features_schema_path : /workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt start_time_window_index : 1 final_time_window_index : 2 time_window_folder_pad_digits : 4 no_incremental_training : False training_time_window_size : 0 use_side_information_features : False input_features_aggregation : concat model_type : xlnet tf_out_activation : tanh mlm : True mlm_probability : 0.30000000000000004 plm : False plm_probability : 0.25 plm_max_span_length : 5 plm_mask_input : False plm_permute_all : False rtd : False rtd_sample_from_batch : False rtd_use_batch_interaction : False rtd_discriminator_loss_weight : 50 rtd_generator_loss_weight : 1 rtd_tied_generator : False d_model : 192 n_layer : 3 n_head : 16 layer_norm_eps : 1e-12 initializer_range : 0.02 hidden_act : gelu dropout : 0.0 summary_type : last num_hidden_groups : 1 inner_group_num : 1 eval_on_last_item_seq_only : True train_on_last_item_seq_only : False mf_constrained_embeddings : True item_embedding_dim : 448 numeric_features_project_to_embedding_dim : 0 numeric_features_soft_one_hot_encoding_num_embeddings : 0 stochastic_shared_embeddings_replacement_prob : 0.1 softmax_temperature : 1.0 label_smoothing : 0.0 embedding_dim_from_cardinality_multiplier : 2.0 item_id_embeddings_init_std : 0.11 other_embeddings_init_std : 0.02 layer_norm_featurewise : True attn_type : bi input_dropout : 0.1 loss_type : cross_entropy similarity_type : concat_mlp inp_merge : mlp learning_rate_warmup_steps : 0 avg_session_length : None output_dir : ./tmp/ overwrite_output_dir : True do_train : True do_eval : True do_predict : False prediction_loss_only : False per_device_train_batch_size : 128 per_device_eval_batch_size : 128 per_gpu_train_batch_size : None per_gpu_eval_batch_size : None gradient_accumulation_steps : 1 eval_accumulation_steps : None learning_rate : 0.0006667377132554976 weight_decay : 3.910060265627374e-05 adam_beta1 : 0.9 adam_beta2 : 0.999 adam_epsilon : 1e-08 max_grad_norm : 1.0 num_train_epochs : 5.0 max_steps : -1 lr_scheduler_type : linear warmup_ratio : 0.0 warmup_steps : 0 log_level : -1 log_level_replica : -1 log_on_each_node : True logging_dir : ./tmp/runs/Mar10_06-50-41_7dfa224f788e logging_first_step : False logging_steps : 500 logging_nan_inf_filter : True save_steps : 0 save_total_limit : None save_on_each_node : False no_cuda : False seed : 3 fp16 : True fp16_opt_level : O1 fp16_backend : auto fp16_full_eval : False local_rank : -1 xpu_backend : None tpu_num_cores : None tpu_metrics_debug : False debug : [] dataloader_drop_last : True eval_steps : None dataloader_num_workers : 0 past_index : -1 run_name : None disable_tqdm : False remove_unused_columns : True label_names : None load_best_model_at_end : False metric_for_best_model : None greater_is_better : None ignore_data_skip : False sharded_ddp : [] deepspeed : None label_smoothing_factor : 0.0 adafactor : False group_by_length : False length_column_name : length report_to : [] ddp_find_unused_parameters : None dataloader_pin_memory : True skip_memory_metrics : True use_legacy_prediction_loop : False push_to_hub : False resume_from_checkpoint : None hub_model_id : None hub_token : None gradient_checkpointing : False push_to_hub_model_id : None push_to_hub_organization : None push_to_hub_token : None _n_gpu : 1 mp_parameters : max_sequence_length : 20 shuffle_buffer_size : 0 data_loader_engine : merlin eval_on_test_set : True eval_steps_on_train_set : 20 predict_top_k : 0 learning_rate_num_cosine_cycles_by_epoch : 1.25 log_predictions : False compute_metrics_each_n_steps : 1 experiments_group : default session_seq_length_max : 20 learning_rate_schedule : linear_with_warmup validate_every : -1 \n", + "\n", + "***** Launch training for day 1: *****\n", + "{'loss': 10.1517, 'learning_rate': 0.0005681078740165187, 'epoch': 0.74}\n", + "{'loss': 9.1378, 'learning_rate': 0.00046947803477753974, 'epoch': 1.48}\n", + "{'loss': 8.8854, 'learning_rate': 0.0003708481955385607, 'epoch': 2.22}\n", + "{'loss': 8.668, 'learning_rate': 0.0002722183562995818, 'epoch': 2.96}\n", + "{'loss': 8.4887, 'learning_rate': 0.0001735885170606029, 'epoch': 3.7}\n", + "{'loss': 8.3778, 'learning_rate': 7.4958677821624e-05, 'epoch': 4.44}\n", + "{'train_runtime': 267.4244, 'train_samples_per_second': 0.019, 'train_steps_per_second': 12.639, 'train_loss': 8.881638241378512, 'epoch': 5.0}\n", + "\n", + "***** Evaluation results for day 2 (train set):*****\n", + "\n", + "{'train_/next-item/ndcg_at_10': 0.08869679272174835, 'train_/next-item/ndcg_at_20': 0.10546129196882248, 'train_/next-item/recall_at_10': 0.15507812798023224, 'train_/next-item/recall_at_20': 0.22148437798023224, 'train_/loss': 7.946734428405762, 'train_runtime': 0.6445, 'train_samples_per_second': 3972.268, 'train_steps_per_second': 31.033}\n", + "\n", + "***** Evaluation results for day 2 (eval set):*****\n", + "\n", + "{'eval_/next-item/ndcg_at_10': 0.08494117110967636, 'eval_/next-item/ndcg_at_20': 0.10106566548347473, 'eval_/next-item/recall_at_10': 0.15568523108959198, 'eval_/next-item/recall_at_20': 0.21940888464450836, 'eval_/loss': 8.320558547973633, 'eval_runtime': 2.2146, 'eval_samples_per_second': 4797.31, 'eval_steps_per_second': 37.479}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Task exception was never retrieved\n", + "future: ._handle_stream() done, defined at /usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py:211> exception=ValueError('Separator is not found, and chunk exceed the limit')>\n", + "Traceback (most recent call last):\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 540, in readline\n", + " line = await self.readuntil(sep)\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 618, in readuntil\n", + " raise exceptions.LimitOverrunError(\n", + "asyncio.exceptions.LimitOverrunError: Separator is not found, and chunk exceed the limit\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/usr/local/lib/python3.8/dist-packages/IPython/core/magics/script.py\", line 213, in _handle_stream\n", + " line = (await stream.readline()).decode(\"utf8\")\n", + " File \"/usr/lib/python3.8/asyncio/streams.py\", line 549, in readline\n", + " raise ValueError(e.args[0])\n", + "ValueError: Separator is not found, and chunk exceed the limit\n" + ] + } + ], + "source": [ + "%%bash\n", + "\n", + "FEATURE_SCHEMA_PATH=/workspace/examples/t4rec_paper_experiments/datasets_configs/ecom_rees46/rees46_schema.pbtxt\n", + "DATA_PATH=/transformers4rec/examples/t4rec_paper_experiments/t4r_paper_repro/\n", + "NUM_EPOCHS=5\n", + "SEED=3\n", + "\n", + "python3 ../transf_exp_main_modified.py --output_dir ./tmp/ --overwrite_output_dir --do_train --do_eval --save_steps 0 --data_path $DATA_PATH --features_schema_path $FEATURE_SCHEMA_PATH --fp16 --data_loader_engine merlin --start_time_window_index 1 --final_time_window_index 2 --time_window_folder_pad_digits 4 --model_type xlnet --loss_type cross_entropy --per_device_eval_batch_size 128 --similarity_type concat_mlp --tf_out_activation tanh --inp_merge mlp --learning_rate_warmup_steps 0 --learning_rate_schedule linear_with_warmup --hidden_act gelu --num_train_epochs $NUM_EPOCHS --dataloader_drop_last --compute_metrics_each_n_steps 1 --session_seq_length_max 20 --eval_on_last_item_seq_only --mf_constrained_embeddings --layer_norm_featurewise --attn_type bi --mlm --per_device_train_batch_size 128 --learning_rate 0.0006667377132554976 --dropout 0.0 --input_dropout 0.1 --weight_decay 3.910060265627374e-05 --d_model 192 --item_embedding_dim 448 --n_layer 3 --n_head 16 --label_smoothing 0.0 --stochastic_shared_embeddings_replacement_prob 0.1 --item_id_embeddings_init_std 0.11 --other_embeddings_init_std 0.02 --mlm_probability 0.30000000000000004 --eval_on_test_set --seed $SEED --report_to none" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/transf_exp_main_modified.py b/transf_exp_main_modified.py new file mode 100644 index 0000000000..08b1859457 --- /dev/null +++ b/transf_exp_main_modified.py @@ -0,0 +1,480 @@ +# +# Copyright (c) 2021, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import glob +import logging +import os +from functools import partial + +import numpy as np +import pandas as pd +import torch +import transformers +from examples.t4rec_paper_experiments.t4r_paper_repro.exp_outputs import ( + config_dllogger, + creates_output_dir, + log_aot_metric_results, + log_metric_results, + log_parameters, +) +from merlin.io import Dataset +from examples.t4rec_paper_experiments.t4r_paper_repro.transf_exp_args import DataArguments, ModelArguments, TrainingArguments +from transformers import HfArgumentParser, set_seed +from transformers.trainer_utils import is_main_process + +import transformers4rec.torch as t4r +from merlin_standard_lib import Schema, Tag +from transformers4rec.torch import Trainer +from transformers4rec.torch.utils.data_utils import MerlinDataLoader +from transformers4rec.torch.utils.examples_utils import wipe_memory + +logger = logging.getLogger(__name__) + + +def main(): + # Parsing command line arguments + (data_args, model_args, training_args) = parse_command_line_args() + pd.to_pickle(data_args, 'data_args_with_side_features.pkl') + pd.to_pickle(model_args, 'model_args_with_side_features.pkl') + pd.to_pickle(training_args, 'training_args_with_side_features.pkl') + + # Ensuring to set W&B run name to null, so that a nice run name is generated + training_args.run_name = None + + # Loading the schema of the dataset + schema = Schema().from_proto_text(data_args.features_schema_path) + if not data_args.use_side_information_features: + schema = schema.select_by_tag(Tag.ITEM_ID) + + item_id_col = schema.select_by_tag(Tag.ITEM_ID).column_names[0] + col_names = schema.column_names + logger.info("Column names: {}".format(col_names)) + + creates_output_dir(training_args) + config_logging(training_args) + set_seed(training_args.seed) + + # Getting masking config + masking_kwargs = get_masking_kwargs(model_args) + + # Obtaining Stochastic Shared embeddings config + pre_transforms = [] + if model_args.stochastic_shared_embeddings_replacement_prob > 0: + pre_transforms.append( + t4r.StochasticSwapNoise( + pad_token=0, + replacement_prob=model_args.stochastic_shared_embeddings_replacement_prob, + schema=schema, + ) + ) + + post_transforms = [] + + # Adding input dropout config + if model_args.input_dropout > 0: + input_dropout = t4r.TabularDropout(dropout_rate=model_args.input_dropout) + post_transforms.append(input_dropout) + + # Obtaining feature-wise layer norm config + if model_args.layer_norm_featurewise: + post_transforms.append("layer-norm") + + # Configuring categorical features embedding sizes + embedding_dims = {item_id_col: model_args.item_embedding_dim} + embedding_dim_default = model_args.item_embedding_dim + infer_embedding_sizes = not model_args.input_features_aggregation.startswith("element-wise") + + # Configuring embedding initializers + embeddings_initializers = {} + for col in col_names: + if col == item_id_col: + std = model_args.item_id_embeddings_init_std + else: + std = model_args.other_embeddings_init_std + embeddings_initializers[col] = partial(torch.nn.init.normal_, mean=0.0, std=std) + + # Define input module to process tabular input-features and to prepare masked inputs + input_module = t4r.TabularSequenceFeatures.from_schema( + schema, + max_sequence_length=training_args.max_sequence_length, + aggregation=model_args.input_features_aggregation, + d_output=model_args.d_model, + pre=pre_transforms, + post=post_transforms, + # Embedding Features args + embedding_dims=embedding_dims, + embedding_dim_default=embedding_dim_default, + infer_embedding_sizes=infer_embedding_sizes, + infer_embedding_sizes_multiplier=model_args.embedding_dim_from_cardinality_multiplier, + embeddings_initializers=embeddings_initializers, + continuous_soft_embeddings=( + model_args.numeric_features_soft_one_hot_encoding_num_embeddings > 0 + ), + soft_embedding_cardinality_default=( + model_args.numeric_features_soft_one_hot_encoding_num_embeddings + ), + soft_embedding_dim_default=model_args.numeric_features_project_to_embedding_dim, + **masking_kwargs, + ) + + # Loss function: Cross-entropy with label smoothing + label_smoothing_xe_loss = t4r.LabelSmoothCrossEntropyLoss( + reduction="mean", smoothing=model_args.label_smoothing + ) + + # Configuring metrics: NDCG@10, NDCG@20, Recall@10, Recall@20 + metrics = [ + t4r.ranking_metric.NDCGAt(top_ks=[10, 20], labels_onehot=True), + t4r.ranking_metric.RecallAt(top_ks=[10, 20], labels_onehot=True), + ] + + # Configures the next-item prediction-task + prediction_task = t4r.NextItemPredictionTask( + weight_tying=model_args.mf_constrained_embeddings, + softmax_temperature=model_args.softmax_temperature, + metrics=metrics, + loss=label_smoothing_xe_loss, + ) + + model_config = get_model_config(training_args, model_args) + + # Generates the final PyTorch model + model = model_config.to_torch_model(input_module, prediction_task) + + trainer = Trainer( + model=model, + args=training_args, + schema=schema, + compute_metrics=True, + incremental_logging=True, + ) + + log_parameters(trainer, data_args, model_args, training_args) + + results_over_time = incremental_train_eval( + trainer, + start_time_index=data_args.start_time_window_index, + end_time_index=data_args.final_time_window_index, + input_dir=data_args.data_path, + training_args=training_args, + data_args=data_args, + ) + + if training_args.do_eval: + logger.info("Computing and logging AOT (Average Over Time) metrics") + results_df = pd.DataFrame.from_dict(results_over_time, orient="index") + results_df.reset_index().to_csv( + os.path.join(training_args.output_dir, "eval_train_results.csv"), + index=False, + ) + + results_avg_time = dict(results_df.mean()) + results_avg_time = {f"{k}_AOT": v for k, v in results_avg_time.items()} + # Logging to W&B / Tensorboard + trainer.log(results_avg_time) + + log_aot_metric_results(training_args.output_dir, results_avg_time) + + # Mimic the inference by manually computing recall@10 using the evaluation data + # of the last time-index. + eval_path = os.path.join( + data_args.data_path, + str( + data_args.final_time_window_index, + ).zfill(data_args.time_window_folder_pad_digits), + "test.parquet" if training_args.eval_on_test_set else "valid.parquet", + ) + prediction_data = pd.read_parquet(eval_path) + # Extract label + labels = prediction_data["sess_pid_seq"].apply(lambda x: x[-1]).values + + # Truncate input sequences up to last item - 1 to mimic the inference + def mask_last_interaction(x): + return list(x[:-1]) + + list_columns = schema.select_by_tag("list").column_names + for col in list_columns: + prediction_data[col] = prediction_data[col].apply(mask_last_interaction) + # Get top-10 predictions + test_loader = MerlinDataLoader.from_schema( + schema, + Dataset(prediction_data), + training_args.per_device_eval_batch_size, + max_sequence_length=training_args.max_sequence_length, + shuffle=False, + ) + trainer.test_dataloader = test_loader + trainer.args.predict_top_k = 10 + topk_preds = trainer.predict(test_loader).predictions[0] + # Compute recall@10 + recall_10 = recall(topk_preds, labels) + + logger.info(f"Recall@10 of manually masked test data = {str(recall_10)}") + output_file = os.path.join(training_args.output_dir, "eval_results_over_time.txt") + with open(output_file, "a") as writer: + writer.write(f"\n***** Recall@10 of simulated inference = {recall_10} *****\n") + # Verify that the recall@10 from train.evaluate() matches the recall@10 calculated manually + if not isinstance(input_module.masking, t4r.masking.PermutationLanguageModeling): + # TODO fix inference discrepancy for permutation language modeling + assert np.isclose(recall_10, results_over_time[2]["eval_/next-item/recall_at_10"], rtol=0.1) + + import nvtabular as nvt + from merlin.schema.tags import Tags + from nvtabular.inference.triton import export_pytorch_ensemble + from nvtabular.workflow import Workflow + + + os.makedirs('/workspace/models_for_benchmarking/', exist_ok=True) + ds = nvt.Dataset(prediction_data) + sess_pid_seq = ['sess_pid_seq'] >> nvt.ops.AddMetadata(tags=[Tags.CATEGORICAL]) + wf = nvt.Workflow(sess_pid_seq) + wf.fit(ds) + wf.output_dtypes + + export_pytorch_ensemble( + model, + wf, + sparse_max=trainer.get_train_dataloader().dataset.sparse_max, + name= "t4r_pytorch", + model_path= "/workspace/models_for_benchmarking/", + label_columns =[], + ) + + + +def recall(predicted_items: np.ndarray, real_items: np.ndarray) -> float: + bs, top_k = predicted_items.shape + valid_rows = real_items != 0 + + # reshape predictions and labels to compare + # the top-10 predicted item-ids with the label id. + real_items = real_items.reshape(bs, 1, -1) + predicted_items = predicted_items.reshape(bs, 1, top_k) + + num_relevant = real_items.shape[-1] + predicted_correct_sum = (predicted_items == real_items).sum(-1) + predicted_correct_sum = predicted_correct_sum[valid_rows] + recall_per_row = predicted_correct_sum / num_relevant + return np.mean(recall_per_row) + + +def incremental_train_eval( + trainer, start_time_index, end_time_index, input_dir, training_args, data_args +): + """ + Performs incremental training eand evaluation. + Iteratively train using data of a given window index and evaluate on the validation data + of the following index. + Parameters + ---------- + start_time_index: int + The start index for training, it should match the partitions of the data directory + end_time_index: int + The end index for training, it should match the partitions of the data directory + input_dir: str + The input directory where the parquet files were saved based on partition column + Returns + ------- + results_over_time: dict + The average over time of ranking metrics. + """ + results_over_time = {} + for time_index in range(start_time_index, end_time_index): + # 1. Set data + time_index_train = time_index + time_index_eval = time_index + 1 + train_paths = glob.glob( + os.path.join( + input_dir, + str(time_index_train).zfill(data_args.time_window_folder_pad_digits), + "train.parquet", + ) + ) + eval_paths = glob.glob( + os.path.join( + input_dir, + str(time_index_eval).zfill(data_args.time_window_folder_pad_digits), + "test.parquet" if training_args.eval_on_test_set else "valid.parquet", + ) + ) + + # 2. Train on train data of time_index + if training_args.do_train: + print("\n***** Launch training for day %s: *****" % time_index) + trainer.train_dataset_or_path = train_paths + trainer.reset_lr_scheduler() + trainer.train() + + if training_args.do_eval: + + # 3. Evaluate on train data of time_index + trainer.eval_dataset_or_path = train_paths + train_metrics = trainer.evaluate(metric_key_prefix="train") + print("\n***** Evaluation results for day %s (train set):*****\n" % time_index_eval) + print(train_metrics) + + log_metric_results( + training_args.output_dir, + train_metrics, + prefix="train", + time_index=time_index_eval, + ) + + # free GPU for next day training + wipe_memory() + + # 4. Evaluate on valid/test data of time_index+1 + trainer.eval_dataset_or_path = eval_paths + eval_metrics = trainer.evaluate(metric_key_prefix="eval") + print("\n***** Evaluation results for day %s (eval set):*****\n" % time_index_eval) + print(eval_metrics) + + log_metric_results( + training_args.output_dir, + eval_metrics, + prefix="eval", + time_index=time_index_eval, + ) + + # free GPU for next day training + wipe_memory() + + results_over_time[time_index_eval] = { + **eval_metrics, + **train_metrics, + } + + return results_over_time + + +def get_masking_kwargs(model_args): + kwargs = {} + if model_args.plm: + kwargs = { + "masking": "plm", + "plm_probability": model_args.plm_probability, + "max_span_length": model_args.plm_max_span_length, + "permute_all": model_args.plm_permute_all, + } + elif model_args.rtd: + kwargs = { + "masking": "rtd", + "sample_from_batch": model_args.rtd_sample_from_batch, + # rtd_use_batch_interaction=? + # rtd_discriminator_loss_weight=? + # rtd_generator_loss_weight=? + # rtd_tied_generator=? + } + elif model_args.mlm: + kwargs = {"masking": "mlm", "mlm_probability": model_args.mlm_probability} + else: + kwargs = {"masking": "clm"} + + return kwargs + + +def get_model_config(training_args, model_args): + kwargs = {} + + if model_args.model_type == "gpt2": + model_build_fn = t4r.GPT2Config.build + if model_args.model_type == "xlnet": + model_build_fn = t4r.XLNetConfig.build + kwargs = { + "summary_type": model_args.summary_type, + "attn_type": model_args.attn_type, + } + if model_args.model_type == "electra": + model_build_fn = t4r.ElectraConfig.build + if model_args.model_type == "albert": + model_build_fn = t4r.AlbertConfig.build + num_hidden_groups = model_args.num_hidden_groups + if model_args.num_hidden_groups == -1: + num_hidden_groups = model_args.n_layer + kwargs = { + "num_hidden_groups": num_hidden_groups, + "inner_group_num": model_args.inner_group_num, + } + if model_args.model_type == "transfoxl": + model_build_fn = t4r.TransfoXLConfig.build + + model_config = model_build_fn( + total_seq_length=training_args.max_sequence_length, + d_model=model_args.d_model, + n_head=model_args.n_head, + n_layer=model_args.n_layer, + hidden_act=model_args.hidden_act, + initializer_range=model_args.initializer_range, + layer_norm_eps=model_args.layer_norm_eps, + dropout=model_args.dropout, + pad_token=0, + **kwargs, + ) + + return model_config + + +def parse_command_line_args(): + # Parsing command line argument + parser = HfArgumentParser((DataArguments, ModelArguments, TrainingArguments)) + ( + data_args, + model_args, + training_args, + ) = parser.parse_args_into_dataclasses() + + # Adapting arguments used in the original paper reproducibility script to the new ones + if training_args.session_seq_length_max: + training_args.max_sequence_length = training_args.session_seq_length_max + + if training_args.learning_rate_schedule: + training_args.lr_scheduler_type = training_args.learning_rate_schedule.replace( + "_with_warmup", "" + ) + + if model_args.input_features_aggregation == "elementwise_sum_multiply_item_embedding": + model_args.input_features_aggregation = "element-wise-sum-item-multi" + + return data_args, model_args, training_args + + +def config_logging(training_args): + # Setup logging + logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, + ) + logger.warning( + "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", + training_args.local_rank, + training_args.device, + training_args.n_gpu, + bool(training_args.local_rank != -1), + training_args.fp16, + ) + + # Set the verbosity to info of the Transformers logger (on main process only): + if is_main_process(training_args.local_rank): + transformers.utils.logging.set_verbosity_info() + transformers.utils.logging.enable_default_handler() + transformers.utils.logging.enable_explicit_format() + + config_dllogger(training_args.output_dir) + + +if __name__ == "__main__": + main()