Update transformers example (#180)

* Move transformers example * Add new example based on more modern example * Update README link * Add Ray Train+Transformers example * Set correct minimum Comet version * Fix the test matrix * Try smaller batch size to reduce CPU memory pressure on CI * Fix last links
comet-ml · Jul 25, 2024 · b6fd213 · b6fd213
1 parent 4548e85
commit b6fd213
Show file tree

Hide file tree

Showing 11 changed files with 461 additions and 7 deletions.
diff --git a/.github/workflows/test-examples.yml b/.github/workflows/test-examples.yml
@@ -23,7 +23,6 @@ jobs:
           - integrations/model-optimization/ray-tune/notebooks/Comet_and_Ray.ipynb
           - integrations/model-training/composer/notebooks/comet_composer.ipynb
           - integrations/model-training/fastai/notebooks/fastai_hello_world.ipynb
-          - integrations/model-training/hugging_face/notebooks/Comet_with_Hugging_Face_Trainer.ipynb
           - integrations/model-training/keras/notebooks/Comet_with_Keras.ipynb
           - integrations/model-training/lightgbm/notebooks/Comet_and_LightGBM.ipynb
           - integrations/model-training/prophet/notebooks/Comet_and_Prophet.ipynb
@@ -33,9 +32,11 @@ jobs:
           - integrations/model-training/pytorch/notebooks/Comet_Pytorch_Tensorboard.ipynb
           - integrations/model-training/pytorch/notebooks/Comet_Pytorch_TensorboardX.ipynb
           - integrations/model-training/pytorch/notebooks/Histogram_Logging_Pytorch.ipynb
+          - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb
           - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_keras.ipynb
           - integrations/model-training/ray-train/notebooks/Comet_with_ray_train_xgboost.ipynb
           - integrations/model-training/tensorflow/notebooks/Comet_and_Tensorflow.ipynb
+          - integrations/model-training/transformers/notebooks/Comet_with_Hugging_Face_Trainer.ipynb
           - integrations/model-training/yolov5/notebooks/Comet_and_YOLOv5.ipynb
           - integrations/model-training/yolov8/notebooks/YOLOv8_and_Comet.ipynb
           - integrations/reinforcement-learning/gymnasium/notebooks/comet_gymnasium_example.ipynb
@@ -107,7 +108,6 @@ jobs:
           - {script: "integrations/model-optimization/optuna/optuna-hello-world/optuna-hello-world.py", arg: ""}
           - {script: "integrations/model-training/composer/mosaicml-getting-started/mosaicml-getting-started.py", arg: ""}
           - {script: "integrations/model-training/fastai/fastai-hello-world/fastai_hello_world.py", arg: ""}
-          - {script: "integrations/model-training/hugging_face/transformers-distilbert-fine-tuning/transformers-distilbert-fine-tuning.py", arg: ""}
           - {script: "integrations/model-training/keras/keras-mnist-dnn/keras-mnist-dnn.py", arg: ""}
           - {script: "integrations/model-training/mlflow/mlflow-hello-world/mlflow-hello-world.py", arg: "run"}
           - {script: "integrations/model-training/pytorch-lightning/pytorch-lightning-optimizer/pytorch-lightning-optimizer.py", arg: ""}
@@ -116,6 +116,8 @@ jobs:
           - {script: "integrations/model-training/pytorch/pytorch-tensorboard/pytorch-tensorboard-example.py", arg: ""}
           - {script: "integrations/model-training/scikit-learn/sklearn-classification-example/comet-scikit-classification-example.py", arg: "run"}
           - {script: "integrations/model-training/scikit-learn/sklearn-nlp-example/comet-scikit-nlp-example.py", args: ""}
+          - {script: "integrations/model-training/transformers/transformers-distilbert-fine-tuning/transformers-distilbert-fine-tuning.py", arg: ""}
+          - {script: "integrations/model-training/transformers/transformers-google-bert-fine-tuning/transformers-google-bert-fine-tuning.py", arg: ""}
           - {script: "integrations/model-training/xgboost/xgboost-california/xgboost-california.py", arg: ""}
           - {script: "integrations/workflow-orchestration/metaflow/metaflow-hello-world/helloworld.py", arg: "run"}
           - {script: "integrations/workflow-orchestration/metaflow/metaflow-model-evaluation/metaflow-model-evaluation.py", arg: "run --max-workers 1 --n_samples 100"}

diff --git a/...ns/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb b/...ns/model-training/ray-train/notebooks/Comet_with_ray_train_huggingface_transformers.ipynb
@@ -0,0 +1,314 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<img src=\"https://cdn.comet.ml/img/notebook_logo.png\">"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "[Comet](https://www.comet.com/site/products/ml-experiment-tracking/?utm_campaign=ray_train&utm_medium=colab) is an MLOps Platform that is designed to help Data Scientists and Teams build better models faster! Comet provides tooling to track, Explain, Manage, and Monitor your models in a single place! It works with Jupyter Notebooks and Scripts and most importantly it's 100% free to get started!\n",
+    "\n",
+    "[Ray Train](https://docs.ray.io/en/latest/train/train.html) abstracts away the complexity of setting up a distributed training system.\n",
+    "\n",
+    "Instrument your runs with Comet to start managing experiments, create dataset versions and track hyperparameters for faster and easier reproducibility and collaboration.\n",
+    "\n",
+    "[Find more information about our integration with Ray Train](https://www.comet.ml/docs/v2/integrations/ml-frameworks/ray/)\n",
+    "\n",
+    "Get a preview for what's to come. Check out a completed experiment created from this notebook [here](https://www.comet.com/examples/comet-example-ray-train-keras/99d169308c854be7ac222c995a2bfa26?experiment-tab=systemMetrics).\n",
+    "\n",
+    "This example is based on the [following Ray Train Tensorflow example](https://docs.ray.io/en/latest/train/examples/tf/tensorflow_mnist_example.html)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ZYchV5RWwdv5"
+   },
+   "source": [
+    "# Install Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "DJnmqphuY2eI"
+   },
+   "outputs": [],
+   "source": [
+    "%pip install \"comet_ml>=3.31.5\" \"ray[air]>=2.1.0\" \"transformers>=4.43.0\" \"accelerate>=0.12.0\" \"datasets\" \"sentencepiece\" scipy \"scikit-learn\" protobuf \"torch>=1.3\" evaluate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "crOcPHobwhGL"
+   },
+   "source": [
+    "# Initialize Comet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "HNQRM0U3caiY"
+   },
+   "outputs": [],
+   "source": [
+    "import comet_ml\n",
+    "import comet_ml.integration.ray\n",
+    "\n",
+    "comet_ml.init()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "cgqwGSwtzVWD"
+   },
+   "source": [
+    "# Import Dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "e-5rRYaUw5AF"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import threading\n",
+    "\n",
+    "import evaluate\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "\n",
+    "from transformers import (\n",
+    "    AutoModelForSequenceClassification,\n",
+    "    AutoTokenizer,\n",
+    "    Trainer,\n",
+    "    TrainingArguments,\n",
+    "    enable_full_determinism,\n",
+    ")\n",
+    "\n",
+    "import ray.train.huggingface.transformers\n",
+    "from ray.train import ScalingConfig, RunConfig\n",
+    "from ray.train.torch import TorchTrainer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Models\n",
+    "PRE_TRAINED_MODEL_NAME = \"google-bert/bert-base-cased\"\n",
+    "SEED = 42\n",
+    "\n",
+    "enable_full_determinism(SEED)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prepare your dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_dataset():\n",
+    "    tokenizer = AutoTokenizer.from_pretrained(\"google-bert/bert-base-cased\")\n",
+    "\n",
+    "    def tokenize_function(examples):\n",
+    "        return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n",
+    "\n",
+    "    dataset = load_dataset(\"yelp_review_full\")\n",
+    "    dataset[\"train\"] = dataset[\"train\"].shuffle(seed=SEED).select(range(100))\n",
+    "    dataset[\"test\"] = dataset[\"test\"].shuffle(seed=SEED).select(range(100))\n",
+    "\n",
+    "    tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
+    "\n",
+    "    small_train_dataset = tokenized_datasets[\"train\"]\n",
+    "    small_eval_dataset = tokenized_datasets[\"test\"]\n",
+    "    return (small_train_dataset, small_eval_dataset)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TJuThf1TxP_G"
+   },
+   "source": [
+    "# Define your distributed training function\n",
+    "\n",
+    "This function is gonna be distributed and executed on each distributed worker."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train_func(config):\n",
+    "    from comet_ml import get_running_experiment\n",
+    "    from comet_ml.integration.ray import comet_worker_logger\n",
+    "\n",
+    "    with comet_worker_logger(config) as experiment:\n",
+    "        small_train_dataset, small_eval_dataset = get_dataset()\n",
+    "\n",
+    "        # Model\n",
+    "        model = AutoModelForSequenceClassification.from_pretrained(\n",
+    "            \"google-bert/bert-base-cased\", num_labels=5\n",
+    "        )\n",
+    "\n",
+    "        # Evaluation Metrics\n",
+    "        metric = evaluate.load(\"accuracy\")\n",
+    "\n",
+    "        def compute_metrics(eval_pred):\n",
+    "            logits, labels = eval_pred\n",
+    "            predictions = np.argmax(logits, axis=-1)\n",
+    "\n",
+    "            experiment = comet_ml.get_running_experiment()\n",
+    "            if experiment:\n",
+    "                experiment.log_confusion_matrix(predictions, labels)\n",
+    "\n",
+    "            return metric.compute(predictions=predictions, references=labels)\n",
+    "\n",
+    "        # Hugging Face Trainer\n",
+    "        training_args = TrainingArguments(\n",
+    "            do_eval=True,\n",
+    "            do_train=True,\n",
+    "            eval_strategy=\"epoch\",\n",
+    "            num_train_epochs=config[\"epochs\"],\n",
+    "            output_dir=\"./results\",\n",
+    "            overwrite_output_dir=True,\n",
+    "            per_device_eval_batch_size=4,\n",
+    "            per_device_train_batch_size=4,\n",
+    "            report_to=[\"comet_ml\"],\n",
+    "            seed=SEED,\n",
+    "        )\n",
+    "        trainer = Trainer(\n",
+    "            model=model,\n",
+    "            args=training_args,\n",
+    "            train_dataset=small_train_dataset,\n",
+    "            eval_dataset=small_eval_dataset,\n",
+    "            compute_metrics=compute_metrics,\n",
+    "        )\n",
+    "\n",
+    "        # Report Metrics and Checkpoints to Ray Train\n",
+    "        callback = ray.train.huggingface.transformers.RayTrainReportCallback()\n",
+    "        trainer.add_callback(callback)\n",
+    "\n",
+    "        # Prepare Transformers Trainer\n",
+    "        trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)\n",
+    "\n",
+    "        # Start Training\n",
+    "        trainer.train()\n",
+    "\n",
+    "    comet_ml.get_running_experiment().end()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Define the function that schedule the distributed job"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def train(num_workers: int = 2, use_gpu: bool = False, epochs=1):\n",
+    "    scaling_config = ScalingConfig(num_workers=num_workers, use_gpu=use_gpu)\n",
+    "    config = {\"use_gpu\": use_gpu, \"epochs\": 2}\n",
+    "\n",
+    "    callback = comet_ml.integration.ray.CometTrainLoggerCallback(\n",
+    "        config, project_name=\"comet-example-ray-train-hugginface-transformers\"\n",
+    "    )\n",
+    "\n",
+    "    ray_trainer = TorchTrainer(\n",
+    "        train_func,\n",
+    "        scaling_config=scaling_config,\n",
+    "        train_loop_config=config,\n",
+    "        run_config=RunConfig(callbacks=[callback]),\n",
+    "    )\n",
+    "    result = ray_trainer.fit()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train the model\n",
+    "\n",
+    "Ray will wait indefinitely if we request more num_workers that the available resources, the code below ensure we never request more CPU than available locally."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ideal_num_workers = 2\n",
+    "\n",
+    "available_local_cpu_count = os.cpu_count() - 1\n",
+    "num_workers = min(ideal_num_workers, available_local_cpu_count)\n",
+    "\n",
+    "if num_workers < 1:\n",
+    "    num_workers = 1\n",
+    "\n",
+    "train(num_workers, use_gpu=False, epochs=5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/...oks/Comet_with_Hugging_Face_Trainer.ipynb → ...oks/Comet_with_Hugging_Face_Trainer.ipynb b/...oks/Comet_with_Hugging_Face_Trainer.ipynb → ...oks/Comet_with_Hugging_Face_Trainer.ipynb
@@ -42,7 +42,7 @@
    },
    "outputs": [],
    "source": [
-    "%pip install -U \"comet_ml>=3.44.0\" torch datasets transformers scikit-learn accelerate"
+    "%pip install -U \"comet_ml>=3.44.0\" \"transformers>=4.42.2\" torch datasets scikit-learn accelerate"
    ]
   },
   {
@@ -320,6 +320,22 @@
     ")\n",
     "trainer.train()"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "comet_ml.get_running_experiment().end()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
@@ -343,7 +359,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.9.1"
   }
  },
  "nbformat": 4,