Minor error handling

ashwinprasadme · ashwinprasadme · commit ddd63a97b6fb · 2023-12-14T16:13:57.000+01:00
diff --git a/src/target_tools/ollama/src/fine_tuning/llama_fine_tuning.ipynb b/src/target_tools/ollama/src/fine_tuning/llama_fine_tuning.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "ae5c6dc9-d246-4f47-a4c0-c1c07da0b901",
    "metadata": {},
    "outputs": [],
@@ -12,7 +12,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "b0ab721c-85f3-49b8-ac06-e8c2efce5d69",
    "metadata": {},
    "outputs": [],
@@ -22,17 +22,26 @@
   },
   {
    "cell_type": "markdown",
+   "id": "7bbf2852",
    "metadata": {},
    "source": [
     "### Test CUDA"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "e1d9dc6d-bf38-41b5-a44a-77aea0a355cf",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Using device: cuda\n"
+     ]
+    }
+   ],
    "source": [
     "import torch\n",
     "use_cuda = torch.cuda.is_available()\n",
@@ -42,14 +51,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "e9252eac",
    "metadata": {},
    "source": [
     "## Imports"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "c1afd2e2-297e-4f67-ac7c-24b1473d5e06",
    "metadata": {},
    "outputs": [],
@@ -69,6 +79,7 @@
   },
   {
    "cell_type": "markdown",
+   "id": "06512f0f",
    "metadata": {},
    "source": [
     "## Login to huggingface\n",
@@ -77,28 +88,72 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "3a1cc9bd-17ca-4754-b36f-1d67e194d205",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|\n",
+      "    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|\n",
+      "    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|\n",
+      "    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|\n",
+      "    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|\n",
+      "\n",
+      "    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.\n",
+      "    Setting a new token will erase the existing one.\n",
+      "    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n"
+     ]
+    },
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      "Token:  ········\n",
+      "Add token as git credential? (Y/n)  n\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Token is valid (permission: write).\n",
+      "Your token has been saved to /pc2/users/a/ashwin/.cache/huggingface/token\n",
+      "Login successful\n"
+     ]
+    }
+   ],
    "source": [
     "from huggingface_hub import interpreter_login\n",
     "interpreter_login()"
    ]
   },
   {
    "cell_type": "markdown",
+   "id": "e27647e5",
    "metadata": {},
    "source": [
     "## Load dataset "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "c70ed7a5-0b8c-41d9-8ad6-80c36d6a10da",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Number of prompts: 15\n",
+      "Column names are: ['text']\n"
+     ]
+    }
+   ],
    "source": [
     "dataset_name = \"ashwinprasadme/typeevalpy_finetuning\"\n",
     "dataset = load_dataset(dataset_name, split=\"train\", token=True)\n",
@@ -109,14 +164,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "d4ad7e49",
    "metadata": {},
    "source": [
     "## Dataset preparation functions"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "6a4f3f26-6c38-4074-804b-caa4e66aa1d5",
    "metadata": {},
    "outputs": [],
@@ -175,14 +231,15 @@
   },
   {
    "cell_type": "markdown",
+   "id": "dec144c8",
    "metadata": {},
    "source": [
     "## Training functions"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "e7a80306-3bcc-4b18-837f-08b883bf98e7",
    "metadata": {},
    "outputs": [],
@@ -270,7 +327,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
+   "id": "427c7dea",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -353,14 +411,16 @@
   },
   {
    "cell_type": "markdown",
+   "id": "001b7997",
    "metadata": {},
    "source": [
     "## Load model"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 10,
+   "id": "1eb2cd1a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -392,44 +452,120 @@
    "execution_count": null,
    "id": "fbd5d2d4-8c64-4266-a9cf-2fdeac5df0d4",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing Model: codellama/CodeLlama-7b-Python-hf\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ff429292453744558773a62add62d543",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9f6cbe7009a644489cbc1d387c41c4d9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "627db9bae4134e45858c3ea37652089b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "9d3a2cba3b3b42e0b723f63392766fae",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
    "source": [
     "# Load model from HF with user's token and with bitsandbytes config\n",
-    "output_dir_str = \"/scratch/hpc-prf-hdgen/ashwin/finetuned_models/ft_{model_name}\"\n",
-    "output_dir_merged_str = \"/scratch/hpc-prf-hdgen/ashwin/finetuned_models/ft_{model_name}_merged\"\n",
+    "output_dir_str = \"/scratch/hpc-prf-hdgen/ashwin/finetuned_models/ft_v1_{model_name}\"\n",
+    "output_dir_merged_str = \"/scratch/hpc-prf-hdgen/ashwin/finetuned_models/ft_v1_{model_name}_merged\"\n",
     "\n",
     "start_time = time.time()\n",
     "for model_name in models_list:\n",
-    "    print(f\"Processing Model: {model_name}\")\n",
-    "    # model_name = \"meta-llama/Llama-2-7b-hf\" \n",
-    "    bnb_config = create_bnb_config()\n",
-    "    model, tokenizer = load_model(model_name, bnb_config)\n",
-    "\n",
-    "    # Preprocess dataset\n",
-    "    print(\"Preprocess dataset\")\n",
-    "    max_length = get_max_length(model)\n",
-    "    dataset = preprocess_dataset(tokenizer, max_length, 0, dataset)\n",
-    "\n",
-    "    # Start training\n",
-    "    print(\"Start training\")\n",
-    "    output_dir = output_dir_str.format(model_name=model_name)\n",
-    "    train(model, tokenizer, dataset, output_dir)\n",
-    "\n",
-    "    # Save and Merge Model\n",
-    "    print(\"Save and Merge Model\")\n",
-    "    model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=\"auto\", torch_dtype=torch.bfloat16)\n",
-    "    model = model.merge_and_unload()\n",
-    "\n",
-    "    output_merged_dir = output_dir_merged_str.format(model_name=model_name)\n",
-    "    os.makedirs(output_merged_dir, exist_ok=True)\n",
-    "    model.save_pretrained(output_merged_dir, safe_serialization=True)\n",
-    "\n",
-    "    # save tokenizer for easy inference\n",
-    "    tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-    "    tokenizer.save_pretrained(output_merged_dir)\n",
+    "    try:\n",
+    "        print(f\"Processing Model: {model_name}\")\n",
+    "        # model_name = \"meta-llama/Llama-2-7b-hf\" \n",
+    "        bnb_config = create_bnb_config()\n",
+    "        model, tokenizer = load_model(model_name, bnb_config)\n",
+    "    \n",
+    "        # Preprocess dataset\n",
+    "        print(\"Preprocess dataset\")\n",
+    "        max_length = get_max_length(model)\n",
+    "        dataset = preprocess_dataset(tokenizer, max_length, 0, dataset)\n",
+    "    \n",
+    "        # Start training\n",
+    "        print(\"Start training\")\n",
+    "        output_dir = output_dir_str.format(model_name=model_name.split(\"/\")[1])\n",
+    "        train(model, tokenizer, dataset, output_dir)\n",
+    "    \n",
+    "        # Save and Merge Model\n",
+    "        print(\"Save and Merge Model\")\n",
+    "        model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=\"auto\", torch_dtype=torch.bfloat16)\n",
+    "        model = model.merge_and_unload()\n",
+    "    \n",
+    "        output_merged_dir = output_dir_merged_str.format(model_name=model_name.split(\"/\")[1])\n",
+    "        os.makedirs(output_merged_dir, exist_ok=True)\n",
+    "        model.save_pretrained(output_merged_dir, safe_serialization=True)\n",
+    "    \n",
+    "        # save tokenizer for easy inference\n",
+    "        tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
+    "        tokenizer.save_pretrained(output_merged_dir)\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error training: {model_name}\")\n",
+    "        print(e)\n",
     "\n",
     "print(f\"DONE! Took{time.time()-start_time}\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cfedb7a2-9a60-498a-bfed-86690ea6eb6b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {