Skip to content

Commit ddd63a9

Browse files
Minor error handling
1 parent 49322c9 commit ddd63a9

File tree

1 file changed

+179
-43
lines changed

1 file changed

+179
-43
lines changed

src/target_tools/ollama/src/fine_tuning/llama_fine_tuning.ipynb

Lines changed: 179 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": null,
5+
"execution_count": 1,
66
"id": "ae5c6dc9-d246-4f47-a4c0-c1c07da0b901",
77
"metadata": {},
88
"outputs": [],
@@ -12,7 +12,7 @@
1212
},
1313
{
1414
"cell_type": "code",
15-
"execution_count": null,
15+
"execution_count": 2,
1616
"id": "b0ab721c-85f3-49b8-ac06-e8c2efce5d69",
1717
"metadata": {},
1818
"outputs": [],
@@ -22,17 +22,26 @@
2222
},
2323
{
2424
"cell_type": "markdown",
25+
"id": "7bbf2852",
2526
"metadata": {},
2627
"source": [
2728
"### Test CUDA"
2829
]
2930
},
3031
{
3132
"cell_type": "code",
32-
"execution_count": null,
33+
"execution_count": 3,
3334
"id": "e1d9dc6d-bf38-41b5-a44a-77aea0a355cf",
3435
"metadata": {},
35-
"outputs": [],
36+
"outputs": [
37+
{
38+
"name": "stdout",
39+
"output_type": "stream",
40+
"text": [
41+
"Using device: cuda\n"
42+
]
43+
}
44+
],
3645
"source": [
3746
"import torch\n",
3847
"use_cuda = torch.cuda.is_available()\n",
@@ -42,14 +51,15 @@
4251
},
4352
{
4453
"cell_type": "markdown",
54+
"id": "e9252eac",
4555
"metadata": {},
4656
"source": [
4757
"## Imports"
4858
]
4959
},
5060
{
5161
"cell_type": "code",
52-
"execution_count": null,
62+
"execution_count": 4,
5363
"id": "c1afd2e2-297e-4f67-ac7c-24b1473d5e06",
5464
"metadata": {},
5565
"outputs": [],
@@ -69,6 +79,7 @@
6979
},
7080
{
7181
"cell_type": "markdown",
82+
"id": "06512f0f",
7283
"metadata": {},
7384
"source": [
7485
"## Login to huggingface\n",
@@ -77,28 +88,72 @@
7788
},
7889
{
7990
"cell_type": "code",
80-
"execution_count": null,
91+
"execution_count": 5,
8192
"id": "3a1cc9bd-17ca-4754-b36f-1d67e194d205",
8293
"metadata": {},
83-
"outputs": [],
94+
"outputs": [
95+
{
96+
"name": "stdout",
97+
"output_type": "stream",
98+
"text": [
99+
"\n",
100+
" _| _| _| _| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _|_|_|_| _|_| _|_|_| _|_|_|_|\n",
101+
" _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n",
102+
" _|_|_|_| _| _| _| _|_| _| _|_| _| _| _| _| _| _|_| _|_|_| _|_|_|_| _| _|_|_|\n",
103+
" _| _| _| _| _| _| _| _| _| _| _|_| _| _| _| _| _| _| _|\n",
104+
" _| _| _|_| _|_|_| _|_|_| _|_|_| _| _| _|_|_| _| _| _| _|_|_| _|_|_|_|\n",
105+
"\n",
106+
" A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.\n",
107+
" Setting a new token will erase the existing one.\n",
108+
" To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .\n"
109+
]
110+
},
111+
{
112+
"name": "stdin",
113+
"output_type": "stream",
114+
"text": [
115+
"Token: ········\n",
116+
"Add token as git credential? (Y/n) n\n"
117+
]
118+
},
119+
{
120+
"name": "stdout",
121+
"output_type": "stream",
122+
"text": [
123+
"Token is valid (permission: write).\n",
124+
"Your token has been saved to /pc2/users/a/ashwin/.cache/huggingface/token\n",
125+
"Login successful\n"
126+
]
127+
}
128+
],
84129
"source": [
85130
"from huggingface_hub import interpreter_login\n",
86131
"interpreter_login()"
87132
]
88133
},
89134
{
90135
"cell_type": "markdown",
136+
"id": "e27647e5",
91137
"metadata": {},
92138
"source": [
93139
"## Load dataset "
94140
]
95141
},
96142
{
97143
"cell_type": "code",
98-
"execution_count": null,
144+
"execution_count": 6,
99145
"id": "c70ed7a5-0b8c-41d9-8ad6-80c36d6a10da",
100146
"metadata": {},
101-
"outputs": [],
147+
"outputs": [
148+
{
149+
"name": "stdout",
150+
"output_type": "stream",
151+
"text": [
152+
"Number of prompts: 15\n",
153+
"Column names are: ['text']\n"
154+
]
155+
}
156+
],
102157
"source": [
103158
"dataset_name = \"ashwinprasadme/typeevalpy_finetuning\"\n",
104159
"dataset = load_dataset(dataset_name, split=\"train\", token=True)\n",
@@ -109,14 +164,15 @@
109164
},
110165
{
111166
"cell_type": "markdown",
167+
"id": "d4ad7e49",
112168
"metadata": {},
113169
"source": [
114170
"## Dataset preparation functions"
115171
]
116172
},
117173
{
118174
"cell_type": "code",
119-
"execution_count": null,
175+
"execution_count": 7,
120176
"id": "6a4f3f26-6c38-4074-804b-caa4e66aa1d5",
121177
"metadata": {},
122178
"outputs": [],
@@ -175,14 +231,15 @@
175231
},
176232
{
177233
"cell_type": "markdown",
234+
"id": "dec144c8",
178235
"metadata": {},
179236
"source": [
180237
"## Training functions"
181238
]
182239
},
183240
{
184241
"cell_type": "code",
185-
"execution_count": null,
242+
"execution_count": 8,
186243
"id": "e7a80306-3bcc-4b18-837f-08b883bf98e7",
187244
"metadata": {},
188245
"outputs": [],
@@ -270,7 +327,8 @@
270327
},
271328
{
272329
"cell_type": "code",
273-
"execution_count": null,
330+
"execution_count": 9,
331+
"id": "427c7dea",
274332
"metadata": {},
275333
"outputs": [],
276334
"source": [
@@ -353,14 +411,16 @@
353411
},
354412
{
355413
"cell_type": "markdown",
414+
"id": "001b7997",
356415
"metadata": {},
357416
"source": [
358417
"## Load model"
359418
]
360419
},
361420
{
362421
"cell_type": "code",
363-
"execution_count": null,
422+
"execution_count": 10,
423+
"id": "1eb2cd1a",
364424
"metadata": {},
365425
"outputs": [],
366426
"source": [
@@ -392,44 +452,120 @@
392452
"execution_count": null,
393453
"id": "fbd5d2d4-8c64-4266-a9cf-2fdeac5df0d4",
394454
"metadata": {},
395-
"outputs": [],
455+
"outputs": [
456+
{
457+
"name": "stdout",
458+
"output_type": "stream",
459+
"text": [
460+
"Processing Model: codellama/CodeLlama-7b-Python-hf\n"
461+
]
462+
},
463+
{
464+
"data": {
465+
"application/vnd.jupyter.widget-view+json": {
466+
"model_id": "ff429292453744558773a62add62d543",
467+
"version_major": 2,
468+
"version_minor": 0
469+
},
470+
"text/plain": [
471+
"config.json: 0%| | 0.00/644 [00:00<?, ?B/s]"
472+
]
473+
},
474+
"metadata": {},
475+
"output_type": "display_data"
476+
},
477+
{
478+
"data": {
479+
"application/vnd.jupyter.widget-view+json": {
480+
"model_id": "9f6cbe7009a644489cbc1d387c41c4d9",
481+
"version_major": 2,
482+
"version_minor": 0
483+
},
484+
"text/plain": [
485+
"model.safetensors.index.json: 0%| | 0.00/25.1k [00:00<?, ?B/s]"
486+
]
487+
},
488+
"metadata": {},
489+
"output_type": "display_data"
490+
},
491+
{
492+
"data": {
493+
"application/vnd.jupyter.widget-view+json": {
494+
"model_id": "627db9bae4134e45858c3ea37652089b",
495+
"version_major": 2,
496+
"version_minor": 0
497+
},
498+
"text/plain": [
499+
"Downloading shards: 0%| | 0/2 [00:00<?, ?it/s]"
500+
]
501+
},
502+
"metadata": {},
503+
"output_type": "display_data"
504+
},
505+
{
506+
"data": {
507+
"application/vnd.jupyter.widget-view+json": {
508+
"model_id": "9d3a2cba3b3b42e0b723f63392766fae",
509+
"version_major": 2,
510+
"version_minor": 0
511+
},
512+
"text/plain": [
513+
"model-00001-of-00002.safetensors: 0%| | 0.00/9.98G [00:00<?, ?B/s]"
514+
]
515+
},
516+
"metadata": {},
517+
"output_type": "display_data"
518+
}
519+
],
396520
"source": [
397521
"# Load model from HF with user's token and with bitsandbytes config\n",
398-
"output_dir_str = \"/scratch/hpc-prf-hdgen/ashwin/finetuned_models/ft_{model_name}\"\n",
399-
"output_dir_merged_str = \"/scratch/hpc-prf-hdgen/ashwin/finetuned_models/ft_{model_name}_merged\"\n",
522+
"output_dir_str = \"/scratch/hpc-prf-hdgen/ashwin/finetuned_models/ft_v1_{model_name}\"\n",
523+
"output_dir_merged_str = \"/scratch/hpc-prf-hdgen/ashwin/finetuned_models/ft_v1_{model_name}_merged\"\n",
400524
"\n",
401525
"start_time = time.time()\n",
402526
"for model_name in models_list:\n",
403-
" print(f\"Processing Model: {model_name}\")\n",
404-
" # model_name = \"meta-llama/Llama-2-7b-hf\" \n",
405-
" bnb_config = create_bnb_config()\n",
406-
" model, tokenizer = load_model(model_name, bnb_config)\n",
407-
"\n",
408-
" # Preprocess dataset\n",
409-
" print(\"Preprocess dataset\")\n",
410-
" max_length = get_max_length(model)\n",
411-
" dataset = preprocess_dataset(tokenizer, max_length, 0, dataset)\n",
412-
"\n",
413-
" # Start training\n",
414-
" print(\"Start training\")\n",
415-
" output_dir = output_dir_str.format(model_name=model_name)\n",
416-
" train(model, tokenizer, dataset, output_dir)\n",
417-
"\n",
418-
" # Save and Merge Model\n",
419-
" print(\"Save and Merge Model\")\n",
420-
" model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=\"auto\", torch_dtype=torch.bfloat16)\n",
421-
" model = model.merge_and_unload()\n",
422-
"\n",
423-
" output_merged_dir = output_dir_merged_str.format(model_name=model_name)\n",
424-
" os.makedirs(output_merged_dir, exist_ok=True)\n",
425-
" model.save_pretrained(output_merged_dir, safe_serialization=True)\n",
426-
"\n",
427-
" # save tokenizer for easy inference\n",
428-
" tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
429-
" tokenizer.save_pretrained(output_merged_dir)\n",
527+
" try:\n",
528+
" print(f\"Processing Model: {model_name}\")\n",
529+
" # model_name = \"meta-llama/Llama-2-7b-hf\" \n",
530+
" bnb_config = create_bnb_config()\n",
531+
" model, tokenizer = load_model(model_name, bnb_config)\n",
532+
" \n",
533+
" # Preprocess dataset\n",
534+
" print(\"Preprocess dataset\")\n",
535+
" max_length = get_max_length(model)\n",
536+
" dataset = preprocess_dataset(tokenizer, max_length, 0, dataset)\n",
537+
" \n",
538+
" # Start training\n",
539+
" print(\"Start training\")\n",
540+
" output_dir = output_dir_str.format(model_name=model_name.split(\"/\")[1])\n",
541+
" train(model, tokenizer, dataset, output_dir)\n",
542+
" \n",
543+
" # Save and Merge Model\n",
544+
" print(\"Save and Merge Model\")\n",
545+
" model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map=\"auto\", torch_dtype=torch.bfloat16)\n",
546+
" model = model.merge_and_unload()\n",
547+
" \n",
548+
" output_merged_dir = output_dir_merged_str.format(model_name=model_name.split(\"/\")[1])\n",
549+
" os.makedirs(output_merged_dir, exist_ok=True)\n",
550+
" model.save_pretrained(output_merged_dir, safe_serialization=True)\n",
551+
" \n",
552+
" # save tokenizer for easy inference\n",
553+
" tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
554+
" tokenizer.save_pretrained(output_merged_dir)\n",
555+
" except Exception as e:\n",
556+
" print(f\"Error training: {model_name}\")\n",
557+
" print(e)\n",
430558
"\n",
431559
"print(f\"DONE! Took{time.time()-start_time}\")"
432560
]
561+
},
562+
{
563+
"cell_type": "code",
564+
"execution_count": null,
565+
"id": "cfedb7a2-9a60-498a-bfed-86690ea6eb6b",
566+
"metadata": {},
567+
"outputs": [],
568+
"source": []
433569
}
434570
],
435571
"metadata": {

0 commit comments

Comments
 (0)