From 8b555f4ec3cdef77b981cf110e1ae90218ba6cc9 Mon Sep 17 00:00:00 2001 From: HubGab-Git <97609337+HubGab-Git@users.noreply.github.com> Date: Sun, 29 Sep 2024 16:21:13 +0200 Subject: [PATCH] #4725: Change model deployment to JumpStart --- ...estion_answering_langchain_jumpstart.ipynb | 522 +++++++++++------- 1 file changed, 335 insertions(+), 187 deletions(-) diff --git a/introduction_to_amazon_algorithms/jumpstart-foundation-models/question_answering_retrieval_augmented_generation/question_answering_langchain_jumpstart.ipynb b/introduction_to_amazon_algorithms/jumpstart-foundation-models/question_answering_retrieval_augmented_generation/question_answering_langchain_jumpstart.ipynb index a3edd26314..e613850a04 100644 --- a/introduction_to_amazon_algorithms/jumpstart-foundation-models/question_answering_retrieval_augmented_generation/question_answering_langchain_jumpstart.ipynb +++ b/introduction_to_amazon_algorithms/jumpstart-foundation-models/question_answering_retrieval_augmented_generation/question_answering_langchain_jumpstart.ipynb @@ -43,7 +43,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "collapsed": false, "jupyter": { @@ -57,65 +57,68 @@ "outputs": [], "source": [ "!pip install --upgrade sagemaker --quiet\n", - "!pip install ipywidgets==7.0.0 --quiet\n", - "!pip install langchain==0.0.148 --quiet\n", - "!pip install faiss-cpu --quiet" + "# !pip install ipywidgets==7.0.0 --quiet\n", + "# !pip install langchain==0.0.148 --quiet\n", + "# !pip install faiss-cpu --quiet" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "tags": [] }, "outputs": [], "source": [ - "import time\n", - "import sagemaker, boto3, json\n", - "from sagemaker.session import Session\n", - "from sagemaker.model import Model\n", - "from sagemaker import image_uris, model_uris, script_uris, hyperparameters\n", - "from sagemaker.predictor import Predictor\n", + "# import time\n", + "# import sagemaker, boto3, json\n", + "# from sagemaker.session import Session\n", + "# from sagemaker.model import Model\n", + "# from sagemaker import image_uris, model_uris, script_uris, hyperparameters\n", + "# from sagemaker.predictor import Predictor\n", + "# from sagemaker.utils import name_from_base\n", + "# from typing import Any, Dict, List, Optional\n", + "# from langchain.embeddings import SagemakerEndpointEmbeddings\n", + "# from langchain.llms.sagemaker_endpoint import ContentHandlerBase\n", + "from sagemaker import Session\n", "from sagemaker.utils import name_from_base\n", - "from typing import Any, Dict, List, Optional\n", - "from langchain.embeddings import SagemakerEndpointEmbeddings\n", - "from langchain.llms.sagemaker_endpoint import ContentHandlerBase\n", + "from sagemaker.jumpstart.model import JumpStartModel\n", "\n", "sagemaker_session = Session()\n", - "aws_role = sagemaker_session.get_caller_identity_arn()\n", - "aws_region = boto3.Session().region_name\n", - "sess = sagemaker.Session()\n", - "model_version = \"1.*\"" + "# aws_role = sagemaker_session.get_caller_identity_arn()\n", + "# aws_region = boto3.Session().region_name\n", + "# sess = sagemaker.Session()\n", + "# model_version = \"1.*\"" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "tags": [] }, "outputs": [], "source": [ - "def query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type=\"application/json\"):\n", - " client = boto3.client(\"runtime.sagemaker\")\n", - " response = client.invoke_endpoint(\n", - " EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json\n", - " )\n", - " return response\n", - "\n", - "\n", - "def parse_response_model_flan_t5(query_response):\n", - " model_predictions = json.loads(query_response[\"Body\"].read())\n", - " generated_text = model_predictions[\"generated_texts\"]\n", - " return generated_text\n", - "\n", - "\n", - "def parse_response_multiple_texts_bloomz(query_response):\n", - " generated_text = []\n", - " model_predictions = json.loads(query_response[\"Body\"].read())\n", - " for x in model_predictions[0]:\n", - " generated_text.append(x[\"generated_text\"])\n", - " return generated_text" + "# def query_endpoint_with_json_payload(encoded_json, endpoint_name, content_type=\"application/json\"):\n", + "# client = boto3.client(\"runtime.sagemaker\")\n", + "# response = client.invoke_endpoint(\n", + "# EndpointName=endpoint_name, ContentType=content_type, Body=encoded_json\n", + "# )\n", + "# return response\n", + "\n", + "\n", + "# def parse_response_model_flan_t5(query_response):\n", + "# model_predictions = json.loads(query_response[\"Body\"].read())\n", + "# generated_text = model_predictions[\"generated_texts\"]\n", + "# return generated_text\n", + "\n", + "\n", + "# def parse_response_multiple_texts_bloomz(query_response):\n", + "# generated_text = []\n", + "# model_predictions = json.loads(query_response[\"Body\"].read())\n", + "# for x in model_predictions[0]:\n", + "# generated_text.append(x[\"generated_text\"])\n", + "# return generated_text" ] }, { @@ -127,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "tags": [] }, @@ -135,74 +138,89 @@ "source": [ "_MODEL_CONFIG_ = {\n", " \"huggingface-text2text-flan-t5-xxl\": {\n", - " \"instance type\": \"ml.g5.12xlarge\",\n", - " \"env\": {\"SAGEMAKER_MODEL_SERVER_WORKERS\": \"1\", \"TS_DEFAULT_WORKERS_PER_MODEL\": \"1\"},\n", - " \"parse_function\": parse_response_model_flan_t5,\n", - " \"prompt\": \"\"\"Answer based on context:\\n\\n{context}\\n\\n{question}\"\"\",\n", + " \"model_version\": \"2.*\",\n", + " \"instance type\": \"ml.g5.12xlarge\"\n", " },\n", - " \"huggingface-textembedding-gpt-j-6b\": {\n", - " \"instance type\": \"ml.g5.24xlarge\",\n", - " \"env\": {\"SAGEMAKER_MODEL_SERVER_WORKERS\": \"1\", \"TS_DEFAULT_WORKERS_PER_MODEL\": \"1\"},\n", - " },\n", - " # \"huggingface-textgeneration1-bloomz-7b1-fp16\": {\n", - " # \"instance type\": \"ml.g5.12xlarge\",\n", - " # \"env\": {},\n", - " # \"parse_function\": parse_response_multiple_texts_bloomz,\n", - " # \"prompt\": \"\"\"question: \\\"{question}\"\\\\n\\nContext: \\\"{context}\"\\\\n\\nAnswer:\"\"\",\n", + " # Got error: \"DeprecatedJumpStartModelError:\n", + " # This model is no longer available. Please try another model.\"\n", + " # for GPT-J Emedings models at the time of testing it\n", + " # \"huggingface-textembedding-gpt-j-6b\": {\n", + " # \"model_version\": \"1.*\",\n", + " # \"instance type\": \"ml.g5.24xlarge\"\n", " # },\n", - " # \"huggingface-text2text-flan-ul2-bf16\": {\n", - " # \"instance type\": \"ml.g5.24xlarge\",\n", - " # \"env\": {\n", - " # \"SAGEMAKER_MODEL_SERVER_WORKERS\": \"1\",\n", - " # \"TS_DEFAULT_WORKERS_PER_MODEL\": \"1\"\n", - " # },\n", - " # \"parse_function\": parse_response_model_flan_t5,\n", - " # \"prompt\": \"\"\"Answer based on context:\\n\\n{context}\\n\\n{question}\"\"\",\n", + " \"huggingface-textembedding-all-MiniLM-L6-v2\": {\n", + " \"model_version\": \"1.*\",\n", + " \"instance type\": \"ml.g5.24xlarge\"\n", + " }\n", + " # \"huggingface-textembedding-all-MiniLM-L6-v2\": {\n", + " # \"model_version\": \"3.*\",\n", + " # \"instance type\": \"ml.g5.12xlarge\"\n", " # },\n", + " # \"huggingface-text2text-flan-ul2-bf16\": {\n", + " # \"model_version\": \"2.*\",\n", + " # \"instance type\": \"ml.g5.24xlarge\"\n", + " # }\n", "}" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Deploying huggingface-text2text-flan-t5-xxl...\n", + "---------!" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using model 'huggingface-textembedding-all-MiniLM-L6-v2' with wildcard version identifier '1.*'. You can pin to version '1.0.0' for more stable results. Note that models may have different input/output signatures after a major version upgrade.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Deployed endpoint: jumpstart-example-raglc-huggingface-tex-2024-09-29-13-08-36-631\n", + "Deploying huggingface-textembedding-all-MiniLM-L6-v2...\n", + "---------!Deployed endpoint: jumpstart-example-raglc-huggingface-tex-2024-09-29-13-13-39-117\n", + "Deployment process completed.\n" + ] + } + ], "source": [ - "newline, bold, unbold = \"\\n\", \"\\033[1m\", \"\\033[0m\"\n", - "\n", "for model_id in _MODEL_CONFIG_:\n", - " endpoint_name = name_from_base(f\"jumpstart-example-raglc-{model_id}\")\n", - " inference_instance_type = _MODEL_CONFIG_[model_id][\"instance type\"]\n", - "\n", - " # Retrieve the inference container uri. This is the base HuggingFace container image for the default model above.\n", - " deploy_image_uri = image_uris.retrieve(\n", - " region=None,\n", - " framework=None, # automatically inferred from model_id\n", - " image_scope=\"inference\",\n", + " endpoint_name = name_from_base(f'jumpstart-example-raglc-{model_id}')\n", + " inference_instance_type = _MODEL_CONFIG_[model_id]['instance type']\n", + " model_version = _MODEL_CONFIG_[model_id]['model_version']\n", + "\n", + " print(f'Deploying {model_id}...')\n", + "\n", + " model = JumpStartModel(\n", " model_id=model_id,\n", - " model_version=model_version,\n", - " instance_type=inference_instance_type,\n", - " )\n", - " # Retrieve the model uri.\n", - " model_uri = model_uris.retrieve(\n", - " model_id=model_id, model_version=model_version, model_scope=\"inference\"\n", + " model_version=model_version\n", " )\n", - " model_inference = Model(\n", - " image_uri=deploy_image_uri,\n", - " model_data=model_uri,\n", - " role=aws_role,\n", - " predictor_cls=Predictor,\n", - " name=endpoint_name,\n", - " env=_MODEL_CONFIG_[model_id][\"env\"],\n", - " )\n", - " model_predictor_inference = model_inference.deploy(\n", - " initial_instance_count=1,\n", - " instance_type=inference_instance_type,\n", - " predictor_cls=Predictor,\n", - " endpoint_name=endpoint_name,\n", - " )\n", - " print(f\"{bold}Model {model_id} has been deployed successfully.{unbold}{newline}\")\n", - " _MODEL_CONFIG_[model_id][\"endpoint_name\"] = endpoint_name" + "\n", + " try:\n", + " predictor = model.deploy(\n", + " initial_instance_count=1,\n", + " instance_type=inference_instance_type,\n", + " endpoint_name=name_from_base(\n", + " f\"jumpstart-example-raglc-{model_id}\"\n", + " )\n", + " )\n", + " print(f\"Deployed endpoint: {predictor.endpoint_name}\")\n", + " _MODEL_CONFIG_[model_id]['predictor'] = predictor\n", + " except Exception as e:\n", + " print(f\"Error deploying {model_id}: {str(e)}\")\n", + "\n", + "print(\"Deployment process completed.\")" ] }, { @@ -216,7 +234,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -225,30 +243,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For model: huggingface-text2text-flan-t5-xxl, the generated output is:\n", + "\n", + "ARM 64-bit instances of the SageMaker server.\n", + "\n" + ] + } + ], "source": [ - "payload = {\n", - " \"text_inputs\": question,\n", - " \"max_length\": 100,\n", - " \"num_return_sequences\": 1,\n", - " \"top_k\": 50,\n", - " \"top_p\": 0.95,\n", - " \"do_sample\": True,\n", - "}\n", - "\n", "list_of_LLMs = list(_MODEL_CONFIG_.keys())\n", - "list_of_LLMs.remove(\"huggingface-textembedding-gpt-j-6b\") # remove the embedding model\n", - "\n", + "list_of_LLMs = [model for model in list_of_LLMs if \"textembedding\" not in model]\n", "\n", "for model_id in list_of_LLMs:\n", - " endpoint_name = _MODEL_CONFIG_[model_id][\"endpoint_name\"]\n", - " query_response = query_endpoint_with_json_payload(\n", - " json.dumps(payload).encode(\"utf-8\"), endpoint_name=endpoint_name\n", - " )\n", - " generated_texts = _MODEL_CONFIG_[model_id][\"parse_function\"](query_response)\n", - " print(f\"For model: {model_id}, the generated output is: {generated_texts[0]}\\n\")" + " predictor = _MODEL_CONFIG_[model_id][\"predictor\"]\n", + " response = predictor.predict({\n", + " \"inputs\": question\n", + " })\n", + " print(f\"For model: {model_id}, the generated output is:\\n\")\n", + " print(f\"{response[0]['generated_text']}\\n\")" ] }, { @@ -270,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -279,35 +298,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For model: huggingface-text2text-flan-t5-xxl, the generated output is:\n", + "\n", + "all instances\n", + "\n" + ] + } + ], "source": [ - "parameters = {\n", - " \"max_length\": 200,\n", - " \"num_return_sequences\": 1,\n", - " \"top_k\": 250,\n", - " \"top_p\": 0.95,\n", - " \"do_sample\": False,\n", - " \"temperature\": 1,\n", - "}\n", - "\n", - "for model_id in list_of_LLMs:\n", - " endpoint_name = _MODEL_CONFIG_[model_id][\"endpoint_name\"]\n", + "# parameters = {\n", + "# \"max_length\": 200,\n", + "# \"num_return_sequences\": 1,\n", + "# \"top_k\": 250,\n", + "# \"top_p\": 0.95,\n", + "# \"do_sample\": False,\n", + "# \"temperature\": 1,\n", + "# }\n", "\n", - " prompt = _MODEL_CONFIG_[model_id][\"prompt\"]\n", + "prompt = f'Answer based on context:\\n\\n{context}\\n\\n{question}'\n", "\n", - " text_input = prompt.replace(\"{context}\", context)\n", - " text_input = text_input.replace(\"{question}\", question)\n", - " payload = {\"text_inputs\": text_input, **parameters}\n", - "\n", - " query_response = query_endpoint_with_json_payload(\n", - " json.dumps(payload).encode(\"utf-8\"), endpoint_name=endpoint_name\n", - " )\n", - " generated_texts = _MODEL_CONFIG_[model_id][\"parse_function\"](query_response)\n", - " print(\n", - " f\"{bold}For model: {model_id}, the generated output is: {generated_texts[0]}{unbold}{newline}\"\n", - " )" + "for model_id in list_of_LLMs:\n", + " predictor = _MODEL_CONFIG_[model_id][\"predictor\"]\n", + " response = predictor.predict({\n", + " \"inputs\": prompt\n", + " })\n", + " print(f\"For model: {model_id}, the generated output is:\\n\")\n", + " print(f\"{response[0]['generated_text']}\\n\")" ] }, { @@ -358,7 +381,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": { "tags": [] }, @@ -405,9 +428,12 @@ "\n", "\n", "content_handler = ContentHandler()\n", + "endpoint_name=_MODEL_CONFIG_[\n", + " \"huggingface-textembedding-all-MiniLM-L6-v2\"\n", + " ][\"predictor\"].endpoint_name\n", "\n", "embeddings = SagemakerEndpointEmbeddingsJumpStart(\n", - " endpoint_name=_MODEL_CONFIG_[\"huggingface-textembedding-gpt-j-6b\"][\"endpoint_name\"],\n", + " endpoint_name=endpoint_name,\n", " region_name=aws_region,\n", " content_handler=content_handler,\n", ")" @@ -422,39 +448,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "from langchain.llms.sagemaker_endpoint import LLMContentHandler, SagemakerEndpoint\n", "\n", - "parameters = {\n", - " \"max_length\": 200,\n", - " \"num_return_sequences\": 1,\n", - " \"top_k\": 250,\n", - " \"top_p\": 0.95,\n", - " \"do_sample\": False,\n", - " \"temperature\": 1,\n", - "}\n", - "\n", - "\n", "class ContentHandler(LLMContentHandler):\n", " content_type = \"application/json\"\n", " accepts = \"application/json\"\n", "\n", " def transform_input(self, prompt: str, model_kwargs={}) -> bytes:\n", - " input_str = json.dumps({\"text_inputs\": prompt, **model_kwargs})\n", + " input_str = json.dumps({\"inputs\": prompt, **model_kwargs})\n", " return input_str.encode(\"utf-8\")\n", "\n", " def transform_output(self, output: bytes) -> str:\n", " response_json = json.loads(output.read().decode(\"utf-8\"))\n", - " return response_json[\"generated_texts\"][0]\n", + " return response_json[0][\"generated_text\"]\n", "\n", "\n", "content_handler = ContentHandler()\n", + "endpoint_name=_MODEL_CONFIG_[\n", + " \"huggingface-text2text-flan-t5-xxl\"\n", + " ][\"predictor\"].endpoint_name\n", + "\n", "\n", "sm_llm = SagemakerEndpoint(\n", - " endpoint_name=_MODEL_CONFIG_[\"huggingface-text2text-flan-t5-xxl\"][\"endpoint_name\"],\n", + " endpoint_name=endpoint_name,\n", " region_name=aws_region,\n", " model_kwargs=parameters,\n", " content_handler=content_handler,\n", @@ -472,11 +492,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "download: s3://jumpstart-cache-prod-us-east-2/training-datasets/Amazon_SageMaker_FAQs/Amazon_SageMaker_FAQs.csv to rag_data/Amazon_SageMaker_FAQs.csv\n" + ] + } + ], "source": [ "original_data = \"s3://jumpstart-cache-prod-us-east-2/training-datasets/Amazon_SageMaker_FAQs/\"\n", "\n", @@ -493,7 +521,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "tags": [] }, @@ -521,7 +549,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "tags": [] }, @@ -532,18 +560,81 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | Answer | \n", + "
---|---|
0 | \n", + "Amazon SageMaker is a fully managed service to... | \n", + "
1 | \n", + "For a list of the supported Amazon SageMaker A... | \n", + "
2 | \n", + "Amazon SageMaker is designed for high availabi... | \n", + "
3 | \n", + "Amazon SageMaker stores code in ML storage vol... | \n", + "
4 | \n", + "Amazon SageMaker ensures that ML model artifac... | \n", + "