diff --git a/IndianLawLLM.ipynb b/IndianLawLLM.ipynb new file mode 100644 index 00000000..b8267056 --- /dev/null +++ b/IndianLawLLM.ipynb @@ -0,0 +1,819 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 12, + "id": "57b3de00-61f6-418a-b837-57a207ee1c2c", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install langchain -q --user " + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c8c66634-87ee-4799-a29a-e9505b1edc4b", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install transformers -q --user " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "0064e947-4ded-4bce-925b-66042f25f063", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install pypdf -q --user" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "367609ba-2daf-458d-9e31-b0bfc3b96f56", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install sentence-transformers -q --user" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "4ea8f326-4535-41df-9c04-8b094c43072b", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install chromadb -q --user" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "5102e563-bbd0-4cac-9c58-a4a64ba82e8b", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install google-cloud -q --user" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "42f71c23-935e-44a0-894a-34af021f5955", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: intel_extension_for_pytorch in /home/ub9a04e0e61464b0670ffe393523c081/.local/lib/python3.9/site-packages (2.1.0)\n", + "Requirement already satisfied: psutil in /opt/intel/oneapi/intelpython/python3.9/lib/python3.9/site-packages (from intel_extension_for_pytorch) (5.9.0)\n", + "Requirement already satisfied: numpy in /opt/intel/oneapi/intelpython/python3.9/lib/python3.9/site-packages (from intel_extension_for_pytorch) (1.24.3)\n", + "Requirement already satisfied: packaging in /opt/intel/oneapi/intelpython/python3.9/lib/python3.9/site-packages (from intel_extension_for_pytorch) (23.1)\n" + ] + } + ], + "source": [ + "!pip install intel_extension_for_pytorch --user" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "b1261631-421a-4df5-85ce-c6b1b64f642d", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import os\n", + "import random\n", + "import re" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "8132a8ff-2c53-4ed7-ab16-afc1695c1ca3", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "\n", + "# Suppress warnings for a cleaner output\n", + "warnings.filterwarnings(\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5b1fe6e1-fb17-40ba-9cd4-ccdf4905dc38", + "metadata": {}, + "outputs": [], + "source": [ + "import torch" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "7bd92eb1-f9ed-4518-a536-95654b75f92e", + "metadata": {}, + "outputs": [], + "source": [ + "import intel_extension_for_pytorch as ipex" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "6e6c20bb-7299-4057-9b7e-2611a12d6105", + "metadata": {}, + "outputs": [], + "source": [ + "# Import necessary modules and classes\n", + "from langchain.vectorstores import Chroma\n", + "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM \n", + "from transformers import pipeline\n", + "from langchain.llms import HuggingFacePipeline\n", + "from langchain.embeddings import SentenceTransformerEmbeddings\n", + "from langchain.chains import RetrievalQA\n", + "from langchain.document_loaders import PyPDFLoader, DirectoryLoader\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "7a389773-65bb-4318-9d17-b69d60459c38", + "metadata": {}, + "outputs": [], + "source": [ + "def get_device_type():\n", + " # random seed\n", + " if torch.cuda.is_available():\n", + " print(\"GPU is available\")\n", + " seed = 88\n", + " random.seed(seed)\n", + " torch.cuda.manual_seed(seed)\n", + " torch.cuda.manual_seed_all(seed)\n", + " return torch.device(\"gpu\")\n", + " elif torch.xpu.is_available():\n", + " print(\"GPU is not available\")\n", + " print(\"XPU is available\")\n", + " seed = 88\n", + " random.seed(seed)\n", + " torch.xpu.manual_seed(seed)\n", + " torch.xpu.manual_seed_all(seed)\n", + " return torch.device(\"xpu\")\n", + " else:\n", + " print(\"XPU or GPU not available - returning with CPU\")\n", + " return torch.device(\"cpu\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "447e72b8-c930-4e88-a823-4ebb1575d06a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "XPU or GPU not available - returning with CPU\n", + "cpu\n" + ] + } + ], + "source": [ + "device_type = get_device_type()\n", + "print(device_type)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d4fad5a3-cd22-430f-8534-20281ca40b3b", + "metadata": {}, + "outputs": [], + "source": [ + "MODEL_CACHE_PATH = \"/home/common/data/Big_Data/GenAI/llm_models\"\n", + "\n", + "class ChatBotModel:\n", + " \"\"\"\n", + " ChatBotModel is a class for generating responses based on text prompts using a pretrained model.\n", + "\n", + " Attributes:\n", + " - model_id_or_path: model Id for text generation. Default is \"\"MBZUAI/LaMini-Flan-T5-783M\"\"\n", + " - torch_dtype: The data type to use in the model.\n", + " - optimize : If True Intel Optimizer for pytorch is used.\n", + " \"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " model_id_or_path: str = \"MBZUAI/LaMini-Flan-T5-783M\", \n", + " torch_dtype: torch.dtype = torch.bfloat16,\n", + " optimize: bool = True,\n", + " ) -> None:\n", + " \"\"\"\n", + " The initializer for ChatBotModel class.\n", + "\n", + " Parameters:\n", + " - model_id_or_path: The identifier or path of the pretrained model.\n", + " - torch_dtype: The data type to use in the model. Default is torch.bfloat16.\n", + " - optimize: If True, ipex is used to optimized the model\n", + " \"\"\"\n", + " self.torch_dtype = torch_dtype\n", + " self.device = get_device_type()\n", + " self.model_id_or_path = model_id_or_path\n", + " local_model_id = self.model_id_or_path.replace(\"/\", \"--\")\n", + " local_model_path = os.path.join(MODEL_CACHE_PATH, local_model_id)\n", + " \n", + "\n", + " if (\n", + " self.device == self.device.startswith(\"xpu\")\n", + " if isinstance(self.device, str)\n", + " else self.device.type == \"xpu\"\n", + " ):\n", + "\n", + " self.autocast = torch.xpu.amp.autocast\n", + " else:\n", + " self.autocast = torch.cpu.amp.autocast\n", + " self.torch_dtype = torch_dtype\n", + " print(f\"torch_dtype : {self.torch_dtype}\")\n", + " print(f\"device : {self.device}\")\n", + " print(f\"model_id_or_path : {self.model_id_or_path}\")\n", + " print(f\"local_model_id : {local_model_id}\")\n", + " print(f\"local_model_path : {local_model_path}\")\n", + " try:\n", + " # Initialize the tokenizer and base model for text generation\n", + " self.tokenizer = AutoTokenizer.from_pretrained(local_model_path, \n", + " trust_remote_code=True )\n", + " self.model = (\n", + " AutoModelForSeq2SeqLM.from_pretrained(\n", + " local_model_path,\n", + " device_map=\"auto\",\n", + " low_cpu_mem_usage=True,\n", + " trust_remote_code=True,\n", + " torch_dtype=torch.float32\n", + " )\n", + " .to(self.device)\n", + " .eval()\n", + " )\n", + " except (OSError, ValueError, EnvironmentError) as e:\n", + " logging.info(\n", + " f\"Tokenizer / model not found locally. Downloading tokenizer / model for {self.model_id_or_path} to cache...: {e}\"\n", + " )\n", + " self.tokenizer = AutoTokenizer.from_pretrained(self.model_id_or_path, trust_remote_code=True )\n", + " self.model = (\n", + " AutoModelForSeq2SeqLM.from_pretrained(\n", + " self.model_id_or_path,\n", + " device_map=\"auto\",\n", + " low_cpu_mem_usage=True,\n", + " trust_remote_code=True,\n", + " torch_dtype=torch.float32\n", + " )\n", + " .to(self.device)\n", + " .eval()\n", + " )\n", + " \n", + " self.max_length = 256\n", + " if optimize:\n", + " if hasattr(ipex, \"optimize_transformers\"):\n", + " try:\n", + " ipex.optimize_transformers(self.model, dtype=self.torch_dtype)\n", + " except:\n", + " ipex.optimize(self.model, dtype=self.torch_dtype)\n", + " else:\n", + " ipex.optimize(self.model, dtype=self.torch_dtype)\n", + " print(\"ChatBotModel Initialization Complete\")" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "80426f7d-b0e1-40be-a4f0-36405efbd96c", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-06 09:52:51,388 - root - INFO - Tokenizer / model not found locally. Downloading tokenizer / model for MBZUAI/LaMini-Flan-T5-783M to cache...: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/home/common/data/Big_Data/GenAI/llm_models/MBZUAI--LaMini-Flan-T5-783M'. Use `repo_type` argument if needed.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "XPU or GPU not available - returning with CPU\n", + "torch_dtype : torch.bfloat16\n", + "device : cpu\n", + "model_id_or_path : MBZUAI/LaMini-Flan-T5-783M\n", + "local_model_id : MBZUAI--LaMini-Flan-T5-783M\n", + "local_model_path : /home/common/data/Big_Data/GenAI/llm_models/MBZUAI--LaMini-Flan-T5-783M\n", + "ChatBotModel Initialization Complete\n" + ] + } + ], + "source": [ + "chat_bot_model = ChatBotModel()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fc526da0-e777-4396-b219-c7512de7490b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "962394e6-5b4d-4e2b-86dd-f4cd3b611568", + "metadata": {}, + "source": [ + "# Getting Data From PDFs - Save in Vector DB" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "78f669e5-3be9-4e34-bb89-607018c24f5c", + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize a directory loader to load PDF documents from a directory# Define the directory where the Chroma database will persist its data\n", + "persist_directory = \"choma-db\"\n", + "\n", + "# Make sure the directory exists, create if it doesn't\n", + "\n", + "if not os.path.exists(persist_directory):\n", + " os.makedirs(persist_directory)\n", + "\n", + "# Initialize a directory loader to load PDF documents from a directory\n", + "loader = DirectoryLoader(\"law_data\", glob=\"./*.pdf\", loader_cls=PyPDFLoader)\n", + "documents = loader.load()\n", + "\n", + "# Initialize a text splitter to split documents into smaller chunks\n", + "text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)\n", + "\n", + "# Split the loaded documents into chunks\n", + "texts = text_splitter.split_documents(documents)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "eb5e7dc1-28f7-40ba-9fd4-2e3b15609a32", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-12-06 09:57:35,950 - sentence_transformers.SentenceTransformer - INFO - Load pretrained SentenceTransformer: multi-qa-mpnet-base-dot-v1\n", + "2023-12-06 09:57:38,427 - sentence_transformers.SentenceTransformer - INFO - Use pytorch device: cpu\n", + "2023-12-06 09:57:38,764 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2380f0d843b34cad85fa162353128cb7", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Batches: 0%| | 0/474 [00:00